MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: ExtractFromPDBFiles.pl,v $
   4 # $Date: 2011/12/16 00:03:30 $
   5 # $Revision: 1.32 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2012 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 use PDBFileUtil;
  38 use AminoAcids;
  39 use SequenceFileUtil;
  40 
  41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  42 
  43 # Autoflush STDOUT
  44 $| = 1;
  45 
  46 # Starting message...
  47 $ScriptName = basename($0);
  48 print "\n$ScriptName: Starting...\n\n";
  49 $StartTime = new Benchmark;
  50 
  51 # Get the options and setup script...
  52 SetupScriptUsage();
  53 if ($Options{help} || @ARGV < 1) {
  54   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  55 }
  56 
  57 my(@PDBFilesList);
  58 @PDBFilesList = ExpandFileNames(\@ARGV, "pdb");
  59 
  60 # Process options...
  61 print "Processing options...\n";
  62 my(%OptionsInfo);
  63 ProcessOptions();
  64 
  65 # Setup information about input files...
  66 print "Checking input PDB file(s)...\n";
  67 my(%PDBFilesInfo);
  68 RetrievePDBFilesInfo();
  69 
  70 # Process input files..
  71 my($FileIndex);
  72 if (@PDBFilesList > 1) {
  73   print "\nProcessing PDB files...\n";
  74 }
  75 for $FileIndex (0 .. $#PDBFilesList) {
  76   if ($PDBFilesInfo{FileOkay}[$FileIndex]) {
  77     print "\nProcessing file $PDBFilesList[$FileIndex]...\n";
  78     ExtractFromPDBFiles($FileIndex);
  79   }
  80 }
  81 print "\n$ScriptName:Done...\n\n";
  82 
  83 $EndTime = new Benchmark;
  84 $TotalTime = timediff ($EndTime, $StartTime);
  85 print "Total time: ", timestr($TotalTime), "\n";
  86 
  87 ###############################################################################
  88 
  89 # Extract appropriate information...
  90 sub ExtractFromPDBFiles {
  91   my($FileIndex) = @_;
  92   my($PDBFile, $PDBRecordLinesRef);
  93 
  94   # Get PDB data...
  95   $PDBFile = $PDBFilesList[$FileIndex];
  96   $PDBRecordLinesRef = ReadPDBFile($PDBFile);
  97 
  98   if ($OptionsInfo{Mode} =~ /Chains/i) {
  99     ExtractChains($FileIndex, $PDBRecordLinesRef);
 100   }
 101   elsif ($OptionsInfo{Mode} =~ /Sequences/i) {
 102     ExtractSequences($FileIndex, $PDBRecordLinesRef);
 103   }
 104   elsif ($OptionsInfo{Mode} =~ /Atoms/i) {
 105     ExtractAtoms($FileIndex, $PDBRecordLinesRef);
 106   }
 107   elsif ($OptionsInfo{Mode} =~ /CAlphas/i) {
 108     ExtractCAlphas($FileIndex, $PDBRecordLinesRef);
 109   }
 110   elsif ($OptionsInfo{Mode} =~ /^(ResidueNums|ResiduesRange)$/i) {
 111     ExtractByResidues($FileIndex, $PDBRecordLinesRef);
 112   }
 113   elsif ($OptionsInfo{Mode} =~ /Distance/i) {
 114     ExtractByDistance($FileIndex, $PDBRecordLinesRef);
 115   }
 116   elsif ($OptionsInfo{Mode} =~ /NonWater/i) {
 117     ExtractNonWaterRecords($FileIndex, $PDBRecordLinesRef);
 118   }
 119   elsif ($OptionsInfo{Mode} =~ /NonHydrogens/i) {
 120     ExtractNonHydrogenRecords($FileIndex, $PDBRecordLinesRef);
 121   }
 122 }
 123 
 124 # Extract chains and generate new PDB files...
 125 #
 126 sub ExtractChains {
 127   my($FileIndex, $PDBRecordLinesRef) = @_;
 128   my($ChainIndex, $ChainID, $ChainLabel, $PDBFileName, $RecordLine, $ChainsAndResiduesInfoRef, $AtomNumber, $AtomName, $ResidueName, $AtomChainID, $ResidueNumber, $AlternateLocation, $InsertionCode, $ConectRecordLinesRef, %ChainAtomNumbersMap);
 129 
 130   # Get chains and residues data...
 131   $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', 0, 1);
 132 
 133   if ($OptionsInfo{CombineChains}) {
 134     $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
 135     print "Generating PDBFileName file $PDBFileName...\n";
 136 
 137     open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
 138 
 139     # Write out header and other older recors...
 140     WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
 141   }
 142 
 143   for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) {
 144     $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex];
 145     $ChainLabel = $PDBFilesInfo{ChainLabels}[$FileIndex][$ChainIndex];
 146 
 147     if (!$OptionsInfo{CombineChains}) {
 148       $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][$ChainIndex];
 149       print "Generating PDBFileName file $PDBFileName...\n";
 150 
 151       open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
 152 
 153       # Write out header and other older recors...
 154       WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
 155     }
 156 
 157     # Write out ATOM/HETATM line for chain and collect all ATOM/HETATM serial numbers
 158     # for writing out appropriate CONECT records...
 159     %ChainAtomNumbersMap = ();
 160     for $RecordLine (@{$ChainsAndResiduesInfoRef->{Lines}{$ChainID}}) {
 161       print OUTFILE "$RecordLine\n";
 162       ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $AtomChainID, $ResidueNumber, $InsertionCode) = ParseAtomRecordLine($RecordLine);
 163       $AtomNumber = int $AtomNumber;
 164       $ChainAtomNumbersMap{$AtomNumber} = $AtomName;
 165     }
 166     # Write out TER record using information from last chain record...
 167     $AtomNumber += 1;
 168     print OUTFILE GenerateTerRecordLine($AtomNumber, $ResidueName, $AtomChainID, $ResidueNumber, $InsertionCode), "\n";
 169 
 170     # Write out CONECT records...
 171     $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%ChainAtomNumbersMap);
 172 
 173     for $RecordLine (@{$ConectRecordLinesRef}) {
 174       print OUTFILE "$RecordLine\n";
 175     }
 176 
 177     if (!$OptionsInfo{CombineChains}) {
 178       # Write out END record...
 179       print OUTFILE GenerateEndRecordLine(), "\n";
 180 
 181       close OUTFILE;
 182     }
 183   }
 184 
 185   if ($OptionsInfo{CombineChains}) {
 186     # Write out END record...
 187     print OUTFILE GenerateEndRecordLine(), "\n";
 188 
 189     close OUTFILE;
 190   }
 191 
 192 }
 193 
 194 
 195 # Extract sequences for individual chains or combine all the chains...
 196 sub ExtractSequences {
 197   my($FileIndex, $PDBRecordLinesRef) = @_;
 198   my($ChainIndex, $ChainID, $ChainLabel, $SequenceFileName, $Residue, $ResidueCode, $StandardResidue, $ChainSequence, $WrappedChainSequence, $ChainSequenceID, $ChainsAndResiduesInfoRef, $ChainResiduesRef, %ChainSequencesDataMap);
 199 
 200   if ($OptionsInfo{SequenceRecordSource} =~ /^SeqRes$/i) {
 201     $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'SeqRes');
 202   }
 203   else {
 204     $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef);
 205   }
 206 
 207   # Generate sequence data for all the chains...
 208   %ChainSequencesDataMap = ();
 209   @{$ChainSequencesDataMap{IDs}} = ();
 210   %{$ChainSequencesDataMap{Sequence}} = ();
 211   %{$ChainSequencesDataMap{Description}} = ();
 212 
 213   for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) {
 214     $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex];
 215     $ChainLabel = $PDBFilesInfo{ChainLabels}[$FileIndex][$ChainIndex];
 216 
 217     # Setup sequence ID...
 218     $ChainSequenceID = $PDBFilesInfo{ChainSequenceIDs}[$FileIndex][$ChainIndex];
 219     push @{$ChainSequencesDataMap{IDs}}, $ChainSequenceID;
 220     $ChainSequencesDataMap{Description}{$ChainID} = $ChainSequenceID;
 221 
 222     # Collect sequence data for the chain...
 223     if ($OptionsInfo{SequenceRecordSource} =~ /^SeqRes/i) {
 224       $ChainResiduesRef = \@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}};
 225     }
 226     else {
 227       $ChainResiduesRef = \@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}};
 228     }
 229     # Setup sequence data...
 230     $ChainSequence = '';
 231     RESIDUE: for $Residue (@{$ChainResiduesRef}) {
 232       ($ResidueCode, $StandardResidue) = GetResidueCode($Residue);
 233       if (!$StandardResidue) {
 234         if ($OptionsInfo{KeepNonStandardSequences}) {
 235           $ResidueCode = $OptionsInfo{NonStandardSequenceCode};
 236           warn "Warning: Keeping nonstandard residue $Residue in $ChainLabel...\n";
 237         }
 238         else {
 239           warn "Warning: Ignoring nonstandard residue $Residue in $ChainLabel...\n";
 240           next RESIDUE;
 241         }
 242       }
 243       $ChainSequence .= $ResidueCode;
 244     }
 245     $ChainSequencesDataMap{Sequence}{$ChainID} = $ChainSequence;
 246 
 247   }
 248 
 249   # Write out the sequence files...
 250   my($SequenceID, $SequenceDescription, $Sequence, %SequencesDataMap );
 251   if ($OptionsInfo{CombineChainSequences}) {
 252     # Combine all the chain sequences...
 253     $Sequence = '';
 254     for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) {
 255       $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex];
 256 
 257       $Sequence .= $ChainSequencesDataMap{Sequence}{$ChainID};
 258     }
 259     $SequenceID = $PDBFilesInfo{ChainSequenceIDsPrefix}[$FileIndex][0] . "_CombinedChains|PDB";;
 260     $SequenceDescription = $SequenceID;
 261     $SequenceFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
 262 
 263     print "Generating sequence file $SequenceFileName...\n";
 264     %SequencesDataMap = ();
 265     @{$SequencesDataMap{IDs}} = ();
 266     %{$SequencesDataMap{Sequence}} = ();
 267     %{$SequencesDataMap{Description}} = ();
 268 
 269     push @{$SequencesDataMap{IDs}}, $SequenceID;
 270     $SequencesDataMap{Description}{$SequenceID} = $SequenceDescription;
 271     $SequencesDataMap{Sequence}{$SequenceID} = $Sequence;
 272 
 273     WritePearsonFastaSequenceFile($SequenceFileName, \%SequencesDataMap, $OptionsInfo{MaxSequenceLength});
 274   }
 275   else {
 276     # For each specifed chain, write out the sequences...
 277     for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) {
 278       $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex];
 279 
 280       $SequenceFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][$ChainIndex];
 281 
 282       $SequenceID = $ChainSequencesDataMap{IDs}[$ChainIndex];
 283       $SequenceDescription = $ChainSequencesDataMap{Description}{$ChainID};
 284       $Sequence = $ChainSequencesDataMap{Sequence}{$ChainID};
 285 
 286       print "Generating sequence file $SequenceFileName...\n";
 287       %SequencesDataMap = ();
 288       @{$SequencesDataMap{IDs}} = ();
 289       %{$SequencesDataMap{Sequence}} = ();
 290       %{$SequencesDataMap{Description}} = ();
 291 
 292       push @{$SequencesDataMap{IDs}}, $SequenceID;
 293       $SequencesDataMap{Description}{$SequenceID} = $SequenceDescription;
 294       $SequencesDataMap{Sequence}{$SequenceID} = $Sequence;
 295 
 296       WritePearsonFastaSequenceFile($SequenceFileName, \%SequencesDataMap, $OptionsInfo{MaxSequenceLength});
 297     }
 298   }
 299 }
 300 
 301 # Extract atoms...
 302 sub ExtractAtoms {
 303   my($FileIndex, $PDBRecordLinesRef) = @_;
 304   my($PDBFileName,  $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $ConectRecordLinesRef, %AtomNumbersMap);
 305 
 306   $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
 307   print "Generating PDBFileName file $PDBFileName...\n";
 308   open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
 309 
 310   # Write out header and other older recors...
 311   WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
 312 
 313   # Write out all ATOM records along with TER and model records to indicate
 314   # chains and multiple models..
 315   %AtomNumbersMap = ();
 316   $ChainRecordCount = 0;
 317   for $RecordLine (@{$PDBRecordLinesRef}) {
 318     if (IsAtomRecordType($RecordLine)) {
 319       $ChainRecordCount++;
 320       print OUTFILE "$RecordLine\n";
 321       ($AtomNumber, $AtomName) = ParseAtomRecordLine($RecordLine);
 322 
 323       # Store the atom number for extracting CONECT records...
 324       $AtomNumber = int $AtomNumber;
 325       $AtomNumbersMap{$AtomNumber} = $AtomName;
 326     }
 327     elsif (IsTerRecordType($RecordLine)) {
 328       if ($ChainRecordCount) {
 329         print OUTFILE GenerateTerRecordLine(), "\n";
 330       }
 331       $ChainRecordCount = 0;
 332     }
 333     elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) {
 334       print OUTFILE "$RecordLine\n";
 335     }
 336   }
 337 
 338   # Write out appropriate CONECT records...
 339   $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap);
 340   for $RecordLine (@{$ConectRecordLinesRef}) {
 341     print OUTFILE "$RecordLine\n";
 342   }
 343 
 344   # Write out END record...
 345   print OUTFILE GenerateEndRecordLine(), "\n";
 346 
 347   close OUTFILE;
 348 }
 349 
 350 # Extract CAlphas...
 351 sub ExtractCAlphas {
 352   my($FileIndex, $PDBRecordLinesRef) = @_;
 353   my($PDBFileName,  $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $ConectRecordLinesRef, %AtomNumbersMap);
 354 
 355   $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
 356   print "Generating PDBFileName file $PDBFileName...\n";
 357   open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
 358 
 359   # Write out header and other older recors...
 360   WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
 361 
 362   # Write out all ATOM records for CAlphas along with TER and model records to indicate
 363   # chains and multiple models..
 364   %AtomNumbersMap = ();
 365   $ChainRecordCount = 0;
 366   for $RecordLine (@{$PDBRecordLinesRef}) {
 367     if (IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) {
 368       ($AtomNumber, $AtomName) = ParseAtomRecordLine($RecordLine);
 369       if ($AtomName =~ /^CA$/i) {
 370         $ChainRecordCount++;
 371         print OUTFILE "$RecordLine\n";
 372         $AtomNumber = int $AtomNumber;
 373         $AtomNumbersMap{$AtomNumber} = $AtomName;
 374       }
 375     }
 376     elsif (IsTerRecordType($RecordLine)) {
 377       if ($ChainRecordCount) {
 378         print OUTFILE GenerateTerRecordLine(), "\n";
 379       }
 380       $ChainRecordCount = 0;
 381     }
 382     elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) {
 383       print OUTFILE "$RecordLine\n";
 384     }
 385   }
 386 
 387   # Write out appropriate CONECT records...
 388   $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap);
 389   for $RecordLine (@{$ConectRecordLinesRef}) {
 390     print OUTFILE "$RecordLine\n";
 391   }
 392   # Write out END record...
 393   print OUTFILE GenerateEndRecordLine(), "\n";
 394 
 395   close OUTFILE;
 396 }
 397 
 398 # Extract residues...
 399 sub ExtractByResidues {
 400   my($FileIndex, $PDBRecordLinesRef) = @_;
 401   my($PDBFileName,  $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $ConectRecordLinesRef, $IgnoreRecord, %AtomNumbersMap);
 402 
 403   $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
 404   print "Generating PDBFileName file $PDBFileName...\n";
 405   open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
 406 
 407   # Write out header and other older recors...
 408   WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
 409 
 410   # Write out all ATOM records for specified residues with TER and model records to indicate
 411   # chains and multiple models...
 412   %AtomNumbersMap = ();
 413   $ChainRecordCount = 0;
 414   for $RecordLine (@{$PDBRecordLinesRef}) {
 415     if (IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) {
 416       ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber) = ParseAtomRecordLine($RecordLine);
 417 
 418       # Check residue numbers...
 419       $IgnoreRecord = 1;
 420       if ($OptionsInfo{Mode} =~ /^ResidueNums$/i) {
 421         if (exists $OptionsInfo{SpecifiedResidueNumsMap}{$ResidueNumber}) {
 422           $IgnoreRecord = 0;
 423         }
 424       }
 425       elsif ($OptionsInfo{Mode} =~ /^ResiduesRange$/i) {
 426         if ($ResidueNumber >= $OptionsInfo{SpecifiedStartResidueNum} && $ResidueNumber <= $OptionsInfo{SpecifiedEndResidueNum}) {
 427           $IgnoreRecord = 0;
 428         }
 429       }
 430       if (!$IgnoreRecord) {
 431         $ChainRecordCount++;
 432         print OUTFILE "$RecordLine\n";
 433         $AtomNumber = int $AtomNumber;
 434         $AtomNumbersMap{$AtomNumber} = $AtomName;
 435       }
 436     }
 437     elsif (IsTerRecordType($RecordLine)) {
 438       if ($ChainRecordCount) {
 439         print OUTFILE GenerateTerRecordLine(), "\n";
 440       }
 441       $ChainRecordCount = 0;
 442     }
 443     elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) {
 444       print OUTFILE "$RecordLine\n";
 445     }
 446   }
 447 
 448   # Write out appropriate CONECT records...
 449   $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap);
 450   for $RecordLine (@{$ConectRecordLinesRef}) {
 451     print OUTFILE "$RecordLine\n";
 452   }
 453   # Write out END record...
 454   print OUTFILE GenerateEndRecordLine(), "\n";
 455 
 456   close OUTFILE;
 457 }
 458 
 459 # Extract non water records...
 460 sub ExtractNonWaterRecords {
 461   my($FileIndex, $PDBRecordLinesRef) = @_;
 462   my($PDBFileName,  $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ConectRecordLinesRef, %AtomNumbersMap);
 463 
 464   $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
 465   print "Generating PDBFileName file $PDBFileName...\n";
 466   open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
 467 
 468   # Write out header and other older recors...
 469   WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
 470 
 471   # Write out all ATOM/HETATM non water records along with TER and model records to indicate
 472   # chains and multiple models..
 473   %AtomNumbersMap = ();
 474   $ChainRecordCount = 0;
 475   for $RecordLine (@{$PDBRecordLinesRef}) {
 476     if (IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) {
 477       ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName) = ParseAtomRecordLine($RecordLine);
 478       if (! exists $OptionsInfo{SpecifiedWaterResiduesMap}{$ResidueName} ) {
 479         $ChainRecordCount++;
 480         print OUTFILE "$RecordLine\n";
 481         $AtomNumber = int $AtomNumber;
 482         $AtomNumbersMap{$AtomNumber} = $AtomName;
 483       }
 484     }
 485     elsif (IsTerRecordType($RecordLine)) {
 486       if ($ChainRecordCount) {
 487         print OUTFILE GenerateTerRecordLine(), "\n";
 488       }
 489       $ChainRecordCount = 0;
 490     }
 491     elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) {
 492       print OUTFILE "$RecordLine\n";
 493     }
 494   }
 495 
 496   # Write out appropriate CONECT records...
 497   $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap);
 498   for $RecordLine (@{$ConectRecordLinesRef}) {
 499     print OUTFILE "$RecordLine\n";
 500   }
 501   # Write out END record...
 502   print OUTFILE GenerateEndRecordLine(), "\n";
 503 
 504   close OUTFILE;
 505 }
 506 
 507 # Extract non hydrogen records...
 508 sub ExtractNonHydrogenRecords {
 509   my($FileIndex, $PDBRecordLinesRef) = @_;
 510   my($PDBFileName,  $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, $ConectRecordLinesRef, %AtomNumbersMap);
 511 
 512   $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
 513   print "Generating PDBFileName file $PDBFileName...\n";
 514   open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
 515 
 516   # Write out header and other older recors...
 517   WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
 518 
 519   # Write out all ATOM/HETATM non hydrogen records along with TER and model records to indicate
 520   # chains and multiple models..
 521   %AtomNumbersMap = ();
 522   $ChainRecordCount = 0;
 523   for $RecordLine (@{$PDBRecordLinesRef}) {
 524     if (IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) {
 525       ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine);
 526       if ($ElementSymbol !~ /^H$/i) {
 527         $ChainRecordCount++;
 528         print OUTFILE "$RecordLine\n";
 529         $AtomNumber = int $AtomNumber;
 530         $AtomNumbersMap{$AtomNumber} = $AtomName;
 531       }
 532     }
 533     elsif (IsTerRecordType($RecordLine)) {
 534       if ($ChainRecordCount) {
 535         print OUTFILE GenerateTerRecordLine(), "\n";
 536       }
 537       $ChainRecordCount = 0;
 538     }
 539     elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) {
 540       print OUTFILE "$RecordLine\n";
 541     }
 542   }
 543 
 544   # Write out appropriate CONECT records...
 545   $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap);
 546   for $RecordLine (@{$ConectRecordLinesRef}) {
 547     print OUTFILE "$RecordLine\n";
 548   }
 549   # Write out END record...
 550   print OUTFILE GenerateEndRecordLine(), "\n";
 551 
 552   close OUTFILE;
 553 }
 554 
 555 # Extract ATOM/HETATM records by distance...
 556 sub ExtractByDistance {
 557   my($FileIndex, $PDBRecordLinesRef) = @_;
 558   my($PDBFileName,  $RecordLine, $ChainRecordCount, $ConectRecordLinesRef, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, @OriginCoords, @Coords, %AtomNumbersMap);
 559 
 560   $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
 561   print "Generating PDBFileName file $PDBFileName...\n";
 562   open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
 563 
 564   # Write out header and other older recors...
 565   WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
 566 
 567   # Setup coordinates of origin to calculate distance...
 568   @OriginCoords = ();
 569   push @OriginCoords, @{$PDBFilesInfo{DistanceOrigin}[$FileIndex]};
 570 
 571   # Write out all ATOM records for which meet specified criteria along with TER and model records to indicate
 572   # chains and multiple models...
 573   %AtomNumbersMap = ();
 574   $ChainRecordCount = 0;
 575   for $RecordLine (@{$PDBRecordLinesRef}) {
 576     if (IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) {
 577       ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine);
 578       @Coords = (); push @Coords, ($X, $Y, $Z);
 579       if (CheckDistance(\@Coords, \@OriginCoords)) {
 580         $ChainRecordCount++;
 581         print OUTFILE "$RecordLine\n";
 582         $AtomNumber = int $AtomNumber;
 583         $AtomNumbersMap{$AtomNumber} = $AtomName;
 584       }
 585     }
 586     elsif (IsTerRecordType($RecordLine)) {
 587       if ($ChainRecordCount) {
 588         print OUTFILE GenerateTerRecordLine(), "\n";
 589       }
 590       $ChainRecordCount = 0;
 591     }
 592     elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) {
 593       print OUTFILE "$RecordLine\n";
 594     }
 595   }
 596 
 597   # Write out appropriate CONECT records...
 598   $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap);
 599   for $RecordLine (@{$ConectRecordLinesRef}) {
 600     print OUTFILE "$RecordLine\n";
 601   }
 602 
 603   # Write out END record...
 604   print OUTFILE GenerateEndRecordLine(), "\n";
 605 
 606   close OUTFILE;
 607 }
 608 
 609 # Does record meets distance citerion specified by the user?
 610 sub CheckDistance {
 611   my($CoordsRef, $OriginCoordsRef) = @_;
 612   my($Status, $Index, $Distance, $DistanceSquare);
 613 
 614   $Status = 0;
 615 
 616   if ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) {
 617     # Go over coordinates of all the atoms in the residue...
 618     my($ResidueCoordsCount) = scalar @{$OriginCoordsRef};
 619     INDEX: for ($Index = 0; $Index < $ResidueCoordsCount; $Index += 3) {
 620       $DistanceSquare = ($CoordsRef->[0] - $OriginCoordsRef->[$Index])**2 + ($CoordsRef->[1] - $OriginCoordsRef->[$Index + 1])**2 + ($CoordsRef->[2] - $OriginCoordsRef->[$Index + 2])**2;
 621       $Distance = sqrt $DistanceSquare;
 622       if ($Distance <= $OptionsInfo{MaxExtractionDistance}) {
 623         $Status = 1;
 624         last INDEX;
 625       }
 626     }
 627   }
 628   else {
 629     $DistanceSquare = 0;
 630     for $Index (0 .. 2) {
 631       $DistanceSquare += ($CoordsRef->[$Index] - $OriginCoordsRef->[$Index])**2;
 632     }
 633     $Distance = sqrt $DistanceSquare;
 634     $Status = ($Distance <= $OptionsInfo{MaxExtractionDistance}) ? 1 : 0;
 635   }
 636 
 637   return $Status;
 638 }
 639 
 640 # Write out modifed header and other older records...
 641 sub WriteHeaderAndOlderRecords {
 642   my($OutFileRef, $PDBRecordLinesRef) = @_;
 643 
 644   if ($OptionsInfo{ModifyHeaderRecord}) {
 645     # Write out modified HEADER record...
 646     my($Classification, $DepositionDate, $IDCode) = GetHeaderRecordInformation($PDBRecordLinesRef);
 647     $Classification = 'Data extracted using MayaChemTools';
 648     print $OutFileRef GenerateHeaderRecordLine($IDCode, $Classification), "\n";
 649   }
 650   else {
 651     print $OutFileRef $PDBRecordLinesRef->[0], "\n";
 652   }
 653 
 654   # Write out any old records...
 655   if ($OptionsInfo{KeepOldRecords}) {
 656     my($RecordLineIndex, $RecordLine);
 657     # Skip HEADER record and write out older records all the way upto first MODEL/ATOM/HETATM records from input file...
 658     RECORDLINE: for $RecordLineIndex (1 .. $#{$PDBRecordLinesRef}) {
 659       $RecordLine = $PDBRecordLinesRef->[$RecordLineIndex];
 660       if (IsModelRecordType($RecordLine) || IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) {
 661         last RECORDLINE;
 662       }
 663       print $OutFileRef "$RecordLine\n";
 664     }
 665   }
 666 }
 667 
 668 # Get header record information assuming it's the first record...
 669 sub GetHeaderRecordInformation {
 670   my($PDBRecordLinesRef) = @_;
 671   my($Classification, $DepositionDate, $IDCode, $HeaderRecordLine);
 672 
 673   ($Classification, $DepositionDate, $IDCode) = ('') x 3;
 674   $HeaderRecordLine = $PDBRecordLinesRef->[0];
 675   if (IsHeaderRecordType($HeaderRecordLine)) {
 676     ($Classification, $DepositionDate, $IDCode) = ParseHeaderRecordLine($HeaderRecordLine);
 677   }
 678   return ($Classification, $DepositionDate, $IDCode);
 679 }
 680 
 681 # Get one letter residue code...
 682 sub GetResidueCode {
 683   my($ResidueName) = @_;
 684   my($ResidueCode, $StandardResidue);
 685 
 686   $ResidueCode = $OptionsInfo{NonStandardSequenceCode};
 687   $StandardResidue = 0;
 688 
 689   if (length($ResidueName) == 3) {
 690     # Assume it's an amino acid...
 691     if (AminoAcids::IsAminoAcid($ResidueName)) {
 692       # Standard amino acid...
 693       $ResidueCode = AminoAcids::GetAminoAcidOneLetterCode($ResidueName);
 694       $StandardResidue = 1;
 695     }
 696   }
 697   elsif (length($ResidueName) == 1) {
 698     # Assume it's a nucleic acid...
 699     if ($ResidueName =~ /^(A|G|T|U|C)$/i) {
 700       $ResidueCode = $ResidueName;
 701       $StandardResidue = 1;
 702     }
 703   }
 704 
 705   return ($ResidueCode, $StandardResidue);
 706 }
 707 
 708 # Process option values...
 709 sub ProcessOptions {
 710   %OptionsInfo = ();
 711   $OptionsInfo{Mode} = $Options{mode};
 712 
 713   my(@SpecifiedChains) = ();
 714   if ($Options{chains} =~ /^(First|All)$/i) {
 715     $OptionsInfo{ChainsToExtract} = $Options{chains};
 716   }
 717   else {
 718     @SpecifiedChains = split /\,/, $Options{chains};
 719     $OptionsInfo{ChainsToExtract} = 'Specified';
 720   }
 721   @{$OptionsInfo{SpecifiedChains}} = ();
 722   push @{$OptionsInfo{SpecifiedChains}}, @SpecifiedChains;
 723 
 724   $OptionsInfo{CombineChains} = ($Options{combinechains} =~ /^Yes$/i) ? 1 : 0;
 725 
 726   $OptionsInfo{CombineChainSequences} = ($Options{combinechains} =~ /^Yes$/i) ? 1 : 0;
 727 
 728   my($ResidueNum, $StartResidueNum, $EndResNum, @SpecifiedResidueNumsList);
 729 
 730   @SpecifiedResidueNumsList = ();
 731   ($StartResidueNum, $EndResNum) = (0, 0);
 732 
 733   if ($OptionsInfo{Mode} =~ /^(ResidueNums|ResiduesRange)$/i) {
 734     if (!$Options{residues}) {
 735       die "Error: You must specify a value for \"--Residues\" option in \"ResidueNums or ResiduesRange\" \"-m, --mode\". \n";
 736     }
 737     $OptionsInfo{Residues} = $Options{residues};
 738     my($ResidueNum);
 739 
 740     $OptionsInfo{Residues} =~ s/ //g;
 741 
 742     @SpecifiedResidueNumsList = split /\,/, $OptionsInfo{Residues};
 743     for $ResidueNum (@SpecifiedResidueNumsList) {
 744       if (!IsPositiveInteger($ResidueNum)) {
 745         die "Error: Invalid number of residue number value, $ResidueNum, for \"--Residues\" option during \"ResidueNumes\" or \"ResiduesRange\"value of \"-m --mode\" option: Residue number must be a positive integer.\n";
 746       }
 747     }
 748     if ($OptionsInfo{Mode} =~ /^ResiduesRange$/i) {
 749       if (@SpecifiedResidueNumsList != 2) {
 750         die "Error: Invalid number of residue number values, ", scalar(@SpecifiedResidueNumsList), ", for \"--Residues\" option during \"ResiduesRange\" value of \"-m --mode\" option: The number of values must be 2 corresponding to start and end residue numbers.\n";
 751       }
 752       if ($SpecifiedResidueNumsList[0] > $SpecifiedResidueNumsList[1]) {
 753         die "Error: Invalid residue number values, @SpecifiedResidueNumsList, for \"--Residues\" option during \"ResiduesRange\" value of \"-m --mode\" option: The start residue numbers must be less than end residue number.\n";
 754       }
 755       ($StartResidueNum, $EndResNum) = @SpecifiedResidueNumsList;
 756     }
 757 
 758   }
 759   @{$OptionsInfo{SpecifiedResidueNumsList}} = ();
 760   push @{$OptionsInfo{SpecifiedResidueNumsList}}, @SpecifiedResidueNumsList;
 761 
 762   $OptionsInfo{SpecifiedStartResidueNum} = $StartResidueNum;
 763   $OptionsInfo{SpecifiedEndResidueNum} = $EndResNum;
 764 
 765   # Set up a specified residue numbers map...
 766   %{$OptionsInfo{SpecifiedResidueNumsMap}} = ();
 767   for $ResidueNum (@{$OptionsInfo{SpecifiedResidueNumsList}}) {
 768     $OptionsInfo{SpecifiedResidueNumsMap}{$ResidueNum} = $ResidueNum;
 769   }
 770 
 771   $OptionsInfo{MaxExtractionDistance} = $Options{distance};
 772   $OptionsInfo{ExtractionDistanceMode} = $Options{distancemode};
 773   $OptionsInfo{ExtractionDistanceOrigin} = $Options{distanceorigin} ? $Options{distanceorigin} : '';
 774   my(@SpecifiedDistanceOrigin) = ();
 775 
 776   if ($OptionsInfo{Mode} =~ /^Distance$/i) {
 777     if (!$Options{distanceorigin}) {
 778       die "Error: You must specify a value for \"--distanceorigin\" option in \"Distance\" \"-m, --mode\". \n";
 779     }
 780     @SpecifiedDistanceOrigin = split /\,/, $Options{distanceorigin};
 781     if ($OptionsInfo{ExtractionDistanceMode} =~ /^Atom$/i) {
 782       if (@SpecifiedDistanceOrigin != 2) {
 783         die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Atom\" value of \"--distancemode\" : The number of values must be 2.\n";
 784       }
 785       if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) {
 786         die "Error: Invalid atom number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Atom\" value of \"--distancemode\". Allowed values: > 0\n";
 787       }
 788     }
 789     elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Hetatm$/i) {
 790       if (@SpecifiedDistanceOrigin != 2) {
 791         die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Hetatm\" value of \"--distancemode\" : The number of values must be 2.\n";
 792       }
 793       if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) {
 794         die "Error: Invalid hetatm number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Hetatm\" value of \"--distancemode\". Allowed values: > 0\n";
 795       }
 796     }
 797     elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) {
 798       if (!(@SpecifiedDistanceOrigin == 2 || @SpecifiedDistanceOrigin == 3)) {
 799         die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Residue\" value of \"--distancemode\" : The number of values must be either 2 or 3.\n";
 800       }
 801       if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) {
 802         die "Error: Invalid residue number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Residue\" value of \"--distancemode\". Allowed values: > 0\n";
 803       }
 804     }
 805     elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) {
 806       if (@SpecifiedDistanceOrigin != 3) {
 807         die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"XYZ\" value of \"--distancemode\" : The number of values must be 3.\n";
 808       }
 809       my($Value);
 810       for $Value (@SpecifiedDistanceOrigin) {
 811         if (!IsNumerical($Value)) {
 812           die "Error: Invalid coordinate value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"XYZ\" value of \"--distancemode\". Allowed values: numerical\n";
 813         }
 814       }
 815     }
 816   }
 817   @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}} = ();
 818   push @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}, @SpecifiedDistanceOrigin;
 819 
 820   $OptionsInfo{WaterResidueNames} = $Options{waterresiduenames};
 821   @{$OptionsInfo{SpecifiedWaterResiduesList}} = ();
 822   %{$OptionsInfo{SpecifiedWaterResiduesMap}} = ();
 823 
 824   my(@SpecifiedWaterResiduesList);
 825   @SpecifiedWaterResiduesList = ();
 826 
 827   if ($OptionsInfo{Mode} =~ /^NonWater$/i) {
 828     my($WaterResidueName);
 829     if ($OptionsInfo{WaterResidueNames} =~ /Automatic/i) {
 830       push @SpecifiedWaterResiduesList, ('HOH', 'WAT', 'H2O');
 831     }
 832     else {
 833       @SpecifiedWaterResiduesList = split /\,/, $Options{waterresiduenames};
 834     }
 835     for $WaterResidueName (@SpecifiedWaterResiduesList) {
 836       $OptionsInfo{SpecifiedWaterResiduesMap}{$WaterResidueName} = $WaterResidueName;
 837     }
 838   }
 839   push @{$OptionsInfo{SpecifiedWaterResiduesList}}, @SpecifiedWaterResiduesList;
 840 
 841 
 842   $OptionsInfo{KeepOldRecords} = ($Options{keepoldrecords} =~ /^Yes$/i) ? 1 : 0;
 843 
 844   $OptionsInfo{ModifyHeaderRecord} = ($Options{modifyheader} =~ /^Yes$/i) ? 1 : 0;
 845 
 846   $OptionsInfo{KeepNonStandardSequences} = ($Options{nonstandardkeep} =~ /^Yes$/i) ? 1 : 0;
 847   $OptionsInfo{NonStandardSequenceCode} = $Options{nonstandardcode};
 848   $OptionsInfo{MaxSequenceLength} = $Options{sequencelength};
 849   $OptionsInfo{SequenceRecordSource} = $Options{sequencerecords};
 850   $OptionsInfo{SequenceIDPrefixSource} = $Options{sequenceidprefix};
 851 
 852   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 853   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 854 }
 855 
 856 # Retrieve information about PDB files...
 857 sub RetrievePDBFilesInfo {
 858   my($Index, $PDBFile, $PDBRecordLinesRef, $ChainID, $ChainLabel, $ChainsAndResiduesInfoRef, $Mode, $FileDir, $FileName, $FileExt, $OutFileName, $OutFileRoot, @SpecifiedChains, @DistanceOrigin, @OutFileNames, @ChainLabels, @ChainSequenceIDs, @ChainSequenceIDsPrefix);
 859 
 860   %PDBFilesInfo = ();
 861   @{$PDBFilesInfo{FileOkay}} = ();
 862   @{$PDBFilesInfo{OutFileRoot}} = ();
 863   @{$PDBFilesInfo{OutFileNames}} = ();
 864   @{$PDBFilesInfo{ChainLabels}} = ();
 865   @{$PDBFilesInfo{ChainSequenceIDs}} = ();
 866   @{$PDBFilesInfo{ChainSequenceIDsPrefix}} = ();
 867   @{$PDBFilesInfo{SpecifiedChains}} = ();
 868   @{$PDBFilesInfo{DistanceOrigin}} = ();
 869 
 870   FILELIST: for $Index (0 .. $#PDBFilesList) {
 871     $PDBFilesInfo{FileOkay}[$Index] = 0;
 872 
 873     $PDBFilesInfo{OutFileRoot}[$Index] = '';
 874     @{$PDBFilesInfo{OutFileNames}[$Index]} = ();
 875     @{$PDBFilesInfo{OutFileNames}[$Index]} = ();
 876     @{$PDBFilesInfo{ChainLabels}[$Index]} = ();
 877     @{$PDBFilesInfo{ChainSequenceIDs}[$Index]} = ();
 878     @{$PDBFilesInfo{ChainSequenceIDsPrefix}[$Index]} = ();
 879     @{$PDBFilesInfo{SpecifiedChains}[$Index]} = ();
 880     @{$PDBFilesInfo{DistanceOrigin}[$Index]} = ();
 881 
 882     $PDBFile = $PDBFilesList[$Index];
 883     if (!(-e $PDBFile)) {
 884       warn "Warning: Ignoring file $PDBFile: It doesn't exist\n";
 885       next FILELIST;
 886     }
 887     if (!CheckFileType($PDBFile, "pdb")) {
 888       warn "Warning: Ignoring file $PDBFile: It's not a PDB file\n";
 889       next FILELIST;
 890     }
 891     if (! open PDBFILE, "$PDBFile") {
 892       warn "Warning: Ignoring file $PDBFile: Couldn't open it: $! \n";
 893       next FILELIST;
 894     }
 895     close PDBFILE;
 896 
 897     # Get PDB data...
 898     $PDBRecordLinesRef = ReadPDBFile($PDBFile);
 899     if ($OptionsInfo{Mode} =~ /^Sequences$/i && $OptionsInfo{SequenceRecordSource} =~ /^SeqRes$/i) {
 900       $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'SeqRes');
 901     }
 902     else {
 903       $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef);
 904     }
 905     if (!scalar @{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 906       warn "Warning: Ignoring file $PDBFile: No chains found \n";
 907       next FILELIST;
 908     }
 909 
 910     # Make sure specified chains exist in PDB file...
 911     @SpecifiedChains = ();
 912     if ($OptionsInfo{ChainsToExtract} =~ /^Specified$/i) {
 913       for $ChainID (@{$OptionsInfo{SpecifiedChains}}) {
 914         if (exists $ChainsAndResiduesInfoRef->{Residues}{$ChainID}) {
 915           push @SpecifiedChains, $ChainID;
 916         }
 917         else {
 918           warn "Warning: Ignoring file $PDBFile: Specified chain, $ChainID, in \"-c, --chains\" option doesn't exist.\n";
 919           next FILELIST;
 920         }
 921       }
 922     }
 923     elsif ($OptionsInfo{ChainsToExtract} =~ /^First$/i) {
 924       push @SpecifiedChains, $ChainsAndResiduesInfoRef->{ChainIDs}[0];
 925     }
 926     elsif ($OptionsInfo{ChainsToExtract} =~ /^All$/i) {
 927       push @SpecifiedChains, @{$ChainsAndResiduesInfoRef->{ChainIDs}};
 928     }
 929     # Setup chain labels to use for sequence IDs and generating output files...
 930     @ChainLabels = ();
 931     for $ChainID (@SpecifiedChains) {
 932       $ChainLabel = $ChainID; $ChainLabel =~ s/^None//ig;
 933       $ChainLabel = "Chain${ChainLabel}";
 934       push @ChainLabels, $ChainLabel;
 935     }
 936 
 937     # Make sure specified distance origin is valid...
 938     @DistanceOrigin = ();
 939     if ($OptionsInfo{Mode} =~ /^Distance$/i) {
 940       if ($OptionsInfo{ExtractionDistanceMode} =~ /^(Atom|Hetatm)$/i) {
 941         my($RecordType, $SpecifiedAtomName, $SpecifiedAtomNumber, $RecordFound, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $RecordLine);
 942         $RecordType = $OptionsInfo{ExtractionDistanceMode};
 943         ($SpecifiedAtomNumber, $SpecifiedAtomName) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}};
 944         $RecordFound = 0;
 945         LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
 946           if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) {
 947             next LINE;
 948           }
 949           ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine);
 950           $AtomName = RemoveLeadingAndTrailingWhiteSpaces($AtomName);
 951           if (($RecordType =~ /^Atom$/i && IsAtomRecordType($RecordLine)) || ($RecordType =~ /^Hetatm$/i && IsHetatmRecordType($RecordLine))) {
 952             if ($AtomNumber == $SpecifiedAtomNumber && $AtomName eq $SpecifiedAtomName) {
 953               $RecordFound = 1;
 954               last LINE;
 955             }
 956           }
 957         }
 958         if (!$RecordFound) {
 959           warn "Warning: Ignoring file $PDBFile: ", uc($RecordType), " record corresponding to \"--distanceorigin\" option value, $OptionsInfo{ExtractionDistanceOrigin}, doesn't exist.\n";
 960           next FILELIST;
 961         }
 962         push @DistanceOrigin, ($X, $Y, $Z);
 963       }
 964       elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) {
 965         my($SpecifiedResidueNumber, $SpecifiedResidueName, $SpecifiedChainID, $RecordFound, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $RecordLine);
 966         $SpecifiedChainID = '';
 967         if (@{$OptionsInfo{SpecifiedExtractionDistanceOrigin}} == 3) {
 968           ($SpecifiedResidueNumber, $SpecifiedResidueName, $SpecifiedChainID) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}};
 969         }
 970         else {
 971           ($SpecifiedResidueNumber, $SpecifiedResidueName) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}};
 972         }
 973         $RecordFound = 0;
 974         LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
 975           if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) {
 976             next LINE;
 977           }
 978           ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine);
 979           $ResidueName = RemoveLeadingAndTrailingWhiteSpaces($ResidueName);
 980           $ChainID = RemoveLeadingAndTrailingWhiteSpaces($ChainID);
 981           if ($SpecifiedChainID && ($SpecifiedChainID ne $ChainID)) {
 982             next LINE;
 983           }
 984           if ($ResidueNumber == $SpecifiedResidueNumber && $ResidueName eq $SpecifiedResidueName) {
 985             # Store coordinates for all the atoms...
 986             $RecordFound = 1;
 987             push @DistanceOrigin, ($X, $Y, $Z);
 988             next LINE;
 989           }
 990         }
 991         if (!$RecordFound) {
 992           warn "Warning: Ignoring file $PDBFile: ATOM/HETATM record corresponding to \"--distanceorigin\" option value, $OptionsInfo{ExtractionDistanceOrigin}, doesn't exist.\n";
 993           next FILELIST;
 994         }
 995       }
 996       elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) {
 997         push @DistanceOrigin, @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}};
 998       }
 999     }
1000     # Setup output file names...
1001     @OutFileNames = ();
1002     $FileDir = ""; $FileName = ""; $FileExt = "";
1003     ($FileDir, $FileName, $FileExt) = ParseFileName($PDBFile);
1004     if ($OptionsInfo{OutFileRoot} && (@PDBFilesList == 1)) {
1005       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
1006       if ($RootFileName && $RootFileExt) {
1007         $FileName = $RootFileName;
1008       }
1009       else {
1010         $FileName = $OptionsInfo{OutFileRoot};
1011       }
1012       $OutFileRoot = $FileName;
1013     }
1014     else {
1015       $OutFileRoot = $FileName;
1016     }
1017     $Mode = $OptionsInfo{Mode};
1018     if ($Mode =~ /^(Atoms|CAlphas|ResidueNums|ResiduesRange|Distance|NonWater|NonHydrogens)$/i) {
1019       $OutFileName = '';
1020       if ($Mode =~ /^CAlphas$/i) {
1021         $OutFileName = "${OutFileRoot}CAlphas.pdb";
1022       }
1023       elsif ($Mode =~ /^Atoms$/i) {
1024         $OutFileName = "${OutFileRoot}Atoms.pdb";
1025       }
1026       elsif ($Mode =~ /^ResidueNums$/i) {
1027         $OutFileName = "${OutFileRoot}ResidueNums.pdb";
1028       }
1029       elsif ($Mode =~ /^ResiduesRange$/i) {
1030         $OutFileName = "${OutFileRoot}ResiduesRange.pdb";
1031       }
1032       elsif ($Mode =~ /^NonWater$/i) {
1033         $OutFileName = "${OutFileRoot}NonWater.pdb";
1034       }
1035       elsif ($Mode =~ /^NonHydrogens$/i) {
1036         $OutFileName = "${OutFileRoot}NonHydrogens.pdb";
1037       }
1038       elsif ($Mode =~ /^Distance$/i) {
1039         my($DistanceMode) = '';
1040         if ($OptionsInfo{ExtractionDistanceMode} =~ /^Atom$/i) {
1041           $DistanceMode = 'Atom';
1042         }
1043         elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Hetatm$/i) {
1044           $DistanceMode = 'Hetatm';
1045         }
1046         elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) {
1047           $DistanceMode = 'Residue';
1048         }
1049         elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) {
1050           $DistanceMode = 'XYZ';
1051         }
1052         $OutFileName = "${OutFileRoot}DistanceBy${DistanceMode}.pdb";
1053       }
1054       push @OutFileNames, $OutFileName;
1055       if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) {
1056         warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n";
1057         next FILELIST;
1058       }
1059     }
1060     elsif ($Mode =~ /^(Chains|Sequences)$/i) {
1061       if ($OptionsInfo{CombineChainSequences}) {
1062         $OutFileName = ($Mode =~ /^Chains$/i) ? "${OutFileRoot}ExtractedChains.pdb" : "${OutFileRoot}SequencesChainsCombined.fasta";
1063         push @OutFileNames, $OutFileName;
1064         if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) {
1065           warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n";
1066           next FILELIST;
1067         }
1068       }
1069       else {
1070         for $ChainLabel (@ChainLabels) {
1071           $OutFileName = ($Mode =~ /^Chains$/i) ? "${OutFileRoot}${ChainLabel}.pdb" : "${OutFileRoot}Sequences${ChainLabel}.fasta";
1072           push @OutFileNames, $OutFileName;
1073           if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) {
1074             warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n";
1075             next FILELIST;
1076           }
1077         }
1078       }
1079     }
1080     @ChainSequenceIDs = ();
1081     @ChainSequenceIDsPrefix = ();
1082     if ($Mode =~ /^Sequences$/i) {
1083       my($HeaderRecordLine, $Classification, $DepositionDate, $IDCode, $IDPrefix);
1084       ($Classification, $DepositionDate, $IDCode) = GetHeaderRecordInformation($PDBRecordLinesRef);
1085 
1086       if ($OptionsInfo{SequenceIDPrefixSource} =~ /^FileName$/i) {
1087         $IDPrefix = $FileName;
1088       }
1089       elsif ($OptionsInfo{SequenceIDPrefixSource} =~ /^HeaderRecord$/i) {
1090         $IDPrefix = IsNotEmpty($IDCode) ? $IDCode : '';
1091       }
1092       else {
1093         $IDPrefix = IsNotEmpty($IDCode) ? $IDCode : $FileName;
1094       }
1095 
1096       for $ChainLabel (@ChainLabels) {
1097         push @ChainSequenceIDsPrefix, $IDPrefix;
1098         push @ChainSequenceIDs, "${IDPrefix}_${ChainLabel}|PDB";
1099       }
1100     }
1101 
1102     $PDBFilesInfo{FileOkay}[$Index] = 1;
1103     $PDBFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
1104 
1105     push @{$PDBFilesInfo{OutFileNames}[$Index]}, @OutFileNames;
1106     push @{$PDBFilesInfo{ChainLabels}[$Index]}, @ChainLabels;
1107     push @{$PDBFilesInfo{ChainSequenceIDsPrefix}[$Index]}, @ChainSequenceIDsPrefix;
1108     push @{$PDBFilesInfo{ChainSequenceIDs}[$Index]}, @ChainSequenceIDs;
1109     push @{$PDBFilesInfo{SpecifiedChains}[$Index]}, @SpecifiedChains;
1110     push @{$PDBFilesInfo{DistanceOrigin}[$Index]}, @DistanceOrigin;
1111   }
1112 }
1113 
1114 
1115 # Setup script usage  and retrieve command line arguments specified using various options...
1116 sub SetupScriptUsage {
1117 
1118   # Retrieve all the options...
1119   %Options = ();
1120   $Options{chains} = 'First';
1121   $Options{combinechains} = 'no';
1122   $Options{distance} = 10.0;
1123   $Options{distancemode} = 'XYZ';
1124   $Options{keepoldrecords} = 'no';
1125   $Options{mode} = 'NonWater';
1126   $Options{modifyheader} = 'yes';
1127   $Options{nonstandardkeep} = 'yes';
1128   $Options{nonstandardcode} = 'X';
1129   $Options{sequencelength} = 80;
1130   $Options{sequenceidprefix} = 'Automatic';
1131   $Options{sequencerecords} = 'Atom';
1132   $Options{waterresiduenames} = 'Automatic';
1133 
1134   if (!GetOptions(\%Options, "chains|c=s", "combinechains=s", "distance|d=f", "distancemode=s", "distanceorigin=s", "help|h", "keepoldrecords|k=s", "mode|m=s", "modifyheader=s", "nonstandardkeep=s", "nonstandardcode=s", "overwrite|o", "root|r=s", "residues=s", "sequencelength=i", "sequenceidprefix=s", "sequencerecords=s", "waterresiduenames=s", "workingdir|w=s")) {
1135     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
1136   }
1137   if ($Options{workingdir}) {
1138     if (! -d $Options{workingdir}) {
1139       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
1140     }
1141     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
1142   }
1143   if ($Options{combinechains} !~ /^(yes|no)$/i) {
1144     die "Error: The value specified, $Options{combinechains}, for option \"--CombineChains\" is not valid. Allowed values: yes or no\n";
1145   }
1146   if ($Options{distancemode} !~ /^(Atom|Hetatm|Residue|XYZ)$/i) {
1147     die "Error: The value specified, $Options{distancemode}, for option \"--DistanceMode\" is not valid. Allowed values: Atom, Hetatm, Residue, or XYZ\n";
1148   }
1149   if ($Options{keepoldrecords} !~ /^(yes|no)$/i) {
1150     die "Error: The value specified, $Options{keepoldrecords}, for option \"--KeepOldRecords\" is not valid. Allowed values: yes or no\n";
1151   }
1152   if ($Options{mode} !~ /^(Chains|Sequences|Atoms|CAlphas|ResidueNums|ResiduesRange|Distance|NonWater|NonHydrogens)$/i) {
1153     die "Error: The value specified, $Options{mode}, for option \"m, --mode\" is not valid. Allowed values: Chains, Sequences, Atoms, CAlphas, ResidueNums, ResiduesRange, Distance, NonWater, NonHydrogens\n";
1154   }
1155   if ($Options{modifyheader} !~ /^(yes|no)$/i) {
1156     die "Error: The value specified, $Options{modifyheader}, for option \"--ModifyHeader\" is not valid. Allowed values: yes or no\n";
1157   }
1158   if ($Options{nonstandardkeep} !~ /^(yes|no)$/i) {
1159     die "Error: The value specified, $Options{nonstandardkeep}, for option \"--NonStandardKeep\" is not valid. Allowed values: yes or no\n";
1160   }
1161   if ($Options{nonstandardcode} !~ /^(\?|\-|X)$/i) {
1162     die "Error: The value specified, $Options{nonstandardcode}, for option \"--NonStandardCode\" is not valid. Allowed values: ?, -, or X\n";
1163   }
1164   if (!IsPositiveInteger($Options{sequencelength})) {
1165     die "Error: The value specified, $Options{sequencelength}, for option \"--SequenceLength\" is not valid. Allowed values: >0\n";
1166   }
1167   if ($Options{sequencerecords} !~ /^(Atom|SeqRes)$/i) {
1168     die "Error: The value specified, $Options{sequencerecords}, for option \"--SequenceRecords\" is not valid. Allowed values: Atom or SeqRes\n";
1169   }
1170   if ($Options{sequenceidprefix} !~ /^(FileName|HeaderRecord|Automatic)$/i) {
1171     die "Error: The value specified, $Options{sequenceidprefix}, for option \"--SequenceIDPrefix\" is not valid. Allowed values: FileName, HeaderRecord, or AutomaticAtom\n";
1172   }
1173 }
1174