1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: ExtractFromPDBFiles.pl,v $ 4 # $Date: 2011/12/16 00:03:30 $ 5 # $Revision: 1.32 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2012 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use TextUtil; 37 use PDBFileUtil; 38 use AminoAcids; 39 use SequenceFileUtil; 40 41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 42 43 # Autoflush STDOUT 44 $| = 1; 45 46 # Starting message... 47 $ScriptName = basename($0); 48 print "\n$ScriptName: Starting...\n\n"; 49 $StartTime = new Benchmark; 50 51 # Get the options and setup script... 52 SetupScriptUsage(); 53 if ($Options{help} || @ARGV < 1) { 54 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 55 } 56 57 my(@PDBFilesList); 58 @PDBFilesList = ExpandFileNames(\@ARGV, "pdb"); 59 60 # Process options... 61 print "Processing options...\n"; 62 my(%OptionsInfo); 63 ProcessOptions(); 64 65 # Setup information about input files... 66 print "Checking input PDB file(s)...\n"; 67 my(%PDBFilesInfo); 68 RetrievePDBFilesInfo(); 69 70 # Process input files.. 71 my($FileIndex); 72 if (@PDBFilesList > 1) { 73 print "\nProcessing PDB files...\n"; 74 } 75 for $FileIndex (0 .. $#PDBFilesList) { 76 if ($PDBFilesInfo{FileOkay}[$FileIndex]) { 77 print "\nProcessing file $PDBFilesList[$FileIndex]...\n"; 78 ExtractFromPDBFiles($FileIndex); 79 } 80 } 81 print "\n$ScriptName:Done...\n\n"; 82 83 $EndTime = new Benchmark; 84 $TotalTime = timediff ($EndTime, $StartTime); 85 print "Total time: ", timestr($TotalTime), "\n"; 86 87 ############################################################################### 88 89 # Extract appropriate information... 90 sub ExtractFromPDBFiles { 91 my($FileIndex) = @_; 92 my($PDBFile, $PDBRecordLinesRef); 93 94 # Get PDB data... 95 $PDBFile = $PDBFilesList[$FileIndex]; 96 $PDBRecordLinesRef = ReadPDBFile($PDBFile); 97 98 if ($OptionsInfo{Mode} =~ /Chains/i) { 99 ExtractChains($FileIndex, $PDBRecordLinesRef); 100 } 101 elsif ($OptionsInfo{Mode} =~ /Sequences/i) { 102 ExtractSequences($FileIndex, $PDBRecordLinesRef); 103 } 104 elsif ($OptionsInfo{Mode} =~ /Atoms/i) { 105 ExtractAtoms($FileIndex, $PDBRecordLinesRef); 106 } 107 elsif ($OptionsInfo{Mode} =~ /CAlphas/i) { 108 ExtractCAlphas($FileIndex, $PDBRecordLinesRef); 109 } 110 elsif ($OptionsInfo{Mode} =~ /^(ResidueNums|ResiduesRange)$/i) { 111 ExtractByResidues($FileIndex, $PDBRecordLinesRef); 112 } 113 elsif ($OptionsInfo{Mode} =~ /Distance/i) { 114 ExtractByDistance($FileIndex, $PDBRecordLinesRef); 115 } 116 elsif ($OptionsInfo{Mode} =~ /NonWater/i) { 117 ExtractNonWaterRecords($FileIndex, $PDBRecordLinesRef); 118 } 119 elsif ($OptionsInfo{Mode} =~ /NonHydrogens/i) { 120 ExtractNonHydrogenRecords($FileIndex, $PDBRecordLinesRef); 121 } 122 } 123 124 # Extract chains and generate new PDB files... 125 # 126 sub ExtractChains { 127 my($FileIndex, $PDBRecordLinesRef) = @_; 128 my($ChainIndex, $ChainID, $ChainLabel, $PDBFileName, $RecordLine, $ChainsAndResiduesInfoRef, $AtomNumber, $AtomName, $ResidueName, $AtomChainID, $ResidueNumber, $AlternateLocation, $InsertionCode, $ConectRecordLinesRef, %ChainAtomNumbersMap); 129 130 # Get chains and residues data... 131 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', 0, 1); 132 133 if ($OptionsInfo{CombineChains}) { 134 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 135 print "Generating PDBFileName file $PDBFileName...\n"; 136 137 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 138 139 # Write out header and other older recors... 140 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 141 } 142 143 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) { 144 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; 145 $ChainLabel = $PDBFilesInfo{ChainLabels}[$FileIndex][$ChainIndex]; 146 147 if (!$OptionsInfo{CombineChains}) { 148 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][$ChainIndex]; 149 print "Generating PDBFileName file $PDBFileName...\n"; 150 151 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 152 153 # Write out header and other older recors... 154 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 155 } 156 157 # Write out ATOM/HETATM line for chain and collect all ATOM/HETATM serial numbers 158 # for writing out appropriate CONECT records... 159 %ChainAtomNumbersMap = (); 160 for $RecordLine (@{$ChainsAndResiduesInfoRef->{Lines}{$ChainID}}) { 161 print OUTFILE "$RecordLine\n"; 162 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $AtomChainID, $ResidueNumber, $InsertionCode) = ParseAtomRecordLine($RecordLine); 163 $AtomNumber = int $AtomNumber; 164 $ChainAtomNumbersMap{$AtomNumber} = $AtomName; 165 } 166 # Write out TER record using information from last chain record... 167 $AtomNumber += 1; 168 print OUTFILE GenerateTerRecordLine($AtomNumber, $ResidueName, $AtomChainID, $ResidueNumber, $InsertionCode), "\n"; 169 170 # Write out CONECT records... 171 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%ChainAtomNumbersMap); 172 173 for $RecordLine (@{$ConectRecordLinesRef}) { 174 print OUTFILE "$RecordLine\n"; 175 } 176 177 if (!$OptionsInfo{CombineChains}) { 178 # Write out END record... 179 print OUTFILE GenerateEndRecordLine(), "\n"; 180 181 close OUTFILE; 182 } 183 } 184 185 if ($OptionsInfo{CombineChains}) { 186 # Write out END record... 187 print OUTFILE GenerateEndRecordLine(), "\n"; 188 189 close OUTFILE; 190 } 191 192 } 193 194 195 # Extract sequences for individual chains or combine all the chains... 196 sub ExtractSequences { 197 my($FileIndex, $PDBRecordLinesRef) = @_; 198 my($ChainIndex, $ChainID, $ChainLabel, $SequenceFileName, $Residue, $ResidueCode, $StandardResidue, $ChainSequence, $WrappedChainSequence, $ChainSequenceID, $ChainsAndResiduesInfoRef, $ChainResiduesRef, %ChainSequencesDataMap); 199 200 if ($OptionsInfo{SequenceRecordSource} =~ /^SeqRes$/i) { 201 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'SeqRes'); 202 } 203 else { 204 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef); 205 } 206 207 # Generate sequence data for all the chains... 208 %ChainSequencesDataMap = (); 209 @{$ChainSequencesDataMap{IDs}} = (); 210 %{$ChainSequencesDataMap{Sequence}} = (); 211 %{$ChainSequencesDataMap{Description}} = (); 212 213 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) { 214 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; 215 $ChainLabel = $PDBFilesInfo{ChainLabels}[$FileIndex][$ChainIndex]; 216 217 # Setup sequence ID... 218 $ChainSequenceID = $PDBFilesInfo{ChainSequenceIDs}[$FileIndex][$ChainIndex]; 219 push @{$ChainSequencesDataMap{IDs}}, $ChainSequenceID; 220 $ChainSequencesDataMap{Description}{$ChainID} = $ChainSequenceID; 221 222 # Collect sequence data for the chain... 223 if ($OptionsInfo{SequenceRecordSource} =~ /^SeqRes/i) { 224 $ChainResiduesRef = \@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}; 225 } 226 else { 227 $ChainResiduesRef = \@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}; 228 } 229 # Setup sequence data... 230 $ChainSequence = ''; 231 RESIDUE: for $Residue (@{$ChainResiduesRef}) { 232 ($ResidueCode, $StandardResidue) = GetResidueCode($Residue); 233 if (!$StandardResidue) { 234 if ($OptionsInfo{KeepNonStandardSequences}) { 235 $ResidueCode = $OptionsInfo{NonStandardSequenceCode}; 236 warn "Warning: Keeping nonstandard residue $Residue in $ChainLabel...\n"; 237 } 238 else { 239 warn "Warning: Ignoring nonstandard residue $Residue in $ChainLabel...\n"; 240 next RESIDUE; 241 } 242 } 243 $ChainSequence .= $ResidueCode; 244 } 245 $ChainSequencesDataMap{Sequence}{$ChainID} = $ChainSequence; 246 247 } 248 249 # Write out the sequence files... 250 my($SequenceID, $SequenceDescription, $Sequence, %SequencesDataMap ); 251 if ($OptionsInfo{CombineChainSequences}) { 252 # Combine all the chain sequences... 253 $Sequence = ''; 254 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) { 255 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; 256 257 $Sequence .= $ChainSequencesDataMap{Sequence}{$ChainID}; 258 } 259 $SequenceID = $PDBFilesInfo{ChainSequenceIDsPrefix}[$FileIndex][0] . "_CombinedChains|PDB";; 260 $SequenceDescription = $SequenceID; 261 $SequenceFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 262 263 print "Generating sequence file $SequenceFileName...\n"; 264 %SequencesDataMap = (); 265 @{$SequencesDataMap{IDs}} = (); 266 %{$SequencesDataMap{Sequence}} = (); 267 %{$SequencesDataMap{Description}} = (); 268 269 push @{$SequencesDataMap{IDs}}, $SequenceID; 270 $SequencesDataMap{Description}{$SequenceID} = $SequenceDescription; 271 $SequencesDataMap{Sequence}{$SequenceID} = $Sequence; 272 273 WritePearsonFastaSequenceFile($SequenceFileName, \%SequencesDataMap, $OptionsInfo{MaxSequenceLength}); 274 } 275 else { 276 # For each specifed chain, write out the sequences... 277 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) { 278 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; 279 280 $SequenceFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][$ChainIndex]; 281 282 $SequenceID = $ChainSequencesDataMap{IDs}[$ChainIndex]; 283 $SequenceDescription = $ChainSequencesDataMap{Description}{$ChainID}; 284 $Sequence = $ChainSequencesDataMap{Sequence}{$ChainID}; 285 286 print "Generating sequence file $SequenceFileName...\n"; 287 %SequencesDataMap = (); 288 @{$SequencesDataMap{IDs}} = (); 289 %{$SequencesDataMap{Sequence}} = (); 290 %{$SequencesDataMap{Description}} = (); 291 292 push @{$SequencesDataMap{IDs}}, $SequenceID; 293 $SequencesDataMap{Description}{$SequenceID} = $SequenceDescription; 294 $SequencesDataMap{Sequence}{$SequenceID} = $Sequence; 295 296 WritePearsonFastaSequenceFile($SequenceFileName, \%SequencesDataMap, $OptionsInfo{MaxSequenceLength}); 297 } 298 } 299 } 300 301 # Extract atoms... 302 sub ExtractAtoms { 303 my($FileIndex, $PDBRecordLinesRef) = @_; 304 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $ConectRecordLinesRef, %AtomNumbersMap); 305 306 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 307 print "Generating PDBFileName file $PDBFileName...\n"; 308 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 309 310 # Write out header and other older recors... 311 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 312 313 # Write out all ATOM records along with TER and model records to indicate 314 # chains and multiple models.. 315 %AtomNumbersMap = (); 316 $ChainRecordCount = 0; 317 for $RecordLine (@{$PDBRecordLinesRef}) { 318 if (IsAtomRecordType($RecordLine)) { 319 $ChainRecordCount++; 320 print OUTFILE "$RecordLine\n"; 321 ($AtomNumber, $AtomName) = ParseAtomRecordLine($RecordLine); 322 323 # Store the atom number for extracting CONECT records... 324 $AtomNumber = int $AtomNumber; 325 $AtomNumbersMap{$AtomNumber} = $AtomName; 326 } 327 elsif (IsTerRecordType($RecordLine)) { 328 if ($ChainRecordCount) { 329 print OUTFILE GenerateTerRecordLine(), "\n"; 330 } 331 $ChainRecordCount = 0; 332 } 333 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 334 print OUTFILE "$RecordLine\n"; 335 } 336 } 337 338 # Write out appropriate CONECT records... 339 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 340 for $RecordLine (@{$ConectRecordLinesRef}) { 341 print OUTFILE "$RecordLine\n"; 342 } 343 344 # Write out END record... 345 print OUTFILE GenerateEndRecordLine(), "\n"; 346 347 close OUTFILE; 348 } 349 350 # Extract CAlphas... 351 sub ExtractCAlphas { 352 my($FileIndex, $PDBRecordLinesRef) = @_; 353 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $ConectRecordLinesRef, %AtomNumbersMap); 354 355 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 356 print "Generating PDBFileName file $PDBFileName...\n"; 357 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 358 359 # Write out header and other older recors... 360 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 361 362 # Write out all ATOM records for CAlphas along with TER and model records to indicate 363 # chains and multiple models.. 364 %AtomNumbersMap = (); 365 $ChainRecordCount = 0; 366 for $RecordLine (@{$PDBRecordLinesRef}) { 367 if (IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) { 368 ($AtomNumber, $AtomName) = ParseAtomRecordLine($RecordLine); 369 if ($AtomName =~ /^CA$/i) { 370 $ChainRecordCount++; 371 print OUTFILE "$RecordLine\n"; 372 $AtomNumber = int $AtomNumber; 373 $AtomNumbersMap{$AtomNumber} = $AtomName; 374 } 375 } 376 elsif (IsTerRecordType($RecordLine)) { 377 if ($ChainRecordCount) { 378 print OUTFILE GenerateTerRecordLine(), "\n"; 379 } 380 $ChainRecordCount = 0; 381 } 382 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 383 print OUTFILE "$RecordLine\n"; 384 } 385 } 386 387 # Write out appropriate CONECT records... 388 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 389 for $RecordLine (@{$ConectRecordLinesRef}) { 390 print OUTFILE "$RecordLine\n"; 391 } 392 # Write out END record... 393 print OUTFILE GenerateEndRecordLine(), "\n"; 394 395 close OUTFILE; 396 } 397 398 # Extract residues... 399 sub ExtractByResidues { 400 my($FileIndex, $PDBRecordLinesRef) = @_; 401 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $ConectRecordLinesRef, $IgnoreRecord, %AtomNumbersMap); 402 403 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 404 print "Generating PDBFileName file $PDBFileName...\n"; 405 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 406 407 # Write out header and other older recors... 408 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 409 410 # Write out all ATOM records for specified residues with TER and model records to indicate 411 # chains and multiple models... 412 %AtomNumbersMap = (); 413 $ChainRecordCount = 0; 414 for $RecordLine (@{$PDBRecordLinesRef}) { 415 if (IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) { 416 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber) = ParseAtomRecordLine($RecordLine); 417 418 # Check residue numbers... 419 $IgnoreRecord = 1; 420 if ($OptionsInfo{Mode} =~ /^ResidueNums$/i) { 421 if (exists $OptionsInfo{SpecifiedResidueNumsMap}{$ResidueNumber}) { 422 $IgnoreRecord = 0; 423 } 424 } 425 elsif ($OptionsInfo{Mode} =~ /^ResiduesRange$/i) { 426 if ($ResidueNumber >= $OptionsInfo{SpecifiedStartResidueNum} && $ResidueNumber <= $OptionsInfo{SpecifiedEndResidueNum}) { 427 $IgnoreRecord = 0; 428 } 429 } 430 if (!$IgnoreRecord) { 431 $ChainRecordCount++; 432 print OUTFILE "$RecordLine\n"; 433 $AtomNumber = int $AtomNumber; 434 $AtomNumbersMap{$AtomNumber} = $AtomName; 435 } 436 } 437 elsif (IsTerRecordType($RecordLine)) { 438 if ($ChainRecordCount) { 439 print OUTFILE GenerateTerRecordLine(), "\n"; 440 } 441 $ChainRecordCount = 0; 442 } 443 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 444 print OUTFILE "$RecordLine\n"; 445 } 446 } 447 448 # Write out appropriate CONECT records... 449 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 450 for $RecordLine (@{$ConectRecordLinesRef}) { 451 print OUTFILE "$RecordLine\n"; 452 } 453 # Write out END record... 454 print OUTFILE GenerateEndRecordLine(), "\n"; 455 456 close OUTFILE; 457 } 458 459 # Extract non water records... 460 sub ExtractNonWaterRecords { 461 my($FileIndex, $PDBRecordLinesRef) = @_; 462 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ConectRecordLinesRef, %AtomNumbersMap); 463 464 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 465 print "Generating PDBFileName file $PDBFileName...\n"; 466 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 467 468 # Write out header and other older recors... 469 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 470 471 # Write out all ATOM/HETATM non water records along with TER and model records to indicate 472 # chains and multiple models.. 473 %AtomNumbersMap = (); 474 $ChainRecordCount = 0; 475 for $RecordLine (@{$PDBRecordLinesRef}) { 476 if (IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) { 477 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName) = ParseAtomRecordLine($RecordLine); 478 if (! exists $OptionsInfo{SpecifiedWaterResiduesMap}{$ResidueName} ) { 479 $ChainRecordCount++; 480 print OUTFILE "$RecordLine\n"; 481 $AtomNumber = int $AtomNumber; 482 $AtomNumbersMap{$AtomNumber} = $AtomName; 483 } 484 } 485 elsif (IsTerRecordType($RecordLine)) { 486 if ($ChainRecordCount) { 487 print OUTFILE GenerateTerRecordLine(), "\n"; 488 } 489 $ChainRecordCount = 0; 490 } 491 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 492 print OUTFILE "$RecordLine\n"; 493 } 494 } 495 496 # Write out appropriate CONECT records... 497 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 498 for $RecordLine (@{$ConectRecordLinesRef}) { 499 print OUTFILE "$RecordLine\n"; 500 } 501 # Write out END record... 502 print OUTFILE GenerateEndRecordLine(), "\n"; 503 504 close OUTFILE; 505 } 506 507 # Extract non hydrogen records... 508 sub ExtractNonHydrogenRecords { 509 my($FileIndex, $PDBRecordLinesRef) = @_; 510 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, $ConectRecordLinesRef, %AtomNumbersMap); 511 512 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 513 print "Generating PDBFileName file $PDBFileName...\n"; 514 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 515 516 # Write out header and other older recors... 517 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 518 519 # Write out all ATOM/HETATM non hydrogen records along with TER and model records to indicate 520 # chains and multiple models.. 521 %AtomNumbersMap = (); 522 $ChainRecordCount = 0; 523 for $RecordLine (@{$PDBRecordLinesRef}) { 524 if (IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) { 525 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine); 526 if ($ElementSymbol !~ /^H$/i) { 527 $ChainRecordCount++; 528 print OUTFILE "$RecordLine\n"; 529 $AtomNumber = int $AtomNumber; 530 $AtomNumbersMap{$AtomNumber} = $AtomName; 531 } 532 } 533 elsif (IsTerRecordType($RecordLine)) { 534 if ($ChainRecordCount) { 535 print OUTFILE GenerateTerRecordLine(), "\n"; 536 } 537 $ChainRecordCount = 0; 538 } 539 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 540 print OUTFILE "$RecordLine\n"; 541 } 542 } 543 544 # Write out appropriate CONECT records... 545 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 546 for $RecordLine (@{$ConectRecordLinesRef}) { 547 print OUTFILE "$RecordLine\n"; 548 } 549 # Write out END record... 550 print OUTFILE GenerateEndRecordLine(), "\n"; 551 552 close OUTFILE; 553 } 554 555 # Extract ATOM/HETATM records by distance... 556 sub ExtractByDistance { 557 my($FileIndex, $PDBRecordLinesRef) = @_; 558 my($PDBFileName, $RecordLine, $ChainRecordCount, $ConectRecordLinesRef, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, @OriginCoords, @Coords, %AtomNumbersMap); 559 560 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 561 print "Generating PDBFileName file $PDBFileName...\n"; 562 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 563 564 # Write out header and other older recors... 565 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 566 567 # Setup coordinates of origin to calculate distance... 568 @OriginCoords = (); 569 push @OriginCoords, @{$PDBFilesInfo{DistanceOrigin}[$FileIndex]}; 570 571 # Write out all ATOM records for which meet specified criteria along with TER and model records to indicate 572 # chains and multiple models... 573 %AtomNumbersMap = (); 574 $ChainRecordCount = 0; 575 for $RecordLine (@{$PDBRecordLinesRef}) { 576 if (IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) { 577 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); 578 @Coords = (); push @Coords, ($X, $Y, $Z); 579 if (CheckDistance(\@Coords, \@OriginCoords)) { 580 $ChainRecordCount++; 581 print OUTFILE "$RecordLine\n"; 582 $AtomNumber = int $AtomNumber; 583 $AtomNumbersMap{$AtomNumber} = $AtomName; 584 } 585 } 586 elsif (IsTerRecordType($RecordLine)) { 587 if ($ChainRecordCount) { 588 print OUTFILE GenerateTerRecordLine(), "\n"; 589 } 590 $ChainRecordCount = 0; 591 } 592 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 593 print OUTFILE "$RecordLine\n"; 594 } 595 } 596 597 # Write out appropriate CONECT records... 598 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 599 for $RecordLine (@{$ConectRecordLinesRef}) { 600 print OUTFILE "$RecordLine\n"; 601 } 602 603 # Write out END record... 604 print OUTFILE GenerateEndRecordLine(), "\n"; 605 606 close OUTFILE; 607 } 608 609 # Does record meets distance citerion specified by the user? 610 sub CheckDistance { 611 my($CoordsRef, $OriginCoordsRef) = @_; 612 my($Status, $Index, $Distance, $DistanceSquare); 613 614 $Status = 0; 615 616 if ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { 617 # Go over coordinates of all the atoms in the residue... 618 my($ResidueCoordsCount) = scalar @{$OriginCoordsRef}; 619 INDEX: for ($Index = 0; $Index < $ResidueCoordsCount; $Index += 3) { 620 $DistanceSquare = ($CoordsRef->[0] - $OriginCoordsRef->[$Index])**2 + ($CoordsRef->[1] - $OriginCoordsRef->[$Index + 1])**2 + ($CoordsRef->[2] - $OriginCoordsRef->[$Index + 2])**2; 621 $Distance = sqrt $DistanceSquare; 622 if ($Distance <= $OptionsInfo{MaxExtractionDistance}) { 623 $Status = 1; 624 last INDEX; 625 } 626 } 627 } 628 else { 629 $DistanceSquare = 0; 630 for $Index (0 .. 2) { 631 $DistanceSquare += ($CoordsRef->[$Index] - $OriginCoordsRef->[$Index])**2; 632 } 633 $Distance = sqrt $DistanceSquare; 634 $Status = ($Distance <= $OptionsInfo{MaxExtractionDistance}) ? 1 : 0; 635 } 636 637 return $Status; 638 } 639 640 # Write out modifed header and other older records... 641 sub WriteHeaderAndOlderRecords { 642 my($OutFileRef, $PDBRecordLinesRef) = @_; 643 644 if ($OptionsInfo{ModifyHeaderRecord}) { 645 # Write out modified HEADER record... 646 my($Classification, $DepositionDate, $IDCode) = GetHeaderRecordInformation($PDBRecordLinesRef); 647 $Classification = 'Data extracted using MayaChemTools'; 648 print $OutFileRef GenerateHeaderRecordLine($IDCode, $Classification), "\n"; 649 } 650 else { 651 print $OutFileRef $PDBRecordLinesRef->[0], "\n"; 652 } 653 654 # Write out any old records... 655 if ($OptionsInfo{KeepOldRecords}) { 656 my($RecordLineIndex, $RecordLine); 657 # Skip HEADER record and write out older records all the way upto first MODEL/ATOM/HETATM records from input file... 658 RECORDLINE: for $RecordLineIndex (1 .. $#{$PDBRecordLinesRef}) { 659 $RecordLine = $PDBRecordLinesRef->[$RecordLineIndex]; 660 if (IsModelRecordType($RecordLine) || IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) { 661 last RECORDLINE; 662 } 663 print $OutFileRef "$RecordLine\n"; 664 } 665 } 666 } 667 668 # Get header record information assuming it's the first record... 669 sub GetHeaderRecordInformation { 670 my($PDBRecordLinesRef) = @_; 671 my($Classification, $DepositionDate, $IDCode, $HeaderRecordLine); 672 673 ($Classification, $DepositionDate, $IDCode) = ('') x 3; 674 $HeaderRecordLine = $PDBRecordLinesRef->[0]; 675 if (IsHeaderRecordType($HeaderRecordLine)) { 676 ($Classification, $DepositionDate, $IDCode) = ParseHeaderRecordLine($HeaderRecordLine); 677 } 678 return ($Classification, $DepositionDate, $IDCode); 679 } 680 681 # Get one letter residue code... 682 sub GetResidueCode { 683 my($ResidueName) = @_; 684 my($ResidueCode, $StandardResidue); 685 686 $ResidueCode = $OptionsInfo{NonStandardSequenceCode}; 687 $StandardResidue = 0; 688 689 if (length($ResidueName) == 3) { 690 # Assume it's an amino acid... 691 if (AminoAcids::IsAminoAcid($ResidueName)) { 692 # Standard amino acid... 693 $ResidueCode = AminoAcids::GetAminoAcidOneLetterCode($ResidueName); 694 $StandardResidue = 1; 695 } 696 } 697 elsif (length($ResidueName) == 1) { 698 # Assume it's a nucleic acid... 699 if ($ResidueName =~ /^(A|G|T|U|C)$/i) { 700 $ResidueCode = $ResidueName; 701 $StandardResidue = 1; 702 } 703 } 704 705 return ($ResidueCode, $StandardResidue); 706 } 707 708 # Process option values... 709 sub ProcessOptions { 710 %OptionsInfo = (); 711 $OptionsInfo{Mode} = $Options{mode}; 712 713 my(@SpecifiedChains) = (); 714 if ($Options{chains} =~ /^(First|All)$/i) { 715 $OptionsInfo{ChainsToExtract} = $Options{chains}; 716 } 717 else { 718 @SpecifiedChains = split /\,/, $Options{chains}; 719 $OptionsInfo{ChainsToExtract} = 'Specified'; 720 } 721 @{$OptionsInfo{SpecifiedChains}} = (); 722 push @{$OptionsInfo{SpecifiedChains}}, @SpecifiedChains; 723 724 $OptionsInfo{CombineChains} = ($Options{combinechains} =~ /^Yes$/i) ? 1 : 0; 725 726 $OptionsInfo{CombineChainSequences} = ($Options{combinechains} =~ /^Yes$/i) ? 1 : 0; 727 728 my($ResidueNum, $StartResidueNum, $EndResNum, @SpecifiedResidueNumsList); 729 730 @SpecifiedResidueNumsList = (); 731 ($StartResidueNum, $EndResNum) = (0, 0); 732 733 if ($OptionsInfo{Mode} =~ /^(ResidueNums|ResiduesRange)$/i) { 734 if (!$Options{residues}) { 735 die "Error: You must specify a value for \"--Residues\" option in \"ResidueNums or ResiduesRange\" \"-m, --mode\". \n"; 736 } 737 $OptionsInfo{Residues} = $Options{residues}; 738 my($ResidueNum); 739 740 $OptionsInfo{Residues} =~ s/ //g; 741 742 @SpecifiedResidueNumsList = split /\,/, $OptionsInfo{Residues}; 743 for $ResidueNum (@SpecifiedResidueNumsList) { 744 if (!IsPositiveInteger($ResidueNum)) { 745 die "Error: Invalid number of residue number value, $ResidueNum, for \"--Residues\" option during \"ResidueNumes\" or \"ResiduesRange\"value of \"-m --mode\" option: Residue number must be a positive integer.\n"; 746 } 747 } 748 if ($OptionsInfo{Mode} =~ /^ResiduesRange$/i) { 749 if (@SpecifiedResidueNumsList != 2) { 750 die "Error: Invalid number of residue number values, ", scalar(@SpecifiedResidueNumsList), ", for \"--Residues\" option during \"ResiduesRange\" value of \"-m --mode\" option: The number of values must be 2 corresponding to start and end residue numbers.\n"; 751 } 752 if ($SpecifiedResidueNumsList[0] > $SpecifiedResidueNumsList[1]) { 753 die "Error: Invalid residue number values, @SpecifiedResidueNumsList, for \"--Residues\" option during \"ResiduesRange\" value of \"-m --mode\" option: The start residue numbers must be less than end residue number.\n"; 754 } 755 ($StartResidueNum, $EndResNum) = @SpecifiedResidueNumsList; 756 } 757 758 } 759 @{$OptionsInfo{SpecifiedResidueNumsList}} = (); 760 push @{$OptionsInfo{SpecifiedResidueNumsList}}, @SpecifiedResidueNumsList; 761 762 $OptionsInfo{SpecifiedStartResidueNum} = $StartResidueNum; 763 $OptionsInfo{SpecifiedEndResidueNum} = $EndResNum; 764 765 # Set up a specified residue numbers map... 766 %{$OptionsInfo{SpecifiedResidueNumsMap}} = (); 767 for $ResidueNum (@{$OptionsInfo{SpecifiedResidueNumsList}}) { 768 $OptionsInfo{SpecifiedResidueNumsMap}{$ResidueNum} = $ResidueNum; 769 } 770 771 $OptionsInfo{MaxExtractionDistance} = $Options{distance}; 772 $OptionsInfo{ExtractionDistanceMode} = $Options{distancemode}; 773 $OptionsInfo{ExtractionDistanceOrigin} = $Options{distanceorigin} ? $Options{distanceorigin} : ''; 774 my(@SpecifiedDistanceOrigin) = (); 775 776 if ($OptionsInfo{Mode} =~ /^Distance$/i) { 777 if (!$Options{distanceorigin}) { 778 die "Error: You must specify a value for \"--distanceorigin\" option in \"Distance\" \"-m, --mode\". \n"; 779 } 780 @SpecifiedDistanceOrigin = split /\,/, $Options{distanceorigin}; 781 if ($OptionsInfo{ExtractionDistanceMode} =~ /^Atom$/i) { 782 if (@SpecifiedDistanceOrigin != 2) { 783 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Atom\" value of \"--distancemode\" : The number of values must be 2.\n"; 784 } 785 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) { 786 die "Error: Invalid atom number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Atom\" value of \"--distancemode\". Allowed values: > 0\n"; 787 } 788 } 789 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Hetatm$/i) { 790 if (@SpecifiedDistanceOrigin != 2) { 791 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Hetatm\" value of \"--distancemode\" : The number of values must be 2.\n"; 792 } 793 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) { 794 die "Error: Invalid hetatm number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Hetatm\" value of \"--distancemode\". Allowed values: > 0\n"; 795 } 796 } 797 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { 798 if (!(@SpecifiedDistanceOrigin == 2 || @SpecifiedDistanceOrigin == 3)) { 799 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Residue\" value of \"--distancemode\" : The number of values must be either 2 or 3.\n"; 800 } 801 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) { 802 die "Error: Invalid residue number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Residue\" value of \"--distancemode\". Allowed values: > 0\n"; 803 } 804 } 805 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) { 806 if (@SpecifiedDistanceOrigin != 3) { 807 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"XYZ\" value of \"--distancemode\" : The number of values must be 3.\n"; 808 } 809 my($Value); 810 for $Value (@SpecifiedDistanceOrigin) { 811 if (!IsNumerical($Value)) { 812 die "Error: Invalid coordinate value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"XYZ\" value of \"--distancemode\". Allowed values: numerical\n"; 813 } 814 } 815 } 816 } 817 @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}} = (); 818 push @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}, @SpecifiedDistanceOrigin; 819 820 $OptionsInfo{WaterResidueNames} = $Options{waterresiduenames}; 821 @{$OptionsInfo{SpecifiedWaterResiduesList}} = (); 822 %{$OptionsInfo{SpecifiedWaterResiduesMap}} = (); 823 824 my(@SpecifiedWaterResiduesList); 825 @SpecifiedWaterResiduesList = (); 826 827 if ($OptionsInfo{Mode} =~ /^NonWater$/i) { 828 my($WaterResidueName); 829 if ($OptionsInfo{WaterResidueNames} =~ /Automatic/i) { 830 push @SpecifiedWaterResiduesList, ('HOH', 'WAT', 'H2O'); 831 } 832 else { 833 @SpecifiedWaterResiduesList = split /\,/, $Options{waterresiduenames}; 834 } 835 for $WaterResidueName (@SpecifiedWaterResiduesList) { 836 $OptionsInfo{SpecifiedWaterResiduesMap}{$WaterResidueName} = $WaterResidueName; 837 } 838 } 839 push @{$OptionsInfo{SpecifiedWaterResiduesList}}, @SpecifiedWaterResiduesList; 840 841 842 $OptionsInfo{KeepOldRecords} = ($Options{keepoldrecords} =~ /^Yes$/i) ? 1 : 0; 843 844 $OptionsInfo{ModifyHeaderRecord} = ($Options{modifyheader} =~ /^Yes$/i) ? 1 : 0; 845 846 $OptionsInfo{KeepNonStandardSequences} = ($Options{nonstandardkeep} =~ /^Yes$/i) ? 1 : 0; 847 $OptionsInfo{NonStandardSequenceCode} = $Options{nonstandardcode}; 848 $OptionsInfo{MaxSequenceLength} = $Options{sequencelength}; 849 $OptionsInfo{SequenceRecordSource} = $Options{sequencerecords}; 850 $OptionsInfo{SequenceIDPrefixSource} = $Options{sequenceidprefix}; 851 852 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 853 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 854 } 855 856 # Retrieve information about PDB files... 857 sub RetrievePDBFilesInfo { 858 my($Index, $PDBFile, $PDBRecordLinesRef, $ChainID, $ChainLabel, $ChainsAndResiduesInfoRef, $Mode, $FileDir, $FileName, $FileExt, $OutFileName, $OutFileRoot, @SpecifiedChains, @DistanceOrigin, @OutFileNames, @ChainLabels, @ChainSequenceIDs, @ChainSequenceIDsPrefix); 859 860 %PDBFilesInfo = (); 861 @{$PDBFilesInfo{FileOkay}} = (); 862 @{$PDBFilesInfo{OutFileRoot}} = (); 863 @{$PDBFilesInfo{OutFileNames}} = (); 864 @{$PDBFilesInfo{ChainLabels}} = (); 865 @{$PDBFilesInfo{ChainSequenceIDs}} = (); 866 @{$PDBFilesInfo{ChainSequenceIDsPrefix}} = (); 867 @{$PDBFilesInfo{SpecifiedChains}} = (); 868 @{$PDBFilesInfo{DistanceOrigin}} = (); 869 870 FILELIST: for $Index (0 .. $#PDBFilesList) { 871 $PDBFilesInfo{FileOkay}[$Index] = 0; 872 873 $PDBFilesInfo{OutFileRoot}[$Index] = ''; 874 @{$PDBFilesInfo{OutFileNames}[$Index]} = (); 875 @{$PDBFilesInfo{OutFileNames}[$Index]} = (); 876 @{$PDBFilesInfo{ChainLabels}[$Index]} = (); 877 @{$PDBFilesInfo{ChainSequenceIDs}[$Index]} = (); 878 @{$PDBFilesInfo{ChainSequenceIDsPrefix}[$Index]} = (); 879 @{$PDBFilesInfo{SpecifiedChains}[$Index]} = (); 880 @{$PDBFilesInfo{DistanceOrigin}[$Index]} = (); 881 882 $PDBFile = $PDBFilesList[$Index]; 883 if (!(-e $PDBFile)) { 884 warn "Warning: Ignoring file $PDBFile: It doesn't exist\n"; 885 next FILELIST; 886 } 887 if (!CheckFileType($PDBFile, "pdb")) { 888 warn "Warning: Ignoring file $PDBFile: It's not a PDB file\n"; 889 next FILELIST; 890 } 891 if (! open PDBFILE, "$PDBFile") { 892 warn "Warning: Ignoring file $PDBFile: Couldn't open it: $! \n"; 893 next FILELIST; 894 } 895 close PDBFILE; 896 897 # Get PDB data... 898 $PDBRecordLinesRef = ReadPDBFile($PDBFile); 899 if ($OptionsInfo{Mode} =~ /^Sequences$/i && $OptionsInfo{SequenceRecordSource} =~ /^SeqRes$/i) { 900 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'SeqRes'); 901 } 902 else { 903 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef); 904 } 905 if (!scalar @{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 906 warn "Warning: Ignoring file $PDBFile: No chains found \n"; 907 next FILELIST; 908 } 909 910 # Make sure specified chains exist in PDB file... 911 @SpecifiedChains = (); 912 if ($OptionsInfo{ChainsToExtract} =~ /^Specified$/i) { 913 for $ChainID (@{$OptionsInfo{SpecifiedChains}}) { 914 if (exists $ChainsAndResiduesInfoRef->{Residues}{$ChainID}) { 915 push @SpecifiedChains, $ChainID; 916 } 917 else { 918 warn "Warning: Ignoring file $PDBFile: Specified chain, $ChainID, in \"-c, --chains\" option doesn't exist.\n"; 919 next FILELIST; 920 } 921 } 922 } 923 elsif ($OptionsInfo{ChainsToExtract} =~ /^First$/i) { 924 push @SpecifiedChains, $ChainsAndResiduesInfoRef->{ChainIDs}[0]; 925 } 926 elsif ($OptionsInfo{ChainsToExtract} =~ /^All$/i) { 927 push @SpecifiedChains, @{$ChainsAndResiduesInfoRef->{ChainIDs}}; 928 } 929 # Setup chain labels to use for sequence IDs and generating output files... 930 @ChainLabels = (); 931 for $ChainID (@SpecifiedChains) { 932 $ChainLabel = $ChainID; $ChainLabel =~ s/^None//ig; 933 $ChainLabel = "Chain${ChainLabel}"; 934 push @ChainLabels, $ChainLabel; 935 } 936 937 # Make sure specified distance origin is valid... 938 @DistanceOrigin = (); 939 if ($OptionsInfo{Mode} =~ /^Distance$/i) { 940 if ($OptionsInfo{ExtractionDistanceMode} =~ /^(Atom|Hetatm)$/i) { 941 my($RecordType, $SpecifiedAtomName, $SpecifiedAtomNumber, $RecordFound, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $RecordLine); 942 $RecordType = $OptionsInfo{ExtractionDistanceMode}; 943 ($SpecifiedAtomNumber, $SpecifiedAtomName) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; 944 $RecordFound = 0; 945 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 946 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { 947 next LINE; 948 } 949 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); 950 $AtomName = RemoveLeadingAndTrailingWhiteSpaces($AtomName); 951 if (($RecordType =~ /^Atom$/i && IsAtomRecordType($RecordLine)) || ($RecordType =~ /^Hetatm$/i && IsHetatmRecordType($RecordLine))) { 952 if ($AtomNumber == $SpecifiedAtomNumber && $AtomName eq $SpecifiedAtomName) { 953 $RecordFound = 1; 954 last LINE; 955 } 956 } 957 } 958 if (!$RecordFound) { 959 warn "Warning: Ignoring file $PDBFile: ", uc($RecordType), " record corresponding to \"--distanceorigin\" option value, $OptionsInfo{ExtractionDistanceOrigin}, doesn't exist.\n"; 960 next FILELIST; 961 } 962 push @DistanceOrigin, ($X, $Y, $Z); 963 } 964 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { 965 my($SpecifiedResidueNumber, $SpecifiedResidueName, $SpecifiedChainID, $RecordFound, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $RecordLine); 966 $SpecifiedChainID = ''; 967 if (@{$OptionsInfo{SpecifiedExtractionDistanceOrigin}} == 3) { 968 ($SpecifiedResidueNumber, $SpecifiedResidueName, $SpecifiedChainID) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; 969 } 970 else { 971 ($SpecifiedResidueNumber, $SpecifiedResidueName) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; 972 } 973 $RecordFound = 0; 974 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 975 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { 976 next LINE; 977 } 978 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); 979 $ResidueName = RemoveLeadingAndTrailingWhiteSpaces($ResidueName); 980 $ChainID = RemoveLeadingAndTrailingWhiteSpaces($ChainID); 981 if ($SpecifiedChainID && ($SpecifiedChainID ne $ChainID)) { 982 next LINE; 983 } 984 if ($ResidueNumber == $SpecifiedResidueNumber && $ResidueName eq $SpecifiedResidueName) { 985 # Store coordinates for all the atoms... 986 $RecordFound = 1; 987 push @DistanceOrigin, ($X, $Y, $Z); 988 next LINE; 989 } 990 } 991 if (!$RecordFound) { 992 warn "Warning: Ignoring file $PDBFile: ATOM/HETATM record corresponding to \"--distanceorigin\" option value, $OptionsInfo{ExtractionDistanceOrigin}, doesn't exist.\n"; 993 next FILELIST; 994 } 995 } 996 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) { 997 push @DistanceOrigin, @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; 998 } 999 } 1000 # Setup output file names... 1001 @OutFileNames = (); 1002 $FileDir = ""; $FileName = ""; $FileExt = ""; 1003 ($FileDir, $FileName, $FileExt) = ParseFileName($PDBFile); 1004 if ($OptionsInfo{OutFileRoot} && (@PDBFilesList == 1)) { 1005 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 1006 if ($RootFileName && $RootFileExt) { 1007 $FileName = $RootFileName; 1008 } 1009 else { 1010 $FileName = $OptionsInfo{OutFileRoot}; 1011 } 1012 $OutFileRoot = $FileName; 1013 } 1014 else { 1015 $OutFileRoot = $FileName; 1016 } 1017 $Mode = $OptionsInfo{Mode}; 1018 if ($Mode =~ /^(Atoms|CAlphas|ResidueNums|ResiduesRange|Distance|NonWater|NonHydrogens)$/i) { 1019 $OutFileName = ''; 1020 if ($Mode =~ /^CAlphas$/i) { 1021 $OutFileName = "${OutFileRoot}CAlphas.pdb"; 1022 } 1023 elsif ($Mode =~ /^Atoms$/i) { 1024 $OutFileName = "${OutFileRoot}Atoms.pdb"; 1025 } 1026 elsif ($Mode =~ /^ResidueNums$/i) { 1027 $OutFileName = "${OutFileRoot}ResidueNums.pdb"; 1028 } 1029 elsif ($Mode =~ /^ResiduesRange$/i) { 1030 $OutFileName = "${OutFileRoot}ResiduesRange.pdb"; 1031 } 1032 elsif ($Mode =~ /^NonWater$/i) { 1033 $OutFileName = "${OutFileRoot}NonWater.pdb"; 1034 } 1035 elsif ($Mode =~ /^NonHydrogens$/i) { 1036 $OutFileName = "${OutFileRoot}NonHydrogens.pdb"; 1037 } 1038 elsif ($Mode =~ /^Distance$/i) { 1039 my($DistanceMode) = ''; 1040 if ($OptionsInfo{ExtractionDistanceMode} =~ /^Atom$/i) { 1041 $DistanceMode = 'Atom'; 1042 } 1043 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Hetatm$/i) { 1044 $DistanceMode = 'Hetatm'; 1045 } 1046 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { 1047 $DistanceMode = 'Residue'; 1048 } 1049 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) { 1050 $DistanceMode = 'XYZ'; 1051 } 1052 $OutFileName = "${OutFileRoot}DistanceBy${DistanceMode}.pdb"; 1053 } 1054 push @OutFileNames, $OutFileName; 1055 if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) { 1056 warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n"; 1057 next FILELIST; 1058 } 1059 } 1060 elsif ($Mode =~ /^(Chains|Sequences)$/i) { 1061 if ($OptionsInfo{CombineChainSequences}) { 1062 $OutFileName = ($Mode =~ /^Chains$/i) ? "${OutFileRoot}ExtractedChains.pdb" : "${OutFileRoot}SequencesChainsCombined.fasta"; 1063 push @OutFileNames, $OutFileName; 1064 if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) { 1065 warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n"; 1066 next FILELIST; 1067 } 1068 } 1069 else { 1070 for $ChainLabel (@ChainLabels) { 1071 $OutFileName = ($Mode =~ /^Chains$/i) ? "${OutFileRoot}${ChainLabel}.pdb" : "${OutFileRoot}Sequences${ChainLabel}.fasta"; 1072 push @OutFileNames, $OutFileName; 1073 if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) { 1074 warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n"; 1075 next FILELIST; 1076 } 1077 } 1078 } 1079 } 1080 @ChainSequenceIDs = (); 1081 @ChainSequenceIDsPrefix = (); 1082 if ($Mode =~ /^Sequences$/i) { 1083 my($HeaderRecordLine, $Classification, $DepositionDate, $IDCode, $IDPrefix); 1084 ($Classification, $DepositionDate, $IDCode) = GetHeaderRecordInformation($PDBRecordLinesRef); 1085 1086 if ($OptionsInfo{SequenceIDPrefixSource} =~ /^FileName$/i) { 1087 $IDPrefix = $FileName; 1088 } 1089 elsif ($OptionsInfo{SequenceIDPrefixSource} =~ /^HeaderRecord$/i) { 1090 $IDPrefix = IsNotEmpty($IDCode) ? $IDCode : ''; 1091 } 1092 else { 1093 $IDPrefix = IsNotEmpty($IDCode) ? $IDCode : $FileName; 1094 } 1095 1096 for $ChainLabel (@ChainLabels) { 1097 push @ChainSequenceIDsPrefix, $IDPrefix; 1098 push @ChainSequenceIDs, "${IDPrefix}_${ChainLabel}|PDB"; 1099 } 1100 } 1101 1102 $PDBFilesInfo{FileOkay}[$Index] = 1; 1103 $PDBFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 1104 1105 push @{$PDBFilesInfo{OutFileNames}[$Index]}, @OutFileNames; 1106 push @{$PDBFilesInfo{ChainLabels}[$Index]}, @ChainLabels; 1107 push @{$PDBFilesInfo{ChainSequenceIDsPrefix}[$Index]}, @ChainSequenceIDsPrefix; 1108 push @{$PDBFilesInfo{ChainSequenceIDs}[$Index]}, @ChainSequenceIDs; 1109 push @{$PDBFilesInfo{SpecifiedChains}[$Index]}, @SpecifiedChains; 1110 push @{$PDBFilesInfo{DistanceOrigin}[$Index]}, @DistanceOrigin; 1111 } 1112 } 1113 1114 1115 # Setup script usage and retrieve command line arguments specified using various options... 1116 sub SetupScriptUsage { 1117 1118 # Retrieve all the options... 1119 %Options = (); 1120 $Options{chains} = 'First'; 1121 $Options{combinechains} = 'no'; 1122 $Options{distance} = 10.0; 1123 $Options{distancemode} = 'XYZ'; 1124 $Options{keepoldrecords} = 'no'; 1125 $Options{mode} = 'NonWater'; 1126 $Options{modifyheader} = 'yes'; 1127 $Options{nonstandardkeep} = 'yes'; 1128 $Options{nonstandardcode} = 'X'; 1129 $Options{sequencelength} = 80; 1130 $Options{sequenceidprefix} = 'Automatic'; 1131 $Options{sequencerecords} = 'Atom'; 1132 $Options{waterresiduenames} = 'Automatic'; 1133 1134 if (!GetOptions(\%Options, "chains|c=s", "combinechains=s", "distance|d=f", "distancemode=s", "distanceorigin=s", "help|h", "keepoldrecords|k=s", "mode|m=s", "modifyheader=s", "nonstandardkeep=s", "nonstandardcode=s", "overwrite|o", "root|r=s", "residues=s", "sequencelength=i", "sequenceidprefix=s", "sequencerecords=s", "waterresiduenames=s", "workingdir|w=s")) { 1135 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 1136 } 1137 if ($Options{workingdir}) { 1138 if (! -d $Options{workingdir}) { 1139 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 1140 } 1141 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 1142 } 1143 if ($Options{combinechains} !~ /^(yes|no)$/i) { 1144 die "Error: The value specified, $Options{combinechains}, for option \"--CombineChains\" is not valid. Allowed values: yes or no\n"; 1145 } 1146 if ($Options{distancemode} !~ /^(Atom|Hetatm|Residue|XYZ)$/i) { 1147 die "Error: The value specified, $Options{distancemode}, for option \"--DistanceMode\" is not valid. Allowed values: Atom, Hetatm, Residue, or XYZ\n"; 1148 } 1149 if ($Options{keepoldrecords} !~ /^(yes|no)$/i) { 1150 die "Error: The value specified, $Options{keepoldrecords}, for option \"--KeepOldRecords\" is not valid. Allowed values: yes or no\n"; 1151 } 1152 if ($Options{mode} !~ /^(Chains|Sequences|Atoms|CAlphas|ResidueNums|ResiduesRange|Distance|NonWater|NonHydrogens)$/i) { 1153 die "Error: The value specified, $Options{mode}, for option \"m, --mode\" is not valid. Allowed values: Chains, Sequences, Atoms, CAlphas, ResidueNums, ResiduesRange, Distance, NonWater, NonHydrogens\n"; 1154 } 1155 if ($Options{modifyheader} !~ /^(yes|no)$/i) { 1156 die "Error: The value specified, $Options{modifyheader}, for option \"--ModifyHeader\" is not valid. Allowed values: yes or no\n"; 1157 } 1158 if ($Options{nonstandardkeep} !~ /^(yes|no)$/i) { 1159 die "Error: The value specified, $Options{nonstandardkeep}, for option \"--NonStandardKeep\" is not valid. Allowed values: yes or no\n"; 1160 } 1161 if ($Options{nonstandardcode} !~ /^(\?|\-|X)$/i) { 1162 die "Error: The value specified, $Options{nonstandardcode}, for option \"--NonStandardCode\" is not valid. Allowed values: ?, -, or X\n"; 1163 } 1164 if (!IsPositiveInteger($Options{sequencelength})) { 1165 die "Error: The value specified, $Options{sequencelength}, for option \"--SequenceLength\" is not valid. Allowed values: >0\n"; 1166 } 1167 if ($Options{sequencerecords} !~ /^(Atom|SeqRes)$/i) { 1168 die "Error: The value specified, $Options{sequencerecords}, for option \"--SequenceRecords\" is not valid. Allowed values: Atom or SeqRes\n"; 1169 } 1170 if ($Options{sequenceidprefix} !~ /^(FileName|HeaderRecord|Automatic)$/i) { 1171 die "Error: The value specified, $Options{sequenceidprefix}, for option \"--SequenceIDPrefix\" is not valid. Allowed values: FileName, HeaderRecord, or AutomaticAtom\n"; 1172 } 1173 } 1174