1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: ExtractFromPDBFiles.pl,v $ 4 # $Date: 2008/01/30 21:44:45 $ 5 # $Revision: 1.20 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use 5.006; 30 use strict; 31 use FindBin; use lib "$FindBin::Bin/../lib"; 32 use Getopt::Long; 33 use File::Basename; 34 use Text::ParseWords; 35 use Benchmark; 36 use FileUtil; 37 use TextUtil; 38 use PDBFileUtil; 39 use AminoAcids; 40 use SequenceFileUtil; 41 42 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 43 44 # Autoflush STDOUT 45 $| = 1; 46 47 # Starting message... 48 $ScriptName = basename($0); 49 print "\n$ScriptName: Starting...\n\n"; 50 $StartTime = new Benchmark; 51 52 # Get the options and setup script... 53 SetupScriptUsage(); 54 if ($Options{help} || @ARGV < 1) { 55 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 56 } 57 58 my(@PDBFilesList); 59 @PDBFilesList = ExpandFileNames(\@ARGV, "pdb"); 60 61 # Process options... 62 my(%OptionsInfo); 63 print "Processing options...\n"; 64 ProcessOptions(); 65 66 # Setup information about input files... 67 print "Checking input PDB file(s)...\n"; 68 my(%PDBFilesInfo); 69 RetrievePDBFilesInfo(); 70 71 # Process input files.. 72 my($FileIndex, $PDBFile, $FileProcessingMsg); 73 $FileProcessingMsg = "Processing file"; 74 if (@PDBFilesList > 1) { 75 print "Processing PDB files...\n"; 76 $FileProcessingMsg = "\n$FileProcessingMsg"; 77 } 78 79 for $FileIndex (0 .. $#PDBFilesList) { 80 if ($PDBFilesInfo{FileOkay}[$FileIndex]) { 81 $PDBFile = $PDBFilesList[$FileIndex]; 82 print "$FileProcessingMsg $PDBFile...\n"; 83 ExtractFromPDBFiles($FileIndex); 84 } 85 } 86 87 print "$ScriptName:Done...\n\n"; 88 89 $EndTime = new Benchmark; 90 $TotalTime = timediff ($EndTime, $StartTime); 91 print "Total time: ", timestr($TotalTime), "\n"; 92 93 ############################################################################### 94 95 # Extract appropriate information... 96 sub ExtractFromPDBFiles { 97 my($FileIndex) = @_; 98 my($PDBFile, $PDBRecordLinesRef); 99 100 # Get PDB data... 101 $PDBFile = $PDBFilesList[$FileIndex]; 102 $PDBRecordLinesRef = ReadPDBFile($PDBFile); 103 104 if ($OptionsInfo{Mode} =~ /Chains/i) { 105 ExtractChains($FileIndex, $PDBRecordLinesRef); 106 } 107 elsif ($OptionsInfo{Mode} =~ /Sequences/i) { 108 ExtractSequences($FileIndex, $PDBRecordLinesRef); 109 } 110 elsif ($OptionsInfo{Mode} =~ /Atoms/i) { 111 ExtractAtoms($FileIndex, $PDBRecordLinesRef); 112 } 113 elsif ($OptionsInfo{Mode} =~ /CAlphas/i) { 114 ExtractCAlphas($FileIndex, $PDBRecordLinesRef); 115 } 116 elsif ($OptionsInfo{Mode} =~ /Distance/i) { 117 ExtractByDistance($FileIndex, $PDBRecordLinesRef); 118 } 119 elsif ($OptionsInfo{Mode} =~ /NonWater/i) { 120 ExtractNonWaterRecords($FileIndex, $PDBRecordLinesRef); 121 } 122 elsif ($OptionsInfo{Mode} =~ /NonHydrogens/i) { 123 ExtractNonHydrogenRecords($FileIndex, $PDBRecordLinesRef); 124 } 125 } 126 127 # Extract chains and generate new PDB files... 128 # 129 sub ExtractChains { 130 my($FileIndex, $PDBRecordLinesRef) = @_; 131 my($ChainIndex, $ChainID, $ChainLabel, $PDBFileName, $RecordLine, $ChainsAndResiduesInfoRef, $AtomNumber, $AtomName, $ResidueName, $AtomChainID, $ResidueNumber, $AlternateLocation, $InsertionCode, $ConectRecordLinesRef, %ChainAtomNumbersMap); 132 133 # Get chains and residues data... 134 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', 0, 1); 135 # $ConectRecordTypesCountRef = GetRecordTypesCount($PDBRecordLinesRef, 'CONECT', 1); 136 137 for $ChainIndex (0 .. $#{@{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}}) { 138 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; 139 $ChainLabel = $PDBFilesInfo{ChainLabels}[$FileIndex][$ChainIndex]; 140 141 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][$ChainIndex]; 142 print "Generating PDBFileName file $PDBFileName...\n"; 143 144 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 145 146 # Write out header and other older recors... 147 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 148 149 # Write out ATOM/HETATM line for chain and collect all ATOM/HETATM serial numbers 150 # for writing out appropriate CONECT records... 151 %ChainAtomNumbersMap = (); 152 for $RecordLine (@{$ChainsAndResiduesInfoRef->{Lines}{$ChainID}}) { 153 print OUTFILE "$RecordLine\n"; 154 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $AtomChainID, $ResidueNumber, $InsertionCode) = ParseAtomRecordLine($RecordLine); 155 $AtomNumber = int $AtomNumber; 156 $ChainAtomNumbersMap{$AtomNumber} = $AtomName; 157 } 158 # Write out TER record using information from last chain record... 159 $AtomNumber += 1; 160 print OUTFILE GenerateTerRecordLine($AtomNumber, $ResidueName, $AtomChainID, $ResidueNumber, $InsertionCode), "\n"; 161 162 # Write out CONECT records... 163 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%ChainAtomNumbersMap); 164 165 for $RecordLine (@{$ConectRecordLinesRef}) { 166 print OUTFILE "$RecordLine\n"; 167 } 168 169 # Write out END record... 170 print OUTFILE GenerateEndRecordLine(), "\n"; 171 172 close OUTFILE; 173 } 174 } 175 176 177 # Extract sequences for individual chains or combine all the chains... 178 sub ExtractSequences { 179 my($FileIndex, $PDBRecordLinesRef) = @_; 180 my($ChainIndex, $ChainID, $ChainLabel, $SequenceFileName, $Residue, $ResidueCode, $StandardResidue, $ChainSequence, $WrappedChainSequence, $ChainSequenceID, $ChainsAndResiduesInfoRef, $ChainResiduesRef, %ChainSequencesDataMap); 181 182 if ($OptionsInfo{SequenceRecordSource} =~ /^SeqRes$/i) { 183 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'SeqRes'); 184 } 185 else { 186 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef); 187 } 188 189 # Generate sequence data for all the chains... 190 %ChainSequencesDataMap = (); 191 @{$ChainSequencesDataMap{IDs}} = (); 192 %{$ChainSequencesDataMap{Sequence}} = (); 193 %{$ChainSequencesDataMap{Description}} = (); 194 195 for $ChainIndex (0 .. $#{@{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}}) { 196 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; 197 $ChainLabel = $PDBFilesInfo{ChainLabels}[$FileIndex][$ChainIndex]; 198 199 # Setup sequence ID... 200 $ChainSequenceID = $PDBFilesInfo{ChainSequenceIDs}[$FileIndex][$ChainIndex]; 201 push @{$ChainSequencesDataMap{IDs}}, $ChainSequenceID; 202 $ChainSequencesDataMap{Description}{$ChainID} = $ChainSequenceID; 203 204 # Collect sequence data for the chain... 205 if ($OptionsInfo{SequenceRecordSource} =~ /^SeqRes/i) { 206 $ChainResiduesRef = \@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}; 207 } 208 else { 209 $ChainResiduesRef = \@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}; 210 } 211 # Setup sequence data... 212 $ChainSequence = ''; 213 RESIDUE: for $Residue (@{$ChainResiduesRef}) { 214 ($ResidueCode, $StandardResidue) = GetResidueCode($Residue); 215 if (!$StandardResidue) { 216 if ($OptionsInfo{KeepNonStandardSequences}) { 217 $ResidueCode = $OptionsInfo{NonStandardSequenceCode}; 218 warn "Warning: Keeping nonstandard residue $Residue in $ChainLabel...\n"; 219 } 220 else { 221 warn "Warning: Ignoring nonstandard residue $Residue in $ChainLabel...\n"; 222 next RESIDUE; 223 } 224 } 225 $ChainSequence .= $ResidueCode; 226 } 227 $ChainSequencesDataMap{Sequence}{$ChainID} = $ChainSequence; 228 229 } 230 231 # Write out the sequence files... 232 my($SequenceID, $SequenceDescription, $Sequence, %SequencesDataMap ); 233 if ($OptionsInfo{CombineChainSequences}) { 234 # Combine all the chain sequences... 235 $Sequence = ''; 236 for $ChainIndex (0 .. $#{@{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}}) { 237 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; 238 239 $Sequence .= $ChainSequencesDataMap{Sequence}{$ChainID}; 240 } 241 $SequenceID = $PDBFilesInfo{ChainSequenceIDsPrefix}[$FileIndex][0] . "_CombinedChains|PDB";; 242 $SequenceDescription = $SequenceID; 243 $SequenceFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 244 245 print "Generating sequence file $SequenceFileName...\n"; 246 %SequencesDataMap = (); 247 @{$SequencesDataMap{IDs}} = (); 248 %{$SequencesDataMap{Sequence}} = (); 249 %{$SequencesDataMap{Description}} = (); 250 251 push @{$SequencesDataMap{IDs}}, $SequenceID; 252 $SequencesDataMap{Description}{$SequenceID} = $SequenceDescription; 253 $SequencesDataMap{Sequence}{$SequenceID} = $Sequence; 254 255 WritePearsonFastaSequenceFile($SequenceFileName, \%SequencesDataMap, $OptionsInfo{MaxSequenceLength}); 256 } 257 else { 258 # For each specifed chain, write out the sequences... 259 for $ChainIndex (0 .. $#{@{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}}) { 260 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; 261 262 $SequenceFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][$ChainIndex]; 263 264 $SequenceID = $ChainSequencesDataMap{IDs}[$ChainIndex]; 265 $SequenceDescription = $ChainSequencesDataMap{Description}{$ChainID}; 266 $Sequence = $ChainSequencesDataMap{Sequence}{$ChainID}; 267 268 print "Generating sequence file $SequenceFileName...\n"; 269 %SequencesDataMap = (); 270 @{$SequencesDataMap{IDs}} = (); 271 %{$SequencesDataMap{Sequence}} = (); 272 %{$SequencesDataMap{Description}} = (); 273 274 push @{$SequencesDataMap{IDs}}, $SequenceID; 275 $SequencesDataMap{Description}{$SequenceID} = $SequenceDescription; 276 $SequencesDataMap{Sequence}{$SequenceID} = $Sequence; 277 278 WritePearsonFastaSequenceFile($SequenceFileName, \%SequencesDataMap, $OptionsInfo{MaxSequenceLength}); 279 } 280 } 281 } 282 283 # Extract atoms... 284 sub ExtractAtoms { 285 my($FileIndex, $PDBRecordLinesRef) = @_; 286 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $ConectRecordLinesRef, %AtomNumbersMap); 287 288 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 289 print "Generating PDBFileName file $PDBFileName...\n"; 290 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 291 292 # Write out header and other older recors... 293 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 294 295 # Write out all ATOM records along with TER and model records to indicate 296 # chains and multiple models.. 297 %AtomNumbersMap = (); 298 $ChainRecordCount = 0; 299 for $RecordLine (@{$PDBRecordLinesRef}) { 300 if (IsAtomRecordType($RecordLine)) { 301 $ChainRecordCount++; 302 print OUTFILE "$RecordLine\n"; 303 ($AtomNumber, $AtomName) = ParseAtomRecordLine($RecordLine); 304 305 # Store the atom number for extracting CONECT records... 306 $AtomNumber = int $AtomNumber; 307 $AtomNumbersMap{$AtomNumber} = $AtomName; 308 } 309 elsif (IsTerRecordType($RecordLine)) { 310 if ($ChainRecordCount) { 311 print OUTFILE GenerateTerRecordLine(), "\n"; 312 } 313 $ChainRecordCount = 0; 314 } 315 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 316 print OUTFILE "$RecordLine\n"; 317 } 318 } 319 320 # Write out appropriate CONECT records... 321 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 322 for $RecordLine (@{$ConectRecordLinesRef}) { 323 print OUTFILE "$RecordLine\n"; 324 } 325 326 # Write out END record... 327 print OUTFILE GenerateEndRecordLine(), "\n"; 328 329 close OUTFILE; 330 } 331 332 # Extract CAlphas... 333 sub ExtractCAlphas { 334 my($FileIndex, $PDBRecordLinesRef) = @_; 335 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $ConectRecordLinesRef, %AtomNumbersMap); 336 337 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 338 print "Generating PDBFileName file $PDBFileName...\n"; 339 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 340 341 # Write out header and other older recors... 342 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 343 344 # Write out all ATOM records for CAlphas along with TER and model records to indicate 345 # chains and multiple models.. 346 %AtomNumbersMap = (); 347 $ChainRecordCount = 0; 348 for $RecordLine (@{$PDBRecordLinesRef}) { 349 if (IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) { 350 ($AtomNumber, $AtomName) = ParseAtomRecordLine($RecordLine); 351 if ($AtomName =~ /^CA$/i) { 352 $ChainRecordCount++; 353 print OUTFILE "$RecordLine\n"; 354 $AtomNumber = int $AtomNumber; 355 $AtomNumbersMap{$AtomNumber} = $AtomName; 356 } 357 } 358 elsif (IsTerRecordType($RecordLine)) { 359 if ($ChainRecordCount) { 360 print OUTFILE GenerateTerRecordLine(), "\n"; 361 } 362 $ChainRecordCount = 0; 363 } 364 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 365 print OUTFILE "$RecordLine\n"; 366 } 367 } 368 369 # Write out appropriate CONECT records... 370 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 371 for $RecordLine (@{$ConectRecordLinesRef}) { 372 print OUTFILE "$RecordLine\n"; 373 } 374 # Write out END record... 375 print OUTFILE GenerateEndRecordLine(), "\n"; 376 377 close OUTFILE; 378 } 379 380 # Extract non water records... 381 sub ExtractNonWaterRecords { 382 my($FileIndex, $PDBRecordLinesRef) = @_; 383 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ConectRecordLinesRef, %AtomNumbersMap); 384 385 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 386 print "Generating PDBFileName file $PDBFileName...\n"; 387 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 388 389 # Write out header and other older recors... 390 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 391 392 # Write out all ATOM/HETATM non water records along with TER and model records to indicate 393 # chains and multiple models.. 394 %AtomNumbersMap = (); 395 $ChainRecordCount = 0; 396 for $RecordLine (@{$PDBRecordLinesRef}) { 397 if (IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) { 398 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName) = ParseAtomRecordLine($RecordLine); 399 if (! exists $OptionsInfo{SpecifiedWaterResiduesMap}{$ResidueName} ) { 400 $ChainRecordCount++; 401 print OUTFILE "$RecordLine\n"; 402 $AtomNumber = int $AtomNumber; 403 $AtomNumbersMap{$AtomNumber} = $AtomName; 404 } 405 } 406 elsif (IsTerRecordType($RecordLine)) { 407 if ($ChainRecordCount) { 408 print OUTFILE GenerateTerRecordLine(), "\n"; 409 } 410 $ChainRecordCount = 0; 411 } 412 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 413 print OUTFILE "$RecordLine\n"; 414 } 415 } 416 417 # Write out appropriate CONECT records... 418 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 419 for $RecordLine (@{$ConectRecordLinesRef}) { 420 print OUTFILE "$RecordLine\n"; 421 } 422 # Write out END record... 423 print OUTFILE GenerateEndRecordLine(), "\n"; 424 425 close OUTFILE; 426 } 427 428 # Extract non hydrogen records... 429 sub ExtractNonHydrogenRecords { 430 my($FileIndex, $PDBRecordLinesRef) = @_; 431 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, $ConectRecordLinesRef, %AtomNumbersMap); 432 433 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 434 print "Generating PDBFileName file $PDBFileName...\n"; 435 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 436 437 # Write out header and other older recors... 438 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 439 440 # Write out all ATOM/HETATM non hydrogen records along with TER and model records to indicate 441 # chains and multiple models.. 442 %AtomNumbersMap = (); 443 $ChainRecordCount = 0; 444 for $RecordLine (@{$PDBRecordLinesRef}) { 445 if (IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) { 446 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine); 447 if ($ElementSymbol !~ /^H$/i) { 448 $ChainRecordCount++; 449 print OUTFILE "$RecordLine\n"; 450 $AtomNumber = int $AtomNumber; 451 $AtomNumbersMap{$AtomNumber} = $AtomName; 452 } 453 } 454 elsif (IsTerRecordType($RecordLine)) { 455 if ($ChainRecordCount) { 456 print OUTFILE GenerateTerRecordLine(), "\n"; 457 } 458 $ChainRecordCount = 0; 459 } 460 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 461 print OUTFILE "$RecordLine\n"; 462 } 463 } 464 465 # Write out appropriate CONECT records... 466 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 467 for $RecordLine (@{$ConectRecordLinesRef}) { 468 print OUTFILE "$RecordLine\n"; 469 } 470 # Write out END record... 471 print OUTFILE GenerateEndRecordLine(), "\n"; 472 473 close OUTFILE; 474 } 475 476 # Extract ATOM/HETATM records by distance... 477 sub ExtractByDistance { 478 my($FileIndex, $PDBRecordLinesRef) = @_; 479 my($PDBFileName, $RecordLine, $ChainRecordCount, $ConectRecordLinesRef, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, @OriginCoords, @Coords, %AtomNumbersMap); 480 481 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 482 print "Generating PDBFileName file $PDBFileName...\n"; 483 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 484 485 # Write out header and other older recors... 486 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 487 488 # Setup coordinates of origin to calculate distance... 489 @OriginCoords = (); 490 push @OriginCoords, @{$PDBFilesInfo{DistanceOrigin}[$FileIndex]}; 491 492 # Write out all ATOM records for which meet specified criteria along with TER and model records to indicate 493 # chains and multiple models... 494 %AtomNumbersMap = (); 495 $ChainRecordCount = 0; 496 for $RecordLine (@{$PDBRecordLinesRef}) { 497 if (IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) { 498 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); 499 @Coords = (); push @Coords, ($X, $Y, $Z); 500 if (CheckDistance(\@Coords, \@OriginCoords)) { 501 $ChainRecordCount++; 502 print OUTFILE "$RecordLine\n"; 503 $AtomNumber = int $AtomNumber; 504 $AtomNumbersMap{$AtomNumber} = $AtomName; 505 } 506 } 507 elsif (IsTerRecordType($RecordLine)) { 508 if ($ChainRecordCount) { 509 print OUTFILE GenerateTerRecordLine(), "\n"; 510 } 511 $ChainRecordCount = 0; 512 } 513 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 514 print OUTFILE "$RecordLine\n"; 515 } 516 } 517 518 # Write out appropriate CONECT records... 519 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 520 for $RecordLine (@{$ConectRecordLinesRef}) { 521 print OUTFILE "$RecordLine\n"; 522 } 523 524 # Write out END record... 525 print OUTFILE GenerateEndRecordLine(), "\n"; 526 527 close OUTFILE; 528 } 529 530 # Does record meets distance citerion specified by the user? 531 sub CheckDistance { 532 my($CoordsRef, $OriginCoordsRef) = @_; 533 my($Status, $Index, $Distance, $DistanceSquare); 534 535 $Status = 0; 536 537 if ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { 538 # Go over coordinates of all the atoms in the residue... 539 my($ResidueCoordsCount) = scalar @{$OriginCoordsRef}; 540 INDEX: for ($Index = 0; $Index < $ResidueCoordsCount; $Index += 3) { 541 $DistanceSquare = ($CoordsRef->[0] - $OriginCoordsRef->[$Index])**2 + ($CoordsRef->[1] - $OriginCoordsRef->[$Index + 1])**2 + ($CoordsRef->[2] - $OriginCoordsRef->[$Index + 2])**2; 542 $Distance = sqrt $DistanceSquare; 543 if ($Distance <= $OptionsInfo{MaxExtractionDistance}) { 544 $Status = 1; 545 last INDEX; 546 } 547 } 548 } 549 else { 550 $DistanceSquare = 0; 551 for $Index (0 .. 2) { 552 $DistanceSquare += ($CoordsRef->[$Index] - $OriginCoordsRef->[$Index])**2; 553 } 554 $Distance = sqrt $DistanceSquare; 555 $Status = ($Distance <= $OptionsInfo{MaxExtractionDistance}) ? 1 : 0; 556 } 557 558 return $Status; 559 } 560 561 # Write out modifed header and other older records... 562 sub WriteHeaderAndOlderRecords { 563 my($OutFileRef, $PDBRecordLinesRef) = @_; 564 565 if ($OptionsInfo{ModifyHeaderRecord}) { 566 # Write out modified HEADER record... 567 my($Classification, $DepositionDate, $IDCode) = GetHeaderRecordInformation($PDBRecordLinesRef); 568 $Classification = 'Data extracted using MayaChemTools'; 569 print $OutFileRef GenerateHeaderRecordLine($IDCode, $Classification), "\n"; 570 } 571 else { 572 print $OutFileRef $PDBRecordLinesRef->[0], "\n"; 573 } 574 575 # Write out any old records... 576 if ($OptionsInfo{KeepOldRecords}) { 577 my($RecordLineIndex, $RecordLine); 578 # Skip HEADER record and write out older records all the way upto first MODEL/ATOM/HETATM records from input file... 579 RECORDLINE: for $RecordLineIndex (1 .. $#{$PDBRecordLinesRef}) { 580 $RecordLine = $PDBRecordLinesRef->[$RecordLineIndex]; 581 if (IsModelRecordType($RecordLine) || IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) { 582 last RECORDLINE; 583 } 584 print $OutFileRef "$RecordLine\n"; 585 } 586 } 587 } 588 589 # Get header record information assuming it's the first record... 590 sub GetHeaderRecordInformation { 591 my($PDBRecordLinesRef) = @_; 592 my($Classification, $DepositionDate, $IDCode, $HeaderRecordLine); 593 594 ($Classification, $DepositionDate, $IDCode) = ('') x 3; 595 $HeaderRecordLine = $PDBRecordLinesRef->[0]; 596 if (IsHeaderRecordType($HeaderRecordLine)) { 597 ($Classification, $DepositionDate, $IDCode) = ParseHeaderRecordLine($HeaderRecordLine); 598 } 599 return ($Classification, $DepositionDate, $IDCode); 600 } 601 602 # Get one letter residue code... 603 sub GetResidueCode { 604 my($ResidueName) = @_; 605 my($ResidueCode, $StandardResidue); 606 607 $ResidueCode = $OptionsInfo{NonStandardSequenceCode}; 608 $StandardResidue = 0; 609 610 if (length($ResidueName) == 3) { 611 # Assume it's an amino acid... 612 if (AminoAcids::IsAminoAcid($ResidueName)) { 613 # Standard amino acid... 614 $ResidueCode = AminoAcids::GetAminoAcidOneLetterCode($ResidueName); 615 $StandardResidue = 1; 616 } 617 } 618 elsif (length($ResidueName) == 1) { 619 # Assume it's a nucleic acid... 620 if ($ResidueName =~ /^(A|G|T|U|C)$/i) { 621 $ResidueCode = $ResidueName; 622 $StandardResidue = 1; 623 } 624 } 625 626 return ($ResidueCode, $StandardResidue); 627 } 628 629 # Process option values... 630 sub ProcessOptions { 631 %OptionsInfo = (); 632 $OptionsInfo{Mode} = $Options{mode}; 633 634 my(@SpecifiedChains) = (); 635 if ($Options{chains} =~ /^(First|All)$/i) { 636 $OptionsInfo{ChainsToExtract} = $Options{chains}; 637 } 638 else { 639 @SpecifiedChains = split /\,/, $Options{chains}; 640 $OptionsInfo{ChainsToExtract} = 'Specified'; 641 } 642 @{$OptionsInfo{SpecifiedChains}} = (); 643 push @{$OptionsInfo{SpecifiedChains}}, @SpecifiedChains; 644 645 $OptionsInfo{CombineChainSequences} = ($Options{combinechains} =~ /^Yes$/i) ? 1 : 0; 646 647 $OptionsInfo{MaxExtractionDistance} = $Options{distance}; 648 $OptionsInfo{ExtractionDistanceMode} = $Options{distancemode}; 649 $OptionsInfo{ExtractionDistanceOrigin} = $Options{distanceorigin} ? $Options{distanceorigin} : ''; 650 my(@SpecifiedDistanceOrigin) = (); 651 652 if ($OptionsInfo{Mode} =~ /^Distance$/i) { 653 if (!$Options{distanceorigin}) { 654 die "Error: You must specify a value for \"--distanceorigin\" option in \"Distance\" \"-m, --mode\". \n"; 655 } 656 @SpecifiedDistanceOrigin = split /\,/, $Options{distanceorigin}; 657 if ($OptionsInfo{ExtractionDistanceMode} =~ /^Atom$/i) { 658 if (@SpecifiedDistanceOrigin != 2) { 659 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Atom\" value of \"--distancemode\" : The number of values must be 2.\n"; 660 } 661 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) { 662 die "Error: Invalid atom number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Atom\" value of \"--distancemode\". Allowed values: > 0\n"; 663 } 664 } 665 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Hetatm$/i) { 666 if (@SpecifiedDistanceOrigin != 2) { 667 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Hetatm\" value of \"--distancemode\" : The number of values must be 2.\n"; 668 } 669 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) { 670 die "Error: Invalid hetatm number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Hetatm\" value of \"--distancemode\". Allowed values: > 0\n"; 671 } 672 } 673 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { 674 if (!(@SpecifiedDistanceOrigin == 2 || @SpecifiedDistanceOrigin == 3)) { 675 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Residue\" value of \"--distancemode\" : The number of values must be either 2 or 3.\n"; 676 } 677 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) { 678 die "Error: Invalid residue number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Residue\" value of \"--distancemode\". Allowed values: > 0\n"; 679 } 680 } 681 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) { 682 if (@SpecifiedDistanceOrigin != 3) { 683 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"XYZ\" value of \"--distancemode\" : The number of values must be 3.\n"; 684 } 685 my($Value); 686 for $Value (@SpecifiedDistanceOrigin) { 687 if (!IsNumerical($Value)) { 688 die "Error: Invalid coordinate value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"XYZ\" value of \"--distancemode\". Allowed values: numerical\n"; 689 } 690 } 691 } 692 } 693 @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}} = (); 694 push @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}, @SpecifiedDistanceOrigin; 695 696 $OptionsInfo{WaterResidueNames} = $Options{waterresiduenames}; 697 @{$OptionsInfo{SpecifiedWaterResiduesList}} = (); 698 %{$OptionsInfo{SpecifiedWaterResiduesMap}} = (); 699 700 my(@SpecifiedWaterResiduesList); 701 @SpecifiedWaterResiduesList = (); 702 703 if ($OptionsInfo{Mode} =~ /^NonWater$/i) { 704 my($WaterResidueName); 705 if ($OptionsInfo{WaterResidueNames} =~ /Automatic/i) { 706 push @SpecifiedWaterResiduesList, ('HOH', 'WAT', 'H2O'); 707 } 708 else { 709 @SpecifiedWaterResiduesList = split /\,/, $Options{waterresiduenames}; 710 } 711 for $WaterResidueName (@SpecifiedWaterResiduesList) { 712 $OptionsInfo{SpecifiedWaterResiduesMap}{$WaterResidueName} = $WaterResidueName; 713 } 714 } 715 push @{$OptionsInfo{SpecifiedWaterResiduesList}}, @SpecifiedWaterResiduesList; 716 717 718 $OptionsInfo{KeepOldRecords} = ($Options{keepoldrecords} =~ /^Yes$/i) ? 1 : 0; 719 720 $OptionsInfo{ModifyHeaderRecord} = ($Options{modifyheader} =~ /^Yes$/i) ? 1 : 0; 721 722 $OptionsInfo{KeepNonStandardSequences} = ($Options{nonstandardkeep} =~ /^Yes$/i) ? 1 : 0; 723 $OptionsInfo{NonStandardSequenceCode} = $Options{nonstandardcode}; 724 $OptionsInfo{MaxSequenceLength} = $Options{sequencelength}; 725 $OptionsInfo{SequenceRecordSource} = $Options{sequencerecords}; 726 $OptionsInfo{SequenceIDPrefixSource} = $Options{sequenceidprefix}; 727 728 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 729 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 730 } 731 732 # Retrieve information about PDB files... 733 sub RetrievePDBFilesInfo { 734 my($Index, $PDBFile, $PDBRecordLinesRef, $ChainID, $ChainLabel, $ChainsAndResiduesInfoRef, $Mode, $FileDir, $FileName, $FileExt, $OutFileName, $OutFileRoot, @SpecifiedChains, @DistanceOrigin, @OutFileNames, @ChainLabels, @ChainSequenceIDs, @ChainSequenceIDsPrefix); 735 736 %PDBFilesInfo = (); 737 @{$PDBFilesInfo{FileOkay}} = (); 738 @{$PDBFilesInfo{OutFileRoot}} = (); 739 @{$PDBFilesInfo{OutFileNames}} = (); 740 @{$PDBFilesInfo{ChainLabels}} = (); 741 @{$PDBFilesInfo{ChainSequenceIDs}} = (); 742 @{$PDBFilesInfo{ChainSequenceIDsPrefix}} = (); 743 @{$PDBFilesInfo{SpecifiedChains}} = (); 744 @{$PDBFilesInfo{DistanceOrigin}} = (); 745 746 FILELIST: for $Index (0 .. $#PDBFilesList) { 747 $PDBFilesInfo{FileOkay}[$Index] = 0; 748 749 $PDBFilesInfo{OutFileRoot}[$Index] = ''; 750 @{$PDBFilesInfo{OutFileNames}[$Index]} = (); 751 @{$PDBFilesInfo{OutFileNames}[$Index]} = (); 752 @{$PDBFilesInfo{ChainLabels}[$Index]} = (); 753 @{$PDBFilesInfo{ChainSequenceIDs}[$Index]} = (); 754 @{$PDBFilesInfo{ChainSequenceIDsPrefix}[$Index]} = (); 755 @{$PDBFilesInfo{SpecifiedChains}[$Index]} = (); 756 @{$PDBFilesInfo{DistanceOrigin}[$Index]} = (); 757 758 $PDBFile = $PDBFilesList[$Index]; 759 if (!(-e $PDBFile)) { 760 warn "Warning: Ignoring file $PDBFile: It doesn't exist\n"; 761 next FILELIST; 762 } 763 if (!CheckFileType($PDBFile, "pdb")) { 764 warn "Warning: Ignoring file $PDBFile: It's not a PDB file\n"; 765 next FILELIST; 766 } 767 if (! open PDBFILE, "$PDBFile") { 768 warn "Warning: Ignoring file $PDBFile: Couldn't open it: $! \n"; 769 next FILELIST; 770 } 771 close PDBFILE; 772 773 # Get PDB data... 774 $PDBRecordLinesRef = ReadPDBFile($PDBFile); 775 if ($OptionsInfo{Mode} =~ /^Sequences$/i && $OptionsInfo{SequenceRecordSource} =~ /^SeqRes$/i) { 776 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'SeqRes'); 777 } 778 else { 779 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef); 780 } 781 if (!scalar @{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 782 warn "Warning: Ignoring file $PDBFile: No chains found \n"; 783 next FILELIST; 784 } 785 786 # Make sure specified chains exist in PDB file... 787 @SpecifiedChains = (); 788 if ($OptionsInfo{ChainsToExtract} =~ /^Specified$/i) { 789 for $ChainID (@{$OptionsInfo{SpecifiedChains}}) { 790 if (exists $ChainsAndResiduesInfoRef->{Residues}{$ChainID}) { 791 push @SpecifiedChains, $ChainID; 792 } 793 else { 794 warn "Warning: Ignoring file $PDBFile: Specified chain, $ChainID, in \"-c, --chains\" option doesn't exist.\n"; 795 next FILELIST; 796 } 797 } 798 } 799 elsif ($OptionsInfo{ChainsToExtract} =~ /^First$/i) { 800 push @SpecifiedChains, $ChainsAndResiduesInfoRef->{ChainIDs}[0]; 801 } 802 elsif ($OptionsInfo{ChainsToExtract} =~ /^All$/i) { 803 push @SpecifiedChains, @{$ChainsAndResiduesInfoRef->{ChainIDs}}; 804 } 805 # Setup chain labels to use for sequence IDs and generating output files... 806 @ChainLabels = (); 807 for $ChainID (@SpecifiedChains) { 808 $ChainLabel = $ChainID; $ChainLabel =~ s/^None//ig; 809 $ChainLabel = "Chain${ChainLabel}"; 810 push @ChainLabels, $ChainLabel; 811 } 812 813 # Make sure specified distance origin is valid... 814 @DistanceOrigin = (); 815 if ($OptionsInfo{Mode} =~ /^Distance$/i) { 816 if ($OptionsInfo{ExtractionDistanceMode} =~ /^(Atom|Hetatm)$/i) { 817 my($RecordType, $SpecifiedAtomName, $SpecifiedAtomNumber, $RecordFound, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $RecordLine); 818 $RecordType = $OptionsInfo{ExtractionDistanceMode}; 819 ($SpecifiedAtomNumber, $SpecifiedAtomName) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; 820 $RecordFound = 0; 821 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 822 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { 823 next LINE; 824 } 825 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); 826 $AtomName = RemoveLeadingAndTrailingWhiteSpaces($AtomName); 827 if (($RecordType =~ /^Atom$/i && IsAtomRecordType($RecordLine)) || ($RecordType =~ /^Hetatm$/i && IsHetatmRecordType($RecordLine))) { 828 if ($AtomNumber == $SpecifiedAtomNumber && $AtomName eq $SpecifiedAtomName) { 829 $RecordFound = 1; 830 last LINE; 831 } 832 } 833 } 834 if (!$RecordFound) { 835 warn "Warning: Ignoring file $PDBFile: ", uc($RecordType), " record corresponding to \"--distanceorigin\" option value, $OptionsInfo{ExtractionDistanceOrigin}, doesn't exist.\n"; 836 next FILELIST; 837 } 838 push @DistanceOrigin, ($X, $Y, $Z); 839 } 840 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { 841 my($SpecifiedResidueNumber, $SpecifiedResidueName, $SpecifiedChainID, $RecordFound, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $RecordLine); 842 $SpecifiedChainID = ''; 843 if (@{$OptionsInfo{SpecifiedExtractionDistanceOrigin}} == 3) { 844 ($SpecifiedResidueNumber, $SpecifiedResidueName, $SpecifiedChainID) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; 845 } 846 else { 847 ($SpecifiedResidueNumber, $SpecifiedResidueName) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; 848 } 849 $RecordFound = 0; 850 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 851 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { 852 next LINE; 853 } 854 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); 855 $ResidueName = RemoveLeadingAndTrailingWhiteSpaces($ResidueName); 856 $ChainID = RemoveLeadingAndTrailingWhiteSpaces($ChainID); 857 if ($SpecifiedChainID && ($SpecifiedChainID ne $ChainID)) { 858 next LINE; 859 } 860 if ($ResidueNumber == $SpecifiedResidueNumber && $ResidueName eq $SpecifiedResidueName) { 861 # Store coordinates for all the atoms... 862 $RecordFound = 1; 863 push @DistanceOrigin, ($X, $Y, $Z); 864 next LINE; 865 } 866 } 867 if (!$RecordFound) { 868 warn "Warning: Ignoring file $PDBFile: ATOM/HETATM record corresponding to \"--distanceorigin\" option value, $OptionsInfo{ExtractionDistanceOrigin}, doesn't exist.\n"; 869 next FILELIST; 870 } 871 } 872 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) { 873 push @DistanceOrigin, @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; 874 } 875 } 876 # Setup output file names... 877 @OutFileNames = (); 878 $FileDir = ""; $FileName = ""; $FileExt = ""; 879 ($FileDir, $FileName, $FileExt) = ParseFileName($PDBFile); 880 if ($OptionsInfo{OutFileRoot} && (@PDBFilesList == 1)) { 881 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 882 if ($RootFileName && $RootFileExt) { 883 $FileName = $RootFileName; 884 } 885 else { 886 $FileName = $OptionsInfo{OutFileRoot}; 887 } 888 $OutFileRoot = $FileName; 889 } 890 else { 891 $OutFileRoot = $FileName; 892 } 893 $Mode = $OptionsInfo{Mode}; 894 if ($Mode =~ /^(Atoms|CAlphas|Distance|NonWater|NonHydrogens)$/i) { 895 $OutFileName = ''; 896 if ($Mode =~ /^CAlphas$