1 package PDBFileUtil; 2 # 3 # $RCSfile: PDBFileUtil.pm,v $ 4 # $Date: 2011/12/16 00:04:11 $ 5 # $Revision: 1.28 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2012 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use Exporter; 31 use Text::ParseWords; 32 use TextUtil; 33 use FileUtil; 34 use TimeUtil (); 35 36 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 37 38 @ISA = qw(Exporter); 39 @EXPORT = qw(GetPDBRecordType GetRecordTypesCount GetAllResidues GetConectRecordLines GetChainsAndResidues GetMinMaxCoords IsPDBFile IsAtomRecordType IsConectRecordType IsHeaderRecordType IsHetatmRecordType IsSeqresRecordType IsModelRecordType IsEndmdlRecordType IsTerRecordType IsMasterRecordType ReadPDBFile ParseHeaderRecordLine GenerateHeaderRecordLine GenerateHeaderRecordTimeStamp ParseAtomRecordLine GenerateAtomRecordLine ParseAtomOrHetatmRecordLine GenerateAtomOrHetatmRecordLine GenerateHetatmRecordLine ParseHetatmRecordLine ParseConectRecordLine GenerateConectRecordLine ParseSeqresRecordLine ParseTerRecordLine GenerateTerRecordLine ParseMasterRecordLine GenerateEndRecordLine); 40 @EXPORT_OK = qw(); 41 42 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 43 44 # Get PDB record type... 45 sub GetPDBRecordType { 46 my($Line) = @_; 47 48 return _GetRecordType($Line); 49 } 50 51 # Is it a PDB file? 52 sub IsPDBFile { 53 my($PDBFile) = @_; 54 my($Line, $Status); 55 56 $Status = 0; 57 open PDBFILE, "$PDBFile" or die "Can't open $PDBFile: $!\n"; 58 $Line = GetTextLine(\*PDBFILE); 59 $Status = ($Line =~ /^HEADER/i) ? 1 : 0; 60 close PDBFILE; 61 62 return $Status; 63 } 64 65 # Is it a atom record type? 66 sub IsAtomRecordType { 67 my($Line) = @_; 68 69 return _IsRecordType($Line, 'ATOM'); 70 } 71 72 # Is it a connect record type? 73 sub IsConectRecordType { 74 my($Line) = @_; 75 76 return _IsRecordType($Line, 'CONECT'); 77 } 78 79 # Is it a header atom record type? 80 sub IsHeaderRecordType { 81 my($Line) = @_; 82 83 return _IsRecordType($Line, 'HEADER'); 84 } 85 86 # Is it a hetro atom record type? 87 sub IsHetatmRecordType { 88 my($Line) = @_; 89 90 return _IsRecordType($Line, 'HETATM'); 91 } 92 93 # Is it a seqres record type? 94 sub IsSeqresRecordType { 95 my($Line) = @_; 96 97 return _IsRecordType($Line, 'SEQRES'); 98 } 99 100 # Is it a MODEL record type? 101 sub IsModelRecordType { 102 my($Line) = @_; 103 104 return _IsRecordType($Line, 'MODEL'); 105 } 106 107 # Is it a ENDMDL record type? 108 sub IsEndmdlRecordType { 109 my($Line) = @_; 110 111 return _IsRecordType($Line, 'ENDMDL'); 112 } 113 114 # Is it a TER record type? 115 sub IsTerRecordType { 116 my($Line) = @_; 117 118 return _IsRecordType($Line, 'TER'); 119 } 120 121 # Is it a MASTER record type? 122 sub IsMasterRecordType { 123 my($Line) = @_; 124 125 return _IsRecordType($Line, 'MASTER'); 126 } 127 128 # Count the number of each record type and a reference to data type with these key/value pairs: 129 # {RecordTypes} - An array of unique record types in order of their presence in the file 130 # {Count}{$RecordType} - Count of each record type 131 # {Lines}{$RecordType} - Optional lines data for a specific record type. 132 # 133 sub GetRecordTypesCount { 134 my($PDBRecordLinesRef, $SpecifiedRecordType, $GetRecordLinesFlag, $RecordType, $RecordLine, %RecordTypeDataMap); 135 136 %RecordTypeDataMap = (); 137 @{$RecordTypeDataMap{RecordTypes}} = (); 138 %{$RecordTypeDataMap{Count}} = (); 139 %{$RecordTypeDataMap{Lines}} = (); 140 141 $SpecifiedRecordType = ''; 142 $GetRecordLinesFlag = 0; 143 if (@_ == 3) { 144 ($PDBRecordLinesRef, $SpecifiedRecordType, $GetRecordLinesFlag) = @_; 145 $SpecifiedRecordType = uc $SpecifiedRecordType; 146 } 147 elsif (@_ == 2) { 148 ($PDBRecordLinesRef, $SpecifiedRecordType) = @_; 149 $SpecifiedRecordType = uc $SpecifiedRecordType; 150 } 151 else { 152 ($PDBRecordLinesRef) = @_; 153 } 154 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 155 $RecordType = _GetRecordType($RecordLine); 156 if ($SpecifiedRecordType && ($SpecifiedRecordType ne $RecordType)) { 157 next LINE; 158 } 159 if (exists $RecordTypeDataMap{Count}{$RecordType}) { 160 # Update count... 161 $RecordTypeDataMap{Count}{$RecordType} += 1; 162 163 if ($GetRecordLinesFlag) { 164 push @{$RecordTypeDataMap{Lines}{$RecordType}}, $RecordLine; 165 } 166 } 167 else { 168 # New record type... 169 push @{$RecordTypeDataMap{RecordTypes}}, $RecordType; 170 $RecordTypeDataMap{Count}{$RecordType} = 1; 171 172 if ($GetRecordLinesFlag) { 173 @{$RecordTypeDataMap{Lines}{$RecordType}} = (); 174 push @{$RecordTypeDataMap{Lines}{$RecordType}}, $RecordLine; 175 } 176 } 177 } 178 return (\%RecordTypeDataMap); 179 } 180 181 # Collect CONECT record lines for specific atom number, modified specified data to exclude any atom 182 # number not present in the list of specified atom numbers and return a reference to list of 183 # CONECT record lines. 184 # 185 sub GetConectRecordLines { 186 my($PDBRecordLinesRef, $AtomNumbersMapRef) = @_; 187 my($AtomNumber, $ConectAtomNumber, $RecordLine, @ConectRecordAtomNums, @ConectRecordLines); 188 189 @ConectRecordLines = (); 190 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 191 if (!IsConectRecordType($RecordLine)) { 192 next LINE; 193 } 194 @ConectRecordAtomNums = (); 195 push @ConectRecordAtomNums, ParseConectRecordLine($RecordLine); 196 ATOMNUMBER: for $ConectAtomNumber (@ConectRecordAtomNums) { 197 if (defined $ConectAtomNumber) { 198 $AtomNumber = $ConectAtomNumber; 199 if ($AtomNumber) { 200 if (! exists $AtomNumbersMapRef->{$AtomNumber}) { 201 next LINE; 202 } 203 } 204 } 205 } 206 push @ConectRecordLines, $RecordLine; 207 } 208 return \@ConectRecordLines; 209 } 210 211 # Get chains and residue information using ATOM/HETATM or SEQRES records. And return a reference to a 212 # hash with these keys: 213 # 214 # @{$ChainsDataMap{ChainIDs}} - List of chain IDs with 'None' for no chain identification 215 # @{$ChainsDataMap{Residues}{$ChainID}} - List of residues in order of their appearance in a chain 216 # @{$ChainsDataMap{ResidueNumbers}{$ChainID}} - List of residue numbers in order of their appearance in a chain 217 # %{$ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName}} - Count of specific residues in a chain 218 # 219 # Notes: 220 # . Chains and residue data can be extacted using either ATOM/HETATM records or SEQRES records. 221 # . In addition to a different chain ID in ATOM/HETATM a TER record also indicates end of an existing chain 222 # and start of a new one: ChainID in ATOM/HETATM records might still be emtpy. 223 # . ATOM/HETATM records after the first ENDMDL records are simply ingnored. 224 # 225 sub GetChainsAndResidues { 226 my($PDBRecordLinesRef, $RecordsSource, $GetChainResiduesBeyondTERFlag, $GetRecordLinesFlag); 227 228 $RecordsSource = 'AtomAndHetatm'; 229 $GetChainResiduesBeyondTERFlag = 0; 230 $GetRecordLinesFlag = 0; 231 232 if (@_ == 4) { 233 ($PDBRecordLinesRef, $RecordsSource, $GetChainResiduesBeyondTERFlag, $GetRecordLinesFlag) = @_; 234 } 235 elsif (@_ == 3) { 236 ($PDBRecordLinesRef, $RecordsSource, $GetChainResiduesBeyondTERFlag) = @_; 237 } 238 elsif (@_ == 2) { 239 ($PDBRecordLinesRef, $RecordsSource) = @_; 240 } 241 else { 242 ($PDBRecordLinesRef) = @_; 243 } 244 245 if ($RecordsSource =~ /^AtomAndHetatm$/i) { 246 return _GetChainsAndResiduesFromAtomHetatmRecords($PDBRecordLinesRef, $GetChainResiduesBeyondTERFlag, $GetRecordLinesFlag); 247 } 248 elsif ($RecordsSource =~ /^Seqres$/i) { 249 return _GetChainsAndResiduesFromSeqresRecords($PDBRecordLinesRef); 250 } 251 else { 252 my(%ChainsDataMap); 253 %ChainsDataMap = (); 254 @{$ChainsDataMap{ChainIDs}} = (); 255 %{$ChainsDataMap{Residues}} = (); 256 %{$ChainsDataMap{ResidueNumbers}} = (); 257 %{$ChainsDataMap{ResidueCount}} = (); 258 259 return \%ChainsDataMap; 260 } 261 } 262 263 264 # Get residue information using ATOM/HETATM records and return a reference to a hash with 265 # these keys: 266 # 267 # @{$ResiduesDataMap{ResidueNames}} - List of all the residues 268 # %{$ResiduesDataMap{ResidueCount}{$ResidueName}} - Count of residues 269 # @{$ResiduesDataMap{AtomResidueNames}} - List of all the residues 270 # %{$ResiduesDataMap{AtomResidueCount}{$ResidueName}} - Count of residues in ATOM records 271 # @{$ResiduesDataMap{HetatomResidueNames}} - List of all the residues 272 # %{$ResiduesDataMap{HetatmResidueCount}{$ResidueName}} - Count of residues HETATM records 273 # 274 # Notes: 275 # . ATOM/HETATM records after the first ENDMDL records are simply ingnored. 276 # 277 sub GetAllResidues { 278 my($PDBRecordLinesRef) = @_; 279 280 my($PreviousChainID, $PreviousResidueNumber, $RecordLine, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, %ResiduesDataMap); 281 282 %ResiduesDataMap = (); 283 @{$ResiduesDataMap{ResidueNames}} = (); 284 %{$ResiduesDataMap{ResidueCount}} = (); 285 @{$ResiduesDataMap{AtomResidueNames}} = (); 286 %{$ResiduesDataMap{AtomResidueCount}} = (); 287 @{$ResiduesDataMap{HetatmResidueNames}} = (); 288 %{$ResiduesDataMap{HetatmResidueCount}} = (); 289 290 $PreviousChainID = ''; 291 $PreviousResidueNumber = 0; 292 293 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 294 if (IsEndmdlRecordType($RecordLine)) { 295 last LINE; 296 } 297 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { 298 next LINE; 299 } 300 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine); 301 302 if ($PreviousChainID eq $ChainID) { 303 if ($ResidueNumber == $PreviousResidueNumber) { 304 next LINE; 305 } 306 $PreviousResidueNumber = $ResidueNumber; 307 } 308 else { 309 # New chain... 310 $PreviousChainID = $ChainID; 311 $PreviousResidueNumber = $ResidueNumber; 312 } 313 314 # Store the residue and update its count... 315 push @{$ResiduesDataMap{ResidueNames}}, $ResidueName; 316 if (exists $ResiduesDataMap{ResidueCount}{$ResidueName}) { 317 $ResiduesDataMap{ResidueCount}{$ResidueName} += 1; 318 } 319 else { 320 $ResiduesDataMap{ResidueCount}{$ResidueName} = 1; 321 } 322 # Update ATOM residue data... 323 if (IsAtomRecordType($RecordLine)) { 324 push @{$ResiduesDataMap{AtomResidueNames}}, $ResidueName; 325 if (exists $ResiduesDataMap{AtomResidueCount}{$ResidueName}) { 326 $ResiduesDataMap{AtomResidueCount}{$ResidueName} += 1; 327 } 328 else { 329 $ResiduesDataMap{AtomResidueCount}{$ResidueName} = 1; 330 } 331 } 332 # Update HETATM residue data... 333 if (IsHetatmRecordType($RecordLine)) { 334 push @{$ResiduesDataMap{HetatmResidueNames}}, $ResidueName; 335 if (exists $ResiduesDataMap{HetatmResidueCount}{$ResidueName}) { 336 $ResiduesDataMap{HetatmResidueCount}{$ResidueName} += 1; 337 } 338 else { 339 $ResiduesDataMap{HetatmResidueCount}{$ResidueName} = 1; 340 } 341 } 342 } 343 344 return \%ResiduesDataMap; 345 } 346 347 # Return min/max XYZ coordinates for ATOM/HETATM records... 348 sub GetMinMaxCoords { 349 my($PDBRecordLinesRef) = @_; 350 351 my($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax, $RecordLine, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge); 352 353 ($XMin, $YMin, $ZMin) = (99999) x 3; 354 ($XMax, $YMax, $ZMax) = (-99999) x 3; 355 356 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 357 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { 358 next LINE; 359 } 360 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine); 361 362 $XMin = ($X < $XMin) ? $X : $XMin; 363 $YMin = ($Y < $YMin) ? $Y : $YMin; 364 $ZMin = ($Z < $ZMin) ? $Z : $ZMin; 365 366 $XMax = ($X > $XMax) ? $X : $XMax; 367 $YMax = ($Y > $YMax) ? $Y : $YMax; 368 $ZMax = ($Z > $ZMax) ? $Z : $ZMax; 369 } 370 371 if ($XMin == 99999) { $XMin = undef; } 372 if ($YMin == 99999) { $YMin = undef; } 373 if ($ZMin == 99999) { $ZMin = undef; } 374 if ($XMax == -99999) { $XMax = undef; } 375 if ($YMax == -99999) { $YMax = undef; } 376 if ($ZMax == -99999) { $ZMax = undef; } 377 378 return ($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax); 379 } 380 381 # Read PDB file and return reference to record lines.. 382 sub ReadPDBFile { 383 my($PDBFile) = @_; 384 385 my($Line, @PDBRecordLines); 386 387 @PDBRecordLines = (); 388 open PDBFILE, "$PDBFile" or die "Can't open $PDBFile: $!\n"; 389 while ($Line = GetTextLine(\*PDBFILE)) { 390 push @PDBRecordLines, $Line; 391 } 392 393 close PDBFILE; 394 395 return (\@PDBRecordLines); 396 } 397 398 # 399 # Parse HEADER record line... 400 sub ParseHeaderRecordLine { 401 my($Line) = @_; 402 my($Classification, $DepositionDate, $IDCode) = (undef, undef, undef); 403 404 if ($Line !~ /^HEADER/i) { 405 return ($Classification, $DepositionDate, $IDCode); 406 } 407 ($Classification, $DepositionDate, $IDCode) = unpack("x10A40A9x3A4", $Line); 408 $Classification = RemoveLeadingAndTrailingWhiteSpaces($Classification); 409 $DepositionDate =~ s/ //g; 410 $IDCode =~ s/ //g; 411 412 return ($Classification, $DepositionDate, $IDCode); 413 } 414 415 # 416 # Generate HEADER record line... 417 sub GenerateHeaderRecordLine { 418 my($Classification, $Date, $IDCode, $Line); 419 420 $Classification = "Created using MayaChemTools"; 421 $Date = GenerateHeaderRecordTimeStamp(); 422 if (@_ == 3) { 423 ($IDCode, $Classification, $Date) = @_; 424 } 425 elsif (@_ == 2) { 426 ($IDCode, $Classification) = @_; 427 } 428 elsif (@_ == 1) { 429 ($IDCode) = @_; 430 } 431 432 $Line = sprintf "HEADER %-40.40s%9.9s %4.4s", $Classification, $Date, $IDCode; 433 return $Line; 434 } 435 436 # Generate PDB header time stamp... 437 sub GenerateHeaderRecordTimeStamp { 438 return TimeUtil::PDBFileTimeStamp(); 439 } 440 441 # 442 # Parse ATOM record line. 443 # 444 sub ParseAtomRecordLine { 445 my($Line) = @_; 446 447 return _ParseAtomOrHetatmRecordLine($Line); 448 } 449 450 # Generate ATOM record line... 451 sub GenerateAtomRecordLine { 452 my($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = @_; 453 454 return _GenerateAtomOrHetatmRecordLine('ATOM', $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge); 455 } 456 457 # 458 # Parse ATOM/HETATm record line. 459 # 460 sub ParseAtomOrHetatmRecordLine { 461 my($Line) = @_; 462 463 return _ParseAtomOrHetatmRecordLine($Line); 464 } 465 466 # Generate ATOM/HETATM record line... 467 sub GenerateAtomOrHetatmRecordLine { 468 my($RecordType, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = @_; 469 470 return _GenerateAtomOrHetatmRecordLine($RecordType, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge); 471 } 472 # 473 # Parse HETATM record line... 474 # 475 sub ParseHetatmRecordLine { 476 my($Line) = @_; 477 478 return _ParseAtomOrHetatmRecordLine($Line); 479 } 480 481 # Generate HETATM record line... 482 sub GenerateHetatmRecordLine { 483 my($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = @_; 484 485 return _GenerateAtomOrHetatmRecordLine('HETATM', $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge); 486 } 487 # 488 # Parse SEQRES record line... 489 # 490 # SEQRES format: 491 # 492 # 1 - 6 Record name "SEQRES" 493 # 9 - 10 Serial number of the SEQRES record for the current chain. Starts at 1 and increments by one each line. Reset to 1 for each chain. 494 # 12 - Chain identifier 495 # 14 - 17 Integer numRes Number of residues in the chain 496 # 20 - 22 24 -26 ... ... 68 - 70 Residue name resName Residue name. 497 # 498 sub ParseSeqresRecordLine { 499 my($Line) = @_; 500 501 if ($Line !~ /^SEQRES/i) { 502 return ((undef) x 5); 503 } 504 my($RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueNames) = unpack("x8A2x1A1x1A4x2A51" , $Line); 505 $RecordSerialNumber =~ s/ //g; 506 $ChainID =~ s/ //g; 507 $NumOfResidues =~ s/ //g; 508 $ResidueNames = RemoveLeadingAndTrailingWhiteSpaces($ResidueNames); 509 510 return ($RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueNames); 511 } 512 513 # 514 # Parse CONECT record line... 515 # 516 # CONECT format: 517 # 518 # 1 - 6 Record name "CONECT" 519 # 7 - 11 Atom number 520 # 12 - 16, 17 - 21, 22 - 26, 27 - 31 Atom number of bonded atom 521 # 522 # 32 - 36, 37 - 41 Atom number of hydrogen bonded atom 523 # 42 - 46 Atom number of salt bridged atom 524 # 47 - 51, 52 -56 Atom number of hydrogen bonded atom 525 # 57 - 61 Atom number of salt bridged atom 526 # 527 sub ParseConectRecordLine { 528 my($Line) = @_; 529 530 if ($Line !~ /^CONECT/i) { 531 return ((undef) x 11); 532 } 533 my($AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2) = map {s/ //g; $_} unpack("x6A5A5A5A5A5A5A5A5A5A5A5", $Line); 534 535 return ($AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2); 536 } 537 538 # Generate CONECT record line... 539 sub GenerateConectRecordLine { 540 my($AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2) = @_; 541 my($Line); 542 543 $Line = sprintf "CONECT%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s", $AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2; 544 545 return $Line; 546 } 547 548 # 549 # Parse TER record line... 550 # 551 # TER format: 552 # 553 #1 - 6 Record name "TER " 554 # 7 - 11 Serial number 555 # 18 - 20 Residue name 556 # 22 Chain identifier 557 # 23 - 26 Residue sequence number 558 # 27 Insertion code 559 # 560 sub ParseTerRecordLine { 561 my($Line) = @_; 562 563 if ($Line !~ /^TER/i) { 564 return ((undef) x 5); 565 } 566 my($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode) = map {s/ //g; $_} unpack("x6A5x6A3xA1A4A1", $Line); 567 568 return ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode); 569 } 570 571 # Generate TER record line... 572 sub GenerateTerRecordLine { 573 my($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $Line) = ('') x 6; 574 575 if (@_ == 5) { 576 ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode) = @_; 577 } 578 elsif (@_ == 4) { 579 ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber) = @_; 580 } 581 elsif (@_ == 3) { 582 ($SerialNumber, $ResidueName) = @_; 583 } 584 elsif (@_ == 2) { 585 ($SerialNumber, $ResidueName) = @_; 586 } 587 elsif (@_ == 1) { 588 ($SerialNumber) = @_; 589 } 590 $Line = sprintf "TER %5.5s %-3.3s %1.1s%4.4s%1.1s", $SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode; 591 592 return $Line; 593 } 594 595 # 596 # Parse MASTER record line... 597 # 598 # MASTER record format: 599 # 600 #1 - 6 Record name "MASTER" 601 # 11 - 15 Number of REMARK records 602 # 16 - 20 "0" 603 # 21 - 25 Number of HET records 604 # 26 - 30 Number of HELIX records 605 # 31 - 35 Number of SHEET records 606 # 36 - 40 Number of TURN records 607 # 41 - 45 Number of SITE records 608 # 46 - 50 Number of coordinate transformation records (ORIGXn+SCALEn+MTRIXn) 609 # 51 - 55 Number of atomic coordinate records (ATOM+HETATM) 610 # 56 - 60 Number of TER records 611 # 61 - 65 Number of CONECT records 612 # 66 - 70 Number of SEQRES records 613 # 614 sub ParseMasterRecordLine { 615 my($Line) = @_; 616 617 if ($Line !~ /^MASTER/i) { 618 return ((undef) x 11); 619 } 620 my($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = map {s/ //g; $_} unpack("x6x4A5x5A5A5A5A5A5A5A5A5A5A5", $Line); 621 622 return ($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords); 623 } 624 625 # End record... 626 sub GenerateEndRecordLine { 627 my($Line); 628 $Line = 'END '; 629 return $Line; 630 } 631 632 # ATOM/HETATM record format: 633 # 634 # 1 - 6 Record name 635 # 7 - 11 Atom serial number - right justified 636 # 13 - 16 Atom name 637 # 17 Alternate location indicator. 638 # 18 - 20 Residue name - right justified 639 # 22 Chain identifier. 640 # 23 - 26 Residue sequence number - right justified 641 # 27 Code for insertion of residues. 642 # 31 - 38 Real(8.3), Orthogonal coordinates for X in Angstroms. 643 # 39 - 46 Real(8.3), Orthogonal coordinates for Y in Angstroms. 644 # 47 - 54 Real(8.3), Orthogonal coordinates for Z in Angstroms. 645 # 55 - 60 Real(6.2), Occupancy 646 # 61 - 66 Real(6.2), Temperature factor 647 # 73 - 76 LString(4), Segment identifier, left-justified. 648 # 77 - 78 LString(2), Element symbol, right-justified. 649 #79 - 80 LString(2), Charge on the atom. 650 # 651 # Notes: 652 # . Atom names starting with C, N, O and S are left justified starting with column 14 653 # and others are left justified starting with column 13. 654 # 655 sub _ParseAtomOrHetatmRecordLine { 656 my($Line) = @_; 657 658 if ($Line !~ /^(ATOM|HETATM)/i) { 659 return ((undef) x 15); 660 } 661 my($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, $Length); 662 663 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ('') x 15; 664 665 $Length = length $Line; 666 667 if ($Length < 60) { 668 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = map {s/ //g; $_} unpack("x6A5xA4A1A3xA1A4A1x3A8A8A8", $Line); 669 } 670 else { 671 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = map {s/ //g; $_} unpack("x6A5xA4A1A3xA1A4A1x3A8A8A8A6A6x6A4A2A2", $Line); 672 } 673 return($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge); 674 } 675 676 # Generate ATOM/HETATM record line... 677 sub _GenerateAtomOrHetatmRecordLine { 678 my($RecordType, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = @_; 679 my($Line, $AtomNameFormat); 680 681 if ($AtomName =~ /^(C|N|O|S)/i) { 682 # Left justified starting at column 14... 683 $AtomNameFormat = " %-3.3s"; 684 } 685 else { 686 $AtomNameFormat = "%-4.4s"; 687 } 688 $Line = sprintf "%-6.6s%5.5s ${AtomNameFormat}%1.1s%3.3s %1.1s%4.4s%1.1s %8.8s%8.8s%8.8s%6.6s%6.6s %-4.4s%2.2s%2.2s", $RecordType, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge; 689 690 return $Line; 691 } 692 693 # Check record type... 694 sub _IsRecordType { 695 my($Line, $SpecifiedType) = @_; 696 my($Type, $Status); 697 698 ($Type) = map {s/ //g; $_} unpack("A6", $Line); 699 700 $Status = ($SpecifiedType eq $Type) ? 1 : 0; 701 702 return $Status; 703 } 704 705 # Get record type... 706 sub _GetRecordType { 707 my($Line) = @_; 708 my($Type); 709 710 ($Type) = map {s/ //g; $_} unpack("A6", $Line); 711 712 return $Type; 713 } 714 715 # Get chains and residues data using ATOM/HETATM records... 716 # 717 sub _GetChainsAndResiduesFromAtomHetatmRecords { 718 my($PDBRecordLinesRef, $GetChainResiduesBeyondTERFlag, $GetRecordLinesFlag) = @_; 719 720 my($LineCount, $TotalChainCount, $PreviousResidueNumber, $ChainCount, $DefaultChainID, $DefaultChainLabel, $RecordLine, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, %ChainsDataMap); 721 722 # Do a quick chain count using TER record... 723 $TotalChainCount = 0; 724 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 725 if (IsEndmdlRecordType($RecordLine)) { 726 last LINE; 727 } 728 if (IsTerRecordType($RecordLine)) { 729 $TotalChainCount++; 730 } 731 } 732 733 %ChainsDataMap = (); 734 @{$ChainsDataMap{ChainIDs}} = (); 735 %{$ChainsDataMap{Residues}} = (); 736 %{$ChainsDataMap{ResidueNumbers}} = (); 737 %{$ChainsDataMap{Lines}} = (); 738 %{$ChainsDataMap{ResidueCount}} = (); 739 740 $LineCount = 0; 741 $ChainCount = 0; 742 $DefaultChainLabel = 'None'; 743 $DefaultChainID = $DefaultChainLabel . ($ChainCount + 1); 744 $PreviousResidueNumber = 0; 745 746 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 747 $LineCount++; 748 if (IsTerRecordType($RecordLine)) { 749 $DefaultChainID = $DefaultChainLabel . ($ChainCount + 1); 750 $ChainCount++; 751 if ($ChainCount == $TotalChainCount) { 752 last LINE; 753 } 754 else { 755 next LINE; 756 } 757 } 758 elsif (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { 759 next LINE; 760 } 761 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine); 762 763 if (IsEmpty($ChainID)) { 764 $ChainID = $DefaultChainID; 765 } 766 if (exists $ChainsDataMap{Residues}{$ChainID}) { 767 # Data for existing chain... 768 if ($GetRecordLinesFlag) { 769 push @{$ChainsDataMap{Lines}{$ChainID}}, $RecordLine; 770 } 771 772 if ($ResidueNumber != $PreviousResidueNumber) { 773 # Next residue with in the chain... 774 push @{$ChainsDataMap{Residues}{$ChainID}}, $ResidueName; 775 push @{$ChainsDataMap{ResidueNumbers}{$ChainID}}, $ResidueNumber; 776 777 if (exists $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName}) { 778 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} += 1; 779 } 780 else { 781 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} = 1; 782 } 783 $PreviousResidueNumber = $ResidueNumber; 784 } 785 } 786 else { 787 # Data for new chain... 788 push @{$ChainsDataMap{ChainIDs}}, $ChainID; 789 790 @{$ChainsDataMap{Residues}{$ChainID}} = (); 791 push @{$ChainsDataMap{Residues}{$ChainID}}, $ResidueName; 792 793 @{$ChainsDataMap{ResidueNumbers}{$ChainID}} = (); 794 push @{$ChainsDataMap{ResidueNumbers}{$ChainID}}, $ResidueNumber; 795 796 @{$ChainsDataMap{Lines}{$ChainID}} = (); 797 if ($GetRecordLinesFlag) { 798 push @{$ChainsDataMap{Lines}{$ChainID}}, $RecordLine; 799 } 800 801 %{$ChainsDataMap{ResidueCount}{$ChainID}} = (); 802 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} = 1; 803 $PreviousResidueNumber = $ResidueNumber; 804 } 805 } 806 if (!$GetChainResiduesBeyondTERFlag) { 807 return \%ChainsDataMap; 808 } 809 # Look for any HETATM residues specified outside TER records which could belong to an existing chain... 810 my($LineIndex, $PreviousChainID); 811 $PreviousChainID = ''; 812 $PreviousResidueNumber = 0; 813 LINE: for $LineIndex (($LineCount - 1) .. $#{$PDBRecordLinesRef}) { 814 $RecordLine = $PDBRecordLinesRef->[$LineIndex]; 815 if (IsEndmdlRecordType($RecordLine)) { 816 last LINE; 817 } 818 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { 819 next LINE; 820 } 821 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine); 822 if (IsEmpty($ChainID)) { 823 # Ignore the chains with no ids... 824 next LINE; 825 } 826 if (! exists($ChainsDataMap{Residues}{$ChainID})) { 827 # Don't collect any new chains after TER record... 828 next LINE; 829 } 830 if ($GetRecordLinesFlag) { 831 push @{$ChainsDataMap{Lines}{$ChainID}}, $RecordLine; 832 } 833 if ($ResidueNumber != $PreviousResidueNumber || $ChainID ne $PreviousChainID) { 834 835 push @{$ChainsDataMap{Residues}{$ChainID}}, $ResidueName; 836 push @{$ChainsDataMap{ResidueNumbers}{$ChainID}}, $ResidueNumber; 837 838 if (exists $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName}) { 839 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} += 1; 840 } 841 else { 842 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} = 1; 843 } 844 $PreviousChainID = $ChainID; 845 $PreviousResidueNumber = $ResidueNumber; 846 } 847 } 848 return \%ChainsDataMap; 849 } 850 851 # Get chains and residues data using SEQRES records... 852 # 853 sub _GetChainsAndResiduesFromSeqresRecords { 854 my($PDBRecordLinesRef) = @_; 855 856 my($ChainCount, $DefaultChainLabel, $DefaultChainID, $RecordLine, $RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueName, $ResidueNamesString, @ResidueNamesList, %ChainsDataMap); 857 858 %ChainsDataMap = (); 859 @{$ChainsDataMap{ChainIDs}} = (); 860 %{$ChainsDataMap{Residues}} = (); 861 %{$ChainsDataMap{ResidueNumbers}} = (); 862 %{$ChainsDataMap{ResidueCount}} = (); 863 864 $ChainCount = 0; 865 $DefaultChainLabel = 'None'; 866 $DefaultChainID = $DefaultChainLabel . ($ChainCount + 1); 867 868 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 869 if (!IsSeqresRecordType($RecordLine)) { 870 next LINE; 871 } 872 ($RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueNamesString) = ParseSeqresRecordLine($RecordLine); 873 if ($RecordSerialNumber == 1) { 874 # Indicates start of a new chain... 875 $DefaultChainID = $DefaultChainLabel . ($ChainCount + 1); 876 $ChainCount++; 877 } 878 if (IsEmpty($ChainID)) { 879 $ChainID = $DefaultChainID; 880 } 881 # Process the residues... 882 @ResidueNamesList = split /[ ]+/, $ResidueNamesString; 883 884 if (exists $ChainsDataMap{Residues}{$ChainID}) { 885 # Data for existing chain... 886 push @{$ChainsDataMap{Residues}{$ChainID}}, @ResidueNamesList; 887 } 888 else { 889 # Data for new chain... 890 push @{$ChainsDataMap{ChainIDs}}, $ChainID; 891 @{$ChainsDataMap{Residues}{$ChainID}} = (); 892 push @{$ChainsDataMap{Residues}{$ChainID}}, @ResidueNamesList; 893 } 894 895 # Setup residue count... 896 for $ResidueName (@ResidueNamesList) { 897 if (exists $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName}) { 898 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} += 1; 899 } 900 else { 901 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} = 1; 902 } 903 } 904 } 905 return \%ChainsDataMap; 906 } 907