1 package PDBFileUtil; 2 # 3 # $RCSfile: PDBFileUtil.pm,v $ 4 # $Date: 2008/04/19 16:11:02 $ 5 # $Revision: 1.18 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 use 5.006; 29 use strict; 30 use Exporter; 31 use Text::ParseWords; 32 use TextUtil; 33 use FileUtil; 34 35 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 36 37 $VERSION = '1.00'; 38 @ISA = qw(Exporter); 39 @EXPORT = qw(GetPDBRecordType GetRecordTypesCount GetAllResidues GetConectRecordLines GetChainsAndResidues GetMinMaxCoords IsPDBFile IsAtomRecordType IsConectRecordType IsHeaderRecordType IsHetatmRecordType IsSeqresRecordType IsModelRecordType IsEndmdlRecordType IsTerRecordType IsMasterRecordType ReadPDBFile ParseHeaderRecordLine GenerateHeaderRecordLine GenerateHeaderRecordTimeStamp ParseAtomRecordLine GenerateAtomRecordLine ParseAtomOrHetatmRecordLine GenerateAtomOrHetatmRecordLine GenerateHetatmRecordLine ParseHetatmRecordLine ParseConectRecordLine GenerateConectRecordLine ParseSeqresRecordLine ParseTerRecordLine GenerateTerRecordLine ParseMasterRecordLine GenerateEndRecordLine); 40 @EXPORT_OK = qw(); 41 42 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 43 44 # Get PDB record type... 45 sub GetPDBRecordType { 46 my($Line) = @_; 47 48 return _GetRecordType($Line); 49 } 50 51 # Is it a PDB file? 52 sub IsPDBFile { 53 my($PDBFile) = @_; 54 my($Line, $Status); 55 56 $Status = 0; 57 open PDBFILE, "$PDBFile" or die "Can't open $PDBFile: $!\n"; 58 $Line = GetTextLine(\*PDBFILE); 59 $Status = ($Line =~ /^HEADER/i) ? 1 : 0; 60 close PDBFILE; 61 62 return $Status; 63 } 64 65 # Is it a atom record type? 66 sub IsAtomRecordType { 67 my($Line) = @_; 68 69 return _IsRecordType($Line, 'ATOM'); 70 } 71 72 # Is it a connect record type? 73 sub IsConectRecordType { 74 my($Line) = @_; 75 76 return _IsRecordType($Line, 'CONECT'); 77 } 78 79 # Is it a header atom record type? 80 sub IsHeaderRecordType { 81 my($Line) = @_; 82 83 return _IsRecordType($Line, 'HEADER'); 84 } 85 86 # Is it a hetro atom record type? 87 sub IsHetatmRecordType { 88 my($Line) = @_; 89 90 return _IsRecordType($Line, 'HETATM'); 91 } 92 93 # Is it a seqres record type? 94 sub IsSeqresRecordType { 95 my($Line) = @_; 96 97 return _IsRecordType($Line, 'SEQRES'); 98 } 99 100 # Is it a MODEL record type? 101 sub IsModelRecordType { 102 my($Line) = @_; 103 104 return _IsRecordType($Line, 'MODEL'); 105 } 106 107 # Is it a ENDMDL record type? 108 sub IsEndmdlRecordType { 109 my($Line) = @_; 110 111 return _IsRecordType($Line, 'ENDMDL'); 112 } 113 114 # Is it a TER record type? 115 sub IsTerRecordType { 116 my($Line) = @_; 117 118 return _IsRecordType($Line, 'TER'); 119 } 120 121 # Is it a MASTER record type? 122 sub IsMasterRecordType { 123 my($Line) = @_; 124 125 return _IsRecordType($Line, 'MASTER'); 126 } 127 128 # Count the number of each record type and a reference to data type with these key/value pairs: 129 # {RecordTypes} - An array of unique record types in order of their presence in the file 130 # {Count}{$RecordType} - Count of each record type 131 # {Lines}{$RecordType} - Optional lines data for a specific record type. 132 # 133 sub GetRecordTypesCount { 134 my($PDBRecordLinesRef, $SpecifiedRecordType, $GetRecordLinesFlag, $RecordType, $RecordLine, %RecordTypeDataMap); 135 136 %RecordTypeDataMap = (); 137 @{$RecordTypeDataMap{RecordTypes}} = (); 138 %{$RecordTypeDataMap{Count}} = (); 139 %{$RecordTypeDataMap{Lines}} = (); 140 141 $SpecifiedRecordType = ''; 142 $GetRecordLinesFlag = 0; 143 if (@_ == 3) { 144 ($PDBRecordLinesRef, $SpecifiedRecordType, $GetRecordLinesFlag) = @_; 145 $SpecifiedRecordType = uc $SpecifiedRecordType; 146 } 147 elsif (@_ == 2) { 148 ($PDBRecordLinesRef, $SpecifiedRecordType) = @_; 149 $SpecifiedRecordType = uc $SpecifiedRecordType; 150 } 151 else { 152 ($PDBRecordLinesRef) = @_; 153 } 154 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 155 $RecordType = _GetRecordType($RecordLine); 156 if ($SpecifiedRecordType && ($SpecifiedRecordType ne $RecordType)) { 157 next LINE; 158 } 159 if (exists $RecordTypeDataMap{Count}{$RecordType}) { 160 # Update count... 161 $RecordTypeDataMap{Count}{$RecordType} += 1; 162 163 if ($GetRecordLinesFlag) { 164 push @{$RecordTypeDataMap{Lines}{$RecordType}}, $RecordLine; 165 } 166 } 167 else { 168 # New record type... 169 push @{$RecordTypeDataMap{RecordTypes}}, $RecordType; 170 $RecordTypeDataMap{Count}{$RecordType} = 1; 171 172 if ($GetRecordLinesFlag) { 173 @{$RecordTypeDataMap{Lines}{$RecordType}} = (); 174 push @{$RecordTypeDataMap{Lines}{$RecordType}}, $RecordLine; 175 } 176 } 177 } 178 return (\%RecordTypeDataMap); 179 } 180 181 # Collect CONECT record lines for specific atom number, modified specified data to exclude any atom 182 # number not present in the list of specified atom numbers and return a reference to list of 183 # CONECT record lines. 184 # 185 sub GetConectRecordLines { 186 my($PDBRecordLinesRef, $AtomNumbersMapRef) = @_; 187 my($AtomNumber, $ConectAtomNumber, $RecordLine, @ConectRecordAtomNums, @ConectRecordLines); 188 189 @ConectRecordLines = (); 190 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 191 if (!IsConectRecordType($RecordLine)) { 192 next LINE; 193 } 194 @ConectRecordAtomNums = (); 195 push @ConectRecordAtomNums, ParseConectRecordLine($RecordLine); 196 ATOMNUMBER: for $ConectAtomNumber (@ConectRecordAtomNums) { 197 if (defined $ConectAtomNumber) { 198 $AtomNumber = $ConectAtomNumber; 199 if ($AtomNumber) { 200 if (! exists $AtomNumbersMapRef->{$AtomNumber}) { 201 next LINE; 202 } 203 } 204 } 205 } 206 push @ConectRecordLines, $RecordLine; 207 } 208 return \@ConectRecordLines; 209 } 210 211 # Get chains and residue information using ATOM/HETATM or SEQRES records. And return a reference to a 212 # hash with these keys: 213 # 214 # @{$ChainsDataMap{ChainIDs}} - List of chain IDs with 'None' for no chain identification 215 # @{$ChainsDataMap{Residues}{$ChainID}} - List of residues in order of their appearance in a chain 216 # %{$ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName}} - Count of specific residues in a chain 217 # 218 # Notes: 219 # . Chains and residue data can be extacted using either ATOM/HETATM records or SEQRES records. 220 # . In addition to a different chain ID in ATOM/HETATM a TER record also indicates end of an existing chain 221 # and start of a new one: ChainID in ATOM/HETATM records might still be emtpy. 222 # . ATOM/HETATM records after the first ENDMDL records are simply ingnored. 223 # 224 sub GetChainsAndResidues { 225 my($PDBRecordLinesRef, $RecordsSource, $GetChainResiduesBeyondTERFlag, $GetRecordLinesFlag); 226 227 $RecordsSource = 'AtomAndHetatm'; 228 $GetChainResiduesBeyondTERFlag = 0; 229 $GetRecordLinesFlag = 0; 230 231 if (@_ == 4) { 232 ($PDBRecordLinesRef, $RecordsSource, $GetChainResiduesBeyondTERFlag, $GetRecordLinesFlag) = @_; 233 } 234 elsif (@_ == 3) { 235 ($PDBRecordLinesRef, $RecordsSource, $GetChainResiduesBeyondTERFlag) = @_; 236 } 237 elsif (@_ == 2) { 238 ($PDBRecordLinesRef, $RecordsSource) = @_; 239 } 240 else { 241 ($PDBRecordLinesRef) = @_; 242 } 243 244 if ($RecordsSource =~ /^AtomAndHetatm$/i) { 245 return _GetChainsAndResiduesFromAtomHetatmRecords($PDBRecordLinesRef, $GetChainResiduesBeyondTERFlag, $GetRecordLinesFlag); 246 } 247 elsif ($RecordsSource =~ /^Seqres$/i) { 248 return _GetChainsAndResiduesFromSeqresRecords($PDBRecordLinesRef); 249 } 250 else { 251 my(%ChainsDataMap); 252 %ChainsDataMap = (); 253 @{$ChainsDataMap{ChainIDs}} = (); 254 %{$ChainsDataMap{Residues}} = (); 255 %{$ChainsDataMap{ResidueCount}} = (); 256 257 return \%ChainsDataMap; 258 } 259 } 260 261 262 # Get residue information using ATOM/HETATM records and return a reference to a hash with 263 # these keys: 264 # 265 # @{$ResiduesDataMap{ResidueNames}} - List of all the residues 266 # %{$ResiduesDataMap{ResidueCount}{$ResidueName}} - Count of residues 267 # @{$ResiduesDataMap{AtomResidueNames}} - List of all the residues 268 # %{$ResiduesDataMap{AtomResidueCount}{$ResidueName}} - Count of residues in ATOM records 269 # @{$ResiduesDataMap{HetatomResidueNames}} - List of all the residues 270 # %{$ResiduesDataMap{HetatmResidueCount}{$ResidueName}} - Count of residues HETATM records 271 # 272 # Notes: 273 # . ATOM/HETATM records after the first ENDMDL records are simply ingnored. 274 # 275 sub GetAllResidues { 276 my($PDBRecordLinesRef) = @_; 277 278 my($PreviousChainID, $PreviousResidueNumber, $RecordLine, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, %ResiduesDataMap); 279 280 %ResiduesDataMap = (); 281 @{$ResiduesDataMap{ResidueNames}} = (); 282 %{$ResiduesDataMap{ResidueCount}} = (); 283 @{$ResiduesDataMap{AtomResidueNames}} = (); 284 %{$ResiduesDataMap{AtomResidueCount}} = (); 285 @{$ResiduesDataMap{HetatmResidueNames}} = (); 286 %{$ResiduesDataMap{HetatmResidueCount}} = (); 287 288 $PreviousChainID = ''; 289 $PreviousResidueNumber = 0; 290 291 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 292 if (IsEndmdlRecordType($RecordLine)) { 293 last LINE; 294 } 295 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { 296 next LINE; 297 } 298 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine); 299 300 if ($PreviousChainID eq $ChainID) { 301 if ($ResidueNumber == $PreviousResidueNumber) { 302 next LINE; 303 } 304 $PreviousResidueNumber = $ResidueNumber; 305 } 306 else { 307 # New chain... 308 $PreviousChainID = $ChainID; 309 $PreviousResidueNumber = $ResidueNumber; 310 } 311 312 # Store the residue and update its count... 313 push @{$ResiduesDataMap{ResidueNames}}, $ResidueName; 314 if (exists $ResiduesDataMap{ResidueCount}{$ResidueName}) { 315 $ResiduesDataMap{ResidueCount}{$ResidueName} += 1; 316 } 317 else { 318 $ResiduesDataMap{ResidueCount}{$ResidueName} = 1; 319 } 320 # Update ATOM residue data... 321 if (IsAtomRecordType($RecordLine)) { 322 push @{$ResiduesDataMap{AtomResidueNames}}, $ResidueName; 323 if (exists $ResiduesDataMap{AtomResidueCount}{$ResidueName}) { 324 $ResiduesDataMap{AtomResidueCount}{$ResidueName} += 1; 325 } 326 else { 327 $ResiduesDataMap{AtomResidueCount}{$ResidueName} = 1; 328 } 329 } 330 # Update HETATM residue data... 331 if (IsHetatmRecordType($RecordLine)) { 332 push @{$ResiduesDataMap{HetatmResidueNames}}, $ResidueName; 333 if (exists $ResiduesDataMap{HetatmResidueCount}{$ResidueName}) { 334 $ResiduesDataMap{HetatmResidueCount}{$ResidueName} += 1; 335 } 336 else { 337 $ResiduesDataMap{HetatmResidueCount}{$ResidueName} = 1; 338 } 339 } 340 } 341 342 return \%ResiduesDataMap; 343 } 344 345 # Return min/max XYZ coordinates for ATOM/HETATM records... 346 sub GetMinMaxCoords { 347 my($PDBRecordLinesRef) = @_; 348 349 my($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax, $RecordLine, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge); 350 351 ($XMin, $YMin, $ZMin) = (99999) x 3; 352 ($XMax, $YMax, $ZMax) = (-99999) x 3; 353 354 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 355 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { 356 next LINE; 357 } 358 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine); 359 360 $XMin = ($X < $XMin) ? $X : $XMin; 361 $YMin = ($Y < $YMin) ? $Y : $YMin; 362 $ZMin = ($Z < $ZMin) ? $Z : $ZMin; 363 364 $XMax = ($X > $XMax) ? $X : $XMax; 365 $YMax = ($Y > $YMax) ? $Y : $YMax; 366 $ZMax = ($Z > $ZMax) ? $Z : $ZMax; 367 } 368 369 if ($XMin == 99999) { $XMin = undef; } 370 if ($YMin == 99999) { $YMin = undef; } 371 if ($ZMin == 99999) { $ZMin = undef; } 372 if ($XMax == -99999) { $XMax = undef; } 373 if ($YMax == -99999) { $YMax = undef; } 374 if ($ZMax == -99999) { $ZMax = undef; } 375 376 return ($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax); 377 } 378 379 # Read PDB file and return reference to record lines.. 380 sub ReadPDBFile { 381 my($PDBFile) = @_; 382 383 my($Line, @PDBRecordLines); 384 385 @PDBRecordLines = (); 386 open PDBFILE, "$PDBFile" or die "Can't open $PDBFile: $!\n"; 387 while ($Line = GetTextLine(\*PDBFILE)) { 388 push @PDBRecordLines, $Line; 389 } 390 391 close PDBFILE; 392 393 return (\@PDBRecordLines); 394 } 395 396 # 397 # Parse HEADER record line... 398 sub ParseHeaderRecordLine { 399 my($Line) = @_; 400 my($Classification, $DepositionDate, $IDCode) = (undef, undef, undef); 401 402 if ($Line !~ /^HEADER/i) { 403 return ($Classification, $DepositionDate, $IDCode); 404 } 405 ($Classification, $DepositionDate, $IDCode) = unpack("x10A40A9x3A4", $Line); 406 $Classification = RemoveLeadingAndTrailingWhiteSpaces($Classification); 407 $DepositionDate =~ s/ //g; 408 $IDCode =~ s/ //g; 409 410 return ($Classification, $DepositionDate, $IDCode); 411 } 412 413 # 414 # Generate HEADER record line... 415 sub GenerateHeaderRecordLine { 416 my($Classification, $Date, $IDCode, $Line); 417 418 $Classification = "Created using MayaChemTools"; 419 $Date = GenerateHeaderRecordTimeStamp(); 420 if (@_ == 3) { 421 ($IDCode, $Classification, $Date) = @_; 422 } 423 elsif (@_ == 2) { 424 ($IDCode, $Classification) = @_; 425 } 426 elsif (@_ == 1) { 427 ($IDCode) = @_; 428 } 429 430 $Line = sprintf "HEADER %-40.40s%9.9s %4.4s", $Classification, $Date, $IDCode; 431 return $Line; 432 } 433 434 # Generate PDB header time stamp... 435 sub GenerateHeaderRecordTimeStamp { 436 my($Date, $Sec, $Min, $Hour, $MDay, $Mon, $Year, $WDay, $YDay, $IsDst); 437 my($MonthName, %MonthNameHash); 438 439 %MonthNameHash = (1 => 'JAN', 2 => 'FEB', 3 => 'MAR', 4 => 'APR', 5 => 'MAY', 6 => 'JUN', 7 => 'JUL', 8 => 'AUG', 9 => 'SEP', 10 => 'OCT', 11 => 'NOV', 12 => 'DEC'); 440 441 ($Sec, $Min, $Hour, $MDay, $Mon, $Year, $WDay, $YDay, $IsDst) = localtime; 442 443 $Mon += 1; 444 $MonthName = $MonthNameHash{$Mon}; 445 446 $Year = substr($Year, -1, 2); 447 448 $Date = sprintf "%2i-%3s-%2i", $MDay, $MonthName, $Year; 449 $Date =~ s/ /0/g; 450 451 return $Date; 452 } 453 454 # 455 # Parse ATOM record line. 456 # 457 sub ParseAtomRecordLine { 458 my($Line) = @_; 459 460 return _ParseAtomOrHetatmRecordLine($Line); 461 } 462 463 # Generate ATOM record line... 464 sub GenerateAtomRecordLine { 465 my($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = @_; 466 467 return _GenerateAtomOrHetatmRecordLine('ATOM', $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge); 468 } 469 470 # 471 # Parse ATOM/HETATm record line. 472 # 473 sub ParseAtomOrHetatmRecordLine { 474 my($Line) = @_; 475 476 return _ParseAtomOrHetatmRecordLine($Line); 477 } 478 479 # Generate ATOM/HETATM record line... 480 sub GenerateAtomOrHetatmRecordLine { 481 my($RecordType, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = @_; 482 483 return _GenerateAtomOrHetatmRecordLine($RecordType, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge); 484 } 485 # 486 # Parse HETATM record line... 487 # 488 sub ParseHetatmRecordLine { 489 my($Line) = @_; 490 491 return _ParseAtomOrHetatmRecordLine($Line); 492 } 493 494 # Generate HETATM record line... 495 sub GenerateHetatmRecordLine { 496 my($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = @_; 497 498 return _GenerateAtomOrHetatmRecordLine('HETATM', $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge); 499 } 500 # 501 # Parse SEQRES record line... 502 # 503 # SEQRES format: 504 # 505 # 1 - 6 Record name "SEQRES" 506 # 9 - 10 Serial number of the SEQRES record for the current chain. Starts at 1 and increments by one each line. Reset to 1 for each chain. 507 # 12 - Chain identifier 508 # 14 - 17 Integer numRes Number of residues in the chain 509 # 20 - 22 24 -26 ... ... 68 - 70 Residue name resName Residue name. 510 # 511 sub ParseSeqresRecordLine { 512 my($Line) = @_; 513 514 if ($Line !~ /^SEQRES/i) { 515 return ((undef) x 5); 516 } 517 my($RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueNames) = unpack("x8A2x1A1x1A4x2A51" , $Line); 518 $RecordSerialNumber =~ s/ //g; 519 $ChainID =~ s/ //g; 520 $NumOfResidues =~ s/ //g; 521 $ResidueNames = RemoveLeadingAndTrailingWhiteSpaces($ResidueNames); 522 523 return ($RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueNames); 524 } 525 526 # 527 # Parse CONECT record line... 528 # 529 # CONECT format: 530 # 531 # 1 - 6 Record name "CONECT" 532 # 7 - 11 Atom number 533 # 12 - 16, 17 - 21, 22 - 26, 27 - 31 Atom number of bonded atom 534 # 535 # 32 - 36, 37 - 41 Atom number of hydrogen bonded atom 536 # 42 - 46 Atom number of salt bridged atom 537 # 47 - 51, 52 -56 Atom number of hydrogen bonded atom 538 # 57 - 61 Atom number of salt bridged atom 539 # 540 sub ParseConectRecordLine { 541 my($Line) = @_; 542 543 if ($Line !~ /^CONECT/i) { 544 return ((undef) x 11); 545 } 546 my($AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2) = map {s/ //g; $_} unpack("x6A5A5A5A5A5A5A5A5A5A5A5", $Line); 547 548 return ($AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2); 549 } 550 551 # Generate CONECT record line... 552 sub GenerateConectRecordLine { 553 my($AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2) = @_; 554 my($Line); 555 556 $Line = sprintf "CONECT%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s", $AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2; 557 558 return $Line; 559 } 560 561 # 562 # Parse TER record line... 563 # 564 # TER format: 565 # 566 #1 - 6 Record name "TER " 567 # 7 - 11 Serial number 568 # 18 - 20 Residue name 569 # 22 Chain identifier 570 # 23 - 26 Residue sequence number 571 # 27 Insertion code 572 # 573 sub ParseTerRecordLine { 574 my($Line) = @_; 575 576 if ($Line !~ /^TER/i) { 577 return ((undef) x 5); 578 } 579 my($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode) = map {s/ //g; $_} unpack("x6A5x6A3xA1A4A1", $Line); 580 581 return ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode); 582 } 583 584 # Generate TER record line... 585 sub GenerateTerRecordLine { 586 my($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $Line) = ('') x 6; 587 588 if (@_ == 5) { 589 ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode) = @_; 590 } 591 elsif (@_ == 4) { 592 ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber) = @_; 593 } 594 elsif (@_ == 3) { 595 ($SerialNumber, $ResidueName) = @_; 596 } 597 elsif (@_ == 2) { 598 ($SerialNumber, $ResidueName) = @_; 599 } 600 elsif (@_ == 1) { 601 ($SerialNumber) = @_; 602 } 603 $Line = sprintf "TER %5.5s %-3.3s %1.1s%4.4s%1.1s", $SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode; 604 605 return $Line; 606 } 607 608 # 609 # Parse MASTER record line... 610 # 611 # MASTER record format: 612 # 613 #1 - 6 Record name "MASTER" 614 # 11 - 15 Number of REMARK records 615 # 16 - 20 "0" 616 # 21 - 25 Number of HET records 617 # 26 - 30 Number of HELIX records 618 # 31 - 35 Number of SHEET records 619 # 36 - 40 Number of TURN records 620 # 41 - 45 Number of SITE records 621 # 46 - 50 Number of coordinate transformation records (ORIGXn+SCALEn+MTRIXn) 622 # 51 - 55 Number of atomic coordinate records (ATOM+HETATM) 623 # 56 - 60 Number of TER records 624 # 61 - 65 Number of CONECT records 625 # 66 - 70 Number of SEQRES records 626 # 627 sub ParseMasterRecordLine { 628 my($Line) = @_; 629 630 if ($Line !~ /^MASTER/i) { 631 return ((undef) x 11); 632 } 633 my($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = map {s/ //g; $_} unpack("x6x4A5x5A5A5A5A5A5A5A5A5A5A5", $Line); 634 635 return ($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords); 636 } 637 638 # End record... 639 sub GenerateEndRecordLine { 640 my($Line); 641 $Line = 'END '; 642 return $Line; 643 } 644 645 # ATOM/HETATM record format: 646 # 647 # 1 - 6 Record name 648 # 7 - 11 Atom serial number - right justified 649 # 13 - 16 Atom name 650 # 17 Alternate location indicator. 651 # 18 - 20 Residue name - right justified 652 # 22 Chain identifier. 653 # 23 - 26 Residue sequence number - right justified 654 # 27 Code for insertion of residues. 655 # 31 - 38 Real(8.3), Orthogonal coordinates for X in Angstroms. 656 # 39 - 46 Real(8.3), Orthogonal coordinates for Y in Angstroms. 657 # 47 - 54 Real(8.3), Orthogonal coordinates for Z in Angstroms. 658 # 55 - 60 Real(6.2), Occupancy 659 # 61 - 66 Real(6.2), Temperature factor 660 # 73 - 76 LString(4), Segment identifier, left-justified. 661 # 77 - 78 LString(2), Element symbol, right-justified. 662 #79 - 80 LString(2), Charge on the atom. 663 # 664 # Notes: 665 # . Atom names starting with C, N, O and S are left justified starting with column 14 666 # and others are left justified starting with column 13. 667 # 668 sub _ParseAtomOrHetatmRecordLine { 669 my($Line) = @_; 670 671 if ($Line !~ /^(ATOM|HETATM)/i) { 672 return ((undef) x 15); 673 } 674 my($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = map {s/ //g; $_} unpack("x6A5xA4A1A3xA1A4A1x3A8A8A8A6A6x6A4A2A2", $Line); 675 676 return($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge); 677 } 678 679 # Generate ATOM/HETATM record line... 680 sub _GenerateAtomOrHetatmRecordLine { 681 my($RecordType, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = @_; 682 my($Line, $AtomNameFormat); 683 684 if ($AtomName =~ /^(C|N|O|S)/i) { 685 # Left justified starting at column 14... 686 $AtomNameFormat = " %-3.3s"; 687 } 688 else { 689 $AtomNameFormat = "%-4.4s"; 690 } 691 $Line = sprintf "%-6.6s%5.5s ${AtomNameFormat}%1.1s%3.3s %1.1s%4.4s%1.1s %8.8s%8.8s%8.8s%6.6s%6.6s %-4.4s%2.2s%2.2s", $RecordType, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge; 692 693 return $Line; 694 } 695 696 # Check record type... 697 sub _IsRecordType { 698 my($Line, $SpecifiedType) = @_; 699 my($Type, $Status); 700 701 ($Type) = map {s/ //g; $_} unpack("A6", $Line); 702 703 $Status = ($SpecifiedType eq $Type) ? 1 : 0; 704 705 return $Status; 706 } 707 708 # Get record type... 709 sub _GetRecordType { 710 my($Line) = @_; 711 my($Type); 712 713 ($Type) = map {s/ //g; $_} unpack("A6", $Line); 714 715 return $Type; 716 } 717 718 # Get chains and residues data using ATOM/HETATM records... 719 # 720 sub _GetChainsAndResiduesFromAtomHetatmRecords { 721 my($PDBRecordLinesRef, $GetChainResiduesBeyondTERFlag, $GetRecordLinesFlag) = @_; 722 723 my($LineCount, $TotalChainCount, $PreviousResidueNumber, $ChainCount, $DefaultChainID, $DefaultChainLabel, $RecordLine, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, %ChainsDataMap); 724 725 # Do a quick chain count using TER record... 726 $TotalChainCount = 0; 727 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 728 if (IsEndmdlRecordType($RecordLine)) { 729 last LINE; 730 } 731 if (IsTerRecordType($RecordLine)) { 732 $TotalChainCount++; 733 } 734 } 735 736 %ChainsDataMap = (); 737 @{$ChainsDataMap{ChainIDs}} = (); 738 %{$ChainsDataMap{Residues}} = (); 739 %{$ChainsDataMap{Lines}} = (); 740 %{$ChainsDataMap{ResidueCount}} = (); 741 742 $LineCount = 0; 743 $ChainCount = 0; 744 $DefaultChainLabel = 'None'; 745 $DefaultChainID = $DefaultChainLabel . ($ChainCount + 1); 746 $PreviousResidueNumber = 0; 747 748 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 749 $LineCount++; 750 if (IsTerRecordType($RecordLine)) { 751 $DefaultChainID = $DefaultChainLabel . ($ChainCount + 1); 752 $ChainCount++; 753 if ($ChainCount == $TotalChainCount) { 754 last LINE; 755 } 756 else { 757 next LINE; 758 } 759 } 760 elsif (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { 761 next LINE; 762 } 763 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine); 764 765 if (IsEmpty($ChainID)) { 766 $ChainID = $DefaultChainID; 767 } 768 if (exists $ChainsDataMap{Residues}{$ChainID}) { 769 # Data for existing chain... 770 if ($GetRecordLinesFlag) { 771 push @{$ChainsDataMap{Lines}{$ChainID}}, $RecordLine; 772 } 773 774 if ($ResidueNumber != $PreviousResidueNumber) { 775 # Next residue with in the chain... 776 push @{$ChainsDataMap{Residues}{$ChainID}}, $ResidueName; 777 778 if (exists $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName}) { 779 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} += 1; 780 } 781 else { 782 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} = 1; 783 } 784 $PreviousResidueNumber = $ResidueNumber; 785 } 786 } 787 else { 788 # Data for new chain... 789 push @{$ChainsDataMap{ChainIDs}}, $ChainID; 790 791 @{$ChainsDataMap{Residues}{$ChainID}} = (); 792 push @{$ChainsDataMap{Residues}{$ChainID}}, $ResidueName; 793 794 @{$ChainsDataMap{Lines}{$ChainID}} = (); 795 if ($GetRecordLinesFlag) { 796 push @{$ChainsDataMap{Lines}{$ChainID}}, $RecordLine; 797 } 798 799 %{$ChainsDataMap{ResidueCount}{$ChainID}} = (); 800 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} = 1; 801 $PreviousResidueNumber = $ResidueNumber; 802 } 803 } 804 if (!$GetChainResiduesBeyondTERFlag) { 805 return \%ChainsDataMap; 806 } 807 # Look for any HETATM residues specified outside TER records which could belong to an existing chain... 808 my($LineIndex, $PreviousChainID); 809 $PreviousChainID = ''; 810 $PreviousResidueNumber = 0; 811 LINE: for $LineIndex (($LineCount - 1) .. $#{$PDBRecordLinesRef}) { 812 $RecordLine = $PDBRecordLinesRef->[$LineIndex]; 813 if (IsEndmdlRecordType($RecordLine)) { 814 last LINE; 815 } 816 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { 817 next LINE; 818 } 819 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine); 820 if (IsEmpty($ChainID)) { 821 # Ignore the chains with no ids... 822 next LINE; 823 } 824 if (! exists($ChainsDataMap{Residues}{$ChainID})) { 825 # Don't collect any new chains after TER record... 826 next LINE; 827 } 828 if ($GetRecordLinesFlag) { 829 push @{$ChainsDataMap{Lines}{$ChainID}}, $RecordLine; 830 } 831 if ($ResidueNumber != $PreviousResidueNumber || $ChainID ne $PreviousChainID) { 832 833 push @{$ChainsDataMap{Residues}{$ChainID}}, $ResidueName; 834 835 if (exists $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName}) { 836 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} += 1; 837 } 838 else { 839 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} = 1; 840 } 841 $PreviousChainID = $ChainID; 842 $PreviousResidueNumber = $ResidueNumber; 843 } 844 } 845 return \%ChainsDataMap; 846 } 847 848 # Get chains and residues data using SEQRES records... 849 # 850 sub _GetChainsAndResiduesFromSeqresRecords { 851 my($PDBRecordLinesRef) = @_; 852 853 my($ChainCount, $DefaultChainLabel, $DefaultChainID, $RecordLine, $RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueName, $ResidueNamesString, @ResidueNamesList, %ChainsDataMap); 854 855 %ChainsDataMap = (); 856 @{$ChainsDataMap{ChainIDs}} = (); 857 %{$ChainsDataMap{Residues}} = (); 858 %{$ChainsDataMap{ResidueCount}} = (); 859 860 $ChainCount = 0; 861 $DefaultChainLabel = 'None'; 862 $DefaultChainID = $DefaultChainLabel . ($ChainCount + 1); 863 864 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 865 if (!IsSeqresRecordType($RecordLine)) { 866 next LINE; 867 } 868 ($RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueNamesString) = ParseSeqresRecordLine($RecordLine); 869 if ($RecordSerialNumber == 1) { 870 # Indicates start of a new chain... 871 $DefaultChainID = $DefaultChainLabel . ($ChainCount + 1); 872 $ChainCount++; 873 } 874 if (IsEmpty($ChainID)) { 875 $ChainID = $DefaultChainID; 876 } 877 # Process the residues... 878 @ResidueNamesList = split /[ ]+/, $ResidueNamesString; 879 880 if (exists $ChainsDataMap{Residues}{$ChainID}) { 881 # Data for existing chain... 882 push @{$ChainsDataMap{Residues}{$ChainID}}, @ResidueNamesList; 883 } 884 else { 885 # Data for new chain... 886 push @{$ChainsDataMap{ChainIDs}}, $ChainID; 887 @{$ChainsDataMap{Residues}{$ChainID}} = (); 888 push @{$ChainsDataMap{Residues}{$ChainID}}, @ResidueNamesList; 889 } 890 891 # Setup residue count... 892 for $ResidueName (@ResidueNamesList) { 893 if (exists $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName}) { 894 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} += 1; 895 } 896 else { 897 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} = 1; 898 } 899 } 900 } 901 return \%ChainsDataMap; 902 } 903