MayaChemTools

   1 package PDBFileUtil;
   2 #
   3 # $RCSfile: PDBFileUtil.pm,v $
   4 # $Date: 2011/12/16 00:04:11 $
   5 # $Revision: 1.28 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2012 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use Exporter;
  31 use Text::ParseWords;
  32 use TextUtil;
  33 use FileUtil;
  34 use TimeUtil ();
  35 
  36 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  37 
  38 @ISA = qw(Exporter);
  39 @EXPORT = qw(GetPDBRecordType GetRecordTypesCount GetAllResidues GetConectRecordLines GetChainsAndResidues GetMinMaxCoords IsPDBFile IsAtomRecordType IsConectRecordType IsHeaderRecordType IsHetatmRecordType IsSeqresRecordType IsModelRecordType IsEndmdlRecordType IsTerRecordType IsMasterRecordType ReadPDBFile ParseHeaderRecordLine GenerateHeaderRecordLine GenerateHeaderRecordTimeStamp ParseAtomRecordLine GenerateAtomRecordLine ParseAtomOrHetatmRecordLine GenerateAtomOrHetatmRecordLine GenerateHetatmRecordLine ParseHetatmRecordLine ParseConectRecordLine GenerateConectRecordLine ParseSeqresRecordLine ParseTerRecordLine GenerateTerRecordLine ParseMasterRecordLine GenerateEndRecordLine);
  40 @EXPORT_OK = qw();
  41 
  42 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  43 
  44 # Get PDB record type...
  45 sub GetPDBRecordType {
  46   my($Line) = @_;
  47 
  48   return _GetRecordType($Line);
  49 }
  50 
  51 # Is it a PDB file?
  52 sub IsPDBFile {
  53   my($PDBFile) = @_;
  54   my($Line, $Status);
  55 
  56   $Status = 0;
  57   open PDBFILE, "$PDBFile" or die "Can't open $PDBFile: $!\n";
  58   $Line = GetTextLine(\*PDBFILE);
  59   $Status = ($Line =~ /^HEADER/i) ? 1 : 0;
  60   close PDBFILE;
  61 
  62   return $Status;
  63 }
  64 
  65 # Is it a atom record type?
  66 sub IsAtomRecordType {
  67   my($Line) = @_;
  68 
  69   return _IsRecordType($Line, 'ATOM');
  70 }
  71 
  72 # Is it a connect record type?
  73 sub IsConectRecordType {
  74   my($Line) = @_;
  75 
  76   return _IsRecordType($Line, 'CONECT');
  77 }
  78 
  79 # Is it a header atom record type?
  80 sub IsHeaderRecordType {
  81   my($Line) = @_;
  82 
  83   return _IsRecordType($Line, 'HEADER');
  84 }
  85 
  86 # Is it a hetro atom record type?
  87 sub IsHetatmRecordType {
  88   my($Line) = @_;
  89 
  90   return _IsRecordType($Line, 'HETATM');
  91 }
  92 
  93 # Is it a seqres record type?
  94 sub IsSeqresRecordType {
  95   my($Line) = @_;
  96 
  97   return _IsRecordType($Line, 'SEQRES');
  98 }
  99 
 100 # Is it a MODEL record type?
 101 sub IsModelRecordType {
 102   my($Line) = @_;
 103 
 104   return _IsRecordType($Line, 'MODEL');
 105 }
 106 
 107 # Is it a ENDMDL record type?
 108 sub IsEndmdlRecordType {
 109   my($Line) = @_;
 110 
 111   return _IsRecordType($Line, 'ENDMDL');
 112 }
 113 
 114 # Is it a TER record type?
 115 sub IsTerRecordType {
 116   my($Line) = @_;
 117 
 118   return _IsRecordType($Line, 'TER');
 119 }
 120 
 121 # Is it a MASTER record type?
 122 sub IsMasterRecordType {
 123   my($Line) = @_;
 124 
 125   return _IsRecordType($Line, 'MASTER');
 126 }
 127 
 128 # Count the number of each record type and a reference to data type with these key/value pairs:
 129 # {RecordTypes} - An array of unique record types in order of their presence in the file
 130 # {Count}{$RecordType} - Count of each record type
 131 # {Lines}{$RecordType} - Optional lines data for a specific record type.
 132 #
 133 sub GetRecordTypesCount {
 134   my($PDBRecordLinesRef, $SpecifiedRecordType, $GetRecordLinesFlag, $RecordType, $RecordLine, %RecordTypeDataMap);
 135 
 136   %RecordTypeDataMap = ();
 137   @{$RecordTypeDataMap{RecordTypes}} = ();
 138   %{$RecordTypeDataMap{Count}} = ();
 139   %{$RecordTypeDataMap{Lines}} = ();
 140 
 141   $SpecifiedRecordType = '';
 142   $GetRecordLinesFlag = 0;
 143   if (@_ == 3) {
 144     ($PDBRecordLinesRef, $SpecifiedRecordType, $GetRecordLinesFlag) = @_;
 145     $SpecifiedRecordType = uc $SpecifiedRecordType;
 146   }
 147   elsif (@_ == 2) {
 148     ($PDBRecordLinesRef, $SpecifiedRecordType) = @_;
 149     $SpecifiedRecordType = uc $SpecifiedRecordType;
 150   }
 151   else {
 152     ($PDBRecordLinesRef) = @_;
 153   }
 154   LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
 155     $RecordType = _GetRecordType($RecordLine);
 156     if ($SpecifiedRecordType && ($SpecifiedRecordType ne $RecordType)) {
 157       next LINE;
 158     }
 159     if (exists $RecordTypeDataMap{Count}{$RecordType}) {
 160       # Update count...
 161       $RecordTypeDataMap{Count}{$RecordType} += 1;
 162 
 163       if ($GetRecordLinesFlag) {
 164         push @{$RecordTypeDataMap{Lines}{$RecordType}}, $RecordLine;
 165       }
 166     }
 167     else {
 168       # New record type...
 169       push @{$RecordTypeDataMap{RecordTypes}}, $RecordType;
 170       $RecordTypeDataMap{Count}{$RecordType} = 1;
 171 
 172       if ($GetRecordLinesFlag) {
 173         @{$RecordTypeDataMap{Lines}{$RecordType}} = ();
 174         push @{$RecordTypeDataMap{Lines}{$RecordType}}, $RecordLine;
 175       }
 176     }
 177   }
 178   return (\%RecordTypeDataMap);
 179 }
 180 
 181 # Collect CONECT record lines for specific atom number, modified specified data to exclude any atom
 182 # number not present in the list of specified atom numbers and return a reference to list of
 183 # CONECT record lines.
 184 #
 185 sub GetConectRecordLines {
 186   my($PDBRecordLinesRef, $AtomNumbersMapRef) = @_;
 187   my($AtomNumber, $ConectAtomNumber, $RecordLine, @ConectRecordAtomNums, @ConectRecordLines);
 188 
 189   @ConectRecordLines = ();
 190   LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
 191     if (!IsConectRecordType($RecordLine)) {
 192       next LINE;
 193     }
 194     @ConectRecordAtomNums = ();
 195     push @ConectRecordAtomNums, ParseConectRecordLine($RecordLine);
 196     ATOMNUMBER: for $ConectAtomNumber (@ConectRecordAtomNums) {
 197       if (defined $ConectAtomNumber) {
 198         $AtomNumber = $ConectAtomNumber;
 199         if ($AtomNumber) {
 200           if (! exists $AtomNumbersMapRef->{$AtomNumber}) {
 201             next LINE;
 202           }
 203         }
 204       }
 205     }
 206     push @ConectRecordLines, $RecordLine;
 207   }
 208   return \@ConectRecordLines;
 209 }
 210 
 211 # Get chains and residue information using ATOM/HETATM or SEQRES records. And return a reference to a
 212 #  hash with these keys:
 213 #
 214 # @{$ChainsDataMap{ChainIDs}} - List of chain IDs with 'None' for no chain identification
 215 # @{$ChainsDataMap{Residues}{$ChainID}} - List of residues in order of their appearance in a chain
 216 # @{$ChainsDataMap{ResidueNumbers}{$ChainID}} - List of residue numbers in order of their appearance in a chain
 217 # %{$ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName}} - Count of specific residues in a chain
 218 #
 219 # Notes:
 220 #  . Chains and residue data can be extacted using either ATOM/HETATM records or SEQRES records.
 221 #  . In addition to a different chain ID in ATOM/HETATM a TER record also indicates end of an existing chain
 222 #    and start of a new one: ChainID in ATOM/HETATM records might still be emtpy.
 223 #   . ATOM/HETATM records after the first ENDMDL records are simply ingnored.
 224 #
 225 sub GetChainsAndResidues {
 226   my($PDBRecordLinesRef, $RecordsSource, $GetChainResiduesBeyondTERFlag, $GetRecordLinesFlag);
 227 
 228   $RecordsSource = 'AtomAndHetatm';
 229   $GetChainResiduesBeyondTERFlag = 0;
 230   $GetRecordLinesFlag = 0;
 231 
 232   if (@_ == 4) {
 233     ($PDBRecordLinesRef, $RecordsSource, $GetChainResiduesBeyondTERFlag, $GetRecordLinesFlag) = @_;
 234   }
 235   elsif (@_ == 3) {
 236     ($PDBRecordLinesRef, $RecordsSource, $GetChainResiduesBeyondTERFlag) = @_;
 237   }
 238   elsif (@_ == 2) {
 239     ($PDBRecordLinesRef, $RecordsSource) = @_;
 240   }
 241   else {
 242     ($PDBRecordLinesRef) = @_;
 243   }
 244 
 245   if ($RecordsSource =~ /^AtomAndHetatm$/i) {
 246     return _GetChainsAndResiduesFromAtomHetatmRecords($PDBRecordLinesRef, $GetChainResiduesBeyondTERFlag, $GetRecordLinesFlag);
 247   }
 248   elsif ($RecordsSource =~ /^Seqres$/i) {
 249     return _GetChainsAndResiduesFromSeqresRecords($PDBRecordLinesRef);
 250   }
 251   else {
 252     my(%ChainsDataMap);
 253     %ChainsDataMap = ();
 254     @{$ChainsDataMap{ChainIDs}} = ();
 255     %{$ChainsDataMap{Residues}} = ();
 256     %{$ChainsDataMap{ResidueNumbers}} = ();
 257     %{$ChainsDataMap{ResidueCount}} = ();
 258 
 259     return \%ChainsDataMap;
 260   }
 261 }
 262 
 263 
 264 # Get residue information using ATOM/HETATM records and return a reference to a hash with
 265 # these keys:
 266 #
 267 # @{$ResiduesDataMap{ResidueNames}} - List of all the residues
 268 # %{$ResiduesDataMap{ResidueCount}{$ResidueName}} - Count of residues
 269 # @{$ResiduesDataMap{AtomResidueNames}} - List of all the residues
 270 # %{$ResiduesDataMap{AtomResidueCount}{$ResidueName}} - Count of residues in ATOM records
 271 # @{$ResiduesDataMap{HetatomResidueNames}} - List of all the residues
 272 # %{$ResiduesDataMap{HetatmResidueCount}{$ResidueName}} - Count of residues HETATM records
 273 #
 274 # Notes:
 275 #  . ATOM/HETATM records after the first ENDMDL records are simply ingnored.
 276 #
 277 sub GetAllResidues {
 278   my($PDBRecordLinesRef) = @_;
 279 
 280   my($PreviousChainID, $PreviousResidueNumber, $RecordLine, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, %ResiduesDataMap);
 281 
 282   %ResiduesDataMap = ();
 283   @{$ResiduesDataMap{ResidueNames}} = ();
 284   %{$ResiduesDataMap{ResidueCount}} = ();
 285   @{$ResiduesDataMap{AtomResidueNames}} = ();
 286   %{$ResiduesDataMap{AtomResidueCount}} = ();
 287   @{$ResiduesDataMap{HetatmResidueNames}} = ();
 288   %{$ResiduesDataMap{HetatmResidueCount}} = ();
 289 
 290   $PreviousChainID = '';
 291   $PreviousResidueNumber = 0;
 292 
 293   LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
 294     if (IsEndmdlRecordType($RecordLine)) {
 295       last LINE;
 296     }
 297     if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) {
 298       next LINE;
 299     }
 300     ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine);
 301 
 302     if ($PreviousChainID eq $ChainID) {
 303       if ($ResidueNumber == $PreviousResidueNumber) {
 304         next LINE;
 305       }
 306       $PreviousResidueNumber = $ResidueNumber;
 307     }
 308     else {
 309       # New chain...
 310       $PreviousChainID = $ChainID;
 311       $PreviousResidueNumber = $ResidueNumber;
 312     }
 313 
 314     # Store the residue and update its count...
 315     push @{$ResiduesDataMap{ResidueNames}}, $ResidueName;
 316     if (exists $ResiduesDataMap{ResidueCount}{$ResidueName}) {
 317       $ResiduesDataMap{ResidueCount}{$ResidueName} += 1;
 318     }
 319     else {
 320       $ResiduesDataMap{ResidueCount}{$ResidueName} = 1;
 321     }
 322     # Update ATOM residue data...
 323     if (IsAtomRecordType($RecordLine)) {
 324       push @{$ResiduesDataMap{AtomResidueNames}}, $ResidueName;
 325       if (exists $ResiduesDataMap{AtomResidueCount}{$ResidueName}) {
 326         $ResiduesDataMap{AtomResidueCount}{$ResidueName} += 1;
 327       }
 328       else {
 329         $ResiduesDataMap{AtomResidueCount}{$ResidueName} = 1;
 330       }
 331     }
 332     # Update HETATM residue data...
 333     if (IsHetatmRecordType($RecordLine)) {
 334       push @{$ResiduesDataMap{HetatmResidueNames}}, $ResidueName;
 335       if (exists $ResiduesDataMap{HetatmResidueCount}{$ResidueName}) {
 336         $ResiduesDataMap{HetatmResidueCount}{$ResidueName} += 1;
 337       }
 338       else {
 339         $ResiduesDataMap{HetatmResidueCount}{$ResidueName} = 1;
 340       }
 341     }
 342   }
 343 
 344   return \%ResiduesDataMap;
 345 }
 346 
 347 # Return min/max XYZ coordinates for ATOM/HETATM records...
 348 sub GetMinMaxCoords {
 349   my($PDBRecordLinesRef) = @_;
 350 
 351   my($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax, $RecordLine, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge);
 352 
 353   ($XMin, $YMin, $ZMin) = (99999) x 3;
 354   ($XMax, $YMax, $ZMax) = (-99999) x 3;
 355 
 356   LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
 357     if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) {
 358       next LINE;
 359     }
 360     ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine);
 361 
 362     $XMin = ($X < $XMin) ? $X : $XMin;
 363     $YMin = ($Y < $YMin) ? $Y : $YMin;
 364     $ZMin = ($Z < $ZMin) ? $Z : $ZMin;
 365 
 366     $XMax = ($X > $XMax) ? $X : $XMax;
 367     $YMax = ($Y > $YMax) ? $Y : $YMax;
 368     $ZMax = ($Z > $ZMax) ? $Z : $ZMax;
 369   }
 370 
 371   if ($XMin == 99999) { $XMin = undef; }
 372   if ($YMin == 99999) { $YMin = undef; }
 373   if ($ZMin == 99999) { $ZMin = undef; }
 374   if ($XMax == -99999) { $XMax = undef; }
 375   if ($YMax == -99999) { $YMax = undef; }
 376   if ($ZMax == -99999) { $ZMax = undef; }
 377 
 378   return ($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax);
 379 }
 380 
 381 # Read PDB file and return reference to record lines..
 382 sub ReadPDBFile {
 383   my($PDBFile) = @_;
 384 
 385   my($Line, @PDBRecordLines);
 386 
 387   @PDBRecordLines = ();
 388   open PDBFILE, "$PDBFile" or die "Can't open $PDBFile: $!\n";
 389   while ($Line = GetTextLine(\*PDBFILE)) {
 390     push @PDBRecordLines, $Line;
 391   }
 392 
 393   close PDBFILE;
 394 
 395   return (\@PDBRecordLines);
 396 }
 397 
 398 #
 399 # Parse HEADER record line...
 400 sub ParseHeaderRecordLine {
 401   my($Line) = @_;
 402   my($Classification, $DepositionDate, $IDCode) = (undef, undef, undef);
 403 
 404   if ($Line !~ /^HEADER/i) {
 405     return ($Classification, $DepositionDate, $IDCode);
 406   }
 407   ($Classification, $DepositionDate, $IDCode) = unpack("x10A40A9x3A4", $Line);
 408   $Classification = RemoveLeadingAndTrailingWhiteSpaces($Classification);
 409   $DepositionDate =~ s/ //g;
 410   $IDCode =~ s/ //g;
 411 
 412   return ($Classification, $DepositionDate, $IDCode);
 413 }
 414 
 415 #
 416 # Generate HEADER record line...
 417 sub GenerateHeaderRecordLine {
 418   my($Classification, $Date, $IDCode, $Line);
 419 
 420   $Classification = "Created using MayaChemTools";
 421   $Date = GenerateHeaderRecordTimeStamp();
 422   if (@_ == 3) {
 423     ($IDCode, $Classification, $Date) = @_;
 424   }
 425   elsif (@_ == 2) {
 426     ($IDCode, $Classification) = @_;
 427   }
 428   elsif (@_ == 1) {
 429     ($IDCode) = @_;
 430   }
 431 
 432   $Line = sprintf "HEADER    %-40.40s%9.9s   %4.4s", $Classification, $Date, $IDCode;
 433   return $Line;
 434 }
 435 
 436 # Generate PDB header time stamp...
 437 sub GenerateHeaderRecordTimeStamp {
 438   return TimeUtil::PDBFileTimeStamp();
 439 }
 440 
 441 #
 442 # Parse ATOM record line.
 443 #
 444 sub ParseAtomRecordLine {
 445   my($Line) = @_;
 446 
 447   return _ParseAtomOrHetatmRecordLine($Line);
 448 }
 449 
 450 # Generate ATOM record line...
 451 sub GenerateAtomRecordLine {
 452   my($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = @_;
 453 
 454   return _GenerateAtomOrHetatmRecordLine('ATOM', $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge);
 455 }
 456 
 457 #
 458 # Parse ATOM/HETATm record line.
 459 #
 460 sub ParseAtomOrHetatmRecordLine {
 461   my($Line) = @_;
 462 
 463   return _ParseAtomOrHetatmRecordLine($Line);
 464 }
 465 
 466 # Generate ATOM/HETATM record line...
 467 sub GenerateAtomOrHetatmRecordLine {
 468   my($RecordType, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = @_;
 469 
 470   return _GenerateAtomOrHetatmRecordLine($RecordType, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge);
 471 }
 472 #
 473 # Parse HETATM record line...
 474 #
 475 sub ParseHetatmRecordLine {
 476   my($Line) = @_;
 477 
 478   return _ParseAtomOrHetatmRecordLine($Line);
 479 }
 480 
 481 # Generate HETATM record line...
 482 sub GenerateHetatmRecordLine {
 483   my($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = @_;
 484 
 485   return _GenerateAtomOrHetatmRecordLine('HETATM', $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge);
 486 }
 487 #
 488 # Parse SEQRES record line...
 489 #
 490 # SEQRES format:
 491 #
 492 # 1 - 6 Record name "SEQRES"
 493 # 9 - 10 Serial number of the SEQRES record for the current chain. Starts at 1 and increments by one each line. Reset to 1 for each chain.
 494 # 12 - Chain identifier
 495 # 14 - 17 Integer numRes Number of residues in the chain
 496 # 20 - 22 24 -26 ... ... 68 - 70 Residue name resName Residue name.
 497 #
 498 sub ParseSeqresRecordLine {
 499   my($Line) = @_;
 500 
 501   if ($Line !~ /^SEQRES/i) {
 502     return ((undef) x 5);
 503   }
 504   my($RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueNames) = unpack("x8A2x1A1x1A4x2A51" , $Line);
 505   $RecordSerialNumber =~ s/ //g;
 506   $ChainID =~ s/ //g;
 507   $NumOfResidues =~ s/ //g;
 508   $ResidueNames = RemoveLeadingAndTrailingWhiteSpaces($ResidueNames);
 509 
 510   return ($RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueNames);
 511 }
 512 
 513 #
 514 # Parse CONECT record line...
 515 #
 516 # CONECT format:
 517 #
 518 # 1 - 6 Record name "CONECT"
 519 # 7 - 11 Atom number
 520 # 12 - 16, 17 - 21, 22 - 26, 27 - 31 Atom number of bonded atom
 521 #
 522 # 32 - 36, 37 - 41 Atom number of hydrogen bonded atom
 523 # 42 - 46 Atom number of salt bridged atom
 524 # 47 - 51, 52 -56 Atom number of hydrogen bonded atom
 525 # 57 - 61 Atom number of salt bridged atom
 526 #
 527 sub ParseConectRecordLine {
 528   my($Line) = @_;
 529 
 530   if ($Line !~ /^CONECT/i) {
 531     return ((undef) x 11);
 532   }
 533   my($AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2) = map {s/ //g; $_} unpack("x6A5A5A5A5A5A5A5A5A5A5A5", $Line);
 534 
 535   return ($AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2);
 536 }
 537 
 538 # Generate CONECT record line...
 539 sub GenerateConectRecordLine {
 540   my($AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2) = @_;
 541   my($Line);
 542 
 543   $Line = sprintf "CONECT%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s", $AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2;
 544 
 545   return $Line;
 546 }
 547 
 548 #
 549 # Parse TER record line...
 550 #
 551 # TER format:
 552 #
 553 #1 - 6 Record name "TER "
 554 # 7 - 11 Serial number
 555 # 18 - 20 Residue name
 556 # 22 Chain identifier
 557 # 23 - 26 Residue sequence number
 558 # 27 Insertion code
 559 #
 560 sub ParseTerRecordLine {
 561   my($Line) = @_;
 562 
 563   if ($Line !~ /^TER/i) {
 564     return ((undef) x 5);
 565   }
 566   my($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode) = map {s/ //g; $_} unpack("x6A5x6A3xA1A4A1", $Line);
 567 
 568   return ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode);
 569 }
 570 
 571 # Generate TER record line...
 572 sub GenerateTerRecordLine {
 573   my($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $Line) = ('') x 6;
 574 
 575   if (@_ == 5) {
 576     ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode) = @_;
 577   }
 578   elsif (@_ == 4) {
 579     ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber) = @_;
 580   }
 581   elsif (@_ == 3) {
 582     ($SerialNumber, $ResidueName) = @_;
 583   }
 584   elsif (@_ == 2) {
 585     ($SerialNumber, $ResidueName) = @_;
 586   }
 587   elsif (@_ == 1) {
 588     ($SerialNumber) = @_;
 589   }
 590   $Line = sprintf "TER   %5.5s      %-3.3s %1.1s%4.4s%1.1s", $SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode;
 591 
 592   return $Line;
 593 }
 594 
 595 #
 596 # Parse MASTER record line...
 597 #
 598 # MASTER record format:
 599 #
 600 #1 - 6 Record name "MASTER"
 601 # 11 - 15 Number of REMARK records
 602 # 16 - 20 "0"
 603 # 21 - 25 Number of HET records
 604 # 26 - 30 Number of HELIX records
 605 # 31 - 35 Number of SHEET records
 606 # 36 - 40 Number of TURN records
 607 # 41 - 45 Number of SITE records
 608 # 46 - 50 Number of coordinate transformation records (ORIGXn+SCALEn+MTRIXn)
 609 # 51 - 55 Number of atomic coordinate records (ATOM+HETATM)
 610 # 56 - 60 Number of TER records
 611 # 61 - 65 Number of CONECT records
 612 # 66 - 70 Number of SEQRES records
 613 #
 614 sub ParseMasterRecordLine {
 615   my($Line) = @_;
 616 
 617   if ($Line !~ /^MASTER/i) {
 618     return ((undef) x 11);
 619   }
 620   my($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = map {s/ //g; $_} unpack("x6x4A5x5A5A5A5A5A5A5A5A5A5A5", $Line);
 621 
 622   return ($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords);
 623 }
 624 
 625 # End record...
 626 sub GenerateEndRecordLine {
 627   my($Line);
 628   $Line = 'END   ';
 629   return $Line;
 630 }
 631 
 632 # ATOM/HETATM record format:
 633 #
 634 # 1 - 6 Record name
 635 # 7 - 11  Atom serial number - right justified
 636 # 13 - 16 Atom name
 637 # 17 Alternate location indicator.
 638 # 18 - 20 Residue name - right justified
 639 # 22 Chain identifier.
 640 # 23 - 26 Residue sequence number - right justified
 641 # 27 Code for insertion of residues.
 642 # 31 - 38 Real(8.3), Orthogonal coordinates for X in Angstroms.
 643 # 39 - 46 Real(8.3), Orthogonal coordinates for Y in Angstroms.
 644 # 47 - 54 Real(8.3), Orthogonal coordinates for Z in Angstroms.
 645 # 55 - 60 Real(6.2), Occupancy
 646 # 61 - 66 Real(6.2), Temperature factor
 647 # 73 - 76 LString(4), Segment identifier, left-justified.
 648 # 77 - 78 LString(2), Element symbol, right-justified.
 649 #79 - 80 LString(2), Charge on the atom.
 650 #
 651 # Notes:
 652 #  . Atom names starting with C, N, O and S are left justified starting with column 14
 653 #    and others are left justified starting with column 13.
 654 #
 655 sub _ParseAtomOrHetatmRecordLine {
 656   my($Line) = @_;
 657 
 658   if ($Line !~ /^(ATOM|HETATM)/i) {
 659     return ((undef) x 15);
 660   }
 661   my($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, $Length);
 662 
 663   ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ('') x 15;
 664 
 665   $Length = length $Line;
 666 
 667   if ($Length < 60) {
 668     ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = map {s/ //g; $_} unpack("x6A5xA4A1A3xA1A4A1x3A8A8A8", $Line);
 669   }
 670   else {
 671     ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = map {s/ //g; $_} unpack("x6A5xA4A1A3xA1A4A1x3A8A8A8A6A6x6A4A2A2", $Line);
 672   }
 673   return($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge);
 674 }
 675 
 676 # Generate ATOM/HETATM record line...
 677 sub _GenerateAtomOrHetatmRecordLine {
 678   my($RecordType, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = @_;
 679   my($Line, $AtomNameFormat);
 680 
 681   if ($AtomName =~ /^(C|N|O|S)/i) {
 682     # Left justified starting at column 14...
 683     $AtomNameFormat = " %-3.3s";
 684   }
 685   else {
 686     $AtomNameFormat = "%-4.4s";
 687   }
 688   $Line = sprintf "%-6.6s%5.5s ${AtomNameFormat}%1.1s%3.3s %1.1s%4.4s%1.1s   %8.8s%8.8s%8.8s%6.6s%6.6s      %-4.4s%2.2s%2.2s", $RecordType, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge;
 689 
 690   return $Line;
 691 }
 692 
 693 # Check record type...
 694 sub _IsRecordType {
 695   my($Line, $SpecifiedType) = @_;
 696   my($Type, $Status);
 697 
 698   ($Type) = map {s/ //g; $_} unpack("A6", $Line);
 699 
 700   $Status = ($SpecifiedType eq $Type) ? 1 : 0;
 701 
 702   return $Status;
 703 }
 704 
 705 # Get record type...
 706 sub _GetRecordType {
 707   my($Line) = @_;
 708   my($Type);
 709 
 710   ($Type) = map {s/ //g; $_} unpack("A6", $Line);
 711 
 712   return $Type;
 713 }
 714 
 715 # Get chains and residues data using ATOM/HETATM records...
 716 #
 717 sub _GetChainsAndResiduesFromAtomHetatmRecords {
 718   my($PDBRecordLinesRef, $GetChainResiduesBeyondTERFlag, $GetRecordLinesFlag) = @_;
 719 
 720   my($LineCount, $TotalChainCount, $PreviousResidueNumber, $ChainCount, $DefaultChainID, $DefaultChainLabel, $RecordLine, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, %ChainsDataMap);
 721 
 722   # Do a quick chain count using TER record...
 723   $TotalChainCount = 0;
 724   LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
 725     if (IsEndmdlRecordType($RecordLine)) {
 726       last LINE;
 727     }
 728     if (IsTerRecordType($RecordLine)) {
 729       $TotalChainCount++;
 730     }
 731   }
 732 
 733   %ChainsDataMap = ();
 734   @{$ChainsDataMap{ChainIDs}} = ();
 735   %{$ChainsDataMap{Residues}} = ();
 736   %{$ChainsDataMap{ResidueNumbers}} = ();
 737   %{$ChainsDataMap{Lines}} = ();
 738   %{$ChainsDataMap{ResidueCount}} = ();
 739 
 740   $LineCount = 0;
 741   $ChainCount = 0;
 742   $DefaultChainLabel = 'None';
 743   $DefaultChainID = $DefaultChainLabel . ($ChainCount + 1);
 744   $PreviousResidueNumber = 0;
 745 
 746   LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
 747       $LineCount++;
 748       if (IsTerRecordType($RecordLine)) {
 749         $DefaultChainID = $DefaultChainLabel . ($ChainCount + 1);
 750         $ChainCount++;
 751         if ($ChainCount == $TotalChainCount) {
 752           last LINE;
 753         }
 754         else {
 755           next LINE;
 756         }
 757       }
 758       elsif (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) {
 759         next LINE;
 760       }
 761       ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine);
 762 
 763       if (IsEmpty($ChainID)) {
 764         $ChainID = $DefaultChainID;
 765       }
 766       if (exists $ChainsDataMap{Residues}{$ChainID}) {
 767         # Data for existing chain...
 768         if ($GetRecordLinesFlag) {
 769           push @{$ChainsDataMap{Lines}{$ChainID}}, $RecordLine;
 770         }
 771 
 772         if ($ResidueNumber != $PreviousResidueNumber) {
 773           # Next residue with in the chain...
 774           push @{$ChainsDataMap{Residues}{$ChainID}}, $ResidueName;
 775           push @{$ChainsDataMap{ResidueNumbers}{$ChainID}}, $ResidueNumber;
 776 
 777           if (exists $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName}) {
 778             $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} += 1;
 779           }
 780           else {
 781             $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} = 1;
 782           }
 783           $PreviousResidueNumber = $ResidueNumber;
 784         }
 785       }
 786       else {
 787         # Data for new chain...
 788         push @{$ChainsDataMap{ChainIDs}}, $ChainID;
 789 
 790         @{$ChainsDataMap{Residues}{$ChainID}} = ();
 791         push @{$ChainsDataMap{Residues}{$ChainID}}, $ResidueName;
 792 
 793         @{$ChainsDataMap{ResidueNumbers}{$ChainID}} = ();
 794         push @{$ChainsDataMap{ResidueNumbers}{$ChainID}}, $ResidueNumber;
 795 
 796         @{$ChainsDataMap{Lines}{$ChainID}} = ();
 797         if ($GetRecordLinesFlag) {
 798           push @{$ChainsDataMap{Lines}{$ChainID}}, $RecordLine;
 799         }
 800 
 801         %{$ChainsDataMap{ResidueCount}{$ChainID}} = ();
 802         $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} = 1;
 803         $PreviousResidueNumber = $ResidueNumber;
 804       }
 805   }
 806   if (!$GetChainResiduesBeyondTERFlag) {
 807     return \%ChainsDataMap;
 808   }
 809   # Look for any HETATM residues specified outside TER records which could belong to an existing chain...
 810   my($LineIndex, $PreviousChainID);
 811   $PreviousChainID = '';
 812   $PreviousResidueNumber = 0;
 813   LINE: for $LineIndex (($LineCount - 1) .. $#{$PDBRecordLinesRef}) {
 814     $RecordLine = $PDBRecordLinesRef->[$LineIndex];
 815     if (IsEndmdlRecordType($RecordLine)) {
 816       last LINE;
 817     }
 818     if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) {
 819       next LINE;
 820     }
 821     ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine);
 822     if (IsEmpty($ChainID)) {
 823       # Ignore the chains with no ids...
 824       next LINE;
 825     }
 826     if (! exists($ChainsDataMap{Residues}{$ChainID})) {
 827       # Don't collect any new chains after TER record...
 828       next LINE;
 829     }
 830     if ($GetRecordLinesFlag) {
 831       push @{$ChainsDataMap{Lines}{$ChainID}}, $RecordLine;
 832     }
 833     if ($ResidueNumber != $PreviousResidueNumber || $ChainID ne $PreviousChainID) {
 834 
 835       push @{$ChainsDataMap{Residues}{$ChainID}}, $ResidueName;
 836       push @{$ChainsDataMap{ResidueNumbers}{$ChainID}}, $ResidueNumber;
 837 
 838       if (exists $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName}) {
 839         $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} += 1;
 840       }
 841       else {
 842         $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} = 1;
 843       }
 844       $PreviousChainID = $ChainID;
 845       $PreviousResidueNumber = $ResidueNumber;
 846     }
 847   }
 848   return \%ChainsDataMap;
 849 }
 850 
 851 # Get chains and residues data using SEQRES records...
 852 #
 853 sub _GetChainsAndResiduesFromSeqresRecords {
 854   my($PDBRecordLinesRef) = @_;
 855 
 856   my($ChainCount, $DefaultChainLabel, $DefaultChainID, $RecordLine, $RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueName, $ResidueNamesString, @ResidueNamesList, %ChainsDataMap);
 857 
 858   %ChainsDataMap = ();
 859   @{$ChainsDataMap{ChainIDs}} = ();
 860   %{$ChainsDataMap{Residues}} = ();
 861   %{$ChainsDataMap{ResidueNumbers}} = ();
 862   %{$ChainsDataMap{ResidueCount}} = ();
 863 
 864   $ChainCount = 0;
 865   $DefaultChainLabel = 'None';
 866   $DefaultChainID = $DefaultChainLabel . ($ChainCount + 1);
 867 
 868   LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
 869       if (!IsSeqresRecordType($RecordLine)) {
 870         next LINE;
 871       }
 872       ($RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueNamesString) = ParseSeqresRecordLine($RecordLine);
 873       if ($RecordSerialNumber == 1) {
 874         # Indicates start of a new chain...
 875         $DefaultChainID = $DefaultChainLabel . ($ChainCount + 1);
 876         $ChainCount++;
 877       }
 878       if (IsEmpty($ChainID)) {
 879         $ChainID = $DefaultChainID;
 880       }
 881       # Process the residues...
 882       @ResidueNamesList = split /[ ]+/, $ResidueNamesString;
 883 
 884       if (exists $ChainsDataMap{Residues}{$ChainID}) {
 885         # Data for existing chain...
 886         push @{$ChainsDataMap{Residues}{$ChainID}}, @ResidueNamesList;
 887       }
 888       else {
 889         # Data for new chain...
 890         push @{$ChainsDataMap{ChainIDs}}, $ChainID;
 891         @{$ChainsDataMap{Residues}{$ChainID}} = ();
 892         push @{$ChainsDataMap{Residues}{$ChainID}}, @ResidueNamesList;
 893       }
 894 
 895       # Setup residue count...
 896       for $ResidueName (@ResidueNamesList) {
 897         if (exists $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName}) {
 898           $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} += 1;
 899         }
 900         else {
 901           $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} = 1;
 902         }
 903       }
 904   }
 905   return \%ChainsDataMap;
 906 }
 907