MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: InfoPDBFiles.pl,v $
   4 # $Date: 2011/12/16 00:03:30 $
   5 # $Revision: 1.30 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2012 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 use PDBFileUtil;
  38 
  39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  40 
  41 # Autoflush STDOUT
  42 $| = 1;
  43 
  44 # Starting message...
  45 $ScriptName = basename($0);
  46 print "\n$ScriptName: Starting...\n\n";
  47 $StartTime = new Benchmark;
  48 
  49 # Get the options and setup script...
  50 SetupScriptUsage();
  51 if ($Options{help} || @ARGV < 1) {
  52   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  53 }
  54 
  55 my(@PDBFilesList);
  56 @PDBFilesList = ExpandFileNames(\@ARGV, "pdb");
  57 
  58 # Process options...
  59 print "Processing options...\n";
  60 my(%OptionsInfo);
  61 ProcessOptions();
  62 
  63 # Setup information about input files...
  64 my(%PDBFilesInfo);
  65 print "Checking input PDB file(s)...\n";
  66 RetrievePDBFilesInfo();
  67 
  68 # Process input files..
  69 my($FileIndex);
  70 if (@PDBFilesList > 1) {
  71   print "\nProcessing PDB files...\n";
  72 }
  73 for $FileIndex (0 .. $#PDBFilesList) {
  74   if ($PDBFilesInfo{FileOkay}[$FileIndex]) {
  75     print "\nProcessing file $PDBFilesList[$FileIndex]...\n";
  76     ListPDBFileInfo($FileIndex);
  77   }
  78 }
  79 ListTotalSizeOfFiles();
  80 
  81 print "\n$ScriptName:Done...\n\n";
  82 
  83 $EndTime = new Benchmark;
  84 $TotalTime = timediff ($EndTime, $StartTime);
  85 print "Total time: ", timestr($TotalTime), "\n";
  86 
  87 ###############################################################################
  88 
  89 # List appropriate information...
  90 sub ListPDBFileInfo {
  91   my($Index) = @_;
  92   my($PDBFile, $PDBRecordLinesRef);
  93 
  94   $PDBFile = $PDBFilesList[$Index];
  95   $PDBRecordLinesRef = ReadPDBFile($PDBFile);
  96 
  97   # Header informaton...
  98   if ($OptionsInfo{ListHeaderInfo}) {
  99     ListHeaderInfo($PDBRecordLinesRef);
 100   }
 101 
 102   # Total number of records...
 103   my($TotalRecordsCount) = scalar @{$PDBRecordLinesRef};
 104   print "\nTotal number of records: $TotalRecordsCount\n";
 105 
 106   # List record type count information...
 107   ListRecordTypesInfo($PDBRecordLinesRef);
 108 
 109   if ($OptionsInfo{CountChains} || $OptionsInfo{CountResiduesInChains} || $OptionsInfo{ResiduesFrequencyInChains}) {
 110     ListChainsAndResiduesInfo($PDBRecordLinesRef);
 111   }
 112   if ($OptionsInfo{CountResiduesAll} || $OptionsInfo{ResiduesFrequencyAll}) {
 113     ListAllResiduesInfo($PDBRecordLinesRef);
 114   }
 115   if ($OptionsInfo{ResidueNumbersInfo}) {
 116     ListResidueNumbersInfo($PDBRecordLinesRef);
 117   }
 118   if ($OptionsInfo{CalculateBoundingBox}) {
 119     ListBoundingBox($PDBRecordLinesRef);
 120   }
 121 
 122   # File size and modification information...
 123   print "\nFile size: ", FormatFileSize($PDBFilesInfo{FileSize}[$Index]), " \n";
 124   print "Last modified: ", $PDBFilesInfo{FileLastModified}[$Index], " \n";
 125 }
 126 
 127 sub ListHeaderInfo {
 128   my($PDBRecordLinesRef) = @_;
 129   my($HeaderRecordLine, $Classification, $DepositionDate, $IDCode);
 130 
 131   ($Classification, $DepositionDate, $IDCode) = (undef) x 3;
 132   $HeaderRecordLine = $PDBRecordLinesRef->[0];
 133   if (IsHeaderRecordType($HeaderRecordLine)) {
 134     ($Classification, $DepositionDate, $IDCode) = ParseHeaderRecordLine($HeaderRecordLine);
 135   }
 136 
 137   $Classification = IsEmpty($Classification) ? 'Not available' : $Classification;
 138   $DepositionDate = IsEmpty($DepositionDate) ? 'Not available' : $DepositionDate;
 139   $IDCode = IsEmpty($IDCode) ? 'Not available' : $IDCode;
 140 
 141   print "\nClassification: $Classification\nID: $IDCode\nDeposition date: $DepositionDate\n";
 142 }
 143 
 144 # List record type info...
 145 sub ListRecordTypesInfo {
 146   my($PDBRecordLinesRef) = @_;
 147   my($RecordType, $RecordCount, $RecordTypesCountRef, @RecordTypeCountInfo);
 148 
 149   $RecordTypesCountRef = GetRecordTypesCount($PDBRecordLinesRef);
 150 
 151   @RecordTypeCountInfo = ();
 152   if ($OptionsInfo{CountRecordType} =~ /^All$/i) {
 153     for $RecordType (@{$RecordTypesCountRef->{RecordTypes}}) {
 154       $RecordCount = $RecordTypesCountRef->{Count}{$RecordType};
 155       push @RecordTypeCountInfo, "$RecordType - $RecordCount";
 156     }
 157   }
 158   else {
 159     for $RecordType (@{$OptionsInfo{SpecifiedRecordTypes}}) {
 160       $RecordCount = (exists $RecordTypesCountRef->{Count}{$RecordType}) ? ($RecordTypesCountRef->{Count}{$RecordType}) : 0;
 161       push @RecordTypeCountInfo, "$RecordType - $RecordCount";
 162     }
 163   }
 164   print "Number of individual records: ", JoinWords(\@RecordTypeCountInfo, '; ', 0), "\n";
 165 
 166   if ($OptionsInfo{CheckMasterRecord}) {
 167     CheckMasterRecord($RecordTypesCountRef, $PDBRecordLinesRef);
 168   }
 169 }
 170 
 171 # List information about residues and chains...
 172 sub ListChainsAndResiduesInfo {
 173   my($PDBRecordLinesRef) = @_;
 174   my($ResidueName, $ResidueCount, $ChainCount, $ChainID, $CollectChainResiduesBeyondTER, $ChainsAndResiduesInfoRef);
 175 
 176   $CollectChainResiduesBeyondTER = 1;
 177   $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', $CollectChainResiduesBeyondTER);
 178   $ChainCount = @{$ChainsAndResiduesInfoRef->{ChainIDs}};
 179   if ($OptionsInfo{CountChains}) {
 180     print "\nNumber of chains: $ChainCount \n";
 181     my($ChainID, @ChainIDsList);
 182     @ChainIDsList = ();
 183     for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 184       push @ChainIDsList, CleanupChainID($ChainID);
 185     }
 186     print "Chain IDs: ", JoinWords(\@ChainIDsList, ', ', 0),"\n";
 187   }
 188 
 189   if ($OptionsInfo{CountResiduesInChains}) {
 190     my($TotalResiduesCount, $ResidueCountInfo, @ResiduesCountInfo);
 191     @ResiduesCountInfo = ();
 192     $TotalResiduesCount = 0;
 193     for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 194       $ResidueCount = @{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}};
 195       $TotalResiduesCount += $ResidueCount;
 196       $ResidueCountInfo =  "Chain ${ChainID} - ${ResidueCount}";
 197       push @ResiduesCountInfo, $ResidueCountInfo;
 198     }
 199     print "\nNumber of residues in chain(s): ";
 200     if ($ChainCount > 1) {
 201       print "Total - $TotalResiduesCount; ", JoinWords(\@ResiduesCountInfo, '; ', 0),"\n";
 202     }
 203     else {
 204       print "$TotalResiduesCount\n";
 205     }
 206 
 207     # List of residues in each chain...
 208     if ($OptionsInfo{DetailLevel} >= 3) {
 209       print "List of residues in chain(s): \n";
 210       for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 211         if ($ChainCount > 1) {
 212           print "Chain ", CleanupChainID($ChainID), ": ";
 213         }
 214         print JoinWords(\@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}, ', ', 0),"\n";
 215       }
 216     }
 217   }
 218   if ($OptionsInfo{ResiduesFrequencyInChains}) {
 219     # Setup a hash using residue count as key for sorting the values...
 220     my(%ResiduesCountToNameMap);
 221     %ResiduesCountToNameMap = ();
 222     @{$ResiduesCountToNameMap{ChainIDs}} = ();
 223     %{$ResiduesCountToNameMap{ResidueNames}} = ();
 224 
 225     for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 226       push @{$ResiduesCountToNameMap{ChainIDs}}, $ChainID;
 227       %{$ResiduesCountToNameMap{ResidueNames}{$ChainID}} = ();
 228 
 229       for $ResidueName (sort keys %{$ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}}) {
 230         $ResidueCount = $ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}{$ResidueName};
 231         # Setup count value for each chain...
 232         if (exists $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount}) {
 233           $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount} .= "~${ResidueName}";
 234         }
 235         else {
 236           $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount} = $ResidueName;
 237         }
 238       }
 239     }
 240     # Collect data for all the residues in all the chains...
 241     my(%AllResiduesNameToCountMap, %AllResiduesCountToNameMap);
 242     %AllResiduesNameToCountMap = ();
 243     %AllResiduesCountToNameMap = ();
 244     if ($ChainCount > 1) {
 245       for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 246         for $ResidueName (keys %{$ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}}) {
 247           $ResidueCount = $ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}{$ResidueName};
 248           if (exists $AllResiduesNameToCountMap{$ResidueName}) {
 249             $AllResiduesNameToCountMap{$ResidueName} += $ResidueCount;
 250           }
 251           else {
 252             $AllResiduesNameToCountMap{$ResidueName} = $ResidueCount;
 253           }
 254         }
 255       }
 256       for $ResidueName (keys %AllResiduesNameToCountMap) {
 257         $ResidueCount = $AllResiduesNameToCountMap{$ResidueName};
 258         if (exists $AllResiduesCountToNameMap{$ResidueCount}) {
 259           $AllResiduesCountToNameMap{$ResidueCount} .= "~${ResidueName}";
 260         }
 261         else {
 262           $AllResiduesCountToNameMap{$ResidueCount} = $ResidueName;
 263         }
 264       }
 265     }
 266 
 267     # Setup distribution data for individual chains and the grand total as well...
 268     my($ChainResidueCount, $PercentResidueCount, $TotalResidueCount, $ResidueNames, @ResidueNamesList, %ResiduesFrequencyInfoMap);
 269     @{$ResiduesFrequencyInfoMap{ChainIDs}} = ();
 270     %{$ResiduesFrequencyInfoMap{Frequency}} = ();
 271     %{$ResiduesFrequencyInfoMap{PercentFrequency}} = ();
 272 
 273     @{$ResiduesFrequencyInfoMap{AllChainsFrequency}} = ();
 274     @{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}} = ();
 275 
 276     $TotalResidueCount = 0;
 277 
 278     for $ChainID (@{$ResiduesCountToNameMap{ChainIDs}}) {
 279       push @{$ResiduesFrequencyInfoMap{ChainIDs}}, $ChainID;
 280       @{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}} = ();
 281       @{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}} = ();
 282 
 283       $ChainResidueCount = @{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}};
 284       $TotalResidueCount += $ChainResidueCount;
 285 
 286       for $ResidueCount (sort {$b <=> $a} keys %{$ResiduesCountToNameMap{ResidueNames}{$ChainID}}) {
 287         $ResidueNames = $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount};
 288         @ResidueNamesList = split /~/, $ResidueNames;
 289         for $ResidueName (@ResidueNamesList) {
 290           push @{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}}, "${ResidueName} - ${ResidueCount}";
 291           $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$ChainResidueCount)*100)) + 0;
 292           push @{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}}, "${ResidueName} - ${PercentResidueCount}%";
 293         }
 294       }
 295     }
 296     if ($ChainCount > 1) {
 297       for $ResidueCount (sort {$b <=> $a} keys %AllResiduesCountToNameMap) {
 298         $ResidueNames = $AllResiduesCountToNameMap{$ResidueCount};
 299         @ResidueNamesList = split /~/, $ResidueNames;
 300         for $ResidueName (@ResidueNamesList) {
 301           push @{$ResiduesFrequencyInfoMap{AllChainsFrequency}}, "${ResidueName} - ${ResidueCount}";
 302           $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0;
 303           push @{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}}, "${ResidueName} - ${PercentResidueCount}%";
 304         }
 305       }
 306     }
 307 
 308     # List distribution of residues
 309     print "\nDistribution of residues in chain(s): \n";
 310     for $ChainID (@{$ResiduesFrequencyInfoMap{ChainIDs}}) {
 311       if ($ChainCount > 1) {
 312         print "Chain ", CleanupChainID($ChainID), ": ";
 313       }
 314       print JoinWords(\@{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}}, '; ', 0), "\n";
 315     }
 316     if ($OptionsInfo{DetailLevel} >= 2) {
 317       print "\nPercent distribution of residues in chain(s): \n";
 318       for $ChainID (@{$ResiduesFrequencyInfoMap{ChainIDs}}) {
 319         if ($ChainCount > 1) {
 320           print "Chain ", CleanupChainID($ChainID), ": ";
 321         }
 322         print JoinWords(\@{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}}, '; ', 0), "\n";
 323       }
 324     }
 325     if ($ChainCount > 1) {
 326       print "\nDistribution of residues across all chains: \n";
 327       print JoinWords(\@{$ResiduesFrequencyInfoMap{AllChainsFrequency}}, '; ', 0), "\n";
 328 
 329       if ($OptionsInfo{DetailLevel} >= 2) {
 330         print "\nPercent distribution of residues across all chains: \n";
 331         print JoinWords(\@{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}}, '; ', 0), "\n";
 332       }
 333     }
 334   }
 335 }
 336 
 337 # List information about all the residues...
 338 sub ListAllResiduesInfo {
 339   my($PDBRecordLinesRef) = @_;
 340   my($TotalResidueCount, $AtomResiduesCount, $HetatmResiduesCount, $ResiduesInfoRef);
 341 
 342   $ResiduesInfoRef = GetAllResidues($PDBRecordLinesRef);
 343   $TotalResidueCount = @{$ResiduesInfoRef->{ResidueNames}};
 344   $AtomResiduesCount = @{$ResiduesInfoRef->{AtomResidueNames}};
 345   $HetatmResiduesCount = @{$ResiduesInfoRef->{HetatmResidueNames}};
 346 
 347   if ($OptionsInfo{CountResiduesAll}) {
 348     print "\nTotal number of residues: Total - $TotalResidueCount; ATOM residues - $AtomResiduesCount; HETATM residues - $HetatmResiduesCount\n";
 349 
 350     if ($OptionsInfo{DetailLevel} >= 3) {
 351       print "List of residues: \n";
 352       if ($AtomResiduesCount) {
 353         print "ATOM residues: ", JoinWords(\@{$ResiduesInfoRef->{AtomResidueNames}}, ', ', 0), "\n";
 354       }
 355       if ($HetatmResiduesCount) {
 356         print "HETATM residues: ", JoinWords(\@{$ResiduesInfoRef->{HetatmResidueNames}}, ', ', 0), "\n";
 357       }
 358     }
 359   }
 360 
 361   if ($OptionsInfo{ResiduesFrequencyAll}) {
 362     my($ResidueName, $ResidueCount);
 363 
 364     # Setup a hash using residue count as key for sorting the values...
 365     my(%ResiduesCountToNameMap, %AtomResiduesCountToNameMap, %HetatmResiduesCountToNameMap);
 366     %ResiduesCountToNameMap = ();
 367     %{$ResiduesCountToNameMap{ResidueNames}} = ();
 368 
 369     %AtomResiduesCountToNameMap = ();
 370     %{$AtomResiduesCountToNameMap{ResidueNames}} = ();
 371 
 372     %HetatmResiduesCountToNameMap = ();
 373     %{$HetatmResiduesCountToNameMap{ResidueNames}} = ();
 374 
 375     for $ResidueName (keys %{$ResiduesInfoRef->{ResidueCount}}) {
 376       $ResidueCount = $ResiduesInfoRef->{ResidueCount}{$ResidueName};
 377       if (exists $ResiduesCountToNameMap{ResidueNames}{$ResidueCount}) {
 378         $ResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}";
 379       }
 380       else {
 381         $ResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName;
 382       }
 383     }
 384 
 385     if ($OptionsInfo{DetailLevel} >= 1) {
 386       for $ResidueName (keys %{$ResiduesInfoRef->{AtomResidueCount}}) {
 387         $ResidueCount = $ResiduesInfoRef->{AtomResidueCount}{$ResidueName};
 388         if (exists $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount}) {
 389           $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}";
 390         }
 391         else {
 392           $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName;
 393         }
 394       }
 395       for $ResidueName (keys %{$ResiduesInfoRef->{HetatmResidueCount}}) {
 396         $ResidueCount = $ResiduesInfoRef->{HetatmResidueCount}{$ResidueName};
 397         if (exists $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount}) {
 398           $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}";
 399         }
 400         else {
 401           $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName;
 402         }
 403       }
 404     }
 405 
 406     # Setup distribution of residues info...
 407     my($ResidueNames, $PercentResidueCount, @ResidueNamesList, %ResiduesCountInfoMap, %AtomResiduesCountInfoMap, %HetatmResiduesCountInfoMap);
 408 
 409     @{$ResiduesCountInfoMap{Frequency}} = ();
 410     @{$ResiduesCountInfoMap{PercentFrequency}} = ();
 411     for $ResidueCount (sort {$b <=> $a} keys %{$ResiduesCountToNameMap{ResidueNames}}) {
 412       $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0;
 413       $ResidueNames = $ResiduesCountToNameMap{ResidueNames}{$ResidueCount};
 414       @ResidueNamesList = split /~/, $ResidueNames;
 415       for $ResidueName (@ResidueNamesList) {
 416         push @{$ResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}";
 417         push @{$ResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}";
 418       }
 419     }
 420     if ($OptionsInfo{DetailLevel} >= 1) {
 421       @{$AtomResiduesCountInfoMap{Frequency}} = ();
 422       @{$AtomResiduesCountInfoMap{PercentFrequency}} = ();
 423       for $ResidueCount (sort {$b <=> $a} keys %{$AtomResiduesCountToNameMap{ResidueNames}}) {
 424         $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0;
 425         $ResidueNames = $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount};
 426         @ResidueNamesList = split /~/, $ResidueNames;
 427         for $ResidueName (@ResidueNamesList) {
 428           push @{$AtomResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}";
 429           push @{$AtomResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}";
 430         }
 431       }
 432       @{$HetatmResiduesCountInfoMap{Frequency}} = ();
 433       @{$HetatmResiduesCountInfoMap{PercentFrequency}} = ();
 434       for $ResidueCount (sort {$b <=> $a} keys %{$HetatmResiduesCountToNameMap{ResidueNames}}) {
 435         $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0;
 436         $ResidueNames = $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount};
 437         @ResidueNamesList = split /~/, $ResidueNames;
 438         for $ResidueName (@ResidueNamesList) {
 439           push @{$HetatmResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}";
 440           push @{$HetatmResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}";
 441         }
 442       }
 443     }
 444 
 445     # List distribution of residues
 446     print "\nDistribution of residues: ", JoinWords(\@{$ResiduesCountInfoMap{Frequency}},'; ', 0), "\n";
 447     if ($OptionsInfo{DetailLevel} >= 2) {
 448       print "\nPercent distribution of residues: ", JoinWords(\@{$ResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n";
 449     }
 450 
 451     if ($OptionsInfo{DetailLevel} >= 1) {
 452       print "\nDistribution of ATOM residues: ", JoinWords(\@{$AtomResiduesCountInfoMap{Frequency}},'; ', 0), "\n";
 453       if ($OptionsInfo{DetailLevel} >= 2) {
 454         print "\nPercent distribution of ATOM residues: ", JoinWords(\@{$AtomResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n";
 455       }
 456 
 457       print "\nDistribution of HETATM residues: ", JoinWords(\@{$HetatmResiduesCountInfoMap{Frequency}},'; ', 0), "\n";
 458       if ($OptionsInfo{DetailLevel} >= 2) {
 459         print "\nPercent distribution of HETATM residues: ", JoinWords(\@{$HetatmResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n";
 460       }
 461     }
 462   }
 463 }
 464 
 465 # List information about residue numbers for each chain...
 466 sub ListResidueNumbersInfo {
 467   my($PDBRecordLinesRef) = @_;
 468   my($Index, $ResidueCount, $StartResidueNum, $EndResidueNum, $ChainID, $CollectChainResiduesBeyondTER, $ChainsAndResiduesInfoRef, $ResidueNum, $PreviousResidueNum, $ResidueName, $PreviousResidueName, $GapResiduePairsCount, $GapLength, $DescendingOrderResiduePairsCount, @DescendingOrderResiduePairs, @GapResiduePairs);
 469 
 470   $CollectChainResiduesBeyondTER = 0;
 471   $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', $CollectChainResiduesBeyondTER);
 472 
 473   print "\nATOM/HETATM residue numbers information for chains:\n";
 474 
 475   for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 476     print "\nChain ID - ",  CleanupChainID($ChainID), "";
 477 
 478     $ResidueCount = @{$ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}};
 479 
 480     # Start and end residue numbers...
 481     $StartResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[0];
 482     $EndResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$ResidueCount - 1];
 483     print "; Number of residues: $ResidueCount; Start residue number - $StartResidueNum; End residue number - $EndResidueNum\n";
 484 
 485     # Identify any gaps in residue numbers or non-ascending order residue numbers...
 486     $GapResiduePairsCount = 0;
 487     $DescendingOrderResiduePairsCount = 0;
 488 
 489     @DescendingOrderResiduePairs = ();
 490     @GapResiduePairs = ();
 491 
 492     RESIDUE: for $Index (1 .. ($ResidueCount - 1)) {
 493       $ResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$Index];
 494       $PreviousResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$Index - 1];
 495 
 496       $ResidueName = $ChainsAndResiduesInfoRef->{Residues}{$ChainID}[$Index];
 497       $PreviousResidueName = $ChainsAndResiduesInfoRef->{Residues}{$ChainID}[$Index - 1];
 498 
 499       if ($ResidueNum == ($PreviousResidueNum + 1)) {
 500         # All is good...
 501         next RESIDUE;
 502       }
 503 
 504       # Are residue in descending order?
 505       if ($ResidueNum < $PreviousResidueNum) {
 506         $DescendingOrderResiduePairsCount++;
 507         push @DescendingOrderResiduePairs, "<${PreviousResidueName}${PreviousResidueNum} - ${ResidueName}${ResidueNum}>";
 508       }
 509 
 510       # Track gaps in residue pairs...
 511       $GapResiduePairsCount++;
 512       $GapLength = abs($ResidueNum - $PreviousResidueNum) - 1;
 513 
 514       push @GapResiduePairs, "<${PreviousResidueName}${PreviousResidueNum} - ${ResidueName}${ResidueNum}; $GapLength>";
 515     }
 516 
 517     # Print gaps information...
 518     print "Gaps in residue numbers: ", $GapResiduePairsCount ? "Yes" : "None";
 519     if ($GapResiduePairsCount) {
 520       print "; Number of gap residue number pairs: $GapResiduePairsCount; Gap residue pairs: <StartRes-EndRes; GapLength> - ", JoinWords(\@GapResiduePairs, "; ", 0);
 521     }
 522     print "\n";
 523 
 524     # Print descending residue order information...
 525     print "Residue numbers in descending order: ", $DescendingOrderResiduePairsCount ? "Yes" : "None";
 526     if ($DescendingOrderResiduePairsCount) {
 527       print "; Number of descending residue number pairs: $DescendingOrderResiduePairsCount; Descending residue number pairs: <StartRes-EndRes> ", JoinWords(\@DescendingOrderResiduePairs, "; ", 0);
 528     }
 529     print "\n";
 530   }
 531 }
 532 
 533 # List min/max XYZ coordinates for ATOM/HETATM records...
 534 sub ListBoundingBox {
 535   my($PDBRecordLinesRef) = @_;
 536   my($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax, $XSize, $YSize, $ZSize);
 537 
 538   ($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax) = GetMinMaxCoords($PDBRecordLinesRef);
 539   $XSize = abs($XMax - $XMin);
 540   $YSize = abs($YMax - $YMin);
 541   $ZSize = abs($ZMax - $ZMin);
 542 
 543   $XMin = sprintf("%.3f", $XMin) + 0; $XMax = sprintf("%.3f", $XMax) + 0;
 544   $YMin = sprintf("%.3f", $YMin) + 0; $YMax = sprintf("%.3f", $YMax) + 0;
 545   $ZMin = sprintf("%.3f", $ZMin) + 0; $ZMax = sprintf("%.3f", $ZMax) + 0;
 546 
 547   $XSize = sprintf("%.3f", $XSize) + 0;
 548   $YSize = sprintf("%.3f", $YSize) + 0;
 549   $ZSize = sprintf("%.3f", $ZSize) + 0;
 550 
 551   print "\nBounding box coordinates: <XMin, XMax> - <$XMin, $XMax>; <YMin, YMax> - <$YMin, $YMax>; <ZMin, ZMax> - <$ZMin, $ZMax>;\n";
 552   print "Bounding box size in angstroms: XSize - $XSize; YSize - $YSize; ZSize - $ZSize\n";
 553 
 554 }
 555 
 556 # Check master record counts against actual record counts...
 557 sub CheckMasterRecord {
 558   my($RecordTypesCountRef, $PDBRecordLinesRef) = @_;
 559 
 560   # Get master record information...
 561   my($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = (undef) x 11;
 562   my($RecordLine, $MasterRecordFound);
 563   $MasterRecordFound = 0;
 564 
 565   LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
 566       if (IsMasterRecordType($RecordLine)) {
 567         ($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = ParseMasterRecordLine($RecordLine);
 568         $MasterRecordFound = 1;
 569         last LINE;
 570       }
 571   }
 572   if (!$MasterRecordFound) {
 573     print "\nWarning: MASTER record is missing.\n";
 574     return;
 575   }
 576   my(@MasterRecordValidationInfo);
 577   @MasterRecordValidationInfo = ();
 578   $NumOfRemarkRecords += 0;
 579   if (exists($RecordTypesCountRef->{Count}{REMARK}) && $NumOfRemarkRecords != $RecordTypesCountRef->{Count}{REMARK}) {
 580     push @MasterRecordValidationInfo, "Number of REMARK records, $NumOfRemarkRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{REMARK}.";
 581   }
 582   $NumOfHetRecords += 0;
 583   if (exists($RecordTypesCountRef->{Count}{HET}) && $NumOfHetRecords != $RecordTypesCountRef->{Count}{HET}) {
 584     push @MasterRecordValidationInfo, "Number of HET records, $NumOfHetRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{HET}.";
 585   }
 586   $NumOfHelixRecords += 0;
 587   if (exists($RecordTypesCountRef->{Count}{HELIX}) && $NumOfHelixRecords != $RecordTypesCountRef->{Count}{HELIX}) {
 588     push @MasterRecordValidationInfo, "Number of HELIX records, $NumOfHelixRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{HELIX}.";
 589   }
 590   $NumOfSheetRecords += 0;
 591   if (exists($RecordTypesCountRef->{Count}{SHEET}) && $NumOfSheetRecords != $RecordTypesCountRef->{Count}{SHEET}) {
 592     push @MasterRecordValidationInfo, "Number of SHEET records, $NumOfSheetRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SHEET}.";
 593   }
 594   $NumOfTurnRecords += 0;
 595   if (exists($RecordTypesCountRef->{Count}{TURN}) && $NumOfTurnRecords != $RecordTypesCountRef->{Count}{TURN}) {
 596     push @MasterRecordValidationInfo, "Number of TURN records, $NumOfTurnRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{REMARK}.";
 597   }
 598   $NumOfSiteRecords += 0;
 599   if (exists($RecordTypesCountRef->{Count}{SITE}) && $NumOfSiteRecords != $RecordTypesCountRef->{Count}{SITE}) {
 600     push @MasterRecordValidationInfo, "Number of SITE records, $NumOfSiteRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SITE}.";
 601   }
 602 
 603   $NumOfTransformationsRecords += 0;
 604   my($RecordsCount, $ID, $RecordID, $RecordLabel);
 605   $RecordsCount = 0;
 606   for $RecordLabel ('ORIGX', 'SCALE', 'MTRIX') {
 607     for $ID (1 .. 3) {
 608       $RecordID = "${RecordLabel}${ID}";
 609       if (exists $RecordTypesCountRef->{Count}{$RecordID}) {
 610         $RecordsCount += $RecordTypesCountRef->{Count}{$RecordID};
 611       }
 612     }
 613   }
 614   if ($NumOfTransformationsRecords != $RecordsCount) {
 615     push @MasterRecordValidationInfo, "Number of transformation records (ORIGXn+SCALEn+MTRIXn), $NumOfTransformationsRecords, specified in MASTER record doen't match its explict count, $RecordsCount.";
 616   }
 617 
 618   $RecordsCount = 0;
 619   for $RecordLabel ('ATOM', 'HETATM') {
 620       if (exists $RecordTypesCountRef->{Count}{$RecordLabel}) {
 621         $RecordsCount += $RecordTypesCountRef->{Count}{$RecordLabel};
 622       }
 623   }
 624   $NumOfAtomAndHetatmRecords += 0;
 625   if ($NumOfAtomAndHetatmRecords != $RecordsCount) {
 626     push @MasterRecordValidationInfo, "Number of ATOM + HETATM records, $NumOfAtomAndHetatmRecords, specified in MASTER record doen't match its explict count, $RecordsCount.";
 627   }
 628   $NumOfTerRecords += 0;
 629   if (exists($RecordTypesCountRef->{Count}{TER}) && $NumOfTerRecords != $RecordTypesCountRef->{Count}{TER}) {
 630     push @MasterRecordValidationInfo, "Number of TER records, $NumOfTerRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{TER}.";
 631   }
 632   $NumOfConectRecords += 0;
 633   if (exists($RecordTypesCountRef->{Count}{CONECT}) && $NumOfConectRecords != $RecordTypesCountRef->{Count}{CONECT}) {
 634     push @MasterRecordValidationInfo, "Number of CONECT records, $NumOfConectRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{CONECT}.";
 635   }
 636   $NumOfSeqresRecords += 0;
 637   if (exists($RecordTypesCountRef->{Count}{SEQRES}) && $NumOfSeqresRecords != $RecordTypesCountRef->{Count}{SEQRES}) {
 638     push @MasterRecordValidationInfo, "Number of SITE records, $NumOfSeqresRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SEQRES}.";
 639   }
 640 
 641   if (@MasterRecordValidationInfo) {
 642     print "\nMASTER record validation: Count mismatches found:\n";
 643     print JoinWords(\@MasterRecordValidationInfo, "\n", 0), "\n";
 644   }
 645   else {
 646     print "\nMASTER record validation: Count values match with the explicit count of the corresponding records.\n";
 647   }
 648 }
 649 
 650 # Total size of all the files...
 651 sub ListTotalSizeOfFiles {
 652   my($FileOkayCount, $TotalSize, $Index);
 653 
 654   $FileOkayCount = 0;
 655   $TotalSize = 0;
 656 
 657   for $Index (0 .. $#PDBFilesList) {
 658     if ($PDBFilesInfo{FileOkay}[$Index]) {
 659       $FileOkayCount++;
 660       $TotalSize += $PDBFilesInfo{FileSize}[$Index];
 661     }
 662   }
 663   if ($FileOkayCount > 1) {
 664     print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n";
 665   }
 666 
 667 }
 668 
 669 # Empty chain IDs are replaced with "None[1-9]". But for displaying purposes, take out any
 670 # numbers from label...
 671 sub CleanupChainID {
 672   my($ChainID) = @_;
 673 
 674   if ($ChainID =~ /^None/i) {
 675     return 'None';
 676   }
 677   return $ChainID;
 678 }
 679 
 680 # Process option values...
 681 sub ProcessOptions {
 682   %OptionsInfo = ();
 683 
 684   # Setup record types to count...
 685   if ($Options{count}) {
 686     $OptionsInfo{CountRecordType} = $Options{count};
 687   }
 688   else {
 689     $OptionsInfo{CountRecordType} = $Options{all} ? 'All' : 'ATOM,HETATM';
 690   }
 691   @{$OptionsInfo{SpecifiedRecordTypes}} =();
 692   if ($OptionsInfo{CountRecordType} !~ /^All$/i) {
 693     my(@RecordTypes);
 694     @RecordTypes = split /\,/, $OptionsInfo{CountRecordType};
 695     push @{$OptionsInfo{SpecifiedRecordTypes}}, @RecordTypes;
 696   }
 697   $OptionsInfo{CountChains} = ($Options{chains} || $Options{all}) ? 1 : 0;
 698   $OptionsInfo{CheckMasterRecord} = ($Options{mastercheck} || $Options{all}) ? 1 : 0;
 699 
 700   # Residue count is the default. So $Options{residues} is simply ignored.
 701   my($CountResidues) = 1;
 702   $OptionsInfo{CountResiduesInChains} = (($CountResidues || $Options{all}) && $Options{residuesmode} =~ /^(InChains|Both)$/i) ? 1 : 0;
 703   $OptionsInfo{CountResiduesAll} = (($CountResidues || $Options{all}) && $Options{residuesmode} =~ /^(All|Both)$/i) ? 1 : 0;
 704 
 705   $OptionsInfo{ResiduesFrequencyInChains} = (($Options{frequency} || $Options{all}) && $Options{residuesmode} =~ /^(InChains|Both)$/i) ? 1 : 0;
 706   $OptionsInfo{ResiduesFrequencyAll} = (($Options{frequency} || $Options{all}) && $Options{residuesmode} =~ /^(All|Both)$/i) ? 1 : 0;
 707 
 708   $OptionsInfo{ResidueNumbersInfo} = ($Options{residuenumbers} || $Options{all})  ? 1 : 0;
 709 
 710   $OptionsInfo{CalculateBoundingBox} = ($Options{boundingbox} || $Options{all}) ? 1 : 0;
 711 
 712   $OptionsInfo{ListHeaderInfo} = ($Options{header} || $Options{all}) ? 1 : 0;
 713   $OptionsInfo{DetailLevel} = $Options{detail};
 714 
 715 }
 716 
 717 # Retrieve information about PDB files...
 718 sub RetrievePDBFilesInfo {
 719   my($Index, $PDBFile, $ModifiedTimeString, $ModifiedDateString);
 720 
 721   %PDBFilesInfo = ();
 722   @{$PDBFilesInfo{FileOkay}} = ();
 723   @{$PDBFilesInfo{FileSize}} = ();
 724   @{$PDBFilesInfo{FileLastModified}} = ();
 725 
 726   FILELIST: for $Index (0 .. $#PDBFilesList) {
 727     $PDBFilesInfo{FileOkay}[$Index] = 0;
 728     $PDBFilesInfo{FileSize}[$Index] = 0;
 729     $PDBFilesInfo{FileLastModified}[$Index] = '';
 730 
 731     $PDBFile = $PDBFilesList[$Index];
 732     if (!(-e $PDBFile)) {
 733       warn "Warning: Ignoring file $PDBFile: It doesn't exist\n";
 734       next FILELIST;
 735     }
 736     if (!CheckFileType($PDBFile, "pdb")) {
 737       warn "Warning: Ignoring file $PDBFile: It's not a PDB file\n";
 738       next FILELIST;
 739     }
 740     if (! open PDBFILE, "$PDBFile") {
 741       warn "Warning: Ignoring file $PDBFile: Couldn't open it: $! \n";
 742       next FILELIST;
 743     }
 744     close PDBFILE;
 745 
 746     $PDBFilesInfo{FileOkay}[$Index] = 1;
 747     $PDBFilesInfo{FileSize}[$Index] = FileSize($PDBFile);
 748     ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($PDBFile);
 749     $PDBFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString";
 750   }
 751 }
 752 
 753 
 754 # Setup script usage  and retrieve command line arguments specified using various options...
 755 sub SetupScriptUsage {
 756 
 757   # Retrieve all the options...
 758   %Options = ();
 759   $Options{count} = '';
 760   $Options{detail} = 1;
 761   $Options{residuesmode} = 'Both';
 762 
 763   if (!GetOptions(\%Options, "all|a", "boundingbox|b", "count|c=s", "chains", "detail|d=i", "frequency|f", "mastercheck|m", "header", "help|h", "residues", "residuesmode=s", "residuenumbers", "workingdir|w=s")) {
 764     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 765   }
 766   if ($Options{workingdir}) {
 767     if (! -d $Options{workingdir}) {
 768       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 769     }
 770     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 771   }
 772   if (!IsPositiveInteger($Options{detail})) {
 773     die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
 774   }
 775   if ($Options{residuesmode} !~ /^(InChains|All|Both)$/i) {
 776     die "Error: The value specified, $Options{residuesmode}, for option \"--ResiduesMode\" is not valid. Allowed values: InChains, All, or Both\n";
 777   }
 778 }
 779