MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: InfoPDBFiles.pl,v $
   4 # $Date: 2008/02/24 18:13:55 $
   5 # $Revision: 1.21 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use 5.006;
  30 use strict;
  31 use FindBin; use lib "$FindBin::Bin/../lib";
  32 use Getopt::Long;
  33 use File::Basename;
  34 use Text::ParseWords;
  35 use Benchmark;
  36 use FileUtil;
  37 use TextUtil;
  38 use PDBFileUtil;
  39 
  40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  41 
  42 # Autoflush STDOUT
  43 $| = 1;
  44 
  45 # Starting message...
  46 $ScriptName = basename($0);
  47 print "\n$ScriptName: Starting...\n\n";
  48 $StartTime = new Benchmark;
  49 
  50 # Get the options and setup script...
  51 SetupScriptUsage();
  52 if ($Options{help} || @ARGV < 1) {
  53   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  54 }
  55 
  56 my(@PDBFilesList);
  57 @PDBFilesList = ExpandFileNames(\@ARGV, "pdb");
  58 
  59 # Process options...
  60 my(%OptionsInfo);
  61 ProcessOptions();
  62 
  63 # Setup information about input files...
  64 my(%PDBFilesInfo);
  65 RetrievePDBFilesInfo();
  66 
  67 # Process input files..
  68 my($FileIndex, $PDBFile, $FileProcessingMsg);
  69 $FileProcessingMsg = "Processing file";
  70 if (@PDBFilesList > 1) {
  71   print "Processing PDB files...\n";
  72   $FileProcessingMsg = "\n$FileProcessingMsg";
  73 }
  74 
  75 for $FileIndex (0 .. $#PDBFilesList) {
  76   if ($PDBFilesInfo{FileOkay}[$FileIndex]) {
  77     $PDBFile = $PDBFilesList[$FileIndex];
  78     print "$FileProcessingMsg $PDBFile...\n";
  79     ListPDBFileInfo($FileIndex);
  80   }
  81 }
  82 
  83 ListTotalSizeOfFiles();
  84 
  85 print "$ScriptName:Done...\n\n";
  86 
  87 $EndTime = new Benchmark;
  88 $TotalTime = timediff ($EndTime, $StartTime);
  89 print "Total time: ", timestr($TotalTime), "\n";
  90 
  91 ###############################################################################
  92 
  93 # List appropriate information...
  94 sub ListPDBFileInfo {
  95   my($Index) = @_;
  96   my($PDBFile, $PDBRecordLinesRef);
  97 
  98   $PDBFile = $PDBFilesList[$Index];
  99   $PDBRecordLinesRef = ReadPDBFile($PDBFile);
 100 
 101   # Header informaton...
 102   if ($OptionsInfo{ListHeaderInfo}) {
 103     ListHeaderInfo($PDBRecordLinesRef);
 104   }
 105 
 106   # Total number of records...
 107   my($TotalRecordsCount) = scalar @{$PDBRecordLinesRef};
 108   print "\nTotal number of records: $TotalRecordsCount\n";
 109 
 110   # List record type count information...
 111   ListRecordTypesInfo($PDBRecordLinesRef);
 112 
 113   if ($OptionsInfo{CountChains} || $OptionsInfo{CountResiduesInChains} || $OptionsInfo{ResiduesFrequencyInChains}) {
 114     ListChainsAndResiduesInfo($PDBRecordLinesRef);
 115   }
 116   if ($OptionsInfo{CountResiduesAll} || $OptionsInfo{ResiduesFrequencyAll}) {
 117     ListAllResiduesInfo($PDBRecordLinesRef);
 118   }
 119   if ($OptionsInfo{CalculateBoundingBox}) {
 120     ListBoundingBox($PDBRecordLinesRef);
 121   }
 122 
 123   # File size and modification information...
 124   print "\nFile size: ", FormatFileSize($PDBFilesInfo{FileSize}[$Index]), " \n";
 125   print "Last modified: ", $PDBFilesInfo{FileLastModified}[$Index], " \n";
 126 }
 127 
 128 sub ListHeaderInfo {
 129   my($PDBRecordLinesRef) = @_;
 130   my($HeaderRecordLine, $Classification, $DepositionDate, $IDCode);
 131 
 132   ($Classification, $DepositionDate, $IDCode) = (undef) x 3;
 133   $HeaderRecordLine = $PDBRecordLinesRef->[0];
 134   if (IsHeaderRecordType($HeaderRecordLine)) {
 135     ($Classification, $DepositionDate, $IDCode) = ParseHeaderRecordLine($HeaderRecordLine);
 136   }
 137 
 138   $Classification = IsEmpty($Classification) ? 'Not available' : $Classification;
 139   $DepositionDate = IsEmpty($DepositionDate) ? 'Not available' : $DepositionDate;
 140   $IDCode = IsEmpty($IDCode) ? 'Not available' : $IDCode;
 141 
 142   print "\nClassification: $Classification\nID: $IDCode\nDeposition date: $DepositionDate\n";
 143 }
 144 
 145 # List record type info...
 146 sub ListRecordTypesInfo {
 147   my($PDBRecordLinesRef) = @_;
 148   my($RecordType, $RecordCount, $RecordTypesCountRef, @RecordTypeCountInfo);
 149 
 150   $RecordTypesCountRef = GetRecordTypesCount($PDBRecordLinesRef);
 151 
 152   @RecordTypeCountInfo = ();
 153   if ($OptionsInfo{CountRecordType} =~ /^All$/i) {
 154     for $RecordType (@{$RecordTypesCountRef->{RecordTypes}}) {
 155       $RecordCount = $RecordTypesCountRef->{Count}{$RecordType};
 156       push @RecordTypeCountInfo, "$RecordType - $RecordCount";
 157     }
 158   }
 159   else {
 160     for $RecordType (@{$OptionsInfo{SpecifiedRecordTypes}}) {
 161       $RecordCount = (exists $RecordTypesCountRef->{Count}{$RecordType}) ? ($RecordTypesCountRef->{Count}{$RecordType}) : 0;
 162       push @RecordTypeCountInfo, "$RecordType - $RecordCount";
 163     }
 164   }
 165   print "Number of individual records: ", JoinWords(\@RecordTypeCountInfo, '; ', 0), "\n";
 166 
 167   if ($OptionsInfo{CheckMasterRecord}) {
 168     CheckMasterRecord($RecordTypesCountRef, $PDBRecordLinesRef);
 169   }
 170 }
 171 
 172 # List information about residues and chains...
 173 sub ListChainsAndResiduesInfo {
 174   my($PDBRecordLinesRef) = @_;
 175   my($ResidueName, $ResidueCount, $ChainCount, $ChainID, $CollectChainResiduesBeyondTER, $ChainsAndResiduesInfoRef);
 176 
 177   $CollectChainResiduesBeyondTER = 1;
 178   $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', $CollectChainResiduesBeyondTER);
 179   $ChainCount = @{$ChainsAndResiduesInfoRef->{ChainIDs}};
 180   if ($OptionsInfo{CountChains}) {
 181     print "\nNumber of chains: $ChainCount \n";
 182     my($ChainID, @ChainIDsList);
 183     @ChainIDsList = ();
 184     for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 185       push @ChainIDsList, CleanupChainID($ChainID);
 186     }
 187     print "Chain IDs: ", JoinWords(\@ChainIDsList, ', ', 0),"\n";
 188   }
 189 
 190   if ($OptionsInfo{CountResiduesInChains}) {
 191     my($TotalResiduesCount, $ResidueCountInfo, @ResiduesCountInfo);
 192     @ResiduesCountInfo = ();
 193     $TotalResiduesCount = 0;
 194     for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 195       $ResidueCount = @{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}};
 196       $TotalResiduesCount += $ResidueCount;
 197       $ResidueCountInfo =  "Chain ${ChainID} - ${ResidueCount}";
 198       push @ResiduesCountInfo, $ResidueCountInfo;
 199     }
 200     print "\nNumber of residues in chain(s): ";
 201     if ($ChainCount > 1) {
 202       print "Total - $TotalResiduesCount; ", JoinWords(\@ResiduesCountInfo, '; ', 0),"\n";
 203     }
 204     else {
 205       print "$TotalResiduesCount\n";
 206     }
 207 
 208     # List of residues in each chain...
 209     if ($OptionsInfo{DetailLevel} >= 3) {
 210       print "List of residues in chain(s): \n";
 211       for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 212 	if ($ChainCount > 1) {
 213 	  print "Chain ", CleanupChainID($ChainID), ": ";
 214 	}
 215 	print JoinWords(\@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}, ', ', 0),"\n";
 216       }
 217     }
 218   }
 219   if ($OptionsInfo{ResiduesFrequencyInChains}) {
 220     # Setup a hash using residue count as key for sorting the values...
 221     my(%ResiduesCountToNameMap);
 222     %ResiduesCountToNameMap = ();
 223     @{$ResiduesCountToNameMap{ChainIDs}} = ();
 224     %{$ResiduesCountToNameMap{ResidueNames}} = ();
 225 
 226     for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 227       push @{$ResiduesCountToNameMap{ChainIDs}}, $ChainID;
 228       %{$ResiduesCountToNameMap{ResidueNames}{$ChainID}} = ();
 229 
 230       for $ResidueName (sort keys %{$ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}}) {
 231 	$ResidueCount = $ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}{$ResidueName};
 232 	# Setup count value for each chain...
 233 	if (exists $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount}) {
 234 	  $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount} .= "~${ResidueName}";
 235 	}
 236 	else {
 237 	  $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount} = $ResidueName;
 238 	}
 239       }
 240     }
 241     # Collect data for all the residues in all the chains...
 242     my(%AllResiduesNameToCountMap, %AllResiduesCountToNameMap);
 243     %AllResiduesNameToCountMap = ();
 244     %AllResiduesCountToNameMap = ();
 245     if ($ChainCount > 1) {
 246       for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 247 	for $ResidueName (keys %{$ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}}) {
 248 	  $ResidueCount = $ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}{$ResidueName};
 249 	  if (exists $AllResiduesNameToCountMap{$ResidueName}) {
 250 	    $AllResiduesNameToCountMap{$ResidueName} += $ResidueCount;
 251 	  }
 252 	  else {
 253 	    $AllResiduesNameToCountMap{$ResidueName} = $ResidueCount;
 254 	  }
 255 	}
 256       }
 257       for $ResidueName (keys %AllResiduesNameToCountMap) {
 258 	$ResidueCount = $AllResiduesNameToCountMap{$ResidueName};
 259 	if (exists $AllResiduesCountToNameMap{$ResidueCount}) {
 260 	  $AllResiduesCountToNameMap{$ResidueCount} .= "~${ResidueName}";
 261 	}
 262 	else {
 263 	  $AllResiduesCountToNameMap{$ResidueCount} = $ResidueName;
 264 	}
 265       }
 266     }
 267 
 268     # Setup distribution data for individual chains and the grand total as well...
 269     my($ChainResidueCount, $PercentResidueCount, $TotalResidueCount, $ResidueNames, @ResidueNamesList, %ResiduesFrequencyInfoMap);
 270     @{$ResiduesFrequencyInfoMap{ChainIDs}} = ();
 271     %{$ResiduesFrequencyInfoMap{Frequency}} = ();
 272     %{$ResiduesFrequencyInfoMap{PercentFrequency}} = ();
 273 
 274     @{$ResiduesFrequencyInfoMap{AllChainsFrequency}} = ();
 275     @{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}} = ();
 276 
 277     $TotalResidueCount = 0;
 278 
 279     for $ChainID (@{$ResiduesCountToNameMap{ChainIDs}}) {
 280       push @{$ResiduesFrequencyInfoMap{ChainIDs}}, $ChainID;
 281       @{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}} = ();
 282       @{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}} = ();
 283 
 284       $ChainResidueCount = @{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}};
 285       $TotalResidueCount += $ChainResidueCount;
 286 
 287       for $ResidueCount (sort {$b <=> $a} keys %{$ResiduesCountToNameMap{ResidueNames}{$ChainID}}) {
 288 	$ResidueNames = $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount};
 289 	@ResidueNamesList = split /~/, $ResidueNames;
 290 	for $ResidueName (@ResidueNamesList) {
 291 	  push @{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}}, "${ResidueName} - ${ResidueCount}";
 292 	  $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$ChainResidueCount)*100)) + 0;
 293 	  push @{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}}, "${ResidueName} - ${PercentResidueCount}%";
 294 	}
 295       }
 296     }
 297     if ($ChainCount > 1) {
 298       for $ResidueCount (sort {$b <=> $a} keys %AllResiduesCountToNameMap) {
 299 	$ResidueNames = $AllResiduesCountToNameMap{$ResidueCount};
 300 	@ResidueNamesList = split /~/, $ResidueNames;
 301 	for $ResidueName (@ResidueNamesList) {
 302 	  push @{$ResiduesFrequencyInfoMap{AllChainsFrequency}}, "${ResidueName} - ${ResidueCount}";
 303 	  $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0;
 304 	  push @{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}}, "${ResidueName} - ${PercentResidueCount}%";
 305 	}
 306       }
 307     }
 308 
 309     # List distribution of residues
 310     print "\nDistribution of residues in chain(s): \n";
 311     for $ChainID (@{$ResiduesFrequencyInfoMap{ChainIDs}}) {
 312       if ($ChainCount > 1) {
 313 	print "Chain ", CleanupChainID($ChainID), ": ";
 314       }
 315       print JoinWords(\@{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}}, '; ', 0), "\n";
 316     }
 317     if ($OptionsInfo{DetailLevel} >= 2) {
 318       print "\nPercent distribution of residues in chain(s): \n";
 319       for $ChainID (@{$ResiduesFrequencyInfoMap{ChainIDs}}) {
 320 	if ($ChainCount > 1) {
 321 	  print "Chain ", CleanupChainID($ChainID), ": ";
 322 	}
 323 	print JoinWords(\@{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}}, '; ', 0), "\n";
 324       }
 325     }
 326     if ($ChainCount > 1) {
 327       print "\nDistribution of residues across all chains: \n";
 328       print JoinWords(\@{$ResiduesFrequencyInfoMap{AllChainsFrequency}}, '; ', 0), "\n";
 329 
 330       if ($OptionsInfo{DetailLevel} >= 2) {
 331 	print "\nPercent distribution of residues across all chains: \n";
 332 	print JoinWords(\@{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}}, '; ', 0), "\n";
 333       }
 334     }
 335   }
 336 }
 337 
 338 # List information about all the residues...
 339 sub ListAllResiduesInfo {
 340   my($PDBRecordLinesRef) = @_;
 341   my($TotalResidueCount, $AtomResiduesCount, $HetatmResiduesCount, $ResiduesInfoRef);
 342 
 343   $ResiduesInfoRef = GetAllResidues($PDBRecordLinesRef);
 344   $TotalResidueCount = @{$ResiduesInfoRef->{ResidueNames}};
 345   $AtomResiduesCount = @{$ResiduesInfoRef->{AtomResidueNames}};
 346   $HetatmResiduesCount = @{$ResiduesInfoRef->{HetatmResidueNames}};
 347 
 348   if ($OptionsInfo{CountResiduesAll}) {
 349     print "\nTotal number of residues: Total - $TotalResidueCount; ATOM residues - $AtomResiduesCount; HETATM residues - $HetatmResiduesCount\n";
 350 
 351     if ($OptionsInfo{DetailLevel} >= 3) {
 352       print "List of residues: \n";
 353       if ($AtomResiduesCount) {
 354 	print "ATOM residues: ", JoinWords(\@{$ResiduesInfoRef->{AtomResidueNames}}, ', ', 0), "\n";
 355       }
 356       if ($HetatmResiduesCount) {
 357 	print "HETATM residues: ", JoinWords(\@{$ResiduesInfoRef->{HetatmResidueNames}}, ', ', 0), "\n";
 358       }
 359     }
 360   }
 361 
 362   if ($OptionsInfo{ResiduesFrequencyAll}) {
 363     my($ResidueName, $ResidueCount);
 364 
 365     # Setup a hash using residue count as key for sorting the values...
 366     my(%ResiduesCountToNameMap, %AtomResiduesCountToNameMap, %HetatmResiduesCountToNameMap);
 367     %ResiduesCountToNameMap = ();
 368     %{$ResiduesCountToNameMap{ResidueNames}} = ();
 369 
 370     %AtomResiduesCountToNameMap = ();
 371     %{$AtomResiduesCountToNameMap{ResidueNames}} = ();
 372 
 373     %HetatmResiduesCountToNameMap = ();
 374     %{$HetatmResiduesCountToNameMap{ResidueNames}} = ();
 375 
 376     for $ResidueName (keys %{$ResiduesInfoRef->{ResidueCount}}) {
 377       $ResidueCount = $ResiduesInfoRef->{ResidueCount}{$ResidueName};
 378       if (exists $ResiduesCountToNameMap{ResidueNames}{$ResidueCount}) {
 379 	$ResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}";
 380       }
 381       else {
 382 	$ResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName;
 383       }
 384     }
 385 
 386     if ($OptionsInfo{DetailLevel} >= 1) {
 387       for $ResidueName (keys %{$ResiduesInfoRef->{AtomResidueCount}}) {
 388 	$ResidueCount = $ResiduesInfoRef->{AtomResidueCount}{$ResidueName};
 389 	if (exists $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount}) {
 390 	  $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}";
 391 	}
 392 	else {
 393 	  $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName;
 394 	}
 395       }
 396       for $ResidueName (keys %{$ResiduesInfoRef->{HetatmResidueCount}}) {
 397 	$ResidueCount = $ResiduesInfoRef->{HetatmResidueCount}{$ResidueName};
 398 	if (exists $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount}) {
 399 	  $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}";
 400 	}
 401 	else {
 402 	  $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName;
 403 	}
 404       }
 405     }
 406 
 407     # Setup distribution of residues info...
 408     my($ResidueNames, $PercentResidueCount, @ResidueNamesList, %ResiduesCountInfoMap, %AtomResiduesCountInfoMap, %HetatmResiduesCountInfoMap);
 409 
 410     @{$ResiduesCountInfoMap{Frequency}} = ();
 411     @{$ResiduesCountInfoMap{PercentFrequency}} = ();
 412     for $ResidueCount (sort {$b <=> $a} keys %{$ResiduesCountToNameMap{ResidueNames}}) {
 413       $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0;
 414       $ResidueNames = $ResiduesCountToNameMap{ResidueNames}{$ResidueCount};
 415       @ResidueNamesList = split /~/, $ResidueNames;
 416       for $ResidueName (@ResidueNamesList) {
 417 	push @{$ResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}";
 418 	push @{$ResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}";
 419       }
 420     }
 421     if ($OptionsInfo{DetailLevel} >= 1) {
 422       @{$AtomResiduesCountInfoMap{Frequency}} = ();
 423       @{$AtomResiduesCountInfoMap{PercentFrequency}} = ();
 424       for $ResidueCount (sort {$b <=> $a} keys %{$AtomResiduesCountToNameMap{ResidueNames}}) {
 425 	$PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0;
 426 	$ResidueNames = $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount};
 427 	@ResidueNamesList = split /~/, $ResidueNames;
 428 	for $ResidueName (@ResidueNamesList) {
 429 	  push @{$AtomResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}";
 430 	  push @{$AtomResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}";
 431 	}
 432       }
 433       @{$HetatmResiduesCountInfoMap{Frequency}} = ();
 434       @{$HetatmResiduesCountInfoMap{PercentFrequency}} = ();
 435       for $ResidueCount (sort {$b <=> $a} keys %{$HetatmResiduesCountToNameMap{ResidueNames}}) {
 436 	$PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0;
 437 	$ResidueNames = $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount};
 438 	@ResidueNamesList = split /~/, $ResidueNames;
 439 	for $ResidueName (@ResidueNamesList) {
 440 	  push @{$HetatmResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}";
 441 	  push @{$HetatmResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}";
 442 	}
 443       }
 444     }
 445 
 446     # List distribution of residues
 447     print "\nDistribution of residues: ", JoinWords(\@{$ResiduesCountInfoMap{Frequency}},'; ', 0), "\n";
 448     if ($OptionsInfo{DetailLevel} >= 2) {
 449       print "\nPercent distribution of residues: ", JoinWords(\@{$ResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n";
 450     }
 451 
 452     if ($OptionsInfo{DetailLevel} >= 1) {
 453       print "\nDistribution of ATOM residues: ", JoinWords(\@{$AtomResiduesCountInfoMap{Frequency}},'; ', 0), "\n";
 454       if ($OptionsInfo{DetailLevel} >= 2) {
 455 	print "\nPercent distribution of ATOM residues: ", JoinWords(\@{$AtomResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n";
 456       }
 457 
 458       print "\nDistribution of HETATM residues: ", JoinWords(\@{$HetatmResiduesCountInfoMap{Frequency}},'; ', 0), "\n";
 459       if ($OptionsInfo{DetailLevel} >= 2) {
 460 	print "\nPercent distribution of HETATM residues: ", JoinWords(\@{$HetatmResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n";
 461       }
 462     }
 463   }
 464 }
 465 
 466 # List min/max XYZ coordinates for ATOM/HETATM records...
 467 sub ListBoundingBox {
 468   my($PDBRecordLinesRef) = @_;
 469   my($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax, $XSize, $YSize, $ZSize);
 470 
 471   ($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax) = GetMinMaxCoords($PDBRecordLinesRef);
 472   $XSize = abs($XMax - $XMin);
 473   $YSize = abs($YMax - $YMin);
 474   $ZSize = abs($ZMax - $ZMin);
 475 
 476   $XMin = sprintf("%.3f", $XMin) + 0; $XMax = sprintf("%.3f", $XMax) + 0;
 477   $YMin = sprintf("%.3f", $YMin) + 0; $YMax = sprintf("%.3f", $YMax) + 0;
 478   $ZMin = sprintf("%.3f", $ZMin) + 0; $ZMax = sprintf("%.3f", $ZMax) + 0;
 479 
 480   $XSize = sprintf("%.3f", $XSize) + 0;
 481   $YSize = sprintf("%.3f", $YSize) + 0;
 482   $ZSize = sprintf("%.3f", $ZSize) + 0;
 483 
 484   print "\nBounding box coordinates: <XMin, XMax> - <$XMin, $XMax>; <YMin, YMax> - <$YMin, $YMax>; <ZMin, ZMax> - <$ZMin, $ZMax>;\n";
 485   print "Bounding box size in angstroms: XSize - $XSize; YSize - $YSize; ZSize - $ZSize\n";
 486 
 487 }
 488 
 489 # Check master record counts against actual record counts...
 490 sub CheckMasterRecord {
 491   my($RecordTypesCountRef, $PDBRecordLinesRef) = @_;
 492 
 493   # Get master record information...
 494   my($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = (undef) x 11;
 495   my($RecordLine, $MasterRecordFound);
 496   $MasterRecordFound = 0;
 497 
 498   LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
 499       if (IsMasterRecordType($RecordLine)) {
 500 	($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = ParseMasterRecordLine($RecordLine);
 501 	$MasterRecordFound = 1;
 502 	last LINE;
 503       }
 504   }
 505   if (!$MasterRecordFound) {
 506     print "\nWarning: MASTER record is missing.\n";
 507     return;
 508   }
 509   my(@MasterRecordValidationInfo);
 510   @MasterRecordValidationInfo = ();
 511   $NumOfRemarkRecords += 0;
 512   if (exists($RecordTypesCountRef->{Count}{REMARK}) && $NumOfRemarkRecords != $RecordTypesCountRef->{Count}{REMARK}) {
 513     push @MasterRecordValidationInfo, "Number of REMARK records, $NumOfRemarkRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{REMARK}.";
 514   }
 515   $NumOfHetRecords += 0;
 516   if (exists($RecordTypesCountRef->{Count}{HET}) && $NumOfHetRecords != $RecordTypesCountRef->{Count}{HET}) {
 517     push @MasterRecordValidationInfo, "Number of HET records, $NumOfHetRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{HET}.";
 518   }
 519   $NumOfHelixRecords += 0;
 520   if (exists($RecordTypesCountRef->{Count}{HELIX}) && $NumOfHelixRecords != $RecordTypesCountRef->{Count}{HELIX}) {
 521     push @MasterRecordValidationInfo, "Number of HELIX records, $NumOfHelixRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{HELIX}.";
 522   }
 523   $NumOfSheetRecords += 0;
 524   if (exists($RecordTypesCountRef->{Count}{SHEET}) && $NumOfSheetRecords != $RecordTypesCountRef->{Count}{SHEET}) {
 525     push @MasterRecordValidationInfo, "Number of SHEET records, $NumOfSheetRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SHEET}.";
 526   }
 527   $NumOfTurnRecords += 0;
 528   if (exists($RecordTypesCountRef->{Count}{TURN}) && $NumOfTurnRecords != $RecordTypesCountRef->{Count}{TURN}) {
 529     push @MasterRecordValidationInfo, "Number of TURN records, $NumOfTurnRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{REMARK}.";
 530   }
 531   $NumOfSiteRecords += 0;
 532   if (exists($RecordTypesCountRef->{Count}{SITE}) && $NumOfSiteRecords != $RecordTypesCountRef->{Count}{SITE}) {
 533     push @MasterRecordValidationInfo, "Number of SITE records, $NumOfSiteRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SITE}.";
 534   }
 535 
 536   $NumOfTransformationsRecords += 0;
 537   my($RecordsCount, $ID, $RecordID, $RecordLabel);
 538   $RecordsCount = 0;
 539   for $RecordLabel ('ORIGX', 'SCALE', 'MTRIX') {
 540     for $ID (1 .. 3) {
 541       $RecordID = "${RecordLabel}${ID}";
 542       if (exists $RecordTypesCountRef->{Count}{$RecordID}) {
 543 	$RecordsCount += $RecordTypesCountRef->{Count}{$RecordID};
 544       }
 545     }
 546   }
 547   if ($NumOfTransformationsRecords != $RecordsCount) {
 548     push @MasterRecordValidationInfo, "Number of transformation records (ORIGXn+SCALEn+MTRIXn), $NumOfTransformationsRecords, specified in MASTER record doen't match its explict count, $RecordsCount.";
 549   }
 550 
 551   $RecordsCount = 0;
 552   for $RecordLabel ('ATOM', 'HETATM') {
 553       if (exists $RecordTypesCountRef->{Count}{$RecordLabel}) {
 554 	$RecordsCount += $RecordTypesCountRef->{Count}{$RecordLabel};
 555       }
 556   }
 557   $NumOfAtomAndHetatmRecords += 0;
 558   if ($NumOfAtomAndHetatmRecords != $RecordsCount) {
 559     push @MasterRecordValidationInfo, "Number of ATOM + HETATM records, $NumOfAtomAndHetatmRecords, specified in MASTER record doen't match its explict count, $RecordsCount.";
 560   }
 561   $NumOfTerRecords += 0;
 562   if (exists($RecordTypesCountRef->{Count}{TER}) && $NumOfTerRecords != $RecordTypesCountRef->{Count}{TER}) {
 563     push @MasterRecordValidationInfo, "Number of TER records, $NumOfTerRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{TER}.";
 564   }
 565   $NumOfConectRecords += 0;
 566   if (exists($RecordTypesCountRef->{Count}{CONECT}) && $NumOfConectRecords != $RecordTypesCountRef->{Count}{CONECT}) {
 567     push @MasterRecordValidationInfo, "Number of CONECT records, $NumOfConectRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{CONECT}.";
 568   }
 569   $NumOfSeqresRecords += 0;
 570   if (exists($RecordTypesCountRef->{Count}{SEQRES}) && $NumOfSeqresRecords != $RecordTypesCountRef->{Count}{SEQRES}) {
 571     push @MasterRecordValidationInfo, "Number of SITE records, $NumOfSeqresRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SEQRES}.";
 572   }
 573 
 574   if (@MasterRecordValidationInfo) {
 575     print "\nMASTER record validation: Count mismatches found:\n";
 576     print JoinWords(\@MasterRecordValidationInfo, "\n", 0), "\n";
 577   }
 578   else {
 579     print "\nMASTER record validation: Count values match with the explicit count of the corresponding records.\n";
 580   }
 581 }
 582 
 583 # Total size of all the fiels...
 584 sub ListTotalSizeOfFiles {
 585   my($FileOkayCount, $TotalSize, $Index);
 586 
 587   $FileOkayCount = 0;
 588   $TotalSize = 0;
 589 
 590   for $Index (0 .. $#PDBFilesList) {
 591     if ($PDBFilesInfo{FileOkay}[$Index]) {
 592       $FileOkayCount++;
 593       $TotalSize += $PDBFilesInfo{FileSize}[$Index];
 594     }
 595   }
 596   if ($FileOkayCount > 1) {
 597     print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n";
 598   }
 599 
 600 }
 601 
 602 # Empty chain IDs are replaced with "None[1-9]". But for displaying purposes, take out any
 603 # numbers from label...
 604 sub CleanupChainID {
 605   my($ChainID) = @_;
 606 
 607   if ($ChainID =~ /^None/i) {
 608     return 'None';
 609   }
 610   return $ChainID;
 611 }
 612 
 613 # Process option values...
 614 sub ProcessOptions {
 615   %OptionsInfo = ();
 616 
 617   # Setup record types to count...
 618   if ($Options{count}) {
 619     $OptionsInfo{CountRecordType} = $Options{count};
 620   }
 621   else {
 622     $OptionsInfo{CountRecordType} = $Options{all} ? 'All' : 'ATOM,HETATM';
 623   }
 624   @{$OptionsInfo{SpecifiedRecordTypes}} =();
 625   if ($OptionsInfo{CountRecordType} !~ /^All$/i) {
 626     my(@RecordTypes);
 627     @RecordTypes = split /\,/, $OptionsInfo{CountRecordType};
 628     push @{$OptionsInfo{SpecifiedRecordTypes}}, @RecordTypes;
 629   }
 630   $OptionsInfo{CountChains} = ($Options{chains} || $Options{all}) ? 1 : 0;
 631   $OptionsInfo{CheckMasterRecord} = ($Options{mastercheck} || $Options{all}) ? 1 : 0;
 632 
 633   # Residue count is the default. So $Options{residues} is simply ignored.
 634   my($CountResidues) = 1;
 635   $OptionsInfo{CountResiduesInChains} = (($CountResidues || $Options{all}) && $Options{residuesmode} =~ /^(InChains|Both)$/i) ? 1 : 0;
 636   $OptionsInfo{CountResiduesAll} = (($CountResidues || $Options{all}) && $Options{residuesmode} =~ /^(All|Both)$/i) ? 1 : 0;
 637 
 638   $OptionsInfo{ResiduesFrequencyInChains} = (($Options{frequency} || $Options{all}) && $Options{residuesmode} =~ /^(InChains|Both)$/i) ? 1 : 0;
 639   $OptionsInfo{ResiduesFrequencyAll} = (($Options{frequency} || $Options{all}) && $Options{residuesmode} =~ /^(All|Both)$/i) ? 1 : 0;
 640 
 641   $OptionsInfo{CalculateBoundingBox} = ($Options{boundingbox} || $Options{all}) ? 1 : 0;
 642 
 643   $OptionsInfo{ListHeaderInfo} = ($Options{header} || $Options{all}) ? 1 : 0;
 644   $OptionsInfo{DetailLevel} = $Options{detail};
 645 
 646 }
 647 
 648 # Retrieve information about PDB files...
 649 sub RetrievePDBFilesInfo {
 650   my($Index, $PDBFile, $ModifiedTimeString, $ModifiedDateString);
 651 
 652   %PDBFilesInfo = ();
 653   @{$PDBFilesInfo{FileOkay}} = ();
 654   @{$PDBFilesInfo{FileSize}} = ();
 655   @{$PDBFilesInfo{FileLastModified}} = ();
 656 
 657   FILELIST: for $Index (0 .. $#PDBFilesList) {
 658     $PDBFilesInfo{FileOkay}[$Index] = 0;
 659     $PDBFilesInfo{FileSize}[$Index] = 0;
 660     $PDBFilesInfo{FileLastModified}[$Index] = '';
 661 
 662     $PDBFile = $PDBFilesList[$Index];
 663     if (!(-e $PDBFile)) {
 664       warn "Warning: Ignoring file $PDBFile: It doesn't exist\n";
 665       next FILELIST;
 666     }
 667     if (!CheckFileType($PDBFile, "pdb")) {
 668       warn "Warning: Ignoring file $PDBFile: It's not a PDB file\n";
 669       next FILELIST;
 670     }
 671     if (! open PDBFILE, "$PDBFile") {
 672       warn "Warning: Ignoring file $PDBFile: Couldn't open it: $! \n";
 673       next FILELIST;
 674     }
 675     close PDBFILE;
 676 
 677     $PDBFilesInfo{FileOkay}[$Index] = 1;
 678     $PDBFilesInfo{FileSize}[$Index] = FileSize($PDBFile);
 679     ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($PDBFile);
 680     $PDBFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString";
 681   }
 682 }
 683 
 684 
 685 # Setup script usage  and retrieve command line arguments specified using various options...
 686 sub SetupScriptUsage {
 687 
 688   # Retrieve all the options...
 689   %Options = ();
 690   $Options{count} = '';
 691   $Options{detail} = 1;
 692   $Options{residuesmode} = 'Both';
 693 
 694   if (!GetOptions(\%Options, "all|a", "boundingbox|b", "count|c=s", "chains", "detail|d=i", "frequency|f", "mastercheck|m", "header", "help|h", "residues", "residuesmode=s", "workingdir|w=s")) {
 695     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 696   }
 697   if ($Options{workingdir}) {
 698     if (! -d $Options{workingdir}) {
 699       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 700     }
 701     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 702   }
 703   if (!IsPositiveInteger($Options{detail})) {
 704     die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
 705   }
 706   if ($Options{residuesmode} !~ /^(InChains|All|Both)$/i) {
 707     die "Error: The value specified, $Options{residuesmode}, for option \"--ResiduesMode\" is not valid. Allowed values: InChains, All, or Both\n";
 708   }
 709 }
 710