1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: InfoPDBFiles.pl,v $ 4 # $Date: 2011/12/16 00:03:30 $ 5 # $Revision: 1.30 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2012 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use TextUtil; 37 use PDBFileUtil; 38 39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 40 41 # Autoflush STDOUT 42 $| = 1; 43 44 # Starting message... 45 $ScriptName = basename($0); 46 print "\n$ScriptName: Starting...\n\n"; 47 $StartTime = new Benchmark; 48 49 # Get the options and setup script... 50 SetupScriptUsage(); 51 if ($Options{help} || @ARGV < 1) { 52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 53 } 54 55 my(@PDBFilesList); 56 @PDBFilesList = ExpandFileNames(\@ARGV, "pdb"); 57 58 # Process options... 59 print "Processing options...\n"; 60 my(%OptionsInfo); 61 ProcessOptions(); 62 63 # Setup information about input files... 64 my(%PDBFilesInfo); 65 print "Checking input PDB file(s)...\n"; 66 RetrievePDBFilesInfo(); 67 68 # Process input files.. 69 my($FileIndex); 70 if (@PDBFilesList > 1) { 71 print "\nProcessing PDB files...\n"; 72 } 73 for $FileIndex (0 .. $#PDBFilesList) { 74 if ($PDBFilesInfo{FileOkay}[$FileIndex]) { 75 print "\nProcessing file $PDBFilesList[$FileIndex]...\n"; 76 ListPDBFileInfo($FileIndex); 77 } 78 } 79 ListTotalSizeOfFiles(); 80 81 print "\n$ScriptName:Done...\n\n"; 82 83 $EndTime = new Benchmark; 84 $TotalTime = timediff ($EndTime, $StartTime); 85 print "Total time: ", timestr($TotalTime), "\n"; 86 87 ############################################################################### 88 89 # List appropriate information... 90 sub ListPDBFileInfo { 91 my($Index) = @_; 92 my($PDBFile, $PDBRecordLinesRef); 93 94 $PDBFile = $PDBFilesList[$Index]; 95 $PDBRecordLinesRef = ReadPDBFile($PDBFile); 96 97 # Header informaton... 98 if ($OptionsInfo{ListHeaderInfo}) { 99 ListHeaderInfo($PDBRecordLinesRef); 100 } 101 102 # Total number of records... 103 my($TotalRecordsCount) = scalar @{$PDBRecordLinesRef}; 104 print "\nTotal number of records: $TotalRecordsCount\n"; 105 106 # List record type count information... 107 ListRecordTypesInfo($PDBRecordLinesRef); 108 109 if ($OptionsInfo{CountChains} || $OptionsInfo{CountResiduesInChains} || $OptionsInfo{ResiduesFrequencyInChains}) { 110 ListChainsAndResiduesInfo($PDBRecordLinesRef); 111 } 112 if ($OptionsInfo{CountResiduesAll} || $OptionsInfo{ResiduesFrequencyAll}) { 113 ListAllResiduesInfo($PDBRecordLinesRef); 114 } 115 if ($OptionsInfo{ResidueNumbersInfo}) { 116 ListResidueNumbersInfo($PDBRecordLinesRef); 117 } 118 if ($OptionsInfo{CalculateBoundingBox}) { 119 ListBoundingBox($PDBRecordLinesRef); 120 } 121 122 # File size and modification information... 123 print "\nFile size: ", FormatFileSize($PDBFilesInfo{FileSize}[$Index]), " \n"; 124 print "Last modified: ", $PDBFilesInfo{FileLastModified}[$Index], " \n"; 125 } 126 127 sub ListHeaderInfo { 128 my($PDBRecordLinesRef) = @_; 129 my($HeaderRecordLine, $Classification, $DepositionDate, $IDCode); 130 131 ($Classification, $DepositionDate, $IDCode) = (undef) x 3; 132 $HeaderRecordLine = $PDBRecordLinesRef->[0]; 133 if (IsHeaderRecordType($HeaderRecordLine)) { 134 ($Classification, $DepositionDate, $IDCode) = ParseHeaderRecordLine($HeaderRecordLine); 135 } 136 137 $Classification = IsEmpty($Classification) ? 'Not available' : $Classification; 138 $DepositionDate = IsEmpty($DepositionDate) ? 'Not available' : $DepositionDate; 139 $IDCode = IsEmpty($IDCode) ? 'Not available' : $IDCode; 140 141 print "\nClassification: $Classification\nID: $IDCode\nDeposition date: $DepositionDate\n"; 142 } 143 144 # List record type info... 145 sub ListRecordTypesInfo { 146 my($PDBRecordLinesRef) = @_; 147 my($RecordType, $RecordCount, $RecordTypesCountRef, @RecordTypeCountInfo); 148 149 $RecordTypesCountRef = GetRecordTypesCount($PDBRecordLinesRef); 150 151 @RecordTypeCountInfo = (); 152 if ($OptionsInfo{CountRecordType} =~ /^All$/i) { 153 for $RecordType (@{$RecordTypesCountRef->{RecordTypes}}) { 154 $RecordCount = $RecordTypesCountRef->{Count}{$RecordType}; 155 push @RecordTypeCountInfo, "$RecordType - $RecordCount"; 156 } 157 } 158 else { 159 for $RecordType (@{$OptionsInfo{SpecifiedRecordTypes}}) { 160 $RecordCount = (exists $RecordTypesCountRef->{Count}{$RecordType}) ? ($RecordTypesCountRef->{Count}{$RecordType}) : 0; 161 push @RecordTypeCountInfo, "$RecordType - $RecordCount"; 162 } 163 } 164 print "Number of individual records: ", JoinWords(\@RecordTypeCountInfo, '; ', 0), "\n"; 165 166 if ($OptionsInfo{CheckMasterRecord}) { 167 CheckMasterRecord($RecordTypesCountRef, $PDBRecordLinesRef); 168 } 169 } 170 171 # List information about residues and chains... 172 sub ListChainsAndResiduesInfo { 173 my($PDBRecordLinesRef) = @_; 174 my($ResidueName, $ResidueCount, $ChainCount, $ChainID, $CollectChainResiduesBeyondTER, $ChainsAndResiduesInfoRef); 175 176 $CollectChainResiduesBeyondTER = 1; 177 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', $CollectChainResiduesBeyondTER); 178 $ChainCount = @{$ChainsAndResiduesInfoRef->{ChainIDs}}; 179 if ($OptionsInfo{CountChains}) { 180 print "\nNumber of chains: $ChainCount \n"; 181 my($ChainID, @ChainIDsList); 182 @ChainIDsList = (); 183 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 184 push @ChainIDsList, CleanupChainID($ChainID); 185 } 186 print "Chain IDs: ", JoinWords(\@ChainIDsList, ', ', 0),"\n"; 187 } 188 189 if ($OptionsInfo{CountResiduesInChains}) { 190 my($TotalResiduesCount, $ResidueCountInfo, @ResiduesCountInfo); 191 @ResiduesCountInfo = (); 192 $TotalResiduesCount = 0; 193 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 194 $ResidueCount = @{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}; 195 $TotalResiduesCount += $ResidueCount; 196 $ResidueCountInfo = "Chain ${ChainID} - ${ResidueCount}"; 197 push @ResiduesCountInfo, $ResidueCountInfo; 198 } 199 print "\nNumber of residues in chain(s): "; 200 if ($ChainCount > 1) { 201 print "Total - $TotalResiduesCount; ", JoinWords(\@ResiduesCountInfo, '; ', 0),"\n"; 202 } 203 else { 204 print "$TotalResiduesCount\n"; 205 } 206 207 # List of residues in each chain... 208 if ($OptionsInfo{DetailLevel} >= 3) { 209 print "List of residues in chain(s): \n"; 210 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 211 if ($ChainCount > 1) { 212 print "Chain ", CleanupChainID($ChainID), ": "; 213 } 214 print JoinWords(\@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}, ', ', 0),"\n"; 215 } 216 } 217 } 218 if ($OptionsInfo{ResiduesFrequencyInChains}) { 219 # Setup a hash using residue count as key for sorting the values... 220 my(%ResiduesCountToNameMap); 221 %ResiduesCountToNameMap = (); 222 @{$ResiduesCountToNameMap{ChainIDs}} = (); 223 %{$ResiduesCountToNameMap{ResidueNames}} = (); 224 225 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 226 push @{$ResiduesCountToNameMap{ChainIDs}}, $ChainID; 227 %{$ResiduesCountToNameMap{ResidueNames}{$ChainID}} = (); 228 229 for $ResidueName (sort keys %{$ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}}) { 230 $ResidueCount = $ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}{$ResidueName}; 231 # Setup count value for each chain... 232 if (exists $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount}) { 233 $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount} .= "~${ResidueName}"; 234 } 235 else { 236 $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount} = $ResidueName; 237 } 238 } 239 } 240 # Collect data for all the residues in all the chains... 241 my(%AllResiduesNameToCountMap, %AllResiduesCountToNameMap); 242 %AllResiduesNameToCountMap = (); 243 %AllResiduesCountToNameMap = (); 244 if ($ChainCount > 1) { 245 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 246 for $ResidueName (keys %{$ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}}) { 247 $ResidueCount = $ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}{$ResidueName}; 248 if (exists $AllResiduesNameToCountMap{$ResidueName}) { 249 $AllResiduesNameToCountMap{$ResidueName} += $ResidueCount; 250 } 251 else { 252 $AllResiduesNameToCountMap{$ResidueName} = $ResidueCount; 253 } 254 } 255 } 256 for $ResidueName (keys %AllResiduesNameToCountMap) { 257 $ResidueCount = $AllResiduesNameToCountMap{$ResidueName}; 258 if (exists $AllResiduesCountToNameMap{$ResidueCount}) { 259 $AllResiduesCountToNameMap{$ResidueCount} .= "~${ResidueName}"; 260 } 261 else { 262 $AllResiduesCountToNameMap{$ResidueCount} = $ResidueName; 263 } 264 } 265 } 266 267 # Setup distribution data for individual chains and the grand total as well... 268 my($ChainResidueCount, $PercentResidueCount, $TotalResidueCount, $ResidueNames, @ResidueNamesList, %ResiduesFrequencyInfoMap); 269 @{$ResiduesFrequencyInfoMap{ChainIDs}} = (); 270 %{$ResiduesFrequencyInfoMap{Frequency}} = (); 271 %{$ResiduesFrequencyInfoMap{PercentFrequency}} = (); 272 273 @{$ResiduesFrequencyInfoMap{AllChainsFrequency}} = (); 274 @{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}} = (); 275 276 $TotalResidueCount = 0; 277 278 for $ChainID (@{$ResiduesCountToNameMap{ChainIDs}}) { 279 push @{$ResiduesFrequencyInfoMap{ChainIDs}}, $ChainID; 280 @{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}} = (); 281 @{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}} = (); 282 283 $ChainResidueCount = @{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}; 284 $TotalResidueCount += $ChainResidueCount; 285 286 for $ResidueCount (sort {$b <=> $a} keys %{$ResiduesCountToNameMap{ResidueNames}{$ChainID}}) { 287 $ResidueNames = $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount}; 288 @ResidueNamesList = split /~/, $ResidueNames; 289 for $ResidueName (@ResidueNamesList) { 290 push @{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}}, "${ResidueName} - ${ResidueCount}"; 291 $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$ChainResidueCount)*100)) + 0; 292 push @{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}}, "${ResidueName} - ${PercentResidueCount}%"; 293 } 294 } 295 } 296 if ($ChainCount > 1) { 297 for $ResidueCount (sort {$b <=> $a} keys %AllResiduesCountToNameMap) { 298 $ResidueNames = $AllResiduesCountToNameMap{$ResidueCount}; 299 @ResidueNamesList = split /~/, $ResidueNames; 300 for $ResidueName (@ResidueNamesList) { 301 push @{$ResiduesFrequencyInfoMap{AllChainsFrequency}}, "${ResidueName} - ${ResidueCount}"; 302 $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0; 303 push @{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}}, "${ResidueName} - ${PercentResidueCount}%"; 304 } 305 } 306 } 307 308 # List distribution of residues 309 print "\nDistribution of residues in chain(s): \n"; 310 for $ChainID (@{$ResiduesFrequencyInfoMap{ChainIDs}}) { 311 if ($ChainCount > 1) { 312 print "Chain ", CleanupChainID($ChainID), ": "; 313 } 314 print JoinWords(\@{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}}, '; ', 0), "\n"; 315 } 316 if ($OptionsInfo{DetailLevel} >= 2) { 317 print "\nPercent distribution of residues in chain(s): \n"; 318 for $ChainID (@{$ResiduesFrequencyInfoMap{ChainIDs}}) { 319 if ($ChainCount > 1) { 320 print "Chain ", CleanupChainID($ChainID), ": "; 321 } 322 print JoinWords(\@{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}}, '; ', 0), "\n"; 323 } 324 } 325 if ($ChainCount > 1) { 326 print "\nDistribution of residues across all chains: \n"; 327 print JoinWords(\@{$ResiduesFrequencyInfoMap{AllChainsFrequency}}, '; ', 0), "\n"; 328 329 if ($OptionsInfo{DetailLevel} >= 2) { 330 print "\nPercent distribution of residues across all chains: \n"; 331 print JoinWords(\@{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}}, '; ', 0), "\n"; 332 } 333 } 334 } 335 } 336 337 # List information about all the residues... 338 sub ListAllResiduesInfo { 339 my($PDBRecordLinesRef) = @_; 340 my($TotalResidueCount, $AtomResiduesCount, $HetatmResiduesCount, $ResiduesInfoRef); 341 342 $ResiduesInfoRef = GetAllResidues($PDBRecordLinesRef); 343 $TotalResidueCount = @{$ResiduesInfoRef->{ResidueNames}}; 344 $AtomResiduesCount = @{$ResiduesInfoRef->{AtomResidueNames}}; 345 $HetatmResiduesCount = @{$ResiduesInfoRef->{HetatmResidueNames}}; 346 347 if ($OptionsInfo{CountResiduesAll}) { 348 print "\nTotal number of residues: Total - $TotalResidueCount; ATOM residues - $AtomResiduesCount; HETATM residues - $HetatmResiduesCount\n"; 349 350 if ($OptionsInfo{DetailLevel} >= 3) { 351 print "List of residues: \n"; 352 if ($AtomResiduesCount) { 353 print "ATOM residues: ", JoinWords(\@{$ResiduesInfoRef->{AtomResidueNames}}, ', ', 0), "\n"; 354 } 355 if ($HetatmResiduesCount) { 356 print "HETATM residues: ", JoinWords(\@{$ResiduesInfoRef->{HetatmResidueNames}}, ', ', 0), "\n"; 357 } 358 } 359 } 360 361 if ($OptionsInfo{ResiduesFrequencyAll}) { 362 my($ResidueName, $ResidueCount); 363 364 # Setup a hash using residue count as key for sorting the values... 365 my(%ResiduesCountToNameMap, %AtomResiduesCountToNameMap, %HetatmResiduesCountToNameMap); 366 %ResiduesCountToNameMap = (); 367 %{$ResiduesCountToNameMap{ResidueNames}} = (); 368 369 %AtomResiduesCountToNameMap = (); 370 %{$AtomResiduesCountToNameMap{ResidueNames}} = (); 371 372 %HetatmResiduesCountToNameMap = (); 373 %{$HetatmResiduesCountToNameMap{ResidueNames}} = (); 374 375 for $ResidueName (keys %{$ResiduesInfoRef->{ResidueCount}}) { 376 $ResidueCount = $ResiduesInfoRef->{ResidueCount}{$ResidueName}; 377 if (exists $ResiduesCountToNameMap{ResidueNames}{$ResidueCount}) { 378 $ResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}"; 379 } 380 else { 381 $ResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName; 382 } 383 } 384 385 if ($OptionsInfo{DetailLevel} >= 1) { 386 for $ResidueName (keys %{$ResiduesInfoRef->{AtomResidueCount}}) { 387 $ResidueCount = $ResiduesInfoRef->{AtomResidueCount}{$ResidueName}; 388 if (exists $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount}) { 389 $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}"; 390 } 391 else { 392 $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName; 393 } 394 } 395 for $ResidueName (keys %{$ResiduesInfoRef->{HetatmResidueCount}}) { 396 $ResidueCount = $ResiduesInfoRef->{HetatmResidueCount}{$ResidueName}; 397 if (exists $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount}) { 398 $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}"; 399 } 400 else { 401 $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName; 402 } 403 } 404 } 405 406 # Setup distribution of residues info... 407 my($ResidueNames, $PercentResidueCount, @ResidueNamesList, %ResiduesCountInfoMap, %AtomResiduesCountInfoMap, %HetatmResiduesCountInfoMap); 408 409 @{$ResiduesCountInfoMap{Frequency}} = (); 410 @{$ResiduesCountInfoMap{PercentFrequency}} = (); 411 for $ResidueCount (sort {$b <=> $a} keys %{$ResiduesCountToNameMap{ResidueNames}}) { 412 $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0; 413 $ResidueNames = $ResiduesCountToNameMap{ResidueNames}{$ResidueCount}; 414 @ResidueNamesList = split /~/, $ResidueNames; 415 for $ResidueName (@ResidueNamesList) { 416 push @{$ResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}"; 417 push @{$ResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}"; 418 } 419 } 420 if ($OptionsInfo{DetailLevel} >= 1) { 421 @{$AtomResiduesCountInfoMap{Frequency}} = (); 422 @{$AtomResiduesCountInfoMap{PercentFrequency}} = (); 423 for $ResidueCount (sort {$b <=> $a} keys %{$AtomResiduesCountToNameMap{ResidueNames}}) { 424 $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0; 425 $ResidueNames = $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount}; 426 @ResidueNamesList = split /~/, $ResidueNames; 427 for $ResidueName (@ResidueNamesList) { 428 push @{$AtomResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}"; 429 push @{$AtomResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}"; 430 } 431 } 432 @{$HetatmResiduesCountInfoMap{Frequency}} = (); 433 @{$HetatmResiduesCountInfoMap{PercentFrequency}} = (); 434 for $ResidueCount (sort {$b <=> $a} keys %{$HetatmResiduesCountToNameMap{ResidueNames}}) { 435 $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0; 436 $ResidueNames = $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount}; 437 @ResidueNamesList = split /~/, $ResidueNames; 438 for $ResidueName (@ResidueNamesList) { 439 push @{$HetatmResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}"; 440 push @{$HetatmResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}"; 441 } 442 } 443 } 444 445 # List distribution of residues 446 print "\nDistribution of residues: ", JoinWords(\@{$ResiduesCountInfoMap{Frequency}},'; ', 0), "\n"; 447 if ($OptionsInfo{DetailLevel} >= 2) { 448 print "\nPercent distribution of residues: ", JoinWords(\@{$ResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n"; 449 } 450 451 if ($OptionsInfo{DetailLevel} >= 1) { 452 print "\nDistribution of ATOM residues: ", JoinWords(\@{$AtomResiduesCountInfoMap{Frequency}},'; ', 0), "\n"; 453 if ($OptionsInfo{DetailLevel} >= 2) { 454 print "\nPercent distribution of ATOM residues: ", JoinWords(\@{$AtomResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n"; 455 } 456 457 print "\nDistribution of HETATM residues: ", JoinWords(\@{$HetatmResiduesCountInfoMap{Frequency}},'; ', 0), "\n"; 458 if ($OptionsInfo{DetailLevel} >= 2) { 459 print "\nPercent distribution of HETATM residues: ", JoinWords(\@{$HetatmResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n"; 460 } 461 } 462 } 463 } 464 465 # List information about residue numbers for each chain... 466 sub ListResidueNumbersInfo { 467 my($PDBRecordLinesRef) = @_; 468 my($Index, $ResidueCount, $StartResidueNum, $EndResidueNum, $ChainID, $CollectChainResiduesBeyondTER, $ChainsAndResiduesInfoRef, $ResidueNum, $PreviousResidueNum, $ResidueName, $PreviousResidueName, $GapResiduePairsCount, $GapLength, $DescendingOrderResiduePairsCount, @DescendingOrderResiduePairs, @GapResiduePairs); 469 470 $CollectChainResiduesBeyondTER = 0; 471 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', $CollectChainResiduesBeyondTER); 472 473 print "\nATOM/HETATM residue numbers information for chains:\n"; 474 475 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 476 print "\nChain ID - ", CleanupChainID($ChainID), ""; 477 478 $ResidueCount = @{$ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}}; 479 480 # Start and end residue numbers... 481 $StartResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[0]; 482 $EndResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$ResidueCount - 1]; 483 print "; Number of residues: $ResidueCount; Start residue number - $StartResidueNum; End residue number - $EndResidueNum\n"; 484 485 # Identify any gaps in residue numbers or non-ascending order residue numbers... 486 $GapResiduePairsCount = 0; 487 $DescendingOrderResiduePairsCount = 0; 488 489 @DescendingOrderResiduePairs = (); 490 @GapResiduePairs = (); 491 492 RESIDUE: for $Index (1 .. ($ResidueCount - 1)) { 493 $ResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$Index]; 494 $PreviousResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$Index - 1]; 495 496 $ResidueName = $ChainsAndResiduesInfoRef->{Residues}{$ChainID}[$Index]; 497 $PreviousResidueName = $ChainsAndResiduesInfoRef->{Residues}{$ChainID}[$Index - 1]; 498 499 if ($ResidueNum == ($PreviousResidueNum + 1)) { 500 # All is good... 501 next RESIDUE; 502 } 503 504 # Are residue in descending order? 505 if ($ResidueNum < $PreviousResidueNum) { 506 $DescendingOrderResiduePairsCount++; 507 push @DescendingOrderResiduePairs, "<${PreviousResidueName}${PreviousResidueNum} - ${ResidueName}${ResidueNum}>"; 508 } 509 510 # Track gaps in residue pairs... 511 $GapResiduePairsCount++; 512 $GapLength = abs($ResidueNum - $PreviousResidueNum) - 1; 513 514 push @GapResiduePairs, "<${PreviousResidueName}${PreviousResidueNum} - ${ResidueName}${ResidueNum}; $GapLength>"; 515 } 516 517 # Print gaps information... 518 print "Gaps in residue numbers: ", $GapResiduePairsCount ? "Yes" : "None"; 519 if ($GapResiduePairsCount) { 520 print "; Number of gap residue number pairs: $GapResiduePairsCount; Gap residue pairs: <StartRes-EndRes; GapLength> - ", JoinWords(\@GapResiduePairs, "; ", 0); 521 } 522 print "\n"; 523 524 # Print descending residue order information... 525 print "Residue numbers in descending order: ", $DescendingOrderResiduePairsCount ? "Yes" : "None"; 526 if ($DescendingOrderResiduePairsCount) { 527 print "; Number of descending residue number pairs: $DescendingOrderResiduePairsCount; Descending residue number pairs: <StartRes-EndRes> ", JoinWords(\@DescendingOrderResiduePairs, "; ", 0); 528 } 529 print "\n"; 530 } 531 } 532 533 # List min/max XYZ coordinates for ATOM/HETATM records... 534 sub ListBoundingBox { 535 my($PDBRecordLinesRef) = @_; 536 my($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax, $XSize, $YSize, $ZSize); 537 538 ($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax) = GetMinMaxCoords($PDBRecordLinesRef); 539 $XSize = abs($XMax - $XMin); 540 $YSize = abs($YMax - $YMin); 541 $ZSize = abs($ZMax - $ZMin); 542 543 $XMin = sprintf("%.3f", $XMin) + 0; $XMax = sprintf("%.3f", $XMax) + 0; 544 $YMin = sprintf("%.3f", $YMin) + 0; $YMax = sprintf("%.3f", $YMax) + 0; 545 $ZMin = sprintf("%.3f", $ZMin) + 0; $ZMax = sprintf("%.3f", $ZMax) + 0; 546 547 $XSize = sprintf("%.3f", $XSize) + 0; 548 $YSize = sprintf("%.3f", $YSize) + 0; 549 $ZSize = sprintf("%.3f", $ZSize) + 0; 550 551 print "\nBounding box coordinates: <XMin, XMax> - <$XMin, $XMax>; <YMin, YMax> - <$YMin, $YMax>; <ZMin, ZMax> - <$ZMin, $ZMax>;\n"; 552 print "Bounding box size in angstroms: XSize - $XSize; YSize - $YSize; ZSize - $ZSize\n"; 553 554 } 555 556 # Check master record counts against actual record counts... 557 sub CheckMasterRecord { 558 my($RecordTypesCountRef, $PDBRecordLinesRef) = @_; 559 560 # Get master record information... 561 my($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = (undef) x 11; 562 my($RecordLine, $MasterRecordFound); 563 $MasterRecordFound = 0; 564 565 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 566 if (IsMasterRecordType($RecordLine)) { 567 ($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = ParseMasterRecordLine($RecordLine); 568 $MasterRecordFound = 1; 569 last LINE; 570 } 571 } 572 if (!$MasterRecordFound) { 573 print "\nWarning: MASTER record is missing.\n"; 574 return; 575 } 576 my(@MasterRecordValidationInfo); 577 @MasterRecordValidationInfo = (); 578 $NumOfRemarkRecords += 0; 579 if (exists($RecordTypesCountRef->{Count}{REMARK}) && $NumOfRemarkRecords != $RecordTypesCountRef->{Count}{REMARK}) { 580 push @MasterRecordValidationInfo, "Number of REMARK records, $NumOfRemarkRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{REMARK}."; 581 } 582 $NumOfHetRecords += 0; 583 if (exists($RecordTypesCountRef->{Count}{HET}) && $NumOfHetRecords != $RecordTypesCountRef->{Count}{HET}) { 584 push @MasterRecordValidationInfo, "Number of HET records, $NumOfHetRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{HET}."; 585 } 586 $NumOfHelixRecords += 0; 587 if (exists($RecordTypesCountRef->{Count}{HELIX}) && $NumOfHelixRecords != $RecordTypesCountRef->{Count}{HELIX}) { 588 push @MasterRecordValidationInfo, "Number of HELIX records, $NumOfHelixRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{HELIX}."; 589 } 590 $NumOfSheetRecords += 0; 591 if (exists($RecordTypesCountRef->{Count}{SHEET}) && $NumOfSheetRecords != $RecordTypesCountRef->{Count}{SHEET}) { 592 push @MasterRecordValidationInfo, "Number of SHEET records, $NumOfSheetRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SHEET}."; 593 } 594 $NumOfTurnRecords += 0; 595 if (exists($RecordTypesCountRef->{Count}{TURN}) && $NumOfTurnRecords != $RecordTypesCountRef->{Count}{TURN}) { 596 push @MasterRecordValidationInfo, "Number of TURN records, $NumOfTurnRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{REMARK}."; 597 } 598 $NumOfSiteRecords += 0; 599 if (exists($RecordTypesCountRef->{Count}{SITE}) && $NumOfSiteRecords != $RecordTypesCountRef->{Count}{SITE}) { 600 push @MasterRecordValidationInfo, "Number of SITE records, $NumOfSiteRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SITE}."; 601 } 602 603 $NumOfTransformationsRecords += 0; 604 my($RecordsCount, $ID, $RecordID, $RecordLabel); 605 $RecordsCount = 0; 606 for $RecordLabel ('ORIGX', 'SCALE', 'MTRIX') { 607 for $ID (1 .. 3) { 608 $RecordID = "${RecordLabel}${ID}"; 609 if (exists $RecordTypesCountRef->{Count}{$RecordID}) { 610 $RecordsCount += $RecordTypesCountRef->{Count}{$RecordID}; 611 } 612 } 613 } 614 if ($NumOfTransformationsRecords != $RecordsCount) { 615 push @MasterRecordValidationInfo, "Number of transformation records (ORIGXn+SCALEn+MTRIXn), $NumOfTransformationsRecords, specified in MASTER record doen't match its explict count, $RecordsCount."; 616 } 617 618 $RecordsCount = 0; 619 for $RecordLabel ('ATOM', 'HETATM') { 620 if (exists $RecordTypesCountRef->{Count}{$RecordLabel}) { 621 $RecordsCount += $RecordTypesCountRef->{Count}{$RecordLabel}; 622 } 623 } 624 $NumOfAtomAndHetatmRecords += 0; 625 if ($NumOfAtomAndHetatmRecords != $RecordsCount) { 626 push @MasterRecordValidationInfo, "Number of ATOM + HETATM records, $NumOfAtomAndHetatmRecords, specified in MASTER record doen't match its explict count, $RecordsCount."; 627 } 628 $NumOfTerRecords += 0; 629 if (exists($RecordTypesCountRef->{Count}{TER}) && $NumOfTerRecords != $RecordTypesCountRef->{Count}{TER}) { 630 push @MasterRecordValidationInfo, "Number of TER records, $NumOfTerRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{TER}."; 631 } 632 $NumOfConectRecords += 0; 633 if (exists($RecordTypesCountRef->{Count}{CONECT}) && $NumOfConectRecords != $RecordTypesCountRef->{Count}{CONECT}) { 634 push @MasterRecordValidationInfo, "Number of CONECT records, $NumOfConectRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{CONECT}."; 635 } 636 $NumOfSeqresRecords += 0; 637 if (exists($RecordTypesCountRef->{Count}{SEQRES}) && $NumOfSeqresRecords != $RecordTypesCountRef->{Count}{SEQRES}) { 638 push @MasterRecordValidationInfo, "Number of SITE records, $NumOfSeqresRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SEQRES}."; 639 } 640 641 if (@MasterRecordValidationInfo) { 642 print "\nMASTER record validation: Count mismatches found:\n"; 643 print JoinWords(\@MasterRecordValidationInfo, "\n", 0), "\n"; 644 } 645 else { 646 print "\nMASTER record validation: Count values match with the explicit count of the corresponding records.\n"; 647 } 648 } 649 650 # Total size of all the files... 651 sub ListTotalSizeOfFiles { 652 my($FileOkayCount, $TotalSize, $Index); 653 654 $FileOkayCount = 0; 655 $TotalSize = 0; 656 657 for $Index (0 .. $#PDBFilesList) { 658 if ($PDBFilesInfo{FileOkay}[$Index]) { 659 $FileOkayCount++; 660 $TotalSize += $PDBFilesInfo{FileSize}[$Index]; 661 } 662 } 663 if ($FileOkayCount > 1) { 664 print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n"; 665 } 666 667 } 668 669 # Empty chain IDs are replaced with "None[1-9]". But for displaying purposes, take out any 670 # numbers from label... 671 sub CleanupChainID { 672 my($ChainID) = @_; 673 674 if ($ChainID =~ /^None/i) { 675 return 'None'; 676 } 677 return $ChainID; 678 } 679 680 # Process option values... 681 sub ProcessOptions { 682 %OptionsInfo = (); 683 684 # Setup record types to count... 685 if ($Options{count}) { 686 $OptionsInfo{CountRecordType} = $Options{count}; 687 } 688 else { 689 $OptionsInfo{CountRecordType} = $Options{all} ? 'All' : 'ATOM,HETATM'; 690 } 691 @{$OptionsInfo{SpecifiedRecordTypes}} =(); 692 if ($OptionsInfo{CountRecordType} !~ /^All$/i) { 693 my(@RecordTypes); 694 @RecordTypes = split /\,/, $OptionsInfo{CountRecordType}; 695 push @{$OptionsInfo{SpecifiedRecordTypes}}, @RecordTypes; 696 } 697 $OptionsInfo{CountChains} = ($Options{chains} || $Options{all}) ? 1 : 0; 698 $OptionsInfo{CheckMasterRecord} = ($Options{mastercheck} || $Options{all}) ? 1 : 0; 699 700 # Residue count is the default. So $Options{residues} is simply ignored. 701 my($CountResidues) = 1; 702 $OptionsInfo{CountResiduesInChains} = (($CountResidues || $Options{all}) && $Options{residuesmode} =~ /^(InChains|Both)$/i) ? 1 : 0; 703 $OptionsInfo{CountResiduesAll} = (($CountResidues || $Options{all}) && $Options{residuesmode} =~ /^(All|Both)$/i) ? 1 : 0; 704 705 $OptionsInfo{ResiduesFrequencyInChains} = (($Options{frequency} || $Options{all}) && $Options{residuesmode} =~ /^(InChains|Both)$/i) ? 1 : 0; 706 $OptionsInfo{ResiduesFrequencyAll} = (($Options{frequency} || $Options{all}) && $Options{residuesmode} =~ /^(All|Both)$/i) ? 1 : 0; 707 708 $OptionsInfo{ResidueNumbersInfo} = ($Options{residuenumbers} || $Options{all}) ? 1 : 0; 709 710 $OptionsInfo{CalculateBoundingBox} = ($Options{boundingbox} || $Options{all}) ? 1 : 0; 711 712 $OptionsInfo{ListHeaderInfo} = ($Options{header} || $Options{all}) ? 1 : 0; 713 $OptionsInfo{DetailLevel} = $Options{detail}; 714 715 } 716 717 # Retrieve information about PDB files... 718 sub RetrievePDBFilesInfo { 719 my($Index, $PDBFile, $ModifiedTimeString, $ModifiedDateString); 720 721 %PDBFilesInfo = (); 722 @{$PDBFilesInfo{FileOkay}} = (); 723 @{$PDBFilesInfo{FileSize}} = (); 724 @{$PDBFilesInfo{FileLastModified}} = (); 725 726 FILELIST: for $Index (0 .. $#PDBFilesList) { 727 $PDBFilesInfo{FileOkay}[$Index] = 0; 728 $PDBFilesInfo{FileSize}[$Index] = 0; 729 $PDBFilesInfo{FileLastModified}[$Index] = ''; 730 731 $PDBFile = $PDBFilesList[$Index]; 732 if (!(-e $PDBFile)) { 733 warn "Warning: Ignoring file $PDBFile: It doesn't exist\n"; 734 next FILELIST; 735 } 736 if (!CheckFileType($PDBFile, "pdb")) { 737 warn "Warning: Ignoring file $PDBFile: It's not a PDB file\n"; 738 next FILELIST; 739 } 740 if (! open PDBFILE, "$PDBFile") { 741 warn "Warning: Ignoring file $PDBFile: Couldn't open it: $! \n"; 742 next FILELIST; 743 } 744 close PDBFILE; 745 746 $PDBFilesInfo{FileOkay}[$Index] = 1; 747 $PDBFilesInfo{FileSize}[$Index] = FileSize($PDBFile); 748 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($PDBFile); 749 $PDBFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString"; 750 } 751 } 752 753 754 # Setup script usage and retrieve command line arguments specified using various options... 755 sub SetupScriptUsage { 756 757 # Retrieve all the options... 758 %Options = (); 759 $Options{count} = ''; 760 $Options{detail} = 1; 761 $Options{residuesmode} = 'Both'; 762 763 if (!GetOptions(\%Options, "all|a", "boundingbox|b", "count|c=s", "chains", "detail|d=i", "frequency|f", "mastercheck|m", "header", "help|h", "residues", "residuesmode=s", "residuenumbers", "workingdir|w=s")) { 764 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 765 } 766 if ($Options{workingdir}) { 767 if (! -d $Options{workingdir}) { 768 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 769 } 770 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 771 } 772 if (!IsPositiveInteger($Options{detail})) { 773 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; 774 } 775 if ($Options{residuesmode} !~ /^(InChains|All|Both)$/i) { 776 die "Error: The value specified, $Options{residuesmode}, for option \"--ResiduesMode\" is not valid. Allowed values: InChains, All, or Both\n"; 777 } 778 } 779