1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: InfoSDFiles.pl,v $ 4 # $Date: 2010/01/03 00:59:52 $ 5 # $Revision: 1.25 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2010 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Benchmark; 34 use SDFileUtil; 35 use TextUtil; 36 use FileUtil; 37 38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 39 my($SDFile, $CmpdCount, $TotalCmpdCount, $OkayFilesCount, $OkayFilesSize, $EmptyCtabBlocksCount, $MismatchCtabBlockCount, $ChiralCtabBlockCount, $UnknownAtomsCtabBlockCount, $InvalidAtomNumbersCtabBlockCount, $SaltsCtabBlockCount, $CtabLinesCount, $ModifiedTimeString, $ModifiedDateString, $FileSize, $Index, $ProcessCmpdInfo, $ProcessCmpdData, $CmpdString, $PrintCmpdCounterHeader, $ProblematicCmpdData, $CheckData, $CountEmptyData, @CmpdLines, @SDFilesList, @FieldLabels, %FieldLabelsMap, %NonEmptyFieldValuesCountMap, %EmptyFieldValuesCountMap, %NonNumericalFieldValuesCountMap, %NumericalFieldValuesCountMap); 40 41 # Autoflush STDOUT 42 $| = 1; 43 44 # Starting message... 45 $ScriptName = basename $0; 46 print "\n$ScriptName:Starting...\n\n"; 47 $StartTime = new Benchmark; 48 49 # Get the options and setup script... 50 SetupScriptUsage(); 51 if ($Options{help} || @ARGV < 1) { 52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 53 } 54 55 @SDFilesList = (); 56 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 57 58 # Process all the SD files... 59 if (@SDFilesList > 1) { 60 print "Processing SD files...\n"; 61 } 62 $TotalCmpdCount = 0; 63 $OkayFilesCount = 0; $OkayFilesSize = 0; 64 65 FILELIST: for $Index (0 .. $#SDFilesList) { 66 $SDFile = $SDFilesList[$Index]; 67 if (@SDFilesList > 1) { 68 print "\nProcessing file $SDFile...\n"; 69 } 70 else { 71 print "$ScriptName:Processing file $SDFile...\n" 72 } 73 if (!(-e $SDFile)) { 74 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 75 next FILELIST; 76 } 77 if (!CheckFileType($SDFile, "sd sdf")) { 78 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 79 next FILELIST; 80 } 81 if (!open SDFILE, "$SDFile") { 82 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; 83 next FILELIST; 84 } 85 $CmpdCount = 0; 86 $EmptyCtabBlocksCount = 0; 87 $MismatchCtabBlockCount = 0; 88 $ChiralCtabBlockCount = 0; 89 $UnknownAtomsCtabBlockCount = 0; 90 $InvalidAtomNumbersCtabBlockCount = 0; 91 $SaltsCtabBlockCount = 0; 92 @FieldLabels = (); 93 %FieldLabelsMap = (); 94 %NonEmptyFieldValuesCountMap = (); 95 %EmptyFieldValuesCountMap = (); 96 %NonNumericalFieldValuesCountMap = (); 97 %NumericalFieldValuesCountMap = (); 98 99 if ($ProcessCmpdInfo) { 100 $PrintCmpdCounterHeader = 1; 101 while ($CmpdString = ReadCmpdString(\*SDFILE)) { 102 $CmpdCount++; 103 $ProblematicCmpdData = 0; 104 if ($Options{detail} <= 1) { 105 if (($CmpdCount % 5000) == 0) { 106 if ($PrintCmpdCounterHeader) { 107 $PrintCmpdCounterHeader = 0; 108 print "Processing compounds:"; 109 } 110 print "$CmpdCount..."; 111 } 112 } 113 @CmpdLines = split "\n", $CmpdString; 114 $CtabLinesCount = GetCtabLinesCount(\@CmpdLines); 115 if ($Options{all} || $Options{empty}) { 116 if ($CtabLinesCount <= 0) { 117 $EmptyCtabBlocksCount++; 118 $ProblematicCmpdData = 1; 119 } 120 } 121 if ($CtabLinesCount > 0) { 122 my ($AtomCount, $BondCount, $ChiralFlag) = ParseCmpdCountsLine($CmpdLines[3]); 123 if ($Options{all} || $Options{mismatch}) { 124 if ($CtabLinesCount != ($AtomCount + $BondCount)) { 125 $MismatchCtabBlockCount++; 126 $ProblematicCmpdData = 1; 127 if ($Options{detail} >= 2) { 128 print "\nMismatch found: Ctab lines count: $CtabLinesCount; Atoms count: $AtomCount; Bond count: $BondCount\n"; 129 } 130 } 131 } 132 if ($Options{all} || $Options{chiral}) { 133 if ($ChiralFlag == 1) { 134 $ChiralCtabBlockCount++; 135 } 136 } 137 if ($CtabLinesCount == ($AtomCount + $BondCount)) { 138 if ($Options{all} || $Options{unknownatoms}) { 139 my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines) = GetUnknownAtoms(\@CmpdLines); 140 if ($UnknownAtomCount) { 141 $UnknownAtomsCtabBlockCount++; 142 $ProblematicCmpdData = 1; 143 if ($Options{detail} >= 2) { 144 print "\nUnknown atom(s) found: $UnknownAtomCount\nUnknown atom(s) symbols:$UnknownAtoms\nUnknown atom(s) data lines:\n$UnknownAtomLines\n"; 145 } 146 } 147 } 148 if ($Options{all} || $Options{invalidatomnumbers}) { 149 my($InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines) = GetInvalidAtomNumbers(\@CmpdLines); 150 if ($InvalidAtomNumbersCount) { 151 $InvalidAtomNumbersCtabBlockCount++; 152 $ProblematicCmpdData = 1; 153 if ($Options{detail} >= 2) { 154 print "\nInvalid atom number(s) found: $InvalidAtomNumbersCount\nInvalid atom number(s):$InvalidAtomNumbers\nInvalid atom number(s) data lines:\n$InvalidAtomNumberLines\n"; 155 } 156 } 157 } 158 if ($Options{all} || $Options{salts}) { 159 my($FragmentsCount, $Fragments) = GetCmpdFragments(\@CmpdLines); 160 if ($FragmentsCount > 1) { 161 $SaltsCtabBlockCount++; 162 $ProblematicCmpdData = 1; 163 if ($Options{detail} >= 2) { 164 print "\nSalts found: $FragmentsCount\nSalts atom numbers:\n$Fragments\n"; 165 } 166 } 167 } 168 } 169 } 170 if ($ProcessCmpdData) { 171 ProcessCmpdData(); 172 } 173 if ($Options{detail} >= 3) { 174 if ($ProblematicCmpdData) { 175 print "\nCompound data:\n$CmpdString\n\n"; 176 } 177 } 178 } 179 if ($Options{detail} <= 1) { 180 if (!$PrintCmpdCounterHeader) { 181 print "\n"; 182 } 183 } 184 } 185 else { 186 # Just count the compounds... 187 while (<SDFILE>) { 188 if (/\$\$\$\$/) { 189 $CmpdCount++; 190 } 191 } 192 } 193 close SDFILE; 194 $TotalCmpdCount += $CmpdCount; 195 print "\nNumber of compounds: $CmpdCount\n"; 196 if ($Options{all} || $Options{empty}) { 197 print "Number of empty atom/bond blocks: $EmptyCtabBlocksCount\n"; 198 } 199 if ($Options{all} || $Options{mismatch}) { 200 print "Number of mismatched atom/bond blocks: $MismatchCtabBlockCount\n"; 201 } 202 if ($Options{all} || $Options{unknownatoms}) { 203 print "Number of atom blocks with unknown atom labels: $UnknownAtomsCtabBlockCount\n"; 204 } 205 if ($Options{all} || $Options{invalidatomnumbers}) { 206 print "Number of bond blocks and atom property blocks with invalid atom numbers: $InvalidAtomNumbersCtabBlockCount\n"; 207 } 208 if ($Options{all} || $Options{salts}) { 209 print "Number of atom blocks containing salts: $SaltsCtabBlockCount\n"; 210 } 211 if ($Options{all} || $Options{chiral}) { 212 print "Number of chiral atom/bond blocks: $ChiralCtabBlockCount\n"; 213 } 214 if ($ProcessCmpdData) { 215 PrintCmpdDataSummary(); 216 } 217 218 $OkayFilesCount++; 219 $FileSize = FileSize($SDFile); 220 $OkayFilesSize += $FileSize; 221 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($SDFile); 222 print "\nFile size: ", FormatFileSize($FileSize), " \n"; 223 print "Last modified: ${ModifiedTimeString}; $ModifiedDateString \n"; 224 225 } 226 if ($OkayFilesCount > 1) { 227 print "\nTotal number of compounds in $OkayFilesCount SD files: $TotalCmpdCount\n"; 228 print "\nTotal size of $OkayFilesCount SD files: ", FormatFileSize($OkayFilesSize), "\n"; 229 } 230 print "\n$ScriptName:Done...\n\n"; 231 232 $EndTime = new Benchmark; 233 $TotalTime = timediff ($EndTime, $StartTime); 234 print "Total time: ", timestr($TotalTime), "\n"; 235 236 ############################################################################### 237 238 # Process compound data header labels and figure out which ones are present for 239 # all the compounds... 240 sub ProcessCmpdData { 241 my($Label); 242 243 if (@FieldLabels) { 244 my (@CmpdFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines); 245 my(%CmpdFieldLabelsMap) = (); 246 # Setup a map for the current labels... 247 for $Label (@CmpdFieldLabels) { 248 $CmpdFieldLabelsMap{$Label} = "PresentInSome"; 249 } 250 # Check the presence old labels for this compound; otherwise, mark 'em new... 251 for $Label (@FieldLabels) { 252 if (!$CmpdFieldLabelsMap{$Label}) { 253 $FieldLabelsMap{$Label} = "PresentInSome"; 254 } 255 } 256 # Check the presence this compound in the old labels; otherwise, add 'em... 257 for $Label (@CmpdFieldLabels ) { 258 if (!$FieldLabelsMap{$Label}) { 259 # It's a new label... 260 push @FieldLabels, $Label; 261 $FieldLabelsMap{$Label} = "PresentInSome"; 262 } 263 } 264 } 265 else { 266 # Get the initial label set and set up a map... 267 @FieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); 268 for $Label (@FieldLabels) { 269 $FieldLabelsMap{$Label} = "PresentInAll"; 270 } 271 } 272 if ($CountEmptyData || $CheckData) { 273 # Count empty data field values... 274 my(%DataFieldAndValues, $Label, $Value); 275 276 %DataFieldAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 277 for $Label (keys %DataFieldAndValues) { 278 $Value = $DataFieldAndValues{$Label}; 279 if ($CountEmptyData) { 280 if (IsNotEmpty($Value)) { 281 if (exists($NonEmptyFieldValuesCountMap{$Label})) { 282 $NonEmptyFieldValuesCountMap{$Label} += 1; 283 } 284 else { 285 $NonEmptyFieldValuesCountMap{$Label} = 1; 286 } 287 } 288 else { 289 if ($Options{detail} >= 2) { 290 print "Compound record $CmpdCount: Empty data field <$Label>\n"; 291 } 292 if (exists($EmptyFieldValuesCountMap{$Label})) { 293 $EmptyFieldValuesCountMap{$Label} += 1; 294 } 295 else { 296 $EmptyFieldValuesCountMap{$Label} = 1; 297 } 298 } 299 } 300 if ($CheckData) { 301 if (IsNumerical($Value)) { 302 if (exists($NumericalFieldValuesCountMap{$Label})) { 303 $NumericalFieldValuesCountMap{$Label} += 1; 304 } 305 else { 306 $NumericalFieldValuesCountMap{$Label} = 1; 307 } 308 } 309 else { 310 if (exists($NonNumericalFieldValuesCountMap{$Label})) { 311 $NonNumericalFieldValuesCountMap{$Label} += 1; 312 } 313 else { 314 $NonNumericalFieldValuesCountMap{$Label} = 1; 315 } 316 } 317 } 318 } 319 } 320 } 321 322 sub PrintCmpdDataSummary { 323 if (@FieldLabels) { 324 my($PresentInAllCount, $Label, @FieldLabelsPresentInSome, @FieldLabelsPresentInAll); 325 326 @FieldLabelsPresentInSome = (); 327 @FieldLabelsPresentInAll = (); 328 329 $PresentInAllCount = 0; 330 print "\nNumber of data fields: ", scalar(@FieldLabels), "\n"; 331 print "All data field labels: "; 332 for $Label (sort keys %FieldLabelsMap) { 333 print "<$Label> "; 334 } 335 print "\n"; 336 for $Label (sort keys %FieldLabelsMap) { 337 if ($FieldLabelsMap{$Label} eq "PresentInAll") { 338 $PresentInAllCount++; 339 push @FieldLabelsPresentInAll, $Label; 340 } 341 } 342 if ($PresentInAllCount != @FieldLabels) { 343 print "Data field labels present in all compounds: "; 344 for $Label (sort keys %FieldLabelsMap) { 345 if ($FieldLabelsMap{$Label} eq "PresentInAll") { 346 print "<$Label> "; 347 } 348 } 349 print "\n"; 350 print "Data field labels present in some compounds: "; 351 for $Label (sort keys %FieldLabelsMap) { 352 if ($FieldLabelsMap{$Label} eq "PresentInSome") { 353 print "<$Label> "; 354 push @FieldLabelsPresentInSome, $Label; 355 } 356 } 357 print "\n"; 358 } 359 # List empty data field values count... 360 if ($CountEmptyData) { 361 print "\n"; 362 if ($PresentInAllCount == @FieldLabels) { 363 PrintDataInformation("Number of non-empty values for data field(s)", \@FieldLabels, \%NonEmptyFieldValuesCountMap); 364 PrintDataInformation("Number of empty values for data field(s)", \@FieldLabels, \%EmptyFieldValuesCountMap); 365 } 366 else { 367 PrintDataInformation("Number of non-empty values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%NonEmptyFieldValuesCountMap); 368 PrintDataInformation("Number of empty values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%EmptyFieldValuesCountMap); 369 PrintDataInformation("Number of non-empty values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%NonEmptyFieldValuesCountMap); 370 PrintDataInformation("Number of empty values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%EmptyFieldValuesCountMap); 371 } 372 print "\n"; 373 } 374 # List numerical data values count... 375 if ($CheckData) { 376 print "\n"; 377 if ($PresentInAllCount == @FieldLabels) { 378 PrintDataInformation("Number of non-numerical values for data field(s)", \@FieldLabels, \%NonNumericalFieldValuesCountMap); 379 PrintDataInformation("Number of numerical values for data field(s)", \@FieldLabels, \%NumericalFieldValuesCountMap); 380 } 381 else { 382 PrintDataInformation("Number of non-numerical values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%NonNumericalFieldValuesCountMap); 383 PrintDataInformation("Number of numerical values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%NumericalFieldValuesCountMap); 384 PrintDataInformation("Number of non-numerical values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%NonNumericalFieldValuesCountMap); 385 PrintDataInformation("Number of numerical values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%NumericalFieldValuesCountMap); 386 } 387 print "\n"; 388 } 389 } 390 else { 391 print "\nNumber of data fields: 0\n"; 392 } 393 } 394 395 # List data information... 396 sub PrintDataInformation { 397 my($InfoLabel, $DataLabelRef, $DataLabelToValueMapRef) = @_; 398 my($Line, $Label); 399 400 $Line = ""; 401 for $Label (@{$DataLabelRef}) { 402 $Line .= " <$Label> - " . (exists($DataLabelToValueMapRef->{$Label}) ? $DataLabelToValueMapRef->{$Label} : 0) . ","; 403 } 404 $Line =~ s/\,$//g; 405 print "$InfoLabel: $Line\n"; 406 } 407 408 # Setup script usage and retrieve command line arguments specified using various options... 409 sub SetupScriptUsage { 410 411 # Setup default and retrieve all the options... 412 %Options = (); 413 $Options{detail} = 1; 414 if (!GetOptions(\%Options, "all|a", "count|c", "chiral", "datacheck", "detail|d:i", "empty|e", "fields|f", "help|h", "invalidatomnumbers|i", "mismatch|m", "salts|s", "unknownatoms|u", "workingdir|w=s")) { 415 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 416 } 417 if ($Options{workingdir}) { 418 if (! -d $Options{workingdir}) { 419 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 420 } 421 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 422 } 423 if ($Options{detail} <= 0 || $Options{detail} > 3) { 424 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Possible values: 1 to 3\n"; 425 } 426 $ProcessCmpdInfo = 0; 427 if ($Options{all} || $Options{chiral} || $Options{empty} || $Options{fields} || $Options{invalidatomnumbers} || $Options{mismatch} || $Options{salts} || $Options{unknownatoms} || $Options{datacheck}) { 428 $ProcessCmpdInfo = 1; 429 } 430 $ProcessCmpdData = 0; 431 if ($Options{all} || $Options{fields} || $Options{empty} || $Options{datacheck}) { 432 $ProcessCmpdData = 1; 433 } 434 $CountEmptyData = ($Options{all} || $Options{empty}) ? 1 : 0; 435 $CheckData = ($Options{all} || $Options{datacheck}) ? 1 : 0; 436 } 437