1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: InfoSDFiles.pl,v $ 4 # $Date: 2008/04/19 16:12:20 $ 5 # $Revision: 1.20 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use 5.006; 30 use strict; 31 use FindBin; use lib "$FindBin::Bin/../lib"; 32 use Getopt::Long; 33 use File::Basename; 34 use Benchmark; 35 use SDFileUtil; 36 use TextUtil; 37 use FileUtil; 38 39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 40 my($SDFile, $CmpdCount, $TotalCmpdCount, $OkayFilesCount, $OkayFilesSize, $EmptyCtabBlocksCount, $MismatchCtabBlockCount, $UnknownAtomsCtabBlockCount, $SaltsCtabBlockCount, $CtabLinesCount, $ModifiedTimeString, $ModifiedDateString, $FileSize, $Index, $ProcessCmpdInfo, $ProcessCmpdData, $CmpdString, $PrintCmpdCounterHeader, $ProblematicCmpdData, $CheckData, $CountEmptyData, @CmpdLines, @SDFilesList, @FieldLabels, %FieldLabelsMap, %NonEmptyFieldValuesCountMap, %EmptyFieldValuesCountMap, %NonNumericalFieldValuesCountMap, %NumericalFieldValuesCountMap); 41 42 # Autoflush STDOUT 43 $| = 1; 44 45 # Starting message... 46 $ScriptName = basename $0; 47 print "\n$ScriptName:Starting...\n\n"; 48 $StartTime = new Benchmark; 49 50 # Get the options and setup script... 51 SetupScriptUsage(); 52 if ($Options{help} || @ARGV < 1) { 53 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 54 } 55 56 @SDFilesList = (); 57 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 58 59 # Process all the SD files... 60 if (@SDFilesList > 1) { 61 print "Processing SD files...\n"; 62 } 63 $TotalCmpdCount = 0; 64 $OkayFilesCount = 0; $OkayFilesSize = 0; 65 66 FILELIST: for $Index (0 .. $#SDFilesList) { 67 $SDFile = $SDFilesList[$Index]; 68 if (@SDFilesList > 1) { 69 print "\nProcessing file $SDFile...\n"; 70 } 71 else { 72 print "$ScriptName:Processing file $SDFile...\n" 73 } 74 if (!(-e $SDFile)) { 75 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 76 next FILELIST; 77 } 78 if (!CheckFileType($SDFile, "sd sdf")) { 79 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 80 next FILELIST; 81 } 82 if (!open SDFILE, "$SDFile") { 83 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; 84 next FILELIST; 85 } 86 $CmpdCount = 0; 87 $EmptyCtabBlocksCount = 0; 88 $MismatchCtabBlockCount = 0; 89 $UnknownAtomsCtabBlockCount = 0; 90 $SaltsCtabBlockCount = 0; 91 @FieldLabels = (); 92 %FieldLabelsMap = (); 93 %NonEmptyFieldValuesCountMap = (); 94 %EmptyFieldValuesCountMap = (); 95 %NonNumericalFieldValuesCountMap = (); 96 %NumericalFieldValuesCountMap = (); 97 98 if ($ProcessCmpdInfo) { 99 $PrintCmpdCounterHeader = 1; 100 while ($CmpdString = ReadCmpdString(\*SDFILE)) { 101 $CmpdCount++; 102 $ProblematicCmpdData = 0; 103 if ($Options{detail} <= 1) { 104 if (($CmpdCount % 5000) == 0) { 105 if ($PrintCmpdCounterHeader) { 106 $PrintCmpdCounterHeader = 0; 107 print "Processing compounds:"; 108 } 109 print "$CmpdCount..."; 110 } 111 } 112 @CmpdLines = split "\n", $CmpdString; 113 $CtabLinesCount = GetCtabLinesCount(\@CmpdLines); 114 if ($Options{all} || $Options{empty}) { 115 if ($CtabLinesCount <= 0) { 116 $EmptyCtabBlocksCount++; 117 $ProblematicCmpdData = 1; 118 } 119 } 120 if ($CtabLinesCount > 0) { 121 my ($AtomCount, $BondCount) = ParseCmpdCountsLine($CmpdLines[3]); 122 if ($Options{all} || $Options{mismatch}) { 123 if ($CtabLinesCount != ($AtomCount + $BondCount)) { 124 $MismatchCtabBlockCount++; 125 $ProblematicCmpdData = 1; 126 if ($Options{detail} >= 2) { 127 print "\nMismatch found: Ctab lines count: $CtabLinesCount; Atoms count: $AtomCount; Bond count: $BondCount\n"; 128 } 129 } 130 } 131 if ($CtabLinesCount == ($AtomCount + $BondCount)) { 132 if ($Options{all} || $Options{unknownatoms}) { 133 my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines) = GetUnknownAtoms(\@CmpdLines); 134 if ($UnknownAtomCount) { 135 $UnknownAtomsCtabBlockCount++; 136 $ProblematicCmpdData = 1; 137 if ($Options{detail} >= 2) { 138 print "\nUnknown atom(s) found: $UnknownAtomCount\nUknown atom(s) symbols:$UnknownAtoms\nUknown atom(s) data lines:\n$UnknownAtomLines\n"; 139 } 140 } 141 } 142 if ($Options{all} || $Options{salts}) { 143 my($FragmentsCount, $Fragments) = GetCmpdFragments(\@CmpdLines); 144 if ($FragmentsCount > 1) { 145 $SaltsCtabBlockCount++; 146 $ProblematicCmpdData = 1; 147 if ($Options{detail} >= 2) { 148 print "\nSalts found: $FragmentsCount\nSalts atom numbers:\n$Fragments\n"; 149 } 150 } 151 } 152 } 153 } 154 if ($ProcessCmpdData) { 155 ProcessCmpdData(); 156 } 157 if ($Options{detail} >= 3) { 158 if ($ProblematicCmpdData) { 159 print "\nCompound data:\n$CmpdString\n\n"; 160 } 161 } 162 } 163 if ($Options{detail} <= 1) { 164 if (!$PrintCmpdCounterHeader) { 165 print "\n"; 166 } 167 } 168 } 169 else { 170 # Just count the compounds... 171 while (<SDFILE>) { 172 if (/\$\$\$\$/) { 173 $CmpdCount++; 174 } 175 } 176 } 177 close SDFILE; 178 $TotalCmpdCount += $CmpdCount; 179 print "\nNumber of compounds: $CmpdCount\n"; 180 if ($Options{all} || $Options{empty}) { 181 print "Number of empty atom/bond blocks: $EmptyCtabBlocksCount\n"; 182 } 183 if ($Options{all} || $Options{mismatch}) { 184 print "Number of mismatched atom/bond blocks: $MismatchCtabBlockCount\n"; 185 } 186 if ($Options{all} || $Options{unknownatoms}) { 187 print "Number of atom blocks with unknown atom labels: $UnknownAtomsCtabBlockCount\n"; 188 } 189 if ($Options{all} || $Options{salts}) { 190 print "Number of atom blocks containing salts: $SaltsCtabBlockCount\n"; 191 } 192 if ($ProcessCmpdData) { 193 PrintCmpdDataSummary(); 194 } 195 196 $OkayFilesCount++; 197 $FileSize = FileSize($SDFile); 198 $OkayFilesSize += $FileSize; 199 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($SDFile); 200 print "\nFile size: ", FormatFileSize($FileSize), " \n"; 201 print "Last modified: ${ModifiedTimeString}; $ModifiedDateString \n"; 202 203 } 204 if ($OkayFilesCount > 1) { 205 print "\nTotal number of compounds in $OkayFilesCount SD files: $TotalCmpdCount\n"; 206 print "\nTotal size of $OkayFilesCount SD files: ", FormatFileSize($OkayFilesSize), "\n"; 207 } 208 print "\n$ScriptName:Done...\n\n"; 209 210 $EndTime = new Benchmark; 211 $TotalTime = timediff ($EndTime, $StartTime); 212 print "Total time: ", timestr($TotalTime), "\n"; 213 214 ############################################################################### 215 216 # Process compound data header labels and figure out which ones are present for 217 # all the compounds... 218 sub ProcessCmpdData { 219 my($Label); 220 221 if (@FieldLabels) { 222 my (@CmpdFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines); 223 my(%CmpdFieldLabelsMap) = (); 224 # Setup a map for the current labels... 225 for $Label (@CmpdFieldLabels) { 226 $CmpdFieldLabelsMap{$Label} = "PresentInSome"; 227 } 228 # Check the presence old labels for this compound; otherwise, mark 'em new... 229 for $Label (@FieldLabels) { 230 if (!$CmpdFieldLabelsMap{$Label}) { 231 $FieldLabelsMap{$Label} = "PresentInSome"; 232 } 233 } 234 # Check the presence this compound in the old labels; otherwise, add 'em... 235 for $Label (@CmpdFieldLabels ) { 236 if (!$FieldLabelsMap{$Label}) { 237 # It's a new label... 238 push @FieldLabels, $Label; 239 $FieldLabelsMap{$Label} = "PresentInSome"; 240 } 241 } 242 } 243 else { 244 # Get the initial label set and set up a map... 245 @FieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); 246 for $Label (@FieldLabels) { 247 $FieldLabelsMap{$Label} = "PresentInAll"; 248 } 249 } 250 if ($CountEmptyData || $CheckData) { 251 # Count empty data field values... 252 my(%DataFieldAndValues, $Label, $Value); 253 254 %DataFieldAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 255 for $Label (keys %DataFieldAndValues) { 256 $Value = $DataFieldAndValues{$Label}; 257 if ($CountEmptyData) { 258 if (IsNotEmpty($Value)) { 259 if (exists($NonEmptyFieldValuesCountMap{$Label})) { 260 $NonEmptyFieldValuesCountMap{$Label} += 1; 261 } 262 else { 263 $NonEmptyFieldValuesCountMap{$Label} = 1; 264 } 265 } 266 else { 267 if ($Options{detail} >= 2) { 268 print "Compound record $CmpdCount: Empty data field <$Label>\n"; 269 } 270 if (exists($EmptyFieldValuesCountMap{$Label})) { 271 $EmptyFieldValuesCountMap{$Label} += 1; 272 } 273 else { 274 $EmptyFieldValuesCountMap{$Label} = 1; 275 } 276 } 277 } 278 if ($CheckData) { 279 if (IsNumerical($Value)) { 280 if (exists($NumericalFieldValuesCountMap{$Label})) { 281 $NumericalFieldValuesCountMap{$Label} += 1; 282 } 283 else { 284 $NumericalFieldValuesCountMap{$Label} = 1; 285 } 286 } 287 else { 288 if (exists($NonNumericalFieldValuesCountMap{$Label})) { 289 $NonNumericalFieldValuesCountMap{$Label} += 1; 290 } 291 else { 292 $NonNumericalFieldValuesCountMap{$Label} = 1; 293 } 294 } 295 } 296 } 297 } 298 } 299 300 sub PrintCmpdDataSummary { 301 if (@FieldLabels) { 302 my($PresentInAllCount, $Label, @FieldLabelsPresentInSome, @FieldLabelsPresentInAll); 303 304 @FieldLabelsPresentInSome = (); 305 @FieldLabelsPresentInAll = (); 306 307 $PresentInAllCount = 0; 308 print "\nNumber of data fields: ", scalar(@FieldLabels), "\n"; 309 print "All data field labels: "; 310 for $Label (sort keys %FieldLabelsMap) { 311 print "<$Label> "; 312 } 313 print "\n"; 314 for $Label (sort keys %FieldLabelsMap) { 315 if ($FieldLabelsMap{$Label} eq "PresentInAll") { 316 $PresentInAllCount++; 317 push @FieldLabelsPresentInAll, $Label; 318 } 319 } 320 if ($PresentInAllCount != @FieldLabels) { 321 print "Data field labels present in all compounds: "; 322 for $Label (sort keys %FieldLabelsMap) { 323 if ($FieldLabelsMap{$Label} eq "PresentInAll") { 324 print "<$Label> "; 325 } 326 } 327 print "\n"; 328 print "Data field labels present in some compounds: "; 329 for $Label (sort keys %FieldLabelsMap) { 330 if ($FieldLabelsMap{$Label} eq "PresentInSome") { 331 print "<$Label> "; 332 push @FieldLabelsPresentInSome, $Label; 333 } 334 } 335 print "\n"; 336 } 337 # List empty data field values count... 338 if ($CountEmptyData) { 339 print "\n"; 340 if ($PresentInAllCount == @FieldLabels) { 341 PrintDataInformation("Number of non-empty values for data field(s)", \@FieldLabels, \%NonEmptyFieldValuesCountMap); 342 PrintDataInformation("Number of empty values for data field(s)", \@FieldLabels, \%EmptyFieldValuesCountMap); 343 } 344 else { 345 PrintDataInformation("Number of non-empty values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%NonEmptyFieldValuesCountMap); 346 PrintDataInformation("Number of empty values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%EmptyFieldValuesCountMap); 347 PrintDataInformation("Number of non-empty values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%NonEmptyFieldValuesCountMap); 348 PrintDataInformation("Number of empty values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%EmptyFieldValuesCountMap); 349 } 350 print "\n"; 351 } 352 # List numerical data values count... 353 if ($CheckData) { 354 print "\n"; 355 if ($PresentInAllCount == @FieldLabels) { 356 PrintDataInformation("Number of non-numerical values for data field(s)", \@FieldLabels, \%NonNumericalFieldValuesCountMap); 357 PrintDataInformation("Number of numerical values for data field(s)", \@FieldLabels, \%NumericalFieldValuesCountMap); 358 } 359 else { 360 PrintDataInformation("Number of non-numerical values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%NonNumericalFieldValuesCountMap); 361 PrintDataInformation("Number of numerical values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%NumericalFieldValuesCountMap); 362 PrintDataInformation("Number of non-numerical values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%NonNumericalFieldValuesCountMap); 363 PrintDataInformation("Number of numerical values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%NumericalFieldValuesCountMap); 364 } 365 print "\n"; 366 } 367 } 368 else { 369 print "\nNumber of data fields: 0\n"; 370 } 371 } 372 373 # List data information... 374 sub PrintDataInformation { 375 my($InfoLabel, $DataLabelRef, $DataLabelToValueMapRef) = @_; 376 my($Line, $Label); 377 378 $Line = ""; 379 for $Label (@{$DataLabelRef}) { 380 $Line .= " <$Label> - " . (exists($DataLabelToValueMapRef->{$Label}) ? $DataLabelToValueMapRef->{$Label} : 0) . ","; 381 } 382 $Line =~ s/\,$//g; 383 print "$InfoLabel: $Line\n"; 384 } 385 386 # Setup script usage and retrieve command line arguments specified using various options... 387 sub SetupScriptUsage { 388 389 # Setup default and retrieve all the options... 390 %Options = (); 391 $Options{detail} = 1; 392 if (!GetOptions(\%Options, "all|a", "count|c", "datacheck", "detail|d:i", "empty|e", "fields|f", "help|h", "mismatch|m", "salts|s", "unknownatoms|u", "workingdir|w=s")) { 393 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 394 } 395 if ($Options{workingdir}) { 396 if (! -d $Options{workingdir}) { 397 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 398 } 399 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 400 } 401 if ($Options{detail} <= 0 || $Options{detail} > 3) { 402 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Possible values: 1 to 3\n"; 403 } 404 $ProcessCmpdInfo = 0; 405 if ($Options{all} || $Options{empty} || $Options{fields} || $Options{mismatch} || $Options{salts} || $Options{unknownatoms} || $Options{datacheck}) { 406 $ProcessCmpdInfo = 1; 407 } 408 $ProcessCmpdData = 0; 409 if ($Options{all} || $Options{fields} || $Options{empty} || $Options{datacheck}) { 410 $ProcessCmpdData = 1; 411 } 412 $CountEmptyData = ($Options{all} || $Options{empty}) ? 1 : 0; 413 $CheckData = ($Options{all} || $Options{datacheck}) ? 1 : 0; 414 } 415