MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: InfoSDFiles.pl,v $
   4 # $Date: 2008/04/19 16:12:20 $
   5 # $Revision: 1.20 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use 5.006;
  30 use strict;
  31 use FindBin; use lib "$FindBin::Bin/../lib";
  32 use Getopt::Long;
  33 use File::Basename;
  34 use Benchmark;
  35 use SDFileUtil;
  36 use TextUtil;
  37 use FileUtil;
  38 
  39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  40 my($SDFile, $CmpdCount, $TotalCmpdCount, $OkayFilesCount, $OkayFilesSize, $EmptyCtabBlocksCount, $MismatchCtabBlockCount, $UnknownAtomsCtabBlockCount, $SaltsCtabBlockCount, $CtabLinesCount, $ModifiedTimeString, $ModifiedDateString, $FileSize, $Index, $ProcessCmpdInfo, $ProcessCmpdData, $CmpdString, $PrintCmpdCounterHeader, $ProblematicCmpdData, $CheckData, $CountEmptyData, @CmpdLines, @SDFilesList, @FieldLabels, %FieldLabelsMap, %NonEmptyFieldValuesCountMap, %EmptyFieldValuesCountMap, %NonNumericalFieldValuesCountMap, %NumericalFieldValuesCountMap);
  41 
  42 # Autoflush STDOUT
  43 $| = 1;
  44 
  45 # Starting message...
  46 $ScriptName = basename $0;
  47 print "\n$ScriptName:Starting...\n\n";
  48 $StartTime = new Benchmark;
  49 
  50 # Get the options and setup script...
  51 SetupScriptUsage();
  52 if ($Options{help} || @ARGV < 1) {
  53   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  54 }
  55 
  56 @SDFilesList = ();
  57 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  58 
  59 # Process all the SD files...
  60 if (@SDFilesList > 1) {
  61   print "Processing SD files...\n";
  62 }
  63 $TotalCmpdCount = 0;
  64 $OkayFilesCount = 0; $OkayFilesSize = 0;
  65 
  66 FILELIST: for $Index (0 .. $#SDFilesList) {
  67   $SDFile = $SDFilesList[$Index];
  68   if (@SDFilesList > 1) {
  69     print "\nProcessing file $SDFile...\n";
  70   }
  71   else {
  72     print "$ScriptName:Processing file $SDFile...\n"
  73   }
  74   if (!(-e $SDFile)) {
  75     warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
  76     next FILELIST;
  77   }
  78   if (!CheckFileType($SDFile, "sd sdf")) {
  79     warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
  80     next FILELIST;
  81   }
  82   if (!open SDFILE, "$SDFile") {
  83     warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
  84     next FILELIST;
  85   }
  86   $CmpdCount = 0;
  87   $EmptyCtabBlocksCount = 0;
  88   $MismatchCtabBlockCount = 0;
  89   $UnknownAtomsCtabBlockCount = 0;
  90   $SaltsCtabBlockCount = 0;
  91   @FieldLabels = ();
  92   %FieldLabelsMap = ();
  93   %NonEmptyFieldValuesCountMap = ();
  94   %EmptyFieldValuesCountMap = ();
  95   %NonNumericalFieldValuesCountMap = ();
  96   %NumericalFieldValuesCountMap = ();
  97 
  98   if ($ProcessCmpdInfo) {
  99     $PrintCmpdCounterHeader = 1;
 100     while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 101       $CmpdCount++;
 102       $ProblematicCmpdData = 0;
 103       if ($Options{detail} <= 1) {
 104 	if (($CmpdCount % 5000) == 0) {
 105 	  if ($PrintCmpdCounterHeader) {
 106 	    $PrintCmpdCounterHeader = 0;
 107 	    print "Processing compounds:";
 108 	  }
 109 	  print "$CmpdCount...";
 110 	}
 111       }
 112       @CmpdLines = split "\n", $CmpdString;
 113       $CtabLinesCount = GetCtabLinesCount(\@CmpdLines);
 114       if ($Options{all} || $Options{empty}) {
 115 	if ($CtabLinesCount <= 0) {
 116 	  $EmptyCtabBlocksCount++;
 117 	  $ProblematicCmpdData = 1;
 118 	}
 119       }
 120       if ($CtabLinesCount > 0) {
 121 	my ($AtomCount, $BondCount) = ParseCmpdCountsLine($CmpdLines[3]);
 122 	if ($Options{all} || $Options{mismatch}) {
 123 	  if ($CtabLinesCount != ($AtomCount + $BondCount)) {
 124 	    $MismatchCtabBlockCount++;
 125 	    $ProblematicCmpdData = 1;
 126 	    if ($Options{detail} >= 2) {
 127 	      print "\nMismatch found: Ctab lines count: $CtabLinesCount;  Atoms count: $AtomCount; Bond count: $BondCount\n";
 128 	    }
 129 	  }
 130 	}
 131 	if ($CtabLinesCount == ($AtomCount + $BondCount)) {
 132 	  if ($Options{all} || $Options{unknownatoms}) {
 133 	    my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines) = GetUnknownAtoms(\@CmpdLines);
 134 	    if ($UnknownAtomCount) {
 135 	      $UnknownAtomsCtabBlockCount++;
 136 	      $ProblematicCmpdData = 1;
 137 	      if ($Options{detail} >= 2) {
 138 		print "\nUnknown atom(s) found: $UnknownAtomCount\nUknown atom(s) symbols:$UnknownAtoms\nUknown atom(s) data lines:\n$UnknownAtomLines\n";
 139 	      }
 140 	    }
 141 	  }
 142 	  if ($Options{all} || $Options{salts}) {
 143 	    my($FragmentsCount, $Fragments) = GetCmpdFragments(\@CmpdLines);
 144 	    if ($FragmentsCount > 1) {
 145 	      $SaltsCtabBlockCount++;
 146 	      $ProblematicCmpdData = 1;
 147 	      if ($Options{detail} >= 2) {
 148 		print "\nSalts found: $FragmentsCount\nSalts atom numbers:\n$Fragments\n";
 149 	      }
 150 	    }
 151 	  }
 152 	}
 153       }
 154       if ($ProcessCmpdData) {
 155 	ProcessCmpdData();
 156       }
 157       if ($Options{detail} >= 3) {
 158 	if ($ProblematicCmpdData) {
 159 	  print "\nCompound data:\n$CmpdString\n\n";
 160 	}
 161       }
 162     }
 163     if ($Options{detail} <= 1) {
 164       if (!$PrintCmpdCounterHeader) {
 165 	print "\n";
 166       }
 167     }
 168   }
 169   else {
 170     # Just count the compounds...
 171     while (<SDFILE>) {
 172       if (/\$\$\$\$/) {
 173 	$CmpdCount++;
 174       }
 175     }
 176   }
 177   close SDFILE;
 178   $TotalCmpdCount += $CmpdCount;
 179   print "\nNumber of compounds: $CmpdCount\n";
 180   if ($Options{all} || $Options{empty}) {
 181     print "Number of empty atom/bond blocks: $EmptyCtabBlocksCount\n";
 182   }
 183   if ($Options{all} || $Options{mismatch}) {
 184     print "Number of mismatched atom/bond blocks: $MismatchCtabBlockCount\n";
 185   }
 186   if ($Options{all} || $Options{unknownatoms}) {
 187     print "Number of atom blocks with unknown atom labels: $UnknownAtomsCtabBlockCount\n";
 188   }
 189   if ($Options{all} || $Options{salts}) {
 190     print "Number of atom blocks containing salts: $SaltsCtabBlockCount\n";
 191   }
 192   if ($ProcessCmpdData) {
 193     PrintCmpdDataSummary();
 194   }
 195 
 196   $OkayFilesCount++;
 197   $FileSize = FileSize($SDFile);
 198   $OkayFilesSize += $FileSize;
 199   ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($SDFile);
 200   print "\nFile size: ", FormatFileSize($FileSize), " \n";
 201   print "Last modified:  ${ModifiedTimeString}; $ModifiedDateString \n";
 202 
 203 }
 204 if ($OkayFilesCount > 1) {
 205   print "\nTotal number of compounds in  $OkayFilesCount SD files: $TotalCmpdCount\n";
 206     print "\nTotal size of $OkayFilesCount SD files: ", FormatFileSize($OkayFilesSize), "\n";
 207 }
 208 print "\n$ScriptName:Done...\n\n";
 209 
 210 $EndTime = new Benchmark;
 211 $TotalTime = timediff ($EndTime, $StartTime);
 212 print "Total time: ", timestr($TotalTime), "\n";
 213 
 214 ###############################################################################
 215 
 216 # Process compound data header labels and figure out which ones are present for
 217 # all the compounds...
 218 sub ProcessCmpdData {
 219   my($Label);
 220 
 221   if (@FieldLabels) {
 222     my (@CmpdFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines);
 223     my(%CmpdFieldLabelsMap) = ();
 224     # Setup a map for the current labels...
 225     for $Label (@CmpdFieldLabels) {
 226       $CmpdFieldLabelsMap{$Label} = "PresentInSome";
 227     }
 228     # Check the presence old labels for this compound; otherwise, mark 'em new...
 229     for $Label (@FieldLabels) {
 230       if (!$CmpdFieldLabelsMap{$Label}) {
 231 	$FieldLabelsMap{$Label} = "PresentInSome";
 232       }
 233     }
 234     # Check the presence this compound in the old labels; otherwise, add 'em...
 235     for $Label (@CmpdFieldLabels ) {
 236       if (!$FieldLabelsMap{$Label}) {
 237 	# It's a new label...
 238 	push @FieldLabels, $Label;
 239 	$FieldLabelsMap{$Label} = "PresentInSome";
 240       }
 241     }
 242   }
 243   else {
 244     # Get the initial label set and set up a map...
 245     @FieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
 246     for $Label (@FieldLabels) {
 247       $FieldLabelsMap{$Label} = "PresentInAll";
 248     }
 249   }
 250   if ($CountEmptyData || $CheckData) {
 251     # Count empty data field values...
 252     my(%DataFieldAndValues, $Label, $Value);
 253 
 254     %DataFieldAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 255     for $Label (keys %DataFieldAndValues) {
 256       $Value = $DataFieldAndValues{$Label};
 257       if ($CountEmptyData) {
 258 	if (IsNotEmpty($Value)) {
 259 	  if (exists($NonEmptyFieldValuesCountMap{$Label})) {
 260 	    $NonEmptyFieldValuesCountMap{$Label} += 1;
 261 	  }
 262 	  else {
 263 	    $NonEmptyFieldValuesCountMap{$Label} = 1;
 264 	  }
 265 	}
 266 	else {
 267 	  if ($Options{detail} >= 2) {
 268 	    print "Compound record $CmpdCount: Empty data field <$Label>\n";
 269 	  }
 270 	  if (exists($EmptyFieldValuesCountMap{$Label})) {
 271 	    $EmptyFieldValuesCountMap{$Label} += 1;
 272 	  }
 273 	  else {
 274 	    $EmptyFieldValuesCountMap{$Label} = 1;
 275 	  }
 276 	}
 277       }
 278       if ($CheckData) {
 279 	if (IsNumerical($Value)) {
 280 	  if (exists($NumericalFieldValuesCountMap{$Label})) {
 281 	    $NumericalFieldValuesCountMap{$Label} += 1;
 282 	  }
 283 	  else {
 284 	    $NumericalFieldValuesCountMap{$Label} = 1;
 285 	  }
 286 	}
 287 	else {
 288 	  if (exists($NonNumericalFieldValuesCountMap{$Label})) {
 289 	    $NonNumericalFieldValuesCountMap{$Label} += 1;
 290 	  }
 291 	  else {
 292 	    $NonNumericalFieldValuesCountMap{$Label} = 1;
 293 	  }
 294 	}
 295       }
 296     }
 297   }
 298 }
 299 
 300 sub PrintCmpdDataSummary {
 301   if (@FieldLabels) {
 302     my($PresentInAllCount, $Label, @FieldLabelsPresentInSome, @FieldLabelsPresentInAll);
 303 
 304     @FieldLabelsPresentInSome = ();
 305     @FieldLabelsPresentInAll = ();
 306 
 307     $PresentInAllCount = 0;
 308     print "\nNumber of data fields: ", scalar(@FieldLabels), "\n";
 309     print "All data field labels: ";
 310     for $Label (sort keys %FieldLabelsMap) {
 311       print "<$Label> ";
 312     }
 313     print "\n";
 314     for $Label (sort keys %FieldLabelsMap) {
 315       if ($FieldLabelsMap{$Label} eq "PresentInAll") {
 316 	$PresentInAllCount++;
 317 	push @FieldLabelsPresentInAll, $Label;
 318       }
 319     }
 320     if ($PresentInAllCount != @FieldLabels) {
 321       print "Data field labels present in all compounds: ";
 322       for $Label (sort keys %FieldLabelsMap) {
 323 	if ($FieldLabelsMap{$Label} eq "PresentInAll") {
 324 	  print "<$Label> ";
 325 	}
 326       }
 327       print "\n";
 328       print "Data field labels present in some compounds: ";
 329       for $Label (sort keys %FieldLabelsMap) {
 330 	if ($FieldLabelsMap{$Label} eq "PresentInSome") {
 331 	  print "<$Label> ";
 332 	  push @FieldLabelsPresentInSome, $Label;
 333 	}
 334       }
 335       print "\n";
 336     }
 337     # List empty data field values count...
 338     if ($CountEmptyData) {
 339       print "\n";
 340       if ($PresentInAllCount == @FieldLabels) {
 341 	PrintDataInformation("Number of non-empty values for data field(s)", \@FieldLabels, \%NonEmptyFieldValuesCountMap);
 342 	PrintDataInformation("Number of empty values for data field(s)", \@FieldLabels, \%EmptyFieldValuesCountMap);
 343       }
 344       else {
 345 	PrintDataInformation("Number of non-empty values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%NonEmptyFieldValuesCountMap);
 346 	PrintDataInformation("Number of empty values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%EmptyFieldValuesCountMap);
 347 	PrintDataInformation("Number of non-empty values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%NonEmptyFieldValuesCountMap);
 348 	PrintDataInformation("Number of empty values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%EmptyFieldValuesCountMap);
 349       }
 350       print "\n";
 351     }
 352     # List numerical data values count...
 353     if ($CheckData) {
 354       print "\n";
 355       if ($PresentInAllCount == @FieldLabels) {
 356 	PrintDataInformation("Number of non-numerical values for data field(s)", \@FieldLabels, \%NonNumericalFieldValuesCountMap);
 357 	PrintDataInformation("Number of numerical values for data field(s)", \@FieldLabels, \%NumericalFieldValuesCountMap);
 358       }
 359       else {
 360 	PrintDataInformation("Number of non-numerical values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%NonNumericalFieldValuesCountMap);
 361 	PrintDataInformation("Number of numerical values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%NumericalFieldValuesCountMap);
 362 	PrintDataInformation("Number of non-numerical values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%NonNumericalFieldValuesCountMap);
 363 	PrintDataInformation("Number of numerical values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%NumericalFieldValuesCountMap);
 364       }
 365       print "\n";
 366     }
 367   }
 368   else {
 369     print "\nNumber of data fields: 0\n";
 370   }
 371 }
 372 
 373 # List data information...
 374 sub PrintDataInformation {
 375   my($InfoLabel, $DataLabelRef, $DataLabelToValueMapRef) = @_;
 376   my($Line, $Label);
 377 
 378   $Line = "";
 379   for $Label (@{$DataLabelRef}) {
 380     $Line .= " <$Label> - " . (exists($DataLabelToValueMapRef->{$Label}) ? $DataLabelToValueMapRef->{$Label} : 0) . ",";
 381   }
 382   $Line =~ s/\,$//g;
 383   print "$InfoLabel: $Line\n";
 384 }
 385 
 386 # Setup script usage  and retrieve command line arguments specified using various options...
 387 sub SetupScriptUsage {
 388 
 389   # Setup default and retrieve all the options...
 390   %Options = ();
 391   $Options{detail} = 1;
 392   if (!GetOptions(\%Options, "all|a", "count|c", "datacheck", "detail|d:i", "empty|e", "fields|f", "help|h", "mismatch|m", "salts|s", "unknownatoms|u", "workingdir|w=s")) {
 393     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 394   }
 395   if ($Options{workingdir}) {
 396     if (! -d $Options{workingdir}) {
 397       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 398     }
 399     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 400   }
 401   if ($Options{detail} <= 0 || $Options{detail} > 3) {
 402     die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Possible values: 1 to 3\n";
 403   }
 404   $ProcessCmpdInfo = 0;
 405   if ($Options{all} || $Options{empty} || $Options{fields} || $Options{mismatch} || $Options{salts} || $Options{unknownatoms} || $Options{datacheck}) {
 406     $ProcessCmpdInfo = 1;
 407   }
 408   $ProcessCmpdData = 0;
 409   if ($Options{all} || $Options{fields} || $Options{empty} || $Options{datacheck}) {
 410     $ProcessCmpdData = 1;
 411   }
 412   $CountEmptyData = ($Options{all} || $Options{empty}) ? 1 : 0;
 413   $CheckData = ($Options{all} || $Options{datacheck}) ? 1 : 0;
 414 }
 415