MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: InfoFingerprintsSDFiles.pl,v $
   4 # $Date: 2008/04/19 16:12:20 $
   5 # $Revision: 1.8 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use 5.006;
  30 use strict;
  31 use FindBin; use lib "$FindBin::Bin/../lib";
  32 use Getopt::Long;
  33 use File::Basename;
  34 use Text::ParseWords;
  35 use Benchmark;
  36 use FileUtil;
  37 use TextUtil;
  38 use SDFileUtil;
  39 use Fingerprints::FingerprintsBitVector;
  40 
  41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  42 
  43 # Autoflush STDOUT
  44 $| = 1;
  45 
  46 # Starting message...
  47 $ScriptName = basename($0);
  48 print "\n$ScriptName: Starting...\n\n";
  49 $StartTime = new Benchmark;
  50 
  51 # Get the options and setup script...
  52 SetupScriptUsage();
  53 if ($Options{help} || @ARGV < 1) {
  54   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  55 }
  56 
  57 my(@SDFilesList);
  58 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  59 
  60 # Process options...
  61 my(%OptionsInfo);
  62 ProcessOptions();
  63 
  64 # Setup information about input files...
  65 my(%SDFilesInfo);
  66 print "Checking input SD file(s)...\n";
  67 RetrieveSDFilesInfo();
  68 
  69 # Process input files..
  70 my($FileIndex, $SDFile, $FileProcessingMsg);
  71 $FileProcessingMsg = "Processing file";
  72 if (@SDFilesList > 1) {
  73   print "Processing SD files...\n";
  74   $FileProcessingMsg = "\n$FileProcessingMsg";
  75 }
  76 
  77 for $FileIndex (0 .. $#SDFilesList) {
  78   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  79     $SDFile = $SDFilesList[$FileIndex];
  80     print "$FileProcessingMsg $SDFile...\n";
  81     ListFingerprintsSDFileInfo($FileIndex);
  82   }
  83 }
  84 ListTotalSizeOfFiles();
  85 
  86 print "$ScriptName:Done...\n\n";
  87 
  88 $EndTime = new Benchmark;
  89 $TotalTime = timediff ($EndTime, $StartTime);
  90 print "Total time: ", timestr($TotalTime), "\n";
  91 
  92 ###############################################################################
  93 
  94 # List approptiate information...
  95 #
  96 sub ListFingerprintsSDFileInfo {
  97   my($FileIndex) = @_;
  98   my($SDFile, $CmpdString, $CmpdCount, $ValidDataCmpdCount, $InvalidDataCmpdCount, $MissingDataCmpdCount, $UseSequentialID, $FingerprintsColFound, $FingerprintsColNum, $DetailLevel, $UseInternalFormat, $FingerprintsType, $FingerprintsStringType, $FingerprintsSize, $FingerprintsString, $InvalidFingerprintsData, $TotalBitDensity, $BitDensity, $NumOfOnBits, $FingerprintsFieldLabel, $FingerprintsFieldFound, @CmpdLines, %DataFieldLabelsAndValues);
  99 
 100   $SDFile = $SDFilesList[$FileIndex];
 101   open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
 102 
 103   $CmpdCount = 0;
 104   $ValidDataCmpdCount = 0;
 105   $InvalidDataCmpdCount = 0;
 106   $MissingDataCmpdCount = 0;
 107   $TotalBitDensity = 0;
 108 
 109   $DetailLevel = $OptionsInfo{DetailLevel};
 110 
 111   $FingerprintsFieldFound = $SDFilesInfo{FingerprintsFieldFound}[$FileIndex];
 112   $FingerprintsFieldLabel = $SDFilesInfo{FingerprintsFieldLabel}[$FileIndex];
 113 
 114   $UseInternalFormat = ($OptionsInfo{FingerprintsFormatMode} =~ /^Internal$/i) ? 1 : 0;
 115 
 116   COMPOUND: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 117     $CmpdCount++;
 118     @CmpdLines = split "\n", $CmpdString;
 119     %DataFieldLabelsAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 120 
 121     if (!$FingerprintsFieldFound) {
 122       # Missing data...
 123       $MissingDataCmpdCount++;
 124       if ($OptionsInfo{CheckFingerprintsData} || $OptionsInfo{CountEmptyFingerprints}) {
 125 	if ($DetailLevel >= 3) {
 126 	  print "Compound number $CmpdCount contains no fingerprints data: $CmpdCount \n";
 127 	}
 128 	elsif ($DetailLevel >= 1) {
 129 	  print "Compound number $CmpdCount contains no fingerprints data...\n";
 130 	}
 131       }
 132       next LINE;
 133     }
 134 
 135     # Setup fingerprints bit vector...
 136     $InvalidFingerprintsData = 0;
 137     if ($UseInternalFormat) {
 138       ($FingerprintsType, $FingerprintsStringType, $FingerprintsSize, $FingerprintsString) = $DataFieldLabelsAndValues{$FingerprintsFieldLabel} =~ /^(.*?):(.*?):(.*?):(.*?)$/;
 139       if ($OptionsInfo{CheckFingerprintsData}) {
 140 	if (IsEmpty($FingerprintsType) || IsEmpty($FingerprintsStringType) || IsEmpty($FingerprintsSize) || IsEmpty($FingerprintsString)) {
 141 	  $InvalidFingerprintsData = 1;
 142 	}
 143       }
 144     }
 145     else {
 146       $FingerprintsString = $DataFieldLabelsAndValues{$FingerprintsFieldLabel};
 147       $FingerprintsStringType = $OptionsInfo{FingerprintsString};
 148       if ($OptionsInfo{CheckFingerprintsData} && IsEmpty($FingerprintsString)) {
 149 	$InvalidFingerprintsData = 1;
 150       }
 151     }
 152     if ($InvalidFingerprintsData) {
 153       # InvalidData data...
 154       $InvalidDataCmpdCount++;
 155       if ($DetailLevel >= 3) {
 156 	print "Compound number $CmpdCount contains invalid fingerprints data: $CmpdCount \n";
 157       }
 158       elsif ($DetailLevel >= 1) {
 159 	print "Compound number $CmpdCount contains invalid fingerprints data...\n";
 160       }
 161       next LINE;
 162     }
 163     my($FingerprintsBitVector);
 164 
 165     $FingerprintsBitVector = '';
 166     if ($FingerprintsStringType =~ /^(Hexadecimal|Hex)$/i) {
 167       $FingerprintsBitVector = FingerprintsBitVector::NewFromHexadecimalString($FingerprintsString);
 168     }
 169     elsif ($FingerprintsStringType =~ /^(Binary|Bin)$/i) {
 170       $FingerprintsBitVector = FingerprintsBitVector::NewFromBinaryString($FingerprintsString);
 171     }
 172     elsif ($FingerprintsStringType =~ /^(RawBinary|RawBin)$/i) {
 173       $FingerprintsBitVector = FingerprintsBitVector::NewFromRawBinaryString($FingerprintsString);
 174     }
 175 
 176     $ValidDataCmpdCount++;
 177     if ($OptionsInfo{ListAverageBitDensity} || $OptionsInfo{ListBitDensity}) {
 178       $BitDensity = $FingerprintsBitVector->GetFingerprintsBitDensity();
 179       $TotalBitDensity += $BitDensity;
 180     }
 181 
 182     if ($OptionsInfo{ListFingerprintsType} || $OptionsInfo{ListFingerprintsStringType} || $OptionsInfo{ListFingerprintsSize} || $OptionsInfo{ListBitDensity} || $OptionsInfo{ListOnBits}) {
 183       print "Compound number: $CmpdCount";
 184 
 185       if ($OptionsInfo{ListFingerprintsType} || $OptionsInfo{ListFingerprintsStringType} || $OptionsInfo{ListFingerprintsSize}) {
 186 	if ($UseInternalFormat) {
 187 	  if ($OptionsInfo{ListFingerprintsType}) {
 188 	    print "; FPType: $FingerprintsType";
 189 	  }
 190 	  elsif ($OptionsInfo{ListFingerprintsStringType}) {
 191 	    print "; FPStringType: $FingerprintsStringType";
 192 	  }
 193 	  elsif ($OptionsInfo{ListFingerprintsSize}) {
 194 	    print "; FPSize: $FingerprintsSize";
 195 	  }
 196 	}
 197 	else {
 198 	  print "; FPStringType: $FingerprintsStringType";
 199 	}
 200       }
 201       if ($OptionsInfo{ListBitDensity}) {
 202 	print "; BitDensity: $BitDensity";
 203       }
 204       if ($OptionsInfo{ListOnBits}) {
 205 	$NumOfOnBits = $FingerprintsBitVector->GetNumOfSetBits();
 206 	print "; NumOfOnBits: $NumOfOnBits";
 207       }
 208       print "\n";
 209     }
 210   }
 211   close SDFILE;
 212 
 213   print "\nNumber of compounds: $CmpdCount\n";
 214   print "Number of compounds with valid fingerprints data: $ValidDataCmpdCount\n";
 215   if ($OptionsInfo{CountEmptyFingerprints}) {
 216     print "Number of compounds with missing fingerprints data: $MissingDataCmpdCount\n";
 217     print "Number of compounds with invalid fingerprints data: $InvalidDataCmpdCount\n";
 218   }
 219   if ($OptionsInfo{ListAverageBitDensity} && $ValidDataCmpdCount) {
 220     my($AverageBitDensity);
 221     $AverageBitDensity = $TotalBitDensity/$ValidDataCmpdCount;
 222     $AverageBitDensity = sprintf("%.2f", $AverageBitDensity) + 0;
 223     print "Average bit density: $AverageBitDensity\n";
 224   }
 225 
 226   # File size and modification information...
 227   print "\nFile size: ", FormatFileSize($SDFilesInfo{FileSize}[$FileIndex]), " \n";
 228   print "Last modified: ", $SDFilesInfo{FileLastModified}[$FileIndex], " \n";
 229 }
 230 
 231 # Total size of all the fiels...
 232 sub ListTotalSizeOfFiles {
 233   my($FileOkayCount, $TotalSize, $Index);
 234 
 235   $FileOkayCount = 0;
 236   $TotalSize = 0;
 237 
 238   for $Index (0 .. $#SDFilesList) {
 239     if ($SDFilesList[$Index]) {
 240       $FileOkayCount++;
 241       $TotalSize += $SDFilesInfo{FileSize}[$Index];
 242     }
 243   }
 244   if ($FileOkayCount > 1) {
 245     print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n";
 246   }
 247 }
 248 
 249 # Retrieve information about SD files...
 250 #
 251 sub RetrieveSDFilesInfo {
 252   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $ModifiedTimeString, $ModifiedDateString, $FingerprintsFieldLabel, $FingerprintsFieldFound);
 253 
 254   %SDFilesInfo = ();
 255   @{$SDFilesInfo{FileOkay}} = ();
 256   @{$SDFilesInfo{FileSize}} = ();
 257   @{$SDFilesInfo{FileLastModified}} = ();
 258   @{$SDFilesInfo{FingerprintsFieldFound}} = ();
 259   @{$SDFilesInfo{FingerprintsFieldLabel}} = ();
 260 
 261   FILELIST: for $Index (0 .. $#SDFilesList) {
 262     $SDFile = $SDFilesList[$Index];
 263 
 264     $SDFilesInfo{FileOkay}[$Index] = 0;
 265     $SDFilesInfo{FileSize}[$Index] = 0;
 266     $SDFilesInfo{FileLastModified}[$Index] = '';
 267     $SDFilesInfo{FingerprintsFieldFound}[$Index] = 0;
 268     $SDFilesInfo{FingerprintsFieldLabel}[$Index] = '';
 269 
 270     $SDFile = $SDFilesList[$Index];
 271     if (!(-e $SDFile)) {
 272       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 273       next FILELIST;
 274     }
 275     if (!CheckFileType($SDFile, "sdf sd")) {
 276       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 277       next FILELIST;
 278     }
 279 
 280     my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
 281     $FingerprintsFieldLabel = '';
 282     $FingerprintsFieldFound = 0;
 283 
 284     @CmpdLines = ();
 285     open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 286     $CmpdString = ReadCmpdString(\*SDFILE);
 287     close SDFILE;
 288     @CmpdLines = split "\n", $CmpdString;
 289     %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 290 
 291     if ($OptionsInfo{FingerprintsField} !~ /^UseDefault$/i) {
 292       $FingerprintsFieldLabel = $OptionsInfo{FingerprintsField};
 293       if (exists $DataFieldValues{$FingerprintsFieldLabel}) {
 294 	$FingerprintsFieldFound = 1;
 295       }
 296     }
 297     else {
 298       my($DataFieldLabel);
 299       DATAFIELDLABEL: for $DataFieldLabel (keys %DataFieldValues) {
 300 	if ($DataFieldLabel =~ /Fingerprints/i) {
 301 	  $FingerprintsFieldFound = 1;
 302 	  $FingerprintsFieldLabel = $DataFieldLabel;
 303 	  last DATAFIELDLABEL;
 304 	}
 305       }
 306     }
 307     $SDFilesInfo{FileOkay}[$Index] = 1;
 308     $SDFilesInfo{FingerprintsFieldFound}[$Index] = $FingerprintsFieldFound;
 309     $SDFilesInfo{FingerprintsFieldLabel}[$Index] = $FingerprintsFieldLabel;
 310 
 311     $SDFilesInfo{FileSize}[$Index] = FileSize($SDFile);
 312     ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($SDFile);
 313     $SDFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString";
 314   }
 315 }
 316 
 317 # Process option values...
 318 sub ProcessOptions {
 319   %OptionsInfo = ();
 320 
 321   $OptionsInfo{ListAverageBitDensity} = ($Options{all} || $Options{averagebitdensity}) ? 1 :0;
 322   $OptionsInfo{ListBitDensity} = ($Options{all} || $Options{bitdensity}) ? 1 :0;
 323 
 324   # By default, count number of compounds containing fingerprints data...
 325   $Options{CountFingerprints} = 1;
 326   $OptionsInfo{CountEmptyFingerprints} = ($Options{all} || $Options{empty}) ? 1 :0;
 327 
 328   $OptionsInfo{CheckFingerprintsData} = ($Options{all} || $Options{datacheck}) ? 1 :0;
 329   $OptionsInfo{DetailLevel} = $Options{detail};
 330 
 331   if (IsNotEmpty($Options{fingerprintsfield})) {
 332     $OptionsInfo{FingerprintsField} = $Options{fingerprintsfield};
 333   }
 334   else {
 335     $OptionsInfo{FingerprintsField} = 'UseDefault';
 336   }
 337   $OptionsInfo{FingerprintsFormatMode} = $Options{fingerprintsformatmode};
 338   $OptionsInfo{FingerprintsString} = '';
 339   if ($Options{fingerprintsformatmode} =~ /^Specify$/i) {
 340     if (IsEmpty($Options{fingerprintsstring})) {
 341       die "Error: You must specify a value for \"--FingerprintsString\" option in \"Specify\" \"--FingerprintsFormatMode\". \n";
 342     }
 343     if ($Options{fingerprintsstring} !~ /^(Hexadecimal|Binary|RawBinary)$/i) {
 344       die "Error: The value specified, $Options{fingerprintsstring}, for option \"--FingerprintsString\" is not valid. Allowed values: Hexadecimal, Binary, or RawBinary\n";
 345     }
 346     $OptionsInfo{FingerprintsString} = $Options{fingerprintsstring};
 347   }
 348 
 349   $OptionsInfo{ListFingerprintsType} = ($Options{all} || $Options{fingerprintstype}) ? 1 :0;
 350   $OptionsInfo{ListFingerprintsStringType} = ($Options{all} || $Options{fingerprintstringstype}) ? 1 :0;
 351   $OptionsInfo{ListFingerprintsSize} = ($Options{all} || $Options{fingerprintssize}) ? 1 :0;
 352 
 353   $OptionsInfo{ListOnBits} = ($Options{all} || $Options{onbits}) ? 1 :0;
 354 }
 355 
 356 # Setup script usage  and retrieve command line arguments specified using various options...
 357 sub SetupScriptUsage {
 358 
 359   # Retrieve all the options...
 360   %Options = ();
 361 
 362   $Options{detail} = 1;
 363   $Options{fingerprintsformatmode} = 'Internal';
 364 
 365   if (!GetOptions(\%Options, "all|a", "averagebitdensity", "bitdensity", "count", "detail|d=i", "datacheck", "empty|e", "fingerprintsfield=s", "fingerprintsformatmode=s", "fingerprintsstring=s", "fingerprintstype", "fingerprintsstringtype", "fingerprintssize", "help|h",  "onbits", "workingdir|w=s")) {
 366     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 367   }
 368   if ($Options{workingdir}) {
 369     if (! -d $Options{workingdir}) {
 370       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 371     }
 372     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 373   }
 374   if (!IsPositiveInteger($Options{detail})) {
 375     die "Error: The value specified, $Options{detail}, for option \"-d, --detail\" is not valid. Allowed values: > 0 \n";
 376   }
 377   if ($Options{fingerprintsformatmode} !~ /^(Internal|Specify)$/i) {
 378     die "Error: The value specified, $Options{fingerprintsformatmode}, for option \"--FingerprintsFormatMode\" is not valid. Allowed values: Internal or Specify\n";
 379   }
 380 }
 381