1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: InfoFingerprintsSDFiles.pl,v $ 4 # $Date: 2008/04/19 16:12:20 $ 5 # $Revision: 1.8 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use 5.006; 30 use strict; 31 use FindBin; use lib "$FindBin::Bin/../lib"; 32 use Getopt::Long; 33 use File::Basename; 34 use Text::ParseWords; 35 use Benchmark; 36 use FileUtil; 37 use TextUtil; 38 use SDFileUtil; 39 use Fingerprints::FingerprintsBitVector; 40 41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 42 43 # Autoflush STDOUT 44 $| = 1; 45 46 # Starting message... 47 $ScriptName = basename($0); 48 print "\n$ScriptName: Starting...\n\n"; 49 $StartTime = new Benchmark; 50 51 # Get the options and setup script... 52 SetupScriptUsage(); 53 if ($Options{help} || @ARGV < 1) { 54 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 55 } 56 57 my(@SDFilesList); 58 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 59 60 # Process options... 61 my(%OptionsInfo); 62 ProcessOptions(); 63 64 # Setup information about input files... 65 my(%SDFilesInfo); 66 print "Checking input SD file(s)...\n"; 67 RetrieveSDFilesInfo(); 68 69 # Process input files.. 70 my($FileIndex, $SDFile, $FileProcessingMsg); 71 $FileProcessingMsg = "Processing file"; 72 if (@SDFilesList > 1) { 73 print "Processing SD files...\n"; 74 $FileProcessingMsg = "\n$FileProcessingMsg"; 75 } 76 77 for $FileIndex (0 .. $#SDFilesList) { 78 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 79 $SDFile = $SDFilesList[$FileIndex]; 80 print "$FileProcessingMsg $SDFile...\n"; 81 ListFingerprintsSDFileInfo($FileIndex); 82 } 83 } 84 ListTotalSizeOfFiles(); 85 86 print "$ScriptName:Done...\n\n"; 87 88 $EndTime = new Benchmark; 89 $TotalTime = timediff ($EndTime, $StartTime); 90 print "Total time: ", timestr($TotalTime), "\n"; 91 92 ############################################################################### 93 94 # List approptiate information... 95 # 96 sub ListFingerprintsSDFileInfo { 97 my($FileIndex) = @_; 98 my($SDFile, $CmpdString, $CmpdCount, $ValidDataCmpdCount, $InvalidDataCmpdCount, $MissingDataCmpdCount, $UseSequentialID, $FingerprintsColFound, $FingerprintsColNum, $DetailLevel, $UseInternalFormat, $FingerprintsType, $FingerprintsStringType, $FingerprintsSize, $FingerprintsString, $InvalidFingerprintsData, $TotalBitDensity, $BitDensity, $NumOfOnBits, $FingerprintsFieldLabel, $FingerprintsFieldFound, @CmpdLines, %DataFieldLabelsAndValues); 99 100 $SDFile = $SDFilesList[$FileIndex]; 101 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n"; 102 103 $CmpdCount = 0; 104 $ValidDataCmpdCount = 0; 105 $InvalidDataCmpdCount = 0; 106 $MissingDataCmpdCount = 0; 107 $TotalBitDensity = 0; 108 109 $DetailLevel = $OptionsInfo{DetailLevel}; 110 111 $FingerprintsFieldFound = $SDFilesInfo{FingerprintsFieldFound}[$FileIndex]; 112 $FingerprintsFieldLabel = $SDFilesInfo{FingerprintsFieldLabel}[$FileIndex]; 113 114 $UseInternalFormat = ($OptionsInfo{FingerprintsFormatMode} =~ /^Internal$/i) ? 1 : 0; 115 116 COMPOUND: while ($CmpdString = ReadCmpdString(\*SDFILE)) { 117 $CmpdCount++; 118 @CmpdLines = split "\n", $CmpdString; 119 %DataFieldLabelsAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 120 121 if (!$FingerprintsFieldFound) { 122 # Missing data... 123 $MissingDataCmpdCount++; 124 if ($OptionsInfo{CheckFingerprintsData} || $OptionsInfo{CountEmptyFingerprints}) { 125 if ($DetailLevel >= 3) { 126 print "Compound number $CmpdCount contains no fingerprints data: $CmpdCount \n"; 127 } 128 elsif ($DetailLevel >= 1) { 129 print "Compound number $CmpdCount contains no fingerprints data...\n"; 130 } 131 } 132 next LINE; 133 } 134 135 # Setup fingerprints bit vector... 136 $InvalidFingerprintsData = 0; 137 if ($UseInternalFormat) { 138 ($FingerprintsType, $FingerprintsStringType, $FingerprintsSize, $FingerprintsString) = $DataFieldLabelsAndValues{$FingerprintsFieldLabel} =~ /^(.*?):(.*?):(.*?):(.*?)$/; 139 if ($OptionsInfo{CheckFingerprintsData}) { 140 if (IsEmpty($FingerprintsType) || IsEmpty($FingerprintsStringType) || IsEmpty($FingerprintsSize) || IsEmpty($FingerprintsString)) { 141 $InvalidFingerprintsData = 1; 142 } 143 } 144 } 145 else { 146 $FingerprintsString = $DataFieldLabelsAndValues{$FingerprintsFieldLabel}; 147 $FingerprintsStringType = $OptionsInfo{FingerprintsString}; 148 if ($OptionsInfo{CheckFingerprintsData} && IsEmpty($FingerprintsString)) { 149 $InvalidFingerprintsData = 1; 150 } 151 } 152 if ($InvalidFingerprintsData) { 153 # InvalidData data... 154 $InvalidDataCmpdCount++; 155 if ($DetailLevel >= 3) { 156 print "Compound number $CmpdCount contains invalid fingerprints data: $CmpdCount \n"; 157 } 158 elsif ($DetailLevel >= 1) { 159 print "Compound number $CmpdCount contains invalid fingerprints data...\n"; 160 } 161 next LINE; 162 } 163 my($FingerprintsBitVector); 164 165 $FingerprintsBitVector = ''; 166 if ($FingerprintsStringType =~ /^(Hexadecimal|Hex)$/i) { 167 $FingerprintsBitVector = FingerprintsBitVector::NewFromHexadecimalString($FingerprintsString); 168 } 169 elsif ($FingerprintsStringType =~ /^(Binary|Bin)$/i) { 170 $FingerprintsBitVector = FingerprintsBitVector::NewFromBinaryString($FingerprintsString); 171 } 172 elsif ($FingerprintsStringType =~ /^(RawBinary|RawBin)$/i) { 173 $FingerprintsBitVector = FingerprintsBitVector::NewFromRawBinaryString($FingerprintsString); 174 } 175 176 $ValidDataCmpdCount++; 177 if ($OptionsInfo{ListAverageBitDensity} || $OptionsInfo{ListBitDensity}) { 178 $BitDensity = $FingerprintsBitVector->GetFingerprintsBitDensity(); 179 $TotalBitDensity += $BitDensity; 180 } 181 182 if ($OptionsInfo{ListFingerprintsType} || $OptionsInfo{ListFingerprintsStringType} || $OptionsInfo{ListFingerprintsSize} || $OptionsInfo{ListBitDensity} || $OptionsInfo{ListOnBits}) { 183 print "Compound number: $CmpdCount"; 184 185 if ($OptionsInfo{ListFingerprintsType} || $OptionsInfo{ListFingerprintsStringType} || $OptionsInfo{ListFingerprintsSize}) { 186 if ($UseInternalFormat) { 187 if ($OptionsInfo{ListFingerprintsType}) { 188 print "; FPType: $FingerprintsType"; 189 } 190 elsif ($OptionsInfo{ListFingerprintsStringType}) { 191 print "; FPStringType: $FingerprintsStringType"; 192 } 193 elsif ($OptionsInfo{ListFingerprintsSize}) { 194 print "; FPSize: $FingerprintsSize"; 195 } 196 } 197 else { 198 print "; FPStringType: $FingerprintsStringType"; 199 } 200 } 201 if ($OptionsInfo{ListBitDensity}) { 202 print "; BitDensity: $BitDensity"; 203 } 204 if ($OptionsInfo{ListOnBits}) { 205 $NumOfOnBits = $FingerprintsBitVector->GetNumOfSetBits(); 206 print "; NumOfOnBits: $NumOfOnBits"; 207 } 208 print "\n"; 209 } 210 } 211 close SDFILE; 212 213 print "\nNumber of compounds: $CmpdCount\n"; 214 print "Number of compounds with valid fingerprints data: $ValidDataCmpdCount\n"; 215 if ($OptionsInfo{CountEmptyFingerprints}) { 216 print "Number of compounds with missing fingerprints data: $MissingDataCmpdCount\n"; 217 print "Number of compounds with invalid fingerprints data: $InvalidDataCmpdCount\n"; 218 } 219 if ($OptionsInfo{ListAverageBitDensity} && $ValidDataCmpdCount) { 220 my($AverageBitDensity); 221 $AverageBitDensity = $TotalBitDensity/$ValidDataCmpdCount; 222 $AverageBitDensity = sprintf("%.2f", $AverageBitDensity) + 0; 223 print "Average bit density: $AverageBitDensity\n"; 224 } 225 226 # File size and modification information... 227 print "\nFile size: ", FormatFileSize($SDFilesInfo{FileSize}[$FileIndex]), " \n"; 228 print "Last modified: ", $SDFilesInfo{FileLastModified}[$FileIndex], " \n"; 229 } 230 231 # Total size of all the fiels... 232 sub ListTotalSizeOfFiles { 233 my($FileOkayCount, $TotalSize, $Index); 234 235 $FileOkayCount = 0; 236 $TotalSize = 0; 237 238 for $Index (0 .. $#SDFilesList) { 239 if ($SDFilesList[$Index]) { 240 $FileOkayCount++; 241 $TotalSize += $SDFilesInfo{FileSize}[$Index]; 242 } 243 } 244 if ($FileOkayCount > 1) { 245 print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n"; 246 } 247 } 248 249 # Retrieve information about SD files... 250 # 251 sub RetrieveSDFilesInfo { 252 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $ModifiedTimeString, $ModifiedDateString, $FingerprintsFieldLabel, $FingerprintsFieldFound); 253 254 %SDFilesInfo = (); 255 @{$SDFilesInfo{FileOkay}} = (); 256 @{$SDFilesInfo{FileSize}} = (); 257 @{$SDFilesInfo{FileLastModified}} = (); 258 @{$SDFilesInfo{FingerprintsFieldFound}} = (); 259 @{$SDFilesInfo{FingerprintsFieldLabel}} = (); 260 261 FILELIST: for $Index (0 .. $#SDFilesList) { 262 $SDFile = $SDFilesList[$Index]; 263 264 $SDFilesInfo{FileOkay}[$Index] = 0; 265 $SDFilesInfo{FileSize}[$Index] = 0; 266 $SDFilesInfo{FileLastModified}[$Index] = ''; 267 $SDFilesInfo{FingerprintsFieldFound}[$Index] = 0; 268 $SDFilesInfo{FingerprintsFieldLabel}[$Index] = ''; 269 270 $SDFile = $SDFilesList[$Index]; 271 if (!(-e $SDFile)) { 272 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 273 next FILELIST; 274 } 275 if (!CheckFileType($SDFile, "sdf sd")) { 276 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 277 next FILELIST; 278 } 279 280 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues); 281 $FingerprintsFieldLabel = ''; 282 $FingerprintsFieldFound = 0; 283 284 @CmpdLines = (); 285 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 286 $CmpdString = ReadCmpdString(\*SDFILE); 287 close SDFILE; 288 @CmpdLines = split "\n", $CmpdString; 289 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 290 291 if ($OptionsInfo{FingerprintsField} !~ /^UseDefault$/i) { 292 $FingerprintsFieldLabel = $OptionsInfo{FingerprintsField}; 293 if (exists $DataFieldValues{$FingerprintsFieldLabel}) { 294 $FingerprintsFieldFound = 1; 295 } 296 } 297 else { 298 my($DataFieldLabel); 299 DATAFIELDLABEL: for $DataFieldLabel (keys %DataFieldValues) { 300 if ($DataFieldLabel =~ /Fingerprints/i) { 301 $FingerprintsFieldFound = 1; 302 $FingerprintsFieldLabel = $DataFieldLabel; 303 last DATAFIELDLABEL; 304 } 305 } 306 } 307 $SDFilesInfo{FileOkay}[$Index] = 1; 308 $SDFilesInfo{FingerprintsFieldFound}[$Index] = $FingerprintsFieldFound; 309 $SDFilesInfo{FingerprintsFieldLabel}[$Index] = $FingerprintsFieldLabel; 310 311 $SDFilesInfo{FileSize}[$Index] = FileSize($SDFile); 312 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($SDFile); 313 $SDFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString"; 314 } 315 } 316 317 # Process option values... 318 sub ProcessOptions { 319 %OptionsInfo = (); 320 321 $OptionsInfo{ListAverageBitDensity} = ($Options{all} || $Options{averagebitdensity}) ? 1 :0; 322 $OptionsInfo{ListBitDensity} = ($Options{all} || $Options{bitdensity}) ? 1 :0; 323 324 # By default, count number of compounds containing fingerprints data... 325 $Options{CountFingerprints} = 1; 326 $OptionsInfo{CountEmptyFingerprints} = ($Options{all} || $Options{empty}) ? 1 :0; 327 328 $OptionsInfo{CheckFingerprintsData} = ($Options{all} || $Options{datacheck}) ? 1 :0; 329 $OptionsInfo{DetailLevel} = $Options{detail}; 330 331 if (IsNotEmpty($Options{fingerprintsfield})) { 332 $OptionsInfo{FingerprintsField} = $Options{fingerprintsfield}; 333 } 334 else { 335 $OptionsInfo{FingerprintsField} = 'UseDefault'; 336 } 337 $OptionsInfo{FingerprintsFormatMode} = $Options{fingerprintsformatmode}; 338 $OptionsInfo{FingerprintsString} = ''; 339 if ($Options{fingerprintsformatmode} =~ /^Specify$/i) { 340 if (IsEmpty($Options{fingerprintsstring})) { 341 die "Error: You must specify a value for \"--FingerprintsString\" option in \"Specify\" \"--FingerprintsFormatMode\". \n"; 342 } 343 if ($Options{fingerprintsstring} !~ /^(Hexadecimal|Binary|RawBinary)$/i) { 344 die "Error: The value specified, $Options{fingerprintsstring}, for option \"--FingerprintsString\" is not valid. Allowed values: Hexadecimal, Binary, or RawBinary\n"; 345 } 346 $OptionsInfo{FingerprintsString} = $Options{fingerprintsstring}; 347 } 348 349 $OptionsInfo{ListFingerprintsType} = ($Options{all} || $Options{fingerprintstype}) ? 1 :0; 350 $OptionsInfo{ListFingerprintsStringType} = ($Options{all} || $Options{fingerprintstringstype}) ? 1 :0; 351 $OptionsInfo{ListFingerprintsSize} = ($Options{all} || $Options{fingerprintssize}) ? 1 :0; 352 353 $OptionsInfo{ListOnBits} = ($Options{all} || $Options{onbits}) ? 1 :0; 354 } 355 356 # Setup script usage and retrieve command line arguments specified using various options... 357 sub SetupScriptUsage { 358 359 # Retrieve all the options... 360 %Options = (); 361 362 $Options{detail} = 1; 363 $Options{fingerprintsformatmode} = 'Internal'; 364 365 if (!GetOptions(\%Options, "all|a", "averagebitdensity", "bitdensity", "count", "detail|d=i", "datacheck", "empty|e", "fingerprintsfield=s", "fingerprintsformatmode=s", "fingerprintsstring=s", "fingerprintstype", "fingerprintsstringtype", "fingerprintssize", "help|h", "onbits", "workingdir|w=s")) { 366 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 367 } 368 if ($Options{workingdir}) { 369 if (! -d $Options{workingdir}) { 370 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 371 } 372 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 373 } 374 if (!IsPositiveInteger($Options{detail})) { 375 die "Error: The value specified, $Options{detail}, for option \"-d, --detail\" is not valid. Allowed values: > 0 \n"; 376 } 377 if ($Options{fingerprintsformatmode} !~ /^(Internal|Specify)$/i) { 378 die "Error: The value specified, $Options{fingerprintsformatmode}, for option \"--FingerprintsFormatMode\" is not valid. Allowed values: Internal or Specify\n"; 379 } 380 } 381