1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: InfoFingerprintsTextFiles.pl,v $ 4 # $Date: 2008/04/19 16:12:20 $ 5 # $Revision: 1.11 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use 5.006; 30 use strict; 31 use FindBin; use lib "$FindBin::Bin/../lib"; 32 use Getopt::Long; 33 use File::Basename; 34 use Text::ParseWords; 35 use Benchmark; 36 use FileUtil; 37 use TextUtil; 38 use Fingerprints::FingerprintsBitVector; 39 40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 41 42 # Autoflush STDOUT 43 $| = 1; 44 45 # Starting message... 46 $ScriptName = basename($0); 47 print "\n$ScriptName: Starting...\n\n"; 48 $StartTime = new Benchmark; 49 50 # Get the options and setup script... 51 SetupScriptUsage(); 52 if ($Options{help} || @ARGV < 1) { 53 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 54 } 55 56 my(@TextFilesList); 57 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); 58 59 # Process options... 60 my(%OptionsInfo); 61 ProcessOptions(); 62 63 # Setup information about input files... 64 my(%TextFilesInfo); 65 print "Checking input text file(s)...\n"; 66 RetrieveTextFilesInfo(); 67 68 ProcessColumnsInfo(); 69 70 # Process input files.. 71 my($FileIndex, $TextFile, $FileProcessingMsg); 72 $FileProcessingMsg = "Processing file"; 73 if (@TextFilesList > 1) { 74 print "Processing text files...\n"; 75 $FileProcessingMsg = "\n$FileProcessingMsg"; 76 } 77 78 for $FileIndex (0 .. $#TextFilesList) { 79 if ($TextFilesInfo{FileOkay}[$FileIndex]) { 80 $TextFile = $TextFilesList[$FileIndex]; 81 print "$FileProcessingMsg $TextFile...\n"; 82 ListFingerprintsTextFileInfo($FileIndex); 83 } 84 } 85 ListTotalSizeOfFiles(); 86 87 print "$ScriptName:Done...\n\n"; 88 89 $EndTime = new Benchmark; 90 $TotalTime = timediff ($EndTime, $StartTime); 91 print "Total time: ", timestr($TotalTime), "\n"; 92 93 ############################################################################### 94 95 # List approptiate information... 96 # 97 sub ListFingerprintsTextFileInfo { 98 my($FileIndex) = @_; 99 my($TextFile, $Line, $InDelim, $LineCount, $ValidDataLineCount, $InvalidDataLineCount, $MissingDataLineCount, $UseSequentialID, $FingerprintsColFound, $FingerprintsColNum, $DetailLevel, $UseInternalFormat, $FingerprintsType, $FingerprintsStringType, $FingerprintsSize, $FingerprintsString, $InvalidFingerprintsData, $TotalBitDensity, $BitDensity, $NumOfOnBits, @LineWords); 100 101 $TextFile = $TextFilesList[$FileIndex]; 102 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n"; 103 104 $LineCount = 0; 105 $ValidDataLineCount = 0; 106 $InvalidDataLineCount = 0; 107 $MissingDataLineCount = 0; 108 $TotalBitDensity = 0; 109 110 $InDelim = $TextFilesInfo{InDelim}[$FileIndex]; 111 $DetailLevel = $OptionsInfo{DetailLevel}; 112 113 $FingerprintsColFound = $TextFilesInfo{FingerprintsColFound}[$FileIndex]; 114 $FingerprintsColNum = $TextFilesInfo{FingerprintsColNum}[$FileIndex]; 115 $UseInternalFormat = ($OptionsInfo{FingerprintsFormatMode} =~ /^Internal$/i) ? 1 : 0; 116 117 # Skip column label line... 118 $Line = GetTextLine(\*TEXTFILE); 119 120 LINE: while ($Line = GetTextLine(\*TEXTFILE)) { 121 $LineCount++; 122 @LineWords = quotewords($InDelim, 0, $Line); 123 124 if (!$FingerprintsColFound) { 125 # Missing data... 126 $MissingDataLineCount++; 127 if ($OptionsInfo{CheckFingerprintsData} || $OptionsInfo{CountEmptyFingerprints}) { 128 if ($DetailLevel >= 3) { 129 print "Line number $LineCount contains no fingerprints data: $Line \n"; 130 } 131 elsif ($DetailLevel >= 1) { 132 print "Line number $LineCount contains no fingerprints data...\n"; 133 } 134 } 135 next LINE; 136 } 137 138 # Setup fingerprints bit vector... 139 $InvalidFingerprintsData = 0; 140 if ($UseInternalFormat) { 141 ($FingerprintsType, $FingerprintsStringType, $FingerprintsSize, $FingerprintsString) = $LineWords[$FingerprintsColNum] =~ /^(.*?):(.*?):(.*?):(.*?)$/; 142 if ($OptionsInfo{CheckFingerprintsData}) { 143 if (IsEmpty($FingerprintsType) || IsEmpty($FingerprintsStringType) || IsEmpty($FingerprintsSize) || IsEmpty($FingerprintsString)) { 144 $InvalidFingerprintsData = 1; 145 } 146 } 147 } 148 else { 149 $FingerprintsString = $LineWords[$FingerprintsColNum]; 150 $FingerprintsStringType = $OptionsInfo{FingerprintsString}; 151 if ($OptionsInfo{CheckFingerprintsData} && IsEmpty($FingerprintsString)) { 152 $InvalidFingerprintsData = 1; 153 } 154 } 155 if ($InvalidFingerprintsData) { 156 # InvalidData data... 157 $InvalidDataLineCount++; 158 if ($DetailLevel >= 3) { 159 print "Line number $LineCount contains invalid fingerprints data: $Line \n"; 160 } 161 elsif ($DetailLevel >= 1) { 162 print "Line number $LineCount contains invalid fingerprints data...\n"; 163 } 164 next LINE; 165 } 166 my($FingerprintsBitVector); 167 168 $FingerprintsBitVector = ''; 169 if ($FingerprintsStringType =~ /^(Hexadecimal|Hex)$/i) { 170 $FingerprintsBitVector = FingerprintsBitVector::NewFromHexadecimalString($FingerprintsString); 171 } 172 elsif ($FingerprintsStringType =~ /^(Binary|Bin)$/i) { 173 $FingerprintsBitVector = FingerprintsBitVector::NewFromBinaryString($FingerprintsString); 174 } 175 elsif ($FingerprintsStringType =~ /^(RawBinary|RawBin)$/i) { 176 $FingerprintsBitVector = FingerprintsBitVector::NewFromRawBinaryString($FingerprintsString); 177 } 178 179 $ValidDataLineCount++; 180 if ($OptionsInfo{ListAverageBitDensity} || $OptionsInfo{ListBitDensity}) { 181 $BitDensity = $FingerprintsBitVector->GetFingerprintsBitDensity(); 182 $TotalBitDensity += $BitDensity; 183 } 184 185 if ($OptionsInfo{ListFingerprintsType} || $OptionsInfo{ListFingerprintsStringType} || $OptionsInfo{ListFingerprintsSize} || $OptionsInfo{ListBitDensity} || $OptionsInfo{ListOnBits}) { 186 print "Data line number: $LineCount"; 187 188 if ($OptionsInfo{ListFingerprintsType} || $OptionsInfo{ListFingerprintsStringType} || $OptionsInfo{ListFingerprintsSize}) { 189 if ($UseInternalFormat) { 190 if ($OptionsInfo{ListFingerprintsType}) { 191 print "; FPType: $FingerprintsType"; 192 } 193 elsif ($OptionsInfo{ListFingerprintsStringType}) { 194 print "; FPStringType: $FingerprintsStringType"; 195 } 196 elsif ($OptionsInfo{ListFingerprintsSize}) { 197 print "; FPSize: $FingerprintsSize"; 198 } 199 } 200 else { 201 print "; FPStringType: $FingerprintsStringType"; 202 } 203 } 204 if ($OptionsInfo{ListBitDensity}) { 205 print "; BitDensity: $BitDensity"; 206 } 207 if ($OptionsInfo{ListOnBits}) { 208 $NumOfOnBits = $FingerprintsBitVector->GetNumOfSetBits(); 209 print "; NumOfOnBits: $NumOfOnBits"; 210 } 211 print "\n"; 212 } 213 } 214 close TEXTFILE; 215 216 print "\nNumber of lines: $LineCount\n"; 217 print "Number of columns: $TextFilesInfo{ColCount}[$FileIndex]\n"; 218 print "Column labels: ", JoinWords(\@{$TextFilesInfo{ColLabels}[$FileIndex]}, ", ", 1), "\n"; 219 print "Number of lines with valid fingerprints data: $ValidDataLineCount\n"; 220 if ($OptionsInfo{CountEmptyFingerprints}) { 221 print "Number of lines with missing fingerprints data: $MissingDataLineCount\n"; 222 print "Number of lines with invalid fingerprints data: $InvalidDataLineCount\n"; 223 } 224 if ($OptionsInfo{ListAverageBitDensity} && $ValidDataLineCount) { 225 my($AverageBitDensity); 226 $AverageBitDensity = $TotalBitDensity/$ValidDataLineCount; 227 $AverageBitDensity = sprintf("%.2f", $AverageBitDensity) + 0; 228 print "Average bit density: $AverageBitDensity\n"; 229 } 230 231 # File size and modification information... 232 print "\nFile size: ", FormatFileSize($TextFilesInfo{FileSize}[$FileIndex]), " \n"; 233 print "Last modified: ", $TextFilesInfo{FileLastModified}[$FileIndex], " \n"; 234 } 235 236 # Total size of all the fiels... 237 sub ListTotalSizeOfFiles { 238 my($FileOkayCount, $TotalSize, $Index); 239 240 $FileOkayCount = 0; 241 $TotalSize = 0; 242 243 for $Index (0 .. $#TextFilesList) { 244 if ($TextFilesList[$Index]) { 245 $FileOkayCount++; 246 $TotalSize += $TextFilesInfo{FileSize}[$Index]; 247 } 248 } 249 if ($FileOkayCount > 1) { 250 print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n"; 251 } 252 } 253 254 # Make sure the specified columns exists in text files... 255 sub ProcessColumnsInfo { 256 my($Index, $TextFile, $ColLabel, $ColFound, $FingerprintsCol, $FingerprintsColNum); 257 258 @{$TextFilesInfo{FingerprintsColFound}} = (); 259 @{$TextFilesInfo{FingerprintsColNum}} = (); 260 261 FILELIST: for $Index (0 .. $#TextFilesList) { 262 $TextFile = $TextFilesList[$Index]; 263 264 $TextFilesInfo{FingerprintsColFound}[$Index] = 0; 265 $TextFilesInfo{FingerprintsColNum}[$Index] = ''; 266 267 # FingerprintsCol... 268 $FingerprintsColNum = ''; 269 $FingerprintsCol = $OptionsInfo{FingerprintsCol}; 270 271 $ColFound = 0; 272 if ($FingerprintsCol =~ /^UseDefault$/i) { 273 # First column containing the word Fingerprints in its label... 274 COLLABEL: for $ColLabel (@{$TextFilesInfo{ColLabels}[$Index]}) { 275 if ($ColLabel =~ /Fingerprints/i) { 276 $ColFound = 1; 277 $FingerprintsColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; 278 last COLLABEL; 279 } 280 } 281 } 282 else { 283 if ($OptionsInfo{ColMode} =~ /^ColNum$/i) { 284 # Is it a valid column number... 285 if ($FingerprintsCol <= $TextFilesInfo{ColCount}[$Index]) { 286 $ColFound = 1; 287 $FingerprintsColNum = $FingerprintsCol - 1; 288 } 289 } 290 elsif ($OptionsInfo{ColMode} =~ /^ColLabel$/i) { 291 # Does this column exists? 292 if (exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$FingerprintsCol}) { 293 $ColFound = 1; 294 $FingerprintsColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$FingerprintsCol}; 295 } 296 } 297 } 298 if ($ColFound) { 299 $TextFilesInfo{FingerprintsColFound}[$Index] = 1; 300 $TextFilesInfo{FingerprintsColNum}[$Index] = $FingerprintsColNum; 301 } 302 } 303 } 304 305 # Retrieve information about text files... 306 # 307 sub RetrieveTextFilesInfo { 308 my($TextFile, $Index, $FileDir, $FileExt, $FileName, $InDelim, $Line, $ColNum, $ColLabel, $ModifiedTimeString, $ModifiedDateString, @ColLabels); 309 310 %TextFilesInfo = (); 311 @{$TextFilesInfo{FileOkay}} = (); 312 @{$TextFilesInfo{FileSize}} = (); 313 @{$TextFilesInfo{FileLastModified}} = (); 314 @{$TextFilesInfo{ColCount}} = (); 315 @{$TextFilesInfo{ColLabels}} = (); 316 @{$TextFilesInfo{ColLabelToNumMap}} = (); 317 @{$TextFilesInfo{InDelim}} = (); 318 319 FILELIST: for $Index (0 .. $#TextFilesList) { 320 $TextFile = $TextFilesList[$Index]; 321 322 $TextFilesInfo{FileOkay}[$Index] = 0; 323 $TextFilesInfo{FileSize}[$Index] = 0; 324 $TextFilesInfo{FileLastModified}[$Index] = ''; 325 $TextFilesInfo{ColCount}[$Index] = 0; 326 @{$TextFilesInfo{ColLabels}[$Index]} = (); 327 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = (); 328 $TextFilesInfo{InDelim}[$Index] = ""; 329 330 $TextFile = $TextFilesList[$Index]; 331 if (!(-e $TextFile)) { 332 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; 333 next FILELIST; 334 } 335 if (!CheckFileType($TextFile, "csv tsv")) { 336 warn "Warning: Ignoring file $TextFile: It's not a text file\n"; 337 next FILELIST; 338 } 339 340 $FileDir = ""; $FileName = ""; $FileExt = ""; 341 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 342 343 if ($FileExt =~ /^tsv$/i) { 344 $InDelim = "\t"; 345 } 346 else { 347 $InDelim = $OptionsInfo{InDelim}; 348 } 349 350 if (!open TEXTFILE, "$TextFile") { 351 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; 352 next FILELIST; 353 } 354 $Line = GetTextLine(\*TEXTFILE); 355 @ColLabels = quotewords($InDelim, 0, $Line); 356 close TEXTFILE; 357 358 $TextFilesInfo{FileOkay}[$Index] = 1; 359 360 $TextFilesInfo{FileSize}[$Index] = FileSize($TextFile); 361 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($TextFile); 362 $TextFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString"; 363 364 $TextFilesInfo{InDelim}[$Index] = $InDelim; 365 366 $TextFilesInfo{ColCount}[$Index] = scalar @ColLabels; 367 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels; 368 for $ColNum (0 .. $#ColLabels) { 369 $ColLabel = $ColLabels[$ColNum]; 370 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum; 371 } 372 } 373 } 374 375 # Process option values... 376 sub ProcessOptions { 377 %OptionsInfo = (); 378 379 $OptionsInfo{ListAverageBitDensity} = ($Options{all} || $Options{averagebitdensity}) ? 1 :0; 380 $OptionsInfo{ListBitDensity} = ($Options{all} || $Options{bitdensity}) ? 1 :0; 381 382 # By default, count number of rows containing fingerprints data... 383 $Options{CountFingerprints} = 1; 384 $OptionsInfo{CountEmptyFingerprints} = ($Options{all} || $Options{empty}) ? 1 :0; 385 386 $OptionsInfo{ColMode} = $Options{colmode}; 387 if (IsNotEmpty($Options{fingerprintscol})) { 388 if ($Options{colmode} =~ /^ColNum$/i) { 389 if (!IsPositiveInteger($Options{fingerprintscol})) { 390 die "Error: Column value, $Options{fingerprintscol}, specified using \"--FingerprintsCol\" is not valid: Allowed integer values: > 0.\n"; 391 } 392 } 393 $OptionsInfo{FingerprintsCol} = $Options{fingerprintscol}; 394 } 395 else { 396 $OptionsInfo{FingerprintsCol} = 'UseDefault'; 397 } 398 399 $OptionsInfo{CheckFingerprintsData} = ($Options{all} || $Options{datacheck}) ? 1 :0; 400 $OptionsInfo{DetailLevel} = $Options{detail}; 401 402 $OptionsInfo{FingerprintsFormatMode} = $Options{fingerprintsformatmode}; 403 $OptionsInfo{FingerprintsString} = ''; 404 if ($Options{fingerprintsformatmode} =~ /^Specify$/i) { 405 if (IsEmpty($Options{fingerprintsstring})) { 406 die "Error: You must specify a value for \"--FingerprintsString\" option in \"Specify\" \"--FingerprintsFormatMode\". \n"; 407 } 408 if ($Options{fingerprintsstring} !~ /^(Hexadecimal|Binary|RawBinary)$/i) { 409 die "Error: The value specified, $Options{fingerprintsstring}, for option \"--FingerprintsString\" is not valid. Allowed values: Hexadecimal, Binary, or RawBinary\n"; 410 } 411 $OptionsInfo{FingerprintsString} = $Options{fingerprintsstring}; 412 } 413 414 $OptionsInfo{ListFingerprintsType} = ($Options{all} || $Options{fingerprintstype}) ? 1 :0; 415 $OptionsInfo{ListFingerprintsStringType} = ($Options{all} || $Options{fingerprintstringstype}) ? 1 :0; 416 $OptionsInfo{ListFingerprintsSize} = ($Options{all} || $Options{fingerprintssize}) ? 1 :0; 417 418 $OptionsInfo{InDelim} = ($Options{indelim} =~ /semicolon/i) ? "\;" : "\,"; 419 420 $OptionsInfo{ListOnBits} = ($Options{all} || $Options{onbits}) ? 1 :0; 421 } 422 423 # Setup script usage and retrieve command line arguments specified using various options... 424 sub SetupScriptUsage { 425 426 # Retrieve all the options... 427 %Options = (); 428 429 $Options{colmode} = 'colnum'; 430 $Options{detail} = 1; 431 $Options{fingerprintsformatmode} = 'Internal'; 432 $Options{indelim} = 'comma'; 433 434 if (!GetOptions(\%Options, "all|a", "averagebitdensity", "bitdensity", "count", "colmode|c=s", "detail|d=i", "datacheck", "empty|e", "fingerprintscol=s", "fingerprintsformatmode=s", "fingerprintsstring=s", "fingerprintstype", "fingerprintsstringtype", "fingerprintssize", "help|h", "indelim=s", "onbits", "workingdir|w=s")) { 435 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 436 } 437 if ($Options{workingdir}) { 438 if (! -d $Options{workingdir}) { 439 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 440 } 441 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 442 } 443 if ($Options{colmode} !~ /^(ColNum|ColLabel)$/i) { 444 die "Error: The value specified, $Options{colmode}, for option \"-c, --ColMode\" is not valid. Allowed values: ColNum, or ColLabel\n"; 445 } 446 if (!IsPositiveInteger($Options{detail})) { 447 die "Error: The value specified, $Options{detail}, for option \"-d, --detail\" is not valid. Allowed values: > 0 \n"; 448 } 449 if ($Options{fingerprintsformatmode} !~ /^(Internal|Specify)$/i) { 450 die "Error: The value specified, $Options{fingerprintsformatmode}, for option \"--FingerprintsFormatMode\" is not valid. Allowed values: Internal or Specify\n"; 451 } 452 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 453 die "Error: The value specified, $Options{indelim}, for option \"--InDelim\" is not valid. Allowed values: comma, or semicolon\n"; 454 } 455 } 456