MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: InfoFingerprintsTextFiles.pl,v $
   4 # $Date: 2008/04/19 16:12:20 $
   5 # $Revision: 1.11 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use 5.006;
  30 use strict;
  31 use FindBin; use lib "$FindBin::Bin/../lib";
  32 use Getopt::Long;
  33 use File::Basename;
  34 use Text::ParseWords;
  35 use Benchmark;
  36 use FileUtil;
  37 use TextUtil;
  38 use Fingerprints::FingerprintsBitVector;
  39 
  40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  41 
  42 # Autoflush STDOUT
  43 $| = 1;
  44 
  45 # Starting message...
  46 $ScriptName = basename($0);
  47 print "\n$ScriptName: Starting...\n\n";
  48 $StartTime = new Benchmark;
  49 
  50 # Get the options and setup script...
  51 SetupScriptUsage();
  52 if ($Options{help} || @ARGV < 1) {
  53   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  54 }
  55 
  56 my(@TextFilesList);
  57 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
  58 
  59 # Process options...
  60 my(%OptionsInfo);
  61 ProcessOptions();
  62 
  63 # Setup information about input files...
  64 my(%TextFilesInfo);
  65 print "Checking input text file(s)...\n";
  66 RetrieveTextFilesInfo();
  67 
  68 ProcessColumnsInfo();
  69 
  70 # Process input files..
  71 my($FileIndex, $TextFile, $FileProcessingMsg);
  72 $FileProcessingMsg = "Processing file";
  73 if (@TextFilesList > 1) {
  74   print "Processing text files...\n";
  75   $FileProcessingMsg = "\n$FileProcessingMsg";
  76 }
  77 
  78 for $FileIndex (0 .. $#TextFilesList) {
  79   if ($TextFilesInfo{FileOkay}[$FileIndex]) {
  80     $TextFile = $TextFilesList[$FileIndex];
  81     print "$FileProcessingMsg $TextFile...\n";
  82     ListFingerprintsTextFileInfo($FileIndex);
  83   }
  84 }
  85 ListTotalSizeOfFiles();
  86 
  87 print "$ScriptName:Done...\n\n";
  88 
  89 $EndTime = new Benchmark;
  90 $TotalTime = timediff ($EndTime, $StartTime);
  91 print "Total time: ", timestr($TotalTime), "\n";
  92 
  93 ###############################################################################
  94 
  95 # List approptiate information...
  96 #
  97 sub ListFingerprintsTextFileInfo {
  98   my($FileIndex) = @_;
  99   my($TextFile, $Line, $InDelim, $LineCount, $ValidDataLineCount, $InvalidDataLineCount, $MissingDataLineCount, $UseSequentialID, $FingerprintsColFound, $FingerprintsColNum, $DetailLevel, $UseInternalFormat, $FingerprintsType, $FingerprintsStringType, $FingerprintsSize, $FingerprintsString, $InvalidFingerprintsData, $TotalBitDensity, $BitDensity, $NumOfOnBits, @LineWords);
 100 
 101   $TextFile = $TextFilesList[$FileIndex];
 102   open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n";
 103 
 104   $LineCount = 0;
 105   $ValidDataLineCount = 0;
 106   $InvalidDataLineCount = 0;
 107   $MissingDataLineCount = 0;
 108   $TotalBitDensity = 0;
 109 
 110   $InDelim = $TextFilesInfo{InDelim}[$FileIndex];
 111   $DetailLevel = $OptionsInfo{DetailLevel};
 112 
 113   $FingerprintsColFound = $TextFilesInfo{FingerprintsColFound}[$FileIndex];
 114   $FingerprintsColNum = $TextFilesInfo{FingerprintsColNum}[$FileIndex];
 115   $UseInternalFormat = ($OptionsInfo{FingerprintsFormatMode} =~ /^Internal$/i) ? 1 : 0;
 116 
 117   # Skip column label line...
 118   $Line = GetTextLine(\*TEXTFILE);
 119 
 120   LINE: while ($Line = GetTextLine(\*TEXTFILE)) {
 121     $LineCount++;
 122     @LineWords = quotewords($InDelim, 0, $Line);
 123 
 124     if (!$FingerprintsColFound) {
 125       # Missing data...
 126       $MissingDataLineCount++;
 127       if ($OptionsInfo{CheckFingerprintsData} || $OptionsInfo{CountEmptyFingerprints}) {
 128 	if ($DetailLevel >= 3) {
 129 	  print "Line number $LineCount contains no fingerprints data: $Line \n";
 130 	}
 131 	elsif ($DetailLevel >= 1) {
 132 	  print "Line number $LineCount contains no fingerprints data...\n";
 133 	}
 134       }
 135       next LINE;
 136     }
 137 
 138     # Setup fingerprints bit vector...
 139     $InvalidFingerprintsData = 0;
 140     if ($UseInternalFormat) {
 141       ($FingerprintsType, $FingerprintsStringType, $FingerprintsSize, $FingerprintsString) = $LineWords[$FingerprintsColNum] =~ /^(.*?):(.*?):(.*?):(.*?)$/;
 142       if ($OptionsInfo{CheckFingerprintsData}) {
 143 	if (IsEmpty($FingerprintsType) || IsEmpty($FingerprintsStringType) || IsEmpty($FingerprintsSize) || IsEmpty($FingerprintsString)) {
 144 	  $InvalidFingerprintsData = 1;
 145 	}
 146       }
 147     }
 148     else {
 149       $FingerprintsString = $LineWords[$FingerprintsColNum];
 150       $FingerprintsStringType = $OptionsInfo{FingerprintsString};
 151       if ($OptionsInfo{CheckFingerprintsData} && IsEmpty($FingerprintsString)) {
 152 	$InvalidFingerprintsData = 1;
 153       }
 154     }
 155     if ($InvalidFingerprintsData) {
 156       # InvalidData data...
 157       $InvalidDataLineCount++;
 158       if ($DetailLevel >= 3) {
 159 	print "Line number $LineCount contains invalid fingerprints data: $Line \n";
 160       }
 161       elsif ($DetailLevel >= 1) {
 162 	print "Line number $LineCount contains invalid fingerprints data...\n";
 163       }
 164       next LINE;
 165     }
 166     my($FingerprintsBitVector);
 167 
 168     $FingerprintsBitVector = '';
 169     if ($FingerprintsStringType =~ /^(Hexadecimal|Hex)$/i) {
 170       $FingerprintsBitVector = FingerprintsBitVector::NewFromHexadecimalString($FingerprintsString);
 171     }
 172     elsif ($FingerprintsStringType =~ /^(Binary|Bin)$/i) {
 173       $FingerprintsBitVector = FingerprintsBitVector::NewFromBinaryString($FingerprintsString);
 174     }
 175     elsif ($FingerprintsStringType =~ /^(RawBinary|RawBin)$/i) {
 176       $FingerprintsBitVector = FingerprintsBitVector::NewFromRawBinaryString($FingerprintsString);
 177     }
 178 
 179     $ValidDataLineCount++;
 180     if ($OptionsInfo{ListAverageBitDensity} || $OptionsInfo{ListBitDensity}) {
 181       $BitDensity = $FingerprintsBitVector->GetFingerprintsBitDensity();
 182       $TotalBitDensity += $BitDensity;
 183     }
 184 
 185     if ($OptionsInfo{ListFingerprintsType} || $OptionsInfo{ListFingerprintsStringType} || $OptionsInfo{ListFingerprintsSize} || $OptionsInfo{ListBitDensity} || $OptionsInfo{ListOnBits}) {
 186       print "Data line number: $LineCount";
 187 
 188       if ($OptionsInfo{ListFingerprintsType} || $OptionsInfo{ListFingerprintsStringType} || $OptionsInfo{ListFingerprintsSize}) {
 189 	if ($UseInternalFormat) {
 190 	  if ($OptionsInfo{ListFingerprintsType}) {
 191 	    print "; FPType: $FingerprintsType";
 192 	  }
 193 	  elsif ($OptionsInfo{ListFingerprintsStringType}) {
 194 	    print "; FPStringType: $FingerprintsStringType";
 195 	  }
 196 	  elsif ($OptionsInfo{ListFingerprintsSize}) {
 197 	    print "; FPSize: $FingerprintsSize";
 198 	  }
 199 	}
 200 	else {
 201 	  print "; FPStringType: $FingerprintsStringType";
 202 	}
 203       }
 204       if ($OptionsInfo{ListBitDensity}) {
 205 	print "; BitDensity: $BitDensity";
 206       }
 207       if ($OptionsInfo{ListOnBits}) {
 208 	$NumOfOnBits = $FingerprintsBitVector->GetNumOfSetBits();
 209 	print "; NumOfOnBits: $NumOfOnBits";
 210       }
 211       print "\n";
 212     }
 213   }
 214   close TEXTFILE;
 215 
 216   print "\nNumber of lines: $LineCount\n";
 217   print "Number of columns: $TextFilesInfo{ColCount}[$FileIndex]\n";
 218   print "Column labels: ", JoinWords(\@{$TextFilesInfo{ColLabels}[$FileIndex]}, ", ", 1), "\n";
 219   print "Number of lines with valid fingerprints data: $ValidDataLineCount\n";
 220   if ($OptionsInfo{CountEmptyFingerprints}) {
 221     print "Number of lines with missing fingerprints data: $MissingDataLineCount\n";
 222     print "Number of lines with invalid fingerprints data: $InvalidDataLineCount\n";
 223   }
 224   if ($OptionsInfo{ListAverageBitDensity} && $ValidDataLineCount) {
 225     my($AverageBitDensity);
 226     $AverageBitDensity = $TotalBitDensity/$ValidDataLineCount;
 227     $AverageBitDensity = sprintf("%.2f", $AverageBitDensity) + 0;
 228     print "Average bit density: $AverageBitDensity\n";
 229   }
 230 
 231   # File size and modification information...
 232   print "\nFile size: ", FormatFileSize($TextFilesInfo{FileSize}[$FileIndex]), " \n";
 233   print "Last modified: ", $TextFilesInfo{FileLastModified}[$FileIndex], " \n";
 234 }
 235 
 236 # Total size of all the fiels...
 237 sub ListTotalSizeOfFiles {
 238   my($FileOkayCount, $TotalSize, $Index);
 239 
 240   $FileOkayCount = 0;
 241   $TotalSize = 0;
 242 
 243   for $Index (0 .. $#TextFilesList) {
 244     if ($TextFilesList[$Index]) {
 245       $FileOkayCount++;
 246       $TotalSize += $TextFilesInfo{FileSize}[$Index];
 247     }
 248   }
 249   if ($FileOkayCount > 1) {
 250     print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n";
 251   }
 252 }
 253 
 254 # Make sure the specified columns exists in text files...
 255 sub ProcessColumnsInfo {
 256   my($Index, $TextFile, $ColLabel, $ColFound, $FingerprintsCol, $FingerprintsColNum);
 257 
 258   @{$TextFilesInfo{FingerprintsColFound}} = ();
 259   @{$TextFilesInfo{FingerprintsColNum}} = ();
 260 
 261   FILELIST: for $Index (0 .. $#TextFilesList) {
 262     $TextFile = $TextFilesList[$Index];
 263 
 264     $TextFilesInfo{FingerprintsColFound}[$Index] = 0;
 265     $TextFilesInfo{FingerprintsColNum}[$Index] = '';
 266 
 267     # FingerprintsCol...
 268     $FingerprintsColNum = '';
 269     $FingerprintsCol = $OptionsInfo{FingerprintsCol};
 270 
 271     $ColFound = 0;
 272     if ($FingerprintsCol =~ /^UseDefault$/i) {
 273       # First column containing the word Fingerprints in its label...
 274       COLLABEL: for $ColLabel (@{$TextFilesInfo{ColLabels}[$Index]}) {
 275 	if ($ColLabel =~ /Fingerprints/i) {
 276 	  $ColFound = 1;
 277 	  $FingerprintsColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel};
 278 	  last COLLABEL;
 279 	}
 280       }
 281     }
 282     else {
 283       if ($OptionsInfo{ColMode} =~ /^ColNum$/i) {
 284 	# Is it a valid column number...
 285 	if ($FingerprintsCol <= $TextFilesInfo{ColCount}[$Index]) {
 286 	  $ColFound = 1;
 287 	  $FingerprintsColNum = $FingerprintsCol - 1;
 288 	}
 289       }
 290       elsif ($OptionsInfo{ColMode} =~ /^ColLabel$/i) {
 291 	# Does this column exists?
 292 	if (exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$FingerprintsCol}) {
 293 	  $ColFound = 1;
 294 	  $FingerprintsColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$FingerprintsCol};
 295 	}
 296       }
 297     }
 298     if ($ColFound) {
 299       $TextFilesInfo{FingerprintsColFound}[$Index] = 1;
 300       $TextFilesInfo{FingerprintsColNum}[$Index] = $FingerprintsColNum;
 301     }
 302   }
 303 }
 304 
 305 # Retrieve information about text files...
 306 #
 307 sub RetrieveTextFilesInfo {
 308   my($TextFile, $Index, $FileDir, $FileExt, $FileName, $InDelim, $Line, $ColNum, $ColLabel, $ModifiedTimeString, $ModifiedDateString, @ColLabels);
 309 
 310   %TextFilesInfo = ();
 311   @{$TextFilesInfo{FileOkay}} = ();
 312   @{$TextFilesInfo{FileSize}} = ();
 313   @{$TextFilesInfo{FileLastModified}} = ();
 314   @{$TextFilesInfo{ColCount}} = ();
 315   @{$TextFilesInfo{ColLabels}} = ();
 316   @{$TextFilesInfo{ColLabelToNumMap}} = ();
 317   @{$TextFilesInfo{InDelim}} = ();
 318 
 319   FILELIST: for $Index (0 .. $#TextFilesList) {
 320     $TextFile = $TextFilesList[$Index];
 321 
 322     $TextFilesInfo{FileOkay}[$Index] = 0;
 323     $TextFilesInfo{FileSize}[$Index] = 0;
 324     $TextFilesInfo{FileLastModified}[$Index] = '';
 325     $TextFilesInfo{ColCount}[$Index] = 0;
 326     @{$TextFilesInfo{ColLabels}[$Index]} = ();
 327     %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = ();
 328     $TextFilesInfo{InDelim}[$Index] = "";
 329 
 330     $TextFile = $TextFilesList[$Index];
 331     if (!(-e $TextFile)) {
 332       warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
 333       next FILELIST;
 334     }
 335     if (!CheckFileType($TextFile, "csv tsv")) {
 336       warn "Warning: Ignoring file $TextFile: It's not a text file\n";
 337       next FILELIST;
 338     }
 339 
 340     $FileDir = ""; $FileName = ""; $FileExt = "";
 341     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 342 
 343     if ($FileExt =~ /^tsv$/i) {
 344       $InDelim = "\t";
 345     }
 346     else {
 347       $InDelim = $OptionsInfo{InDelim};
 348     }
 349 
 350     if (!open TEXTFILE, "$TextFile") {
 351       warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
 352       next FILELIST;
 353     }
 354     $Line = GetTextLine(\*TEXTFILE);
 355     @ColLabels = quotewords($InDelim, 0, $Line);
 356     close TEXTFILE;
 357 
 358     $TextFilesInfo{FileOkay}[$Index] = 1;
 359 
 360     $TextFilesInfo{FileSize}[$Index] = FileSize($TextFile);
 361     ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($TextFile);
 362     $TextFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString";
 363 
 364     $TextFilesInfo{InDelim}[$Index] = $InDelim;
 365 
 366     $TextFilesInfo{ColCount}[$Index] = scalar @ColLabels;
 367     push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels;
 368     for $ColNum (0 .. $#ColLabels) {
 369       $ColLabel = $ColLabels[$ColNum];
 370       $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum;
 371     }
 372   }
 373 }
 374 
 375 # Process option values...
 376 sub ProcessOptions {
 377   %OptionsInfo = ();
 378 
 379   $OptionsInfo{ListAverageBitDensity} = ($Options{all} || $Options{averagebitdensity}) ? 1 :0;
 380   $OptionsInfo{ListBitDensity} = ($Options{all} || $Options{bitdensity}) ? 1 :0;
 381 
 382   # By default, count number of rows containing fingerprints data...
 383   $Options{CountFingerprints} = 1;
 384   $OptionsInfo{CountEmptyFingerprints} = ($Options{all} || $Options{empty}) ? 1 :0;
 385 
 386   $OptionsInfo{ColMode} = $Options{colmode};
 387   if (IsNotEmpty($Options{fingerprintscol})) {
 388     if ($Options{colmode} =~ /^ColNum$/i) {
 389       if (!IsPositiveInteger($Options{fingerprintscol})) {
 390 	die "Error: Column value, $Options{fingerprintscol}, specified using \"--FingerprintsCol\" is not valid: Allowed integer values: > 0.\n";
 391       }
 392     }
 393     $OptionsInfo{FingerprintsCol} = $Options{fingerprintscol};
 394   }
 395   else {
 396     $OptionsInfo{FingerprintsCol} = 'UseDefault';
 397   }
 398 
 399   $OptionsInfo{CheckFingerprintsData} = ($Options{all} || $Options{datacheck}) ? 1 :0;
 400   $OptionsInfo{DetailLevel} = $Options{detail};
 401 
 402   $OptionsInfo{FingerprintsFormatMode} = $Options{fingerprintsformatmode};
 403   $OptionsInfo{FingerprintsString} = '';
 404   if ($Options{fingerprintsformatmode} =~ /^Specify$/i) {
 405     if (IsEmpty($Options{fingerprintsstring})) {
 406       die "Error: You must specify a value for \"--FingerprintsString\" option in \"Specify\" \"--FingerprintsFormatMode\". \n";
 407     }
 408     if ($Options{fingerprintsstring} !~ /^(Hexadecimal|Binary|RawBinary)$/i) {
 409       die "Error: The value specified, $Options{fingerprintsstring}, for option \"--FingerprintsString\" is not valid. Allowed values: Hexadecimal, Binary, or RawBinary\n";
 410     }
 411     $OptionsInfo{FingerprintsString} = $Options{fingerprintsstring};
 412   }
 413 
 414   $OptionsInfo{ListFingerprintsType} = ($Options{all} || $Options{fingerprintstype}) ? 1 :0;
 415   $OptionsInfo{ListFingerprintsStringType} = ($Options{all} || $Options{fingerprintstringstype}) ? 1 :0;
 416   $OptionsInfo{ListFingerprintsSize} = ($Options{all} || $Options{fingerprintssize}) ? 1 :0;
 417 
 418   $OptionsInfo{InDelim} = ($Options{indelim} =~ /semicolon/i) ? "\;" : "\,";
 419 
 420   $OptionsInfo{ListOnBits} = ($Options{all} || $Options{onbits}) ? 1 :0;
 421 }
 422 
 423 # Setup script usage  and retrieve command line arguments specified using various options...
 424 sub SetupScriptUsage {
 425 
 426   # Retrieve all the options...
 427   %Options = ();
 428 
 429   $Options{colmode} = 'colnum';
 430   $Options{detail} = 1;
 431   $Options{fingerprintsformatmode} = 'Internal';
 432   $Options{indelim} = 'comma';
 433 
 434   if (!GetOptions(\%Options, "all|a", "averagebitdensity", "bitdensity", "count", "colmode|c=s", "detail|d=i", "datacheck", "empty|e", "fingerprintscol=s", "fingerprintsformatmode=s", "fingerprintsstring=s", "fingerprintstype", "fingerprintsstringtype", "fingerprintssize", "help|h", "indelim=s", "onbits", "workingdir|w=s")) {
 435     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 436   }
 437   if ($Options{workingdir}) {
 438     if (! -d $Options{workingdir}) {
 439       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 440     }
 441     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 442   }
 443   if ($Options{colmode} !~ /^(ColNum|ColLabel)$/i) {
 444     die "Error: The value specified, $Options{colmode}, for option \"-c, --ColMode\" is not valid. Allowed values: ColNum, or ColLabel\n";
 445   }
 446   if (!IsPositiveInteger($Options{detail})) {
 447     die "Error: The value specified, $Options{detail}, for option \"-d, --detail\" is not valid. Allowed values: > 0 \n";
 448   }
 449   if ($Options{fingerprintsformatmode} !~ /^(Internal|Specify)$/i) {
 450     die "Error: The value specified, $Options{fingerprintsformatmode}, for option \"--FingerprintsFormatMode\" is not valid. Allowed values: Internal or Specify\n";
 451   }
 452   if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 453     die "Error: The value specified, $Options{indelim}, for option \"--InDelim\" is not valid. Allowed values: comma, or semicolon\n";
 454   }
 455 }
 456