MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: InfoTextFiles.pl,v $
   4 # $Date: 2008/01/30 21:44:48 $
   5 # $Revision: 1.16 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use 5.006;
  30 use strict;
  31 use FindBin; use lib "$FindBin::Bin/../lib";
  32 use Getopt::Long;
  33 use File::Basename;
  34 use Text::ParseWords;
  35 use Benchmark;
  36 use FileUtil;
  37 use TextUtil;
  38 
  39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  40 
  41 # Autoflush STDOUT
  42 $| = 1;
  43 
  44 # Starting message...
  45 $ScriptName = basename($0);
  46 print "\n$ScriptName: Starting...\n\n";
  47 $StartTime = new Benchmark;
  48 
  49 # Get the options and setup script...
  50 SetupScriptUsage();
  51 if ($Options{help} || @ARGV < 1) {
  52   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  53 }
  54 
  55 my(@TextFilesList);
  56 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
  57 
  58 my($DetailLevel, $ParseLines, $CountEmpty, $CheckData, $CheckNumericalData, @SpecifiedNumericalDataCols);
  59 ProcessOptions();
  60 
  61 print "Checking input text file(s)...\n";
  62 my(@TextFilesOkay, @TextFilesColCount, @TextFilesSize, @TextFilesLastModified, @TextFilesColLabels, @TextFilesColLabelToNumMap, @TextFilesInDelim);
  63 RetrieveTextFilesInfo();
  64 
  65 my(@TextFilesNumericalDataColNums, @TextFilesNumericalDataColLabels);
  66 ProcessColumnsInfo();
  67 
  68 # Generate output files...
  69 my($Index, $TextFile);
  70 if (@TextFilesList > 1) {
  71   print "Processing text files...\n";
  72 }
  73 for $Index (0 .. $#TextFilesList) {
  74   if ($TextFilesOkay[$Index]) {
  75     $TextFile = $TextFilesList[$Index];
  76     if (@TextFilesList > 1) {
  77       print "\nProcessing file $TextFile...\n";
  78     }
  79     else {
  80       print "Processing file $TextFile...\n"
  81     }
  82     ListTextFileInfo($Index);
  83   }
  84 }
  85 
  86 ListTotalSizeOfFiles();
  87 
  88 print "$ScriptName:Done...\n\n";
  89 
  90 $EndTime = new Benchmark;
  91 $TotalTime = timediff ($EndTime, $StartTime);
  92 print "Total time: ", timestr($TotalTime), "\n";
  93 
  94 ###############################################################################
  95 
  96 # List appropriate information...
  97 sub ListTextFileInfo {
  98   my($Index) = @_;
  99   my($TextFile,  $Line, $InDelim, $LineCount, $EmptyLinesCount, $EmptyColDataLinesCount, $GreaterThanMaxColLinesCount, $Label, $Value, $ColNum, $EmptyColValueFound, $PrintTextLine, $NonNumericalDataFound, @ColLabels, @LineWords, %EmptyColValuesCountMap, %NonEmptyColValuesCountMap, %SpecifiedNonNumericalColValuesCountMap, %NonNumericalColValuesCountMap, %NumericalColValuesCountMap,);
 100 
 101   $TextFile = $TextFilesList[$Index];
 102   $InDelim = $TextFilesInDelim[$Index];
 103   @ColLabels = @{$TextFilesColLabels[$Index]};
 104 
 105   open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n";
 106 
 107   $LineCount = 0;
 108   $EmptyLinesCount = 0;
 109   $EmptyColDataLinesCount = 0;
 110   $GreaterThanMaxColLinesCount = 0;
 111 
 112   %EmptyColValuesCountMap = ();
 113   %NonEmptyColValuesCountMap = ();
 114   %SpecifiedNonNumericalColValuesCountMap = ();
 115   %NonNumericalColValuesCountMap = ();
 116   %NumericalColValuesCountMap = ();
 117 
 118   if ($ParseLines) {
 119     # Skip over column labels from old file...
 120     if (<TEXTFILE>) {
 121       $LineCount++;
 122       LINE: while ($Line = <TEXTFILE>) {
 123 	$LineCount++;
 124 	$PrintTextLine = 0;
 125 	$Line =~ s/(\r\n)|(\r)|\n//g;
 126 	@LineWords = quotewords($InDelim, 0, $Line);
 127 	if ($CountEmpty) {
 128 	  # Count lines with no data...
 129 	  if (!@LineWords) {
 130 	    $EmptyLinesCount++;
 131 	    if ($DetailLevel >= 2) {
 132 	      print "Line number $LineCount is empty...\n";
 133 	    }
 134 	    next LINE;
 135 	  }
 136 	  # Count lines with empty data for some columns...
 137 	  $EmptyColValueFound = 0;
 138 	  VALUE: for $Value (@LineWords) {
 139 	      if (!IsNotEmpty($Value)) {
 140 		$EmptyColValueFound = 1;
 141 		next VALUE;
 142 	      }
 143 	  }
 144 	  if ($EmptyColValueFound) {
 145 	    $EmptyColDataLinesCount++;
 146 	    if ($DetailLevel >= 2) {
 147 	      print "Line number $LineCount contains empty column value(s)...\n";
 148 	    }
 149 	    $PrintTextLine = ($DetailLevel >= 3) ? 1 : 0;
 150 	  }
 151 	  # Count lines with columns greater than the column label line...
 152 	  if (@LineWords > @ColLabels) {
 153 	    $GreaterThanMaxColLinesCount++;
 154 	    if ($DetailLevel >= 2) {
 155 	      print "Line number $LineCount contains more than ", scalar(@ColLabels), " columns...\n";
 156 	    }
 157 	    $PrintTextLine = ($DetailLevel >= 3) ? 1 : 0;
 158 	  }
 159 	  # Count empty values for each coulmn...
 160 	  for $ColNum (0 .. $#LineWords) {
 161 	    if ($ColNum < @ColLabels) {
 162 	      $Label = $ColLabels[$ColNum];
 163 	      if (IsNotEmpty($LineWords[$ColNum])) {
 164 		if (exists($NonEmptyColValuesCountMap{$Label})) {
 165 		  $NonEmptyColValuesCountMap{$Label} += 1;
 166 		}
 167 		else {
 168 		  $NonEmptyColValuesCountMap{$Label} = 1;
 169 		}
 170 	      }
 171 	      else {
 172 		$PrintTextLine = ($DetailLevel >= 3) ? 1 : 0;
 173 		if (exists($EmptyColValuesCountMap{$Label})) {
 174 		  $EmptyColValuesCountMap{$Label} += 1;
 175 		}
 176 		else {
 177 		  $EmptyColValuesCountMap{$Label} = 1;
 178 		}
 179 	      }
 180 	    }
 181 	  }
 182 	}
 183 	if ($CheckData) {
 184 	  for $ColNum (0 .. $#LineWords) {
 185 	    if ($ColNum < @ColLabels) {
 186 	      if (IsNumerical($LineWords[$ColNum])) {
 187 		$Label = $ColLabels[$ColNum];
 188 		if (exists($NumericalColValuesCountMap{$Label})) {
 189 		  $NumericalColValuesCountMap{$Label} += 1;
 190 		}
 191 		else {
 192 		  $NumericalColValuesCountMap{$Label} = 1;
 193 		}
 194 	      }
 195 	      else {
 196 		$Label = $ColLabels[$ColNum];
 197 		if (IsNotEmpty($LineWords[$ColNum])) {
 198 		  if (exists($NonNumericalColValuesCountMap{$Label})) {
 199 		    $NonNumericalColValuesCountMap{$Label} += 1;
 200 		  }
 201 		  else {
 202 		    $NonNumericalColValuesCountMap{$Label} = 1;
 203 		  }
 204 		}
 205 	      }
 206 	    }
 207 	  }
 208 	}
 209 	if ($CheckNumericalData) {
 210 	  $NonNumericalDataFound = 0;
 211 	  for $ColNum (@{$TextFilesNumericalDataColNums[$Index]}) {
 212 	    if ($ColNum < @LineWords) {
 213 	      if (!IsNumerical($LineWords[$ColNum])) {
 214 		$NonNumericalDataFound = 1;
 215 		$Label = $ColLabels[$ColNum];
 216 		if (exists($SpecifiedNonNumericalColValuesCountMap{$Label})) {
 217 		  $SpecifiedNonNumericalColValuesCountMap{$Label} += 1;
 218 		}
 219 		else {
 220 		  $SpecifiedNonNumericalColValuesCountMap{$Label} = 1;
 221 		}
 222 	      }
 223 	    }
 224 	  }
 225 	  if ($NonNumericalDataFound) {
 226 	    $PrintTextLine = ($DetailLevel >= 3) ? 1 : 0;
 227 	    if ($DetailLevel >=2 ) {
 228 	      print "Line number $LineCount contains non-numerical data for some specified column(s)...\n";
 229 	    }
 230 	  }
 231 	}
 232 	if ($PrintTextLine) {
 233 	  print "Line $LineCount: $Line\n\n";
 234 	}
 235       }
 236     }
 237   }
 238   else {
 239     while (<TEXTFILE>) {
 240       $LineCount++;
 241     }
 242   }
 243   close TEXTFILE;
 244   print "\nNumber of lines: $LineCount\n";
 245   print "Number of columns: $TextFilesColCount[$Index]\n";
 246   print "Column labels: ", JoinWords(\@ColLabels, ", ", 1), "\n";
 247   if ($CountEmpty) {
 248     print "\nNumber of lines with no data: $EmptyLinesCount\n";
 249     print "Number of lines with some missing column data: $EmptyColDataLinesCount\n";
 250     print "Number of lines containing greater than ", scalar(@ColLabels), " columns: $GreaterThanMaxColLinesCount\n";
 251     PrintDataInformation("Number of non-empty values for each column(s)", \@ColLabels, \%NonEmptyColValuesCountMap);
 252     PrintDataInformation("Number of empty values for each column(s)", \@ColLabels, \%EmptyColValuesCountMap);
 253   }
 254   if ($CheckData) {
 255     print "\n";
 256     PrintDataInformation("Number of non-numerical data values for each column(s)", \@ColLabels, \%NonNumericalColValuesCountMap);
 257     PrintDataInformation("Number of numerical data values for each column(s)", \@ColLabels, \%NumericalColValuesCountMap);
 258     print "\n";
 259   }
 260   if ($CheckNumericalData && @{$TextFilesNumericalDataColLabels[$Index]}) {
 261     PrintDataInformation("Number of non-numerical data values for each column(s)", \@{$TextFilesNumericalDataColLabels[$Index]}, \%SpecifiedNonNumericalColValuesCountMap);
 262   }
 263 
 264   # File size and modification information...
 265   print "\nFile size: ", FormatFileSize($TextFilesSize[$Index]), " \n";
 266   print "Last modified: ", $TextFilesLastModified[$Index], " \n";
 267 }
 268 
 269 # Total size of all the fiels...
 270 sub ListTotalSizeOfFiles {
 271   my($FileOkayCount, $TotalSize, $Index);
 272 
 273   $FileOkayCount = 0;
 274   $TotalSize = 0;
 275 
 276   for $Index (0 .. $#TextFilesList) {
 277     if ($TextFilesOkay[$Index]) {
 278       $FileOkayCount++;
 279       $TotalSize += $TextFilesSize[$Index];
 280     }
 281   }
 282   if ($FileOkayCount > 1) {
 283     print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n";
 284   }
 285 }
 286 
 287 
 288 # List data information...
 289 sub PrintDataInformation {
 290   my($InfoLabel, $DataLabelRef, $DataLabelToValueMapRef) = @_;
 291   my($Line, $Label);
 292 
 293   $Line = "";
 294   for $Label (@{$DataLabelRef}) {
 295     $Line .= " \"$Label\" - " . (exists($DataLabelToValueMapRef->{$Label}) ? $DataLabelToValueMapRef->{$Label} : 0) . ",";
 296   }
 297   $Line =~ s/\,$//g;
 298   print "$InfoLabel: $Line\n";
 299 }
 300 
 301 # Process option values...
 302 sub ProcessOptions {
 303   $DetailLevel = $Options{detail};
 304 
 305   $ParseLines = ($Options{all} || $Options{empty} || $Options{numericaldatacols}) ? 1 : 0;
 306   $CountEmpty = ($Options{all} || $Options{empty}) ? 1 : 0;
 307   $CheckData = ($Options{all} || $Options{datacheck}) ? 1 : 0;
 308   $CheckNumericalData = ($Options{all} || $Options{numericaldatacols}) ? 1 : 0;
 309 
 310   @SpecifiedNumericalDataCols = ();
 311   if ($Options{numericaldatacols}) {
 312     @SpecifiedNumericalDataCols = split ",", $Options{numericaldatacols};
 313     if ($Options{mode} =~ /^colnum$/i) {
 314       my($ColNum);
 315       for $ColNum (@SpecifiedNumericalDataCols) {
 316 	if (!IsPositiveInteger($ColNum)) {
 317 	  die "Error: Invalid value $ColNum specified using \"--numericaldatacols\" option: Allowed values: > 0\n";
 318 	}
 319       }
 320     }
 321   }
 322 
 323 }
 324 
 325 # Retrieve information about input text files...
 326 sub RetrieveTextFilesInfo {
 327   my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels,  $ColNum, $ColLabel, $ModifiedTimeString, $ModifiedDateString);
 328 
 329   @TextFilesOkay = ();
 330   @TextFilesColCount = (); @TextFilesColLabels = ();
 331   @TextFilesColLabelToNumMap = ();
 332   @TextFilesInDelim = ();
 333   @TextFilesSize = ();
 334   @TextFilesLastModified = ();
 335 
 336  FILELIST: for $Index (0 .. $#TextFilesList) {
 337     $TextFile = $TextFilesList[$Index];
 338     $TextFilesOkay[$Index] = 0;
 339     $TextFilesColCount[$Index] = 0;
 340     $TextFilesInDelim[$Index] = "";
 341     $TextFilesSize[$Index] = 0;
 342     $TextFilesLastModified[$Index] = '';
 343     @{$TextFilesColLabels[$Index]} = ();
 344     %{$TextFilesColLabelToNumMap[$Index]} = ();
 345     if (!(-e $TextFile)) {
 346       warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
 347       next FILELIST;
 348     }
 349     if (!CheckFileType($TextFile, "csv tsv")) {
 350       warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n";
 351       next FILELIST;
 352     }
 353     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 354     if ($FileExt =~ /^tsv$/i) {
 355       $InDelim = "\t";
 356     }
 357     else {
 358       $InDelim = "\,";
 359       if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 360 	warn "Warning: Ignoring file $TextFile: The value specified, $Options{indelim}, for option \"--indelim\" is not valid for csv files\n";
 361 	next FILELIST;
 362       }
 363       if ($Options{indelim} =~ /^semicolon$/i) {
 364 	$InDelim = "\;";
 365       }
 366     }
 367 
 368     if (!open TEXTFILE, "$TextFile") {
 369       warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
 370       next FILELIST;
 371     }
 372 
 373     $Line = GetTextLine(\*TEXTFILE);
 374     @ColLabels = quotewords($InDelim, 0, $Line);
 375     close TEXTFILE;
 376 
 377     $TextFilesOkay[$Index] = 1;
 378     $TextFilesInDelim[$Index] = $InDelim;
 379 
 380     $TextFilesColCount[$Index] = @ColLabels;
 381     push @{$TextFilesColLabels[$Index]}, @ColLabels;
 382     for $ColNum (0 .. $#ColLabels) {
 383       $ColLabel = $ColLabels[$ColNum];
 384       $TextFilesColLabelToNumMap[$Index]{$ColLabel} = $ColNum;
 385     }
 386     $TextFilesSize[$Index] = FileSize($TextFile);
 387     ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($TextFile);
 388     $TextFilesLastModified[$Index] = "$ModifiedTimeString; $ModifiedDateString";
 389   }
 390 
 391 }
 392 
 393 # Make sure specified numerical data columns are okay...
 394 sub ProcessColumnsInfo {
 395   my($Index, $TextFile);
 396 
 397   @TextFilesNumericalDataColNums = ();
 398   @TextFilesNumericalDataColLabels = ();
 399  FILELIST: for $Index (0 .. $#TextFilesList) {
 400     $TextFile = $TextFilesList[$Index];
 401     @{$TextFilesNumericalDataColNums[$Index]} = ();
 402     @{$TextFilesNumericalDataColLabels[$Index]} = ();
 403 
 404     if ($TextFilesOkay[$Index]) {
 405       my($SpecifiedColNum, $ColNum, $ColLabel, @SpecifiedColNums, @SpecifiedColLabels);
 406       @SpecifiedColNums = ();
 407       if ($Options{mode} =~ /^colnum$/i) {
 408 	for $SpecifiedColNum (@SpecifiedNumericalDataCols) {
 409 	  if ($SpecifiedColNum <= $TextFilesColCount[$Index]) {
 410 	    $ColNum = $SpecifiedColNum - 1;
 411 	    push @SpecifiedColNums, $ColNum;
 412 	    push @SpecifiedColLabels, $TextFilesColLabels[$Index][$ColNum];
 413 	  }
 414 	}
 415       }
 416       else {
 417 	for $ColLabel (@SpecifiedNumericalDataCols) {
 418 	  if (exists($TextFilesColLabelToNumMap[$Index]{$ColLabel})) {
 419 	    $ColNum = $TextFilesColLabelToNumMap[$Index]{$ColLabel};
 420 	    push @SpecifiedColNums, $ColNum;
 421 	    push @SpecifiedColLabels, $ColLabel;
 422 	  }
 423 	}
 424       }
 425       if (@SpecifiedColNums) {
 426 	push @{$TextFilesNumericalDataColNums[$Index]}, @SpecifiedColNums;
 427 	push @{$TextFilesNumericalDataColLabels[$Index]}, @SpecifiedColLabels;
 428       }
 429     }
 430   }
 431 }
 432 
 433 
 434 # Setup script usage  and retrieve command line arguments specified using various options...
 435 sub SetupScriptUsage {
 436 
 437   # Retrieve all the options...
 438   %Options = ();
 439   $Options{detail} = 1;
 440   $Options{mode} = "colnum";
 441   $Options{indelim} = "comma";
 442   if (!GetOptions(\%Options, "all|a", "count|c", "datacheck", "detail|d=i", "empty|e", "help|h", "indelim=s", "mode|m=s", "numericaldatacols|n=s", "workingdir|w=s")) {
 443     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 444   }
 445   if ($Options{workingdir}) {
 446     if (! -d $Options{workingdir}) {
 447       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 448     }
 449     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 450   }
 451   if ($Options{mode} !~ /^(colnum|collabel)$/i) {
 452     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum or collabel\n";
 453   }
 454   if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 455     die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
 456   }
 457   if (!IsPositiveInteger($Options{detail})) {
 458     die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
 459   }
 460 }
 461