MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: SortTextFiles.pl,v $
   4 # $Date: 2008/01/30 21:45:03 $
   5 # $Revision: 1.23 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use 5.006;
  30 use strict;
  31 use FindBin; use lib "$FindBin::Bin/../lib";
  32 use Getopt::Long;
  33 use File::Basename;
  34 use Text::ParseWords;
  35 use Benchmark;
  36 use FileUtil;
  37 use TextUtil;
  38 
  39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  40 
  41 # Autoflush STDOUT
  42 $| = 1;
  43 
  44 # Starting message...
  45 $ScriptName = basename($0);
  46 print "\n$ScriptName: Starting...\n\n";
  47 $StartTime = new Benchmark;
  48 
  49 # Get the options and setup script...
  50 SetupScriptUsage();
  51 if ($Options{help} || @ARGV < 1) {
  52   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  53 }
  54 
  55 my(@TextFilesList);
  56 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
  57 
  58 my($DetailLevel, $OutDelim, $OutQuote, $SpecifiedKeyCol);
  59 ProcessOptions();
  60 
  61 print "Checking input text file(s)...\n";
  62 my(@TextFilesOkay, @TextFilesColCount, @TextFilesColLabels, @TextFilesColLabelToNumMap, @TextFilesInDelim, @TextFilesOutFile);
  63 RetrieveTextFilesInfo();
  64 
  65 my(@TextFilesKeyColNum);
  66 ProcessColumnsInfo();
  67 
  68 # Generate output files...
  69 my($Index, $TextFile);
  70 if (@TextFilesList > 1) {
  71   print "Processing text files...\n";
  72 }
  73 for $Index (0 .. $#TextFilesList) {
  74   if ($TextFilesOkay[$Index]) {
  75     $TextFile = $TextFilesList[$Index];
  76     if (@TextFilesList > 1) {
  77       print "\nProcessing file $TextFile...\n";
  78     }
  79     else {
  80       print "Processing file $TextFile...\n"
  81     }
  82     SortTextFile($Index);
  83   }
  84 }
  85 
  86 print "$ScriptName:Done...\n\n";
  87 
  88 $EndTime = new Benchmark;
  89 $TotalTime = timediff ($EndTime, $StartTime);
  90 print "Total time: ", timestr($TotalTime), "\n";
  91 
  92 ###############################################################################
  93 
  94 # Process option values...
  95 sub ProcessOptions {
  96   $DetailLevel = $Options{detail};
  97 
  98   $OutDelim = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,");
  99   $OutQuote = ($Options{quote} =~ /^yes$/i) ? 1 : 0;
 100 
 101   $SpecifiedKeyCol = "";
 102   if (defined $Options{key}) {
 103     $SpecifiedKeyCol = $Options{key};
 104     if ($Options{mode} =~ /^colnum$/i) {
 105       if (!IsPositiveInteger($SpecifiedKeyCol)) {
 106 	die "Error: Invalid value $Options{key} specified using \"-k --key\" option: Allowed values: > 0\n";
 107       }
 108     }
 109   }
 110 }
 111 
 112 # Retrieve information about input text files...
 113 sub RetrieveTextFilesInfo {
 114   my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $OutFileRoot,  $OutFile, $ColNum, $ColLabel);
 115 
 116   @TextFilesOkay = ();
 117   @TextFilesColCount = (); @TextFilesColLabels = ();
 118   @TextFilesColLabelToNumMap = ();
 119   @TextFilesInDelim = ();
 120   @TextFilesOutFile = ();
 121 
 122  FILELIST: for $Index (0 .. $#TextFilesList) {
 123     $TextFile = $TextFilesList[$Index];
 124     $TextFilesOkay[$Index] = 0;
 125     $TextFilesColCount[$Index] = 0;
 126     $TextFilesInDelim[$Index] = "";
 127     $TextFilesOutFile[$Index] = "";
 128     @{$TextFilesColLabels[$Index]} = ();
 129     %{$TextFilesColLabelToNumMap[$Index]} = ();
 130     if (!(-e $TextFile)) {
 131       warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
 132       next FILELIST;
 133     }
 134     if (!CheckFileType($TextFile, "csv tsv")) {
 135       warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n";
 136       next FILELIST;
 137     }
 138     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 139     if ($FileExt =~ /^tsv$/i) {
 140       $InDelim = "\t";
 141     }
 142     else {
 143       $InDelim = "\,";
 144       if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 145 	warn "Warning: Ignoring file $TextFile: The value specified, $Options{indelim}, for option \"--indelim\" is not valid for csv files\n";
 146 	next FILELIST;
 147       }
 148       if ($Options{indelim} =~ /^semicolon$/i) {
 149 	$InDelim = "\;";
 150       }
 151     }
 152 
 153     if (!open TEXTFILE, "$TextFile") {
 154       warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
 155       next FILELIST;
 156     }
 157 
 158     $Line = GetTextLine(\*TEXTFILE);
 159     @ColLabels = quotewords($InDelim, 0, $Line);
 160     close TEXTFILE;
 161 
 162     $FileDir = ""; $FileName = ""; $FileExt = "";
 163     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 164     $FileExt = "csv";
 165     if ($Options{outdelim} =~ /^tab$/i) {
 166       $FileExt = "tsv";
 167     }
 168     if ($Options{root} && (@TextFilesList == 1)) {
 169       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 170       if ($RootFileName && $RootFileExt) {
 171 	$FileName = $RootFileName;
 172       }
 173       else {
 174 	$FileName = $Options{root};
 175       }
 176       $OutFileRoot = $FileName;
 177     }
 178     else {
 179       $OutFileRoot = $FileName . "SortedByColumn";
 180     }
 181 
 182     $OutFile = $OutFileRoot . ".$FileExt";
 183     if (lc($OutFile) eq lc($TextFile)) {
 184       warn "Warning: Ignoring file $TextFile:Output file name, $OutFile, is same as input text file name, $TextFile\n";
 185       next FILELIST;
 186     }
 187     if (!$Options{overwrite}) {
 188       if (-e $OutFile) {
 189 	warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n";
 190 	next FILELIST;
 191       }
 192     }
 193 
 194     $TextFilesOkay[$Index] = 1;
 195     $TextFilesInDelim[$Index] = $InDelim;
 196     $TextFilesOutFile[$Index] = "$OutFile";
 197 
 198     $TextFilesColCount[$Index] = @ColLabels;
 199     push @{$TextFilesColLabels[$Index]}, @ColLabels;
 200     for $ColNum (0 .. $#ColLabels) {
 201       $ColLabel = $ColLabels[$ColNum];
 202       $TextFilesColLabelToNumMap[$Index]{$ColLabel} = $ColNum;
 203     }
 204   }
 205 
 206 }
 207 
 208 # Make sure specified key column are okay...
 209 sub ProcessColumnsInfo {
 210   my($Index, $TextFile);
 211 
 212   @TextFilesKeyColNum = ();
 213  FILELIST: for $Index (0 .. $#TextFilesList) {
 214     $TextFile = $TextFilesList[$Index];
 215     $TextFilesKeyColNum[$Index] = 0;
 216 
 217     if ($TextFilesOkay[$Index]) {
 218       my($KeyColNum, $KeyColValid);
 219 
 220       $KeyColNum = 0;
 221       $KeyColValid = 1;
 222       if ($SpecifiedKeyCol) {
 223 	if ($Options{mode} =~ /^colnum$/i) {
 224 	  if ($SpecifiedKeyCol <= $TextFilesColCount[$Index]) {
 225 	    $KeyColNum = $SpecifiedKeyCol - 1;
 226 	  }
 227 	  else {
 228 	    $KeyColValid = 0;
 229 	  }
 230 	}
 231 	else {
 232 	  if (exists($TextFilesColLabelToNumMap[$Index]{$SpecifiedKeyCol})) {
 233 	    $KeyColNum =  $TextFilesColLabelToNumMap[$Index]{$SpecifiedKeyCol};
 234 	  }
 235 	  else {
 236 	    $KeyColValid = 0;
 237 	  }
 238 	}
 239       }
 240       if ($KeyColValid) {
 241 	$TextFilesKeyColNum[$Index] = $KeyColNum;
 242       }
 243       else {
 244 	warn "Warning: Ignoring file $TextFile: Column key specified, $SpecifiedKeyCol, using \"k --key\" option doesn't exist\n";
 245 	$TextFilesOkay[$Index] = 0;
 246       }
 247     }
 248   }
 249 }
 250 
 251 # Sort it out...
 252 sub SortTextFile {
 253   my($Index) = @_;
 254   my($TextFile, $NewTextFile, $KeyCol, $Line, $KeyColValue, $InDelim, @ColLabels, @LineWords);
 255 
 256   $TextFile = $TextFilesList[$Index];
 257   $InDelim = $TextFilesInDelim[$Index];
 258   $NewTextFile = $TextFilesOutFile[$Index];
 259   $KeyCol = $TextFilesKeyColNum[$Index];
 260   @ColLabels = @{$TextFilesColLabels[$Index]};
 261 
 262   print "Generating new Text file $NewTextFile...\n";
 263   open NEWTEXTFILE, ">$NewTextFile" or die "Error: Couldn't open $NewTextFile: $! \n";
 264   open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n";
 265 
 266   # Skip over column labels from old file...
 267   $Line = GetTextLine(\*TEXTFILE);
 268 
 269   # Add column lablels in new file...
 270   $Line = JoinWords(\@ColLabels, $OutDelim, $OutQuote);
 271   print NEWTEXTFILE "$Line\n";
 272 
 273   # Go over all rows and store the lines using key value as hash...
 274   my(%KeyToLinesMap, @InvalidDataLines, $LineCount);
 275 
 276   %KeyToLinesMap = ();
 277   @InvalidDataLines = ();
 278   $LineCount = 1;
 279   TEXTLINE: while ($Line = GetTextLine(\*TEXTFILE)) {
 280     @LineWords = quotewords($InDelim, 0, $Line);
 281     $LineCount++;
 282     if ($KeyCol < @LineWords) {
 283       $KeyColValue = $LineWords[$KeyCol];
 284       if (!IsNotEmpty($KeyColValue)) {
 285 	$Line = JoinWords(\@LineWords, $OutDelim, $OutQuote);
 286 	push @InvalidDataLines, $Line;
 287 	if ($DetailLevel >= 3 ) {
 288 	  print "Ignoring line $LineCount: Contains empty value for key column $ColLabels[$KeyCol]: $Line\n";
 289 	}
 290 	elsif ($DetailLevel >= 2) {
 291 	  print "Ignoring line $LineCount: Contains empty value for key column $ColLabels[$KeyCol]...\n";
 292 	}
 293 	next TEXTLINE;
 294       }
 295       if ($Options{keydata} =~ /^numeric$/i) {
 296 	if (!IsFloat($KeyColValue)) {
 297 	  $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote);
 298 	  push @InvalidDataLines, $Line;
 299 	  if ($DetailLevel >= 3 ) {
 300 	    print "Line number $LineCount: Contains non-numerical value for key column $ColLabels[$KeyCol]: $Line\n";
 301 	  }
 302 	  elsif ($DetailLevel >= 2) {
 303 	    print "Line number $LineCount: Contains non-numerical value for key column $ColLabels[$KeyCol]...\n";
 304 	  }
 305 	  next TEXTLINE;
 306 	}
 307       }
 308       if (exists($KeyToLinesMap{$KeyColValue})) {
 309 	# Append to existing line...
 310 	$Line = JoinWords(\@LineWords, $OutDelim, $OutQuote);
 311 	$KeyToLinesMap{$KeyColValue} .= "\n" . $Line;
 312       }
 313       else {
 314 	$Line = JoinWords(\@LineWords, $OutDelim, $OutQuote);
 315 	$KeyToLinesMap{$KeyColValue} = $Line;
 316       }
 317     }
 318   }
 319   if ($Options{sort} =~ /^ascending$/i) {
 320     if ($Options{keydata} =~ /^alphanumeric$/i) {
 321       for $KeyColValue (sort { lc($a) cmp lc($b) } keys %KeyToLinesMap ) {
 322 	print NEWTEXTFILE "$KeyToLinesMap{$KeyColValue}\n";
 323       }
 324     }
 325     else {
 326       for $KeyColValue (sort { $a <=> $b } keys %KeyToLinesMap ) {
 327 	print NEWTEXTFILE "$KeyToLinesMap{$KeyColValue}\n";
 328       }
 329     }
 330   }
 331   else {
 332     if ($Options{keydata} =~ /^alphanumeric$/i) {
 333       for $KeyColValue (sort { lc($b) cmp lc($a) } keys %KeyToLinesMap ) {
 334 	print NEWTEXTFILE "$KeyToLinesMap{$KeyColValue}\n";
 335       }
 336     }
 337     else {
 338       for $KeyColValue (sort { $b <=> $a } keys %KeyToLinesMap ) {
 339 	print NEWTEXTFILE "$KeyToLinesMap{$KeyColValue}\n";
 340       }
 341     }
 342   }
 343   # Write out the lines with invalid data...
 344   if (@InvalidDataLines) {
 345     print "Placing ", scalar(@InvalidDataLines)," line(s) with invalid column key data at the end...\n";
 346     for $Line (@InvalidDataLines) {
 347       print NEWTEXTFILE "$Line\n";
 348     }
 349   }
 350   close NEWTEXTFILE;
 351   close TEXTFILE;
 352 
 353 }
 354 
 355 # Setup script usage  and retrieve command line arguments specified using various options...
 356 sub SetupScriptUsage {
 357 
 358   # Retrieve all the options...
 359   %Options = ();
 360   $Options{detail} = 1;
 361   $Options{mode} = "colnum";
 362   $Options{sort} = "ascending";
 363   $Options{keydata} = "numeric";
 364   $Options{indelim} = "comma";
 365   $Options{outdelim} = "comma";
 366   $Options{quote} = "yes";
 367   if (!GetOptions(\%Options, "detail|d=i", "help|h", "indelim=s", "key|k=s", "keydata=s", "mode|m=s", "outdelim=s", "overwrite|o", "quote|q=s", "root|r=s", "sort|s=s", "workingdir|w=s")) {
 368     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 369   }
 370   if ($Options{workingdir}) {
 371     if (! -d $Options{workingdir}) {
 372       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 373     }
 374     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 375   }
 376   if ($Options{mode} !~ /^(colnum|collabel)$/i) {
 377     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum or collabel\n";
 378   }
 379   if ($Options{keydata} !~ /^(numeric|alphanumeric)$/i) {
 380     die "Error: The value specified, $Options{keydata}, for option \"--keydata\" is not valid. Allowed values: numeric or alphanumeric\n";
 381   }
 382   if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 383     die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
 384   }
 385   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 386     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 387   }
 388   if ($Options{quote} !~ /^(yes|no)$/i) {
 389     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 390   }
 391   if ($Options{sort} !~ /^(ascending|descending)$/i) {
 392     die "Error: The value specified, $Options{sort}, for option \"-s --sort\" is not valid. Allowed values: ascending or descending\n";
 393   }
 394   if (!IsPositiveInteger($Options{detail})) {
 395     die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
 396   }
 397 }
 398