MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: MergeTextFilesWithSD.pl,v $
   4 # $Date: 2008/01/30 21:44:48 $
   5 # $Revision: 1.22 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use 5.006;
  30 use strict;
  31 use FindBin; use lib "$FindBin::Bin/../lib";
  32 use Getopt::Long;
  33 use File::Basename;
  34 use Text::ParseWords;
  35 use Benchmark;
  36 use FileHandle;
  37 use SDFileUtil;
  38 use FileUtil;
  39 use TextUtil;
  40 
  41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  42 my(@InputFilesList, $TextFile, @TextFilesList, $NewTextFile, $SDFile, $NewSDFile, $Index,  $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, @ColValues, @KeyValues, $ColLabel, $ColNum, $ColIndex, $Values, @Words, @LineWords);
  43 
  44 # Autoflush STDOUT
  45 $| = 1;
  46 
  47 # Starting message...
  48 $ScriptName = basename $0;
  49 print "\n$ScriptName:Starting...\n\n";
  50 $StartTime = new Benchmark;
  51 
  52 # Get the options and setup script...
  53 SetupScriptUsage();
  54 if ($Options{help} || @ARGV < 1) {
  55   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  56 }
  57 
  58 @InputFilesList = ExpandFileNames(\@ARGV, "csv tsv");
  59 
  60 if (@InputFilesList == 1) {
  61   die "Error: Specify more than one Text file.\n";
  62 }
  63 $SDFile = $InputFilesList[0];
  64 @TextFilesList  = ();
  65 for $Index (1 .. $#InputFilesList) {
  66   push @TextFilesList, $InputFilesList[$Index];
  67 }
  68 if ($Options{root}) {
  69   $FileDir = ""; $FileName = ""; $FileExt = "";
  70   ($FileDir, $FileName, $FileExt) = ParseFileName($Options{root});
  71   if ($FileName && $FileExt) {
  72     $NewSDFile = $FileName;
  73   }
  74   else {
  75       $NewSDFile =  $Options{root};
  76   }
  77 }
  78 else {
  79   $FileDir = ""; $FileName = ""; $FileExt = "";
  80   ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
  81   $NewSDFile = $FileName;
  82   ($FileDir, $FileName, $FileExt) = ParseFileName($TextFilesList[0]);
  83   $NewSDFile = $NewSDFile . "MergedWith" . $FileName . "1To" . @TextFilesList;
  84 }
  85 $NewSDFile = $NewSDFile . ".sdf";
  86 if (!$Options{overwrite}) {
  87   if (-e $NewSDFile) {
  88     die "Error: The file $NewSDFile already exists.\n";
  89   }
  90 }
  91 if ($Options{root}) {
  92   if (lc($NewSDFile) eq lc($SDFile)) {
  93     die "Error: Output filename, $NewSDFile, is similar to a input file name.\nSpecify a different name using \"-r --root\" option or use default name.\n";
  94   }
  95 }
  96 if ($Options{columns}) {
  97   @ColValues = split ";", $Options{columns};
  98   if (@ColValues != @TextFilesList) {
  99     die "Error: Invalid number of values specified by \"-c --columns\" option: it must be equal to number of input text files.\n";
 100   }
 101   for $Index (0 .. $#ColValues) {
 102     if (!length($ColValues[$Index])) {
 103       die "Error: Invalid value specified by \"-c --columns\" option: empty values are not allowed.\n";
 104     }
 105   }
 106 }
 107 if ($Options{keys}) {
 108   @KeyValues = split ";", $Options{keys};
 109   if (@KeyValues != @TextFilesList) {
 110     die "Error: Invalid number of values specified by \"-k --keys\" option: it must be equal to number of input text files.\n";
 111   }
 112   for $Index (0 .. $#KeyValues) {
 113     if (!length($KeyValues[$Index])) {
 114       die "Error: Invalid value specified by \"-k --keys\" option: empty values are not allowed.\n";
 115     }
 116   }
 117 }
 118 
 119 print "Processing various options and checking input text files...\n";
 120 
 121 # Process SD file related options
 122 ProcessSDFileInfo();
 123 
 124 # Collect column information for all the text files...
 125 my(@TextFilesColCount, @TextFilesInDelim, @TextFilesColLabels, @TextFilesColLabelToNumMap);
 126 RetrieveTextFilesInfo();
 127 
 128 # Collect values specified using "-c --columns" option and map 'em to colnum...
 129 my(@TextFilesColSpecified, @TextFilesColToMerge, @TextFilesColToMergeLabels, @TextFilesColToMergeNumToLabelMap);
 130 ProcessColumnsOption();
 131 
 132 # Collect values specified using "-k --keys" option and map 'em to colnum...
 133 my(@TextFilesKeysSpecified, @TextFilesKeysToUse, $Key);
 134 if ($Options{keys}) {
 135   ProcessKeysOption();
 136 }
 137 
 138 print "Generating new SD file $NewSDFile...\n";
 139 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
 140 
 141 #Open up all the files and skip label lines for text files...
 142 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 143 my(@TextFilesHandleList) = ();
 144 for $Index (0 .. $#TextFilesList) {
 145   $TextFilesHandleList[$Index] = new FileHandle;
 146   $TextFile = $TextFilesList[$Index];
 147   open $TextFilesHandleList[$Index], "$TextFile" or die "Error: Couldn't open $TextFile: $! \n";
 148   $Line = GetTextLine($TextFilesHandleList[$Index]);
 149 }
 150 
 151 if ($Options{keys}) {
 152   MergeTextColumnValuesUsingKeys();
 153 }
 154 else {
 155   MergeTextColumnValues();
 156 }
 157 
 158 #close up all the files...
 159 close NEWSDFILE;
 160 close SDFILE;
 161 for $Index (0 .. $#TextFilesList) {
 162   close $TextFilesHandleList[$Index];
 163 }
 164 print "$ScriptName:Done...\n\n";
 165 
 166 $EndTime = new Benchmark;
 167 $TotalTime = timediff ($EndTime, $StartTime);
 168 print "Total time: ", timestr($TotalTime), "\n";
 169 
 170 ###############################################################################
 171 
 172 # Merge the specified text columns into SD file...
 173 sub MergeTextColumnValues {
 174   my($Value, $CmpdString);
 175   while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 176     $CmpdString =~ s/\$\$\$\$//g;
 177     print NEWSDFILE "$CmpdString";
 178     @ColLabels = (); @ColValues = ();
 179     # Merge coulmn values from other text files...
 180     for $Index (0 .. $#TextFilesList) {
 181       push @ColLabels, @{$TextFilesColToMergeLabels[$Index]};
 182       $InDelim = $TextFilesInDelim[$Index];
 183       if ($Line = GetTextLine($TextFilesHandleList[$Index])) {
 184 	@LineWords = quotewords($InDelim, 0, $Line);
 185 	for $ColNum (@{$TextFilesColToMerge[$Index]}) {
 186 	  $Value = ($ColNum < @LineWords) ? $LineWords[$ColNum] : "";
 187 	  push @ColValues, $Value;
 188 	}
 189       }
 190     }
 191     for $ColIndex (0 .. $#ColLabels) {
 192       print NEWSDFILE "> <$ColLabels[$ColIndex]>\n$ColValues[$ColIndex]\n\n";
 193     }
 194     print NEWSDFILE "\$\$\$\$\n";
 195   }
 196 }
 197 
 198 # Merge the specified text columns into SD file using keys...
 199 sub MergeTextColumnValuesUsingKeys {
 200   my(@CmpdLines, $CmpdString, $Value, $KeyColNum, $KeyColValue, @TextFilesKeysToLinesMap, %DataFieldValues);
 201 
 202   @TextFilesKeysToLinesMap = ();
 203   # Retrieve text lines from all the text files...
 204   for $Index (0 .. $#TextFilesList) {
 205     $InDelim = $TextFilesInDelim[$Index];
 206     %{$TextFilesKeysToLinesMap[$Index]} = ();
 207     $KeyColNum = $TextFilesKeysToUse[$Index];
 208     while ($Line = GetTextLine($TextFilesHandleList[$Index])) {
 209       @LineWords = quotewords($InDelim, 0, $Line);
 210       if ($KeyColNum < @LineWords) {
 211 	$KeyColValue = $LineWords[$KeyColNum];
 212 	if (length($KeyColValue)) {
 213 	  if (exists($TextFilesKeysToLinesMap[$Index]{$KeyColValue})) {
 214 	    warn "Warning: Ignoring line, $Line, in text file $TextFilesList[$Index]: Column key value, $KeyColValue, already exists\n";
 215 	  }
 216 	  else {
 217 	    @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}} = ();
 218 	    push @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}}, @LineWords;
 219 	  }
 220 	}
 221       }
 222     }
 223   }
 224   while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 225     @CmpdLines = split "\n", $CmpdString;
 226     %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 227     if (exists($DataFieldValues{$Options{sdkey}})) {
 228       @ColLabels = (); @ColValues = ();
 229       $CmpdString =~ s/\$\$\$\$//g;
 230       print NEWSDFILE "$CmpdString";
 231 
 232       $KeyColValue = $DataFieldValues{$Options{sdkey}};
 233 
 234       # Merge coulmn values from other text files...
 235       for $Index (0 .. $#TextFilesList) {
 236 	push @ColLabels, @{$TextFilesColToMergeLabels[$Index]};
 237 	@LineWords = ();
 238 	if (exists($TextFilesKeysToLinesMap[$Index]{$KeyColValue})) {
 239 	  push @LineWords, @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}};
 240 	}
 241 	for $ColNum (@{$TextFilesColToMerge[$Index]}) {
 242 	  $Value = ($ColNum < @LineWords) ? $LineWords[$ColNum] : "";
 243 	  push @ColValues, $Value;
 244 	}
 245       }
 246       for $ColIndex (0 .. $#ColLabels) {
 247 	$Value = (($ColIndex < @ColValues) && IsNotEmpty($ColValues[$ColIndex]) ) ? $ColValues[$ColIndex] : "";
 248 	print NEWSDFILE "> <$ColLabels[$ColIndex]>\n$Value\n\n";
 249       }
 250       print NEWSDFILE "\$\$\$\$\n";
 251     }
 252   }
 253 }
 254 
 255 # Process specified columns...
 256 sub ProcessColumnsOption {
 257   @TextFilesColSpecified = (); @TextFilesColToMerge = (); @TextFilesColToMergeLabels = ();
 258   @TextFilesColToMergeNumToLabelMap = ();
 259   for $Index (0 .. $#TextFilesList) {
 260     $Values = "all";
 261     if ($Options{columns}) {
 262       $Values = $ColValues[$Index]
 263     }
 264     @{$TextFilesColSpecified[$Index]} = ();
 265     if ($Values =~ /all/i) {
 266       if ($Options{mode} =~ /^colnum$/i) {
 267 	for $ColNum (1 .. $TextFilesColCount[$Index]) {
 268 	  push @{$TextFilesColSpecified[$Index]}, $ColNum;
 269 	}
 270       } else {
 271 	push @{$TextFilesColSpecified[$Index]}, @{$TextFilesColLabels[$Index]};
 272       }
 273     } else {
 274       @Words = split ",", $Values;
 275       push @{$TextFilesColSpecified[$Index]}, @Words;
 276     }
 277     @{$TextFilesColToMerge[$Index]} = ();
 278     %{$TextFilesColToMergeNumToLabelMap[$Index]} = ();
 279     if ($Options{mode} =~ /^collabel$/i) {
 280       for $ColIndex (0 .. $#{$TextFilesColSpecified[$Index]}) {
 281 	$ColLabel = $TextFilesColSpecified[$Index][$ColIndex];
 282 	if (exists($TextFilesColLabelToNumMap[$Index]{$ColLabel})) {
 283 	  $ColNum = $TextFilesColLabelToNumMap[$Index]{$ColLabel};
 284 	  push @{$TextFilesColToMerge[$Index]}, $ColNum;
 285 	  $TextFilesColToMergeNumToLabelMap[$Index]{$ColNum} = $ColLabel;
 286 	} else {
 287 	  warn "Warning: Ignoring value, $ColLabel, specified using \"-c --column\" option: column name doesn't exist in  $TextFilesList[$Index]  \n";
 288 	}
 289       }
 290     }
 291     else {
 292       for $ColIndex (0 .. $#{$TextFilesColSpecified[$Index]}) {
 293 	$ColNum = $TextFilesColSpecified[$Index][$ColIndex];
 294 	# Make sure it's a numeric value...
 295 	if (!IsPositiveInteger($ColNum)) {
 296 	  warn "Warning: Ignoring value, $ColNum, specified using \"-c --column\" option:  Allowed integer values: > 0\n";
 297 	}
 298 	else {
 299 	  if ($ColNum > 0 && $ColNum <= $TextFilesColCount[$Index]) {
 300 	    $ColNum -= 1;
 301 	    push @{$TextFilesColToMerge[$Index]}, $ColNum;
 302 	    $TextFilesColToMergeNumToLabelMap[$Index]{$ColNum} = $TextFilesColLabels[$Index][$ColNum];
 303 	  }
 304 	  else {
 305 	    warn "Warning: Ignoring value, $ColNum, specified using \"-c --column\" option: column number doesn't exist in  $TextFilesList[$Index]  \n";
 306 	  }
 307 	}
 308       }
 309     }
 310     my (@TextFilesColToMergeSorted) = sort @{$TextFilesColToMerge[$Index]};
 311     @{$TextFilesColToMerge[$Index]} = ();
 312     push @{$TextFilesColToMerge[$Index]}, @TextFilesColToMergeSorted;
 313     # Set up the labels...
 314     @{$TextFilesColToMergeLabels[$Index]} = ();
 315     for $ColNum (@TextFilesColToMergeSorted) {
 316       push @{$TextFilesColToMergeLabels[$Index]}, $TextFilesColToMergeNumToLabelMap[$Index]{$ColNum};
 317     }
 318   }
 319 }
 320 
 321 # Process specified keys....
 322 sub ProcessKeysOption {
 323   @TextFilesKeysSpecified = (); @TextFilesKeysToUse = ();
 324   for $Index (0 .. $#TextFilesList) {
 325     $Key = $KeyValues[$Index];
 326     $TextFilesKeysSpecified[$Index] = $Key;
 327     $TextFilesKeysToUse[$Index] = -1;
 328     if ($Options{mode} =~ /^collabel$/i) {
 329       $ColLabel = $Key;
 330       if (exists($TextFilesColLabelToNumMap[$Index]{$ColLabel})) {
 331 	$TextFilesKeysToUse[$Index] =  $TextFilesColLabelToNumMap[$Index]{$ColLabel};
 332       }
 333       else {
 334 	warn "Warning: Ignoring value, $ColLabel, specified using \"-k --keys\" option: column name doesn't exist in  $TextFilesList[$Index]  \n";
 335       }
 336     }
 337     else {
 338       $ColNum = $Key;
 339       if (!IsPositiveInteger($ColNum)) {
 340 	warn "Warning: Ignoring value, $ColNum, specified using \"-k --keys\" option: Allowed integer values: > 0  \n";
 341       }
 342       else {
 343 	if ($ColNum > 0 && $ColNum <= $TextFilesColCount[$Index]) {
 344 	  $TextFilesKeysToUse[$Index] = $ColNum - 1;
 345 	}
 346 	else {
 347 	  warn "Warning: Ignoring value, $ColNum, specified using \"-k --keys\" option: column number doesn't exist in  $TextFilesList[$Index]  \n";
 348 	}
 349       }
 350     }
 351   }
 352   # Modify columns to merge list to make sure the columns identified by key are taken off the list
 353   my(@TextFilesColToMergeFiltered, @TextFilesColToMergeLabelsFiltered);
 354   for $Index (0 .. $#TextFilesList) {
 355     @TextFilesColToMergeFiltered = ();
 356     @TextFilesColToMergeLabelsFiltered = ();
 357     for $ColNum (@{$TextFilesColToMerge[$Index]}) {
 358       if ($TextFilesKeysToUse[$Index] != $ColNum) {
 359 	push @TextFilesColToMergeFiltered, $ColNum;
 360 	push @TextFilesColToMergeLabelsFiltered, $TextFilesColToMergeNumToLabelMap[$Index]{$ColNum};
 361       }
 362     }
 363     @{$TextFilesColToMerge[$Index]} = ();
 364     push @{$TextFilesColToMerge[$Index]}, @TextFilesColToMergeFiltered;
 365     @{$TextFilesColToMergeLabels[$Index]} = ();
 366     push @{$TextFilesColToMergeLabels[$Index]}, @TextFilesColToMergeLabelsFiltered;
 367   }
 368 }
 369 sub ProcessSDFileInfo {
 370   if (!CheckFileType($SDFile, "sd sdf")) {
 371     die "Error: Invalid first file $SDFile: It's not a SD file\n";
 372   }
 373   if (!(-e $SDFile)) {
 374     die "Error: SDFile $SDFile doesn't exist\n";
 375   }
 376 }
 377 
 378 # Retrieve information about input text files...
 379 sub RetrieveTextFilesInfo {
 380   my($TextFilesErrorCount) = 0;
 381 
 382   @TextFilesColCount = (); @TextFilesInDelim = (); @TextFilesColLabels = ();
 383   @TextFilesColLabelToNumMap = ();
 384  FILELIST: for $Index (0 .. $#TextFilesList) {
 385     $TextFile = $TextFilesList[$Index];
 386     $TextFilesColCount[$Index] = 0;
 387     @{$TextFilesColLabels[$Index]} = ();
 388     %{$TextFilesColLabelToNumMap[$Index]} = ();
 389     if (!(-e $TextFile)) {
 390       print "File $TextFile doesn't exist\n";
 391       $TextFilesErrorCount++;
 392       next FILELIST;
 393     }
 394     if (!CheckFileType($TextFile, "csv tsv")) {
 395       print "Problematic file $TextFile: It's not a csv or tsv file\n";
 396       $TextFilesErrorCount++;
 397       next FILELIST;
 398     }
 399     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 400     if ($FileExt =~ /^tsv$/i) {
 401       $InDelim = "\t";
 402     }
 403     else {
 404       $InDelim = "\,";
 405       if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 406 	warn "Warning: Ignoring file $TextFile: The value specified, $Options{indelim}, for option \"--indelim\" is not valid for csv files\n";
 407 	$TextFilesErrorCount++;
 408 	next FILELIST;
 409       }
 410       if ($Options{indelim} =~ /^semicolon$/i) {
 411 	$InDelim = "\;";
 412       }
 413     }
 414     if (!open TEXTFILE, "$TextFile") {
 415       print "Problematic file $TextFile: Couldn't open it: $! \n";
 416       $TextFilesErrorCount++;
 417       next FILELIST;
 418     }
 419     $Line = GetTextLine(\*TEXTFILE);
 420     @ColLabels = quotewords($InDelim, 0, $Line);
 421     $TextFilesInDelim[$Index] = $InDelim;
 422     $TextFilesColCount[$Index] = @ColLabels;
 423     push @{$TextFilesColLabels[$Index]}, @ColLabels;
 424     for $ColNum (0 .. $#ColLabels) {
 425       $ColLabel = $ColLabels[$ColNum];
 426       $TextFilesColLabelToNumMap[$Index]{$ColLabel} = $ColNum;
 427     }
 428     close TEXTFILE;
 429   }
 430   if ($TextFilesErrorCount) {
 431     die "Error: Problems with input text file(s)...\n";
 432   }
 433 }
 434 
 435 # Setup script usage  and retrieve command line arguments specified using various options...
 436 sub SetupScriptUsage {
 437 
 438   # Retrieve all the options...
 439   %Options = ();
 440   $Options{mode} = "colnum";
 441   $Options{indelim} = "comma";
 442   if (!GetOptions(\%Options, "help|h", "indelim=s", "columns|c=s", "keys|k=s", "mode|m=s", "overwrite|o", "root|r=s", "sdkey|s=s", "workingdir|w=s")) {
 443     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 444   }
 445   if ($Options{workingdir}) {
 446     if (! -d $Options{workingdir}) {
 447       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 448     }
 449     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 450   }
 451   if ($Options{mode} !~ /(^(colnum|collabel)$)/i) {
 452     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum, or collabel\n";
 453   }
 454   if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 455     die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
 456   }
 457   if ($Options{sdkey} && !$Options{keys}) {
 458     die "Error: The option \"-s --sdkey\" can't be specified without the \"-k --keys\" option.\n";
 459   }
 460   elsif (!$Options{sdkey} && $Options{keys}) {
 461     die "Error: The option \"-k --keys\" can't be specified without the \"-s --sdkey\" option.\n";
 462   }
 463 }
 464