MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: MergeTextFilesWithSD.pl,v $
   4 # $Date: 2010/06/23 20:59:29 $
   5 # $Revision: 1.27 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2010 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileHandle;
  36 use SDFileUtil;
  37 use FileUtil;
  38 use TextUtil;
  39 
  40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  41 my(@InputFilesList, $TextFile, @TextFilesList, $NewTextFile, $SDFile, $NewSDFile, $Index,  $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, @ColValues, @KeyValues, $ColLabel, $ColNum, $ColIndex, $Values, @Words, @LineWords);
  42 
  43 # Autoflush STDOUT
  44 $| = 1;
  45 
  46 # Starting message...
  47 $ScriptName = basename $0;
  48 print "\n$ScriptName:Starting...\n\n";
  49 $StartTime = new Benchmark;
  50 
  51 # Get the options and setup script...
  52 SetupScriptUsage();
  53 if ($Options{help} || @ARGV < 1) {
  54   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  55 }
  56 
  57 @InputFilesList = ExpandFileNames(\@ARGV, "csv tsv");
  58 
  59 if (@InputFilesList == 1) {
  60   die "Error: Specify more than one Text file.\n";
  61 }
  62 $SDFile = $InputFilesList[0];
  63 @TextFilesList  = ();
  64 for $Index (1 .. $#InputFilesList) {
  65   push @TextFilesList, $InputFilesList[$Index];
  66 }
  67 if ($Options{root}) {
  68   $FileDir = ""; $FileName = ""; $FileExt = "";
  69   ($FileDir, $FileName, $FileExt) = ParseFileName($Options{root});
  70   if ($FileName && $FileExt) {
  71     $NewSDFile = $FileName;
  72   }
  73   else {
  74       $NewSDFile =  $Options{root};
  75   }
  76 }
  77 else {
  78   $FileDir = ""; $FileName = ""; $FileExt = "";
  79   ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
  80   $NewSDFile = $FileName;
  81   ($FileDir, $FileName, $FileExt) = ParseFileName($TextFilesList[0]);
  82   $NewSDFile = $NewSDFile . "MergedWith" . $FileName . "1To" . @TextFilesList;
  83 }
  84 $NewSDFile = $NewSDFile . ".sdf";
  85 if (!$Options{overwrite}) {
  86   if (-e $NewSDFile) {
  87     die "Error: The file $NewSDFile already exists.\n";
  88   }
  89 }
  90 if ($Options{root}) {
  91   if (lc($NewSDFile) eq lc($SDFile)) {
  92     die "Error: Output filename, $NewSDFile, is similar to a input file name.\nSpecify a different name using \"-r --root\" option or use default name.\n";
  93   }
  94 }
  95 if ($Options{columns}) {
  96   @ColValues = split ";", $Options{columns};
  97   if (@ColValues != @TextFilesList) {
  98     die "Error: Invalid number of values specified by \"-c --columns\" option: it must be equal to number of input text files.\n";
  99   }
 100   for $Index (0 .. $#ColValues) {
 101     if (!length($ColValues[$Index])) {
 102       die "Error: Invalid value specified by \"-c --columns\" option: empty values are not allowed.\n";
 103     }
 104   }
 105 }
 106 if ($Options{keys}) {
 107   @KeyValues = split ";", $Options{keys};
 108   if (@KeyValues != @TextFilesList) {
 109     die "Error: Invalid number of values specified by \"-k --keys\" option: it must be equal to number of input text files.\n";
 110   }
 111   for $Index (0 .. $#KeyValues) {
 112     if (!length($KeyValues[$Index])) {
 113       die "Error: Invalid value specified by \"-k --keys\" option: empty values are not allowed.\n";
 114     }
 115   }
 116 }
 117 
 118 print "Processing various options and checking input text files...\n";
 119 
 120 # Process SD file related options
 121 ProcessSDFileInfo();
 122 
 123 # Collect column information for all the text files...
 124 my(@TextFilesColCount, @TextFilesInDelim, @TextFilesColLabels, @TextFilesColLabelToNumMap);
 125 RetrieveTextFilesInfo();
 126 
 127 # Collect values specified using "-c --columns" option and map 'em to colnum...
 128 my(@TextFilesColSpecified, @TextFilesColToMerge, @TextFilesColToMergeLabels, @TextFilesColToMergeNumToLabelMap);
 129 ProcessColumnsOption();
 130 
 131 # Collect values specified using "-k --keys" option and map 'em to colnum...
 132 my(@TextFilesKeysSpecified, @TextFilesKeysToUse, $Key);
 133 if ($Options{keys}) {
 134   ProcessKeysOption();
 135 }
 136 
 137 print "Generating new SD file $NewSDFile...\n";
 138 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
 139 
 140 #Open up all the files and skip label lines for text files...
 141 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 142 my(@TextFilesHandleList) = ();
 143 for $Index (0 .. $#TextFilesList) {
 144   $TextFilesHandleList[$Index] = new FileHandle;
 145   $TextFile = $TextFilesList[$Index];
 146   open $TextFilesHandleList[$Index], "$TextFile" or die "Error: Couldn't open $TextFile: $! \n";
 147   $Line = GetTextLine($TextFilesHandleList[$Index]);
 148 }
 149 
 150 if ($Options{keys}) {
 151   MergeTextColumnValuesUsingKeys();
 152 }
 153 else {
 154   MergeTextColumnValues();
 155 }
 156 
 157 #close up all the files...
 158 close NEWSDFILE;
 159 close SDFILE;
 160 for $Index (0 .. $#TextFilesList) {
 161   close $TextFilesHandleList[$Index];
 162 }
 163 print "$ScriptName:Done...\n\n";
 164 
 165 $EndTime = new Benchmark;
 166 $TotalTime = timediff ($EndTime, $StartTime);
 167 print "Total time: ", timestr($TotalTime), "\n";
 168 
 169 ###############################################################################
 170 
 171 # Merge the specified text columns into SD file...
 172 sub MergeTextColumnValues {
 173   my($Value, $CmpdString);
 174   while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 175     $CmpdString =~ s/\$\$\$\$//g;
 176     print NEWSDFILE "$CmpdString";
 177     @ColLabels = (); @ColValues = ();
 178     # Merge coulmn values from other text files...
 179     for $Index (0 .. $#TextFilesList) {
 180       push @ColLabels, @{$TextFilesColToMergeLabels[$Index]};
 181       $InDelim = $TextFilesInDelim[$Index];
 182       if ($Line = GetTextLine($TextFilesHandleList[$Index])) {
 183 	@LineWords = quotewords($InDelim, 0, $Line);
 184 	for $ColNum (@{$TextFilesColToMerge[$Index]}) {
 185 	  $Value = ($ColNum < @LineWords) ? $LineWords[$ColNum] : "";
 186 	  push @ColValues, $Value;
 187 	}
 188       }
 189     }
 190     for $ColIndex (0 .. $#ColLabels) {
 191       print NEWSDFILE "> <$ColLabels[$ColIndex]>\n$ColValues[$ColIndex]\n\n";
 192     }
 193     print NEWSDFILE "\$\$\$\$\n";
 194   }
 195 }
 196 
 197 # Merge the specified text columns into SD file using keys...
 198 sub MergeTextColumnValuesUsingKeys {
 199   my(@CmpdLines, $CmpdString, $Value, $KeyColNum, $KeyColValue, @TextFilesKeysToLinesMap, %DataFieldValues);
 200 
 201   @TextFilesKeysToLinesMap = ();
 202   # Retrieve text lines from all the text files...
 203   for $Index (0 .. $#TextFilesList) {
 204     $InDelim = $TextFilesInDelim[$Index];
 205     %{$TextFilesKeysToLinesMap[$Index]} = ();
 206     $KeyColNum = $TextFilesKeysToUse[$Index];
 207     while ($Line = GetTextLine($TextFilesHandleList[$Index])) {
 208       @LineWords = quotewords($InDelim, 0, $Line);
 209       if ($KeyColNum < @LineWords) {
 210 	$KeyColValue = $LineWords[$KeyColNum];
 211 	if (length($KeyColValue)) {
 212 	  if (exists($TextFilesKeysToLinesMap[$Index]{$KeyColValue})) {
 213 	    warn "Warning: Ignoring line, $Line, in text file $TextFilesList[$Index]: Column key value, $KeyColValue, already exists\n";
 214 	  }
 215 	  else {
 216 	    @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}} = ();
 217 	    push @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}}, @LineWords;
 218 	  }
 219 	}
 220       }
 221     }
 222   }
 223   while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 224     @CmpdLines = split "\n", $CmpdString;
 225     %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 226     if (exists($DataFieldValues{$Options{sdkey}})) {
 227       @ColLabels = (); @ColValues = ();
 228       $CmpdString =~ s/\$\$\$\$//g;
 229       print NEWSDFILE "$CmpdString";
 230 
 231       $KeyColValue = $DataFieldValues{$Options{sdkey}};
 232 
 233       # Merge coulmn values from other text files...
 234       for $Index (0 .. $#TextFilesList) {
 235 	push @ColLabels, @{$TextFilesColToMergeLabels[$Index]};
 236 	@LineWords = ();
 237 	if (exists($TextFilesKeysToLinesMap[$Index]{$KeyColValue})) {
 238 	  push @LineWords, @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}};
 239 	}
 240 	for $ColNum (@{$TextFilesColToMerge[$Index]}) {
 241 	  $Value = ($ColNum < @LineWords) ? $LineWords[$ColNum] : "";
 242 	  push @ColValues, $Value;
 243 	}
 244       }
 245       for $ColIndex (0 .. $#ColLabels) {
 246 	$Value = (($ColIndex < @ColValues) && IsNotEmpty($ColValues[$ColIndex]) ) ? $ColValues[$ColIndex] : "";
 247 	print NEWSDFILE "> <$ColLabels[$ColIndex]>\n$Value\n\n";
 248       }
 249       print NEWSDFILE "\$\$\$\$\n";
 250     }
 251   }
 252 }
 253 
 254 # Process specified columns...
 255 sub ProcessColumnsOption {
 256   @TextFilesColSpecified = (); @TextFilesColToMerge = (); @TextFilesColToMergeLabels = ();
 257   @TextFilesColToMergeNumToLabelMap = ();
 258   for $Index (0 .. $#TextFilesList) {
 259     $Values = "all";
 260     if ($Options{columns}) {
 261       $Values = $ColValues[$Index]
 262     }
 263     @{$TextFilesColSpecified[$Index]} = ();
 264     if ($Values =~ /all/i) {
 265       if ($Options{mode} =~ /^colnum$/i) {
 266 	for $ColNum (1 .. $TextFilesColCount[$Index]) {
 267 	  push @{$TextFilesColSpecified[$Index]}, $ColNum;
 268 	}
 269       } else {
 270 	push @{$TextFilesColSpecified[$Index]}, @{$TextFilesColLabels[$Index]};
 271       }
 272     } else {
 273       @Words = split ",", $Values;
 274       push @{$TextFilesColSpecified[$Index]}, @Words;
 275     }
 276     @{$TextFilesColToMerge[$Index]} = ();
 277     %{$TextFilesColToMergeNumToLabelMap[$Index]} = ();
 278     if ($Options{mode} =~ /^collabel$/i) {
 279       for $ColIndex (0 .. $#{$TextFilesColSpecified[$Index]}) {
 280 	$ColLabel = $TextFilesColSpecified[$Index][$ColIndex];
 281 	if (exists($TextFilesColLabelToNumMap[$Index]{$ColLabel})) {
 282 	  $ColNum = $TextFilesColLabelToNumMap[$Index]{$ColLabel};
 283 	  push @{$TextFilesColToMerge[$Index]}, $ColNum;
 284 	  $TextFilesColToMergeNumToLabelMap[$Index]{$ColNum} = $ColLabel;
 285 	} else {
 286 	  warn "Warning: Ignoring value, $ColLabel, specified using \"-c --column\" option: column name doesn't exist in  $TextFilesList[$Index]  \n";
 287 	}
 288       }
 289     }
 290     else {
 291       for $ColIndex (0 .. $#{$TextFilesColSpecified[$Index]}) {
 292 	$ColNum = $TextFilesColSpecified[$Index][$ColIndex];
 293 	# Make sure it's a numeric value...
 294 	if (!IsPositiveInteger($ColNum)) {
 295 	  warn "Warning: Ignoring value, $ColNum, specified using \"-c --column\" option:  Allowed integer values: > 0\n";
 296 	}
 297 	else {
 298 	  if ($ColNum > 0 && $ColNum <= $TextFilesColCount[$Index]) {
 299 	    $ColNum -= 1;
 300 	    push @{$TextFilesColToMerge[$Index]}, $ColNum;
 301 	    $TextFilesColToMergeNumToLabelMap[$Index]{$ColNum} = $TextFilesColLabels[$Index][$ColNum];
 302 	  }
 303 	  else {
 304 	    warn "Warning: Ignoring value, $ColNum, specified using \"-c --column\" option: column number doesn't exist in  $TextFilesList[$Index]  \n";
 305 	  }
 306 	}
 307       }
 308     }
 309     my (@TextFilesColToMergeSorted) = sort @{$TextFilesColToMerge[$Index]};
 310     @{$TextFilesColToMerge[$Index]} = ();
 311     push @{$TextFilesColToMerge[$Index]}, @TextFilesColToMergeSorted;
 312     # Set up the labels...
 313     @{$TextFilesColToMergeLabels[$Index]} = ();
 314     for $ColNum (@TextFilesColToMergeSorted) {
 315       push @{$TextFilesColToMergeLabels[$Index]}, $TextFilesColToMergeNumToLabelMap[$Index]{$ColNum};
 316     }
 317   }
 318 }
 319 
 320 # Process specified keys....
 321 sub ProcessKeysOption {
 322   @TextFilesKeysSpecified = (); @TextFilesKeysToUse = ();
 323   for $Index (0 .. $#TextFilesList) {
 324     $Key = $KeyValues[$Index];
 325     $TextFilesKeysSpecified[$Index] = $Key;
 326     $TextFilesKeysToUse[$Index] = -1;
 327     if ($Options{mode} =~ /^collabel$/i) {
 328       $ColLabel = $Key;
 329       if (exists($TextFilesColLabelToNumMap[$Index]{$ColLabel})) {
 330 	$TextFilesKeysToUse[$Index] =  $TextFilesColLabelToNumMap[$Index]{$ColLabel};
 331       }
 332       else {
 333 	warn "Warning: Ignoring value, $ColLabel, specified using \"-k --keys\" option: column name doesn't exist in  $TextFilesList[$Index]  \n";
 334       }
 335     }
 336     else {
 337       $ColNum = $Key;
 338       if (!IsPositiveInteger($ColNum)) {
 339 	warn "Warning: Ignoring value, $ColNum, specified using \"-k --keys\" option: Allowed integer values: > 0  \n";
 340       }
 341       else {
 342 	if ($ColNum > 0 && $ColNum <= $TextFilesColCount[$Index]) {
 343 	  $TextFilesKeysToUse[$Index] = $ColNum - 1;
 344 	}
 345 	else {
 346 	  warn "Warning: Ignoring value, $ColNum, specified using \"-k --keys\" option: column number doesn't exist in  $TextFilesList[$Index]  \n";
 347 	}
 348       }
 349     }
 350   }
 351   # Modify columns to merge list to make sure the columns identified by key are taken off the list
 352   my(@TextFilesColToMergeFiltered, @TextFilesColToMergeLabelsFiltered);
 353   for $Index (0 .. $#TextFilesList) {
 354     @TextFilesColToMergeFiltered = ();
 355     @TextFilesColToMergeLabelsFiltered = ();
 356     for $ColNum (@{$TextFilesColToMerge[$Index]}) {
 357       if ($TextFilesKeysToUse[$Index] != $ColNum) {
 358 	push @TextFilesColToMergeFiltered, $ColNum;
 359 	push @TextFilesColToMergeLabelsFiltered, $TextFilesColToMergeNumToLabelMap[$Index]{$ColNum};
 360       }
 361     }
 362     @{$TextFilesColToMerge[$Index]} = ();
 363     push @{$TextFilesColToMerge[$Index]}, @TextFilesColToMergeFiltered;
 364     @{$TextFilesColToMergeLabels[$Index]} = ();
 365     push @{$TextFilesColToMergeLabels[$Index]}, @TextFilesColToMergeLabelsFiltered;
 366   }
 367 }
 368 sub ProcessSDFileInfo {
 369   if (!CheckFileType($SDFile, "sd sdf")) {
 370     die "Error: Invalid first file $SDFile: It's not a SD file\n";
 371   }
 372   if (!(-e $SDFile)) {
 373     die "Error: SDFile $SDFile doesn't exist\n";
 374   }
 375 }
 376 
 377 # Retrieve information about input text files...
 378 sub RetrieveTextFilesInfo {
 379   my($TextFilesErrorCount) = 0;
 380 
 381   @TextFilesColCount = (); @TextFilesInDelim = (); @TextFilesColLabels = ();
 382   @TextFilesColLabelToNumMap = ();
 383  FILELIST: for $Index (0 .. $#TextFilesList) {
 384     $TextFile = $TextFilesList[$Index];
 385     $TextFilesColCount[$Index] = 0;
 386     @{$TextFilesColLabels[$Index]} = ();
 387     %{$TextFilesColLabelToNumMap[$Index]} = ();
 388     if (!(-e $TextFile)) {
 389       print "File $TextFile doesn't exist\n";
 390       $TextFilesErrorCount++;
 391       next FILELIST;
 392     }
 393     if (!CheckFileType($TextFile, "csv tsv")) {
 394       print "Problematic file $TextFile: It's not a csv or tsv file\n";
 395       $TextFilesErrorCount++;
 396       next FILELIST;
 397     }
 398     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 399     if ($FileExt =~ /^tsv$/i) {
 400       $InDelim = "\t";
 401     }
 402     else {
 403       $InDelim = "\,";
 404       if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 405 	warn "Warning: Ignoring file $TextFile: The value specified, $Options{indelim}, for option \"--indelim\" is not valid for csv files\n";
 406 	$TextFilesErrorCount++;
 407 	next FILELIST;
 408       }
 409       if ($Options{indelim} =~ /^semicolon$/i) {
 410 	$InDelim = "\;";
 411       }
 412     }
 413     if (!open TEXTFILE, "$TextFile") {
 414       print "Problematic file $TextFile: Couldn't open it: $! \n";
 415       $TextFilesErrorCount++;
 416       next FILELIST;
 417     }
 418     $Line = GetTextLine(\*TEXTFILE);
 419     @ColLabels = quotewords($InDelim, 0, $Line);
 420     $TextFilesInDelim[$Index] = $InDelim;
 421     $TextFilesColCount[$Index] = @ColLabels;
 422     push @{$TextFilesColLabels[$Index]}, @ColLabels;
 423     for $ColNum (0 .. $#ColLabels) {
 424       $ColLabel = $ColLabels[$ColNum];
 425       $TextFilesColLabelToNumMap[$Index]{$ColLabel} = $ColNum;
 426     }
 427     close TEXTFILE;
 428   }
 429   if ($TextFilesErrorCount) {
 430     die "Error: Problems with input text file(s)...\n";
 431   }
 432 }
 433 
 434 # Setup script usage  and retrieve command line arguments specified using various options...
 435 sub SetupScriptUsage {
 436 
 437   # Retrieve all the options...
 438   %Options = ();
 439   $Options{mode} = "colnum";
 440   $Options{indelim} = "comma";
 441   if (!GetOptions(\%Options, "help|h", "indelim=s", "columns|c=s", "keys|k=s", "mode|m=s", "overwrite|o", "root|r=s", "sdkey|s=s", "workingdir|w=s")) {
 442     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 443   }
 444   if ($Options{workingdir}) {
 445     if (! -d $Options{workingdir}) {
 446       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 447     }
 448     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 449   }
 450   if ($Options{mode} !~ /^(colnum|collabel)$/i) {
 451     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum, or collabel\n";
 452   }
 453   if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 454     die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
 455   }
 456   if ($Options{sdkey} && !$Options{keys}) {
 457     die "Error: The option \"-s --sdkey\" can't be specified without the \"-k --keys\" option.\n";
 458   }
 459   elsif (!$Options{sdkey} && $Options{keys}) {
 460     die "Error: The option \"-k --keys\" can't be specified without the \"-s --sdkey\" option.\n";
 461   }
 462 }
 463