1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: MergeTextFilesWithSD.pl,v $ 4 # $Date: 2010/06/23 20:59:29 $ 5 # $Revision: 1.27 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2010 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileHandle; 36 use SDFileUtil; 37 use FileUtil; 38 use TextUtil; 39 40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 41 my(@InputFilesList, $TextFile, @TextFilesList, $NewTextFile, $SDFile, $NewSDFile, $Index, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, @ColValues, @KeyValues, $ColLabel, $ColNum, $ColIndex, $Values, @Words, @LineWords); 42 43 # Autoflush STDOUT 44 $| = 1; 45 46 # Starting message... 47 $ScriptName = basename $0; 48 print "\n$ScriptName:Starting...\n\n"; 49 $StartTime = new Benchmark; 50 51 # Get the options and setup script... 52 SetupScriptUsage(); 53 if ($Options{help} || @ARGV < 1) { 54 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 55 } 56 57 @InputFilesList = ExpandFileNames(\@ARGV, "csv tsv"); 58 59 if (@InputFilesList == 1) { 60 die "Error: Specify more than one Text file.\n"; 61 } 62 $SDFile = $InputFilesList[0]; 63 @TextFilesList = (); 64 for $Index (1 .. $#InputFilesList) { 65 push @TextFilesList, $InputFilesList[$Index]; 66 } 67 if ($Options{root}) { 68 $FileDir = ""; $FileName = ""; $FileExt = ""; 69 ($FileDir, $FileName, $FileExt) = ParseFileName($Options{root}); 70 if ($FileName && $FileExt) { 71 $NewSDFile = $FileName; 72 } 73 else { 74 $NewSDFile = $Options{root}; 75 } 76 } 77 else { 78 $FileDir = ""; $FileName = ""; $FileExt = ""; 79 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 80 $NewSDFile = $FileName; 81 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFilesList[0]); 82 $NewSDFile = $NewSDFile . "MergedWith" . $FileName . "1To" . @TextFilesList; 83 } 84 $NewSDFile = $NewSDFile . ".sdf"; 85 if (!$Options{overwrite}) { 86 if (-e $NewSDFile) { 87 die "Error: The file $NewSDFile already exists.\n"; 88 } 89 } 90 if ($Options{root}) { 91 if (lc($NewSDFile) eq lc($SDFile)) { 92 die "Error: Output filename, $NewSDFile, is similar to a input file name.\nSpecify a different name using \"-r --root\" option or use default name.\n"; 93 } 94 } 95 if ($Options{columns}) { 96 @ColValues = split ";", $Options{columns}; 97 if (@ColValues != @TextFilesList) { 98 die "Error: Invalid number of values specified by \"-c --columns\" option: it must be equal to number of input text files.\n"; 99 } 100 for $Index (0 .. $#ColValues) { 101 if (!length($ColValues[$Index])) { 102 die "Error: Invalid value specified by \"-c --columns\" option: empty values are not allowed.\n"; 103 } 104 } 105 } 106 if ($Options{keys}) { 107 @KeyValues = split ";", $Options{keys}; 108 if (@KeyValues != @TextFilesList) { 109 die "Error: Invalid number of values specified by \"-k --keys\" option: it must be equal to number of input text files.\n"; 110 } 111 for $Index (0 .. $#KeyValues) { 112 if (!length($KeyValues[$Index])) { 113 die "Error: Invalid value specified by \"-k --keys\" option: empty values are not allowed.\n"; 114 } 115 } 116 } 117 118 print "Processing various options and checking input text files...\n"; 119 120 # Process SD file related options 121 ProcessSDFileInfo(); 122 123 # Collect column information for all the text files... 124 my(@TextFilesColCount, @TextFilesInDelim, @TextFilesColLabels, @TextFilesColLabelToNumMap); 125 RetrieveTextFilesInfo(); 126 127 # Collect values specified using "-c --columns" option and map 'em to colnum... 128 my(@TextFilesColSpecified, @TextFilesColToMerge, @TextFilesColToMergeLabels, @TextFilesColToMergeNumToLabelMap); 129 ProcessColumnsOption(); 130 131 # Collect values specified using "-k --keys" option and map 'em to colnum... 132 my(@TextFilesKeysSpecified, @TextFilesKeysToUse, $Key); 133 if ($Options{keys}) { 134 ProcessKeysOption(); 135 } 136 137 print "Generating new SD file $NewSDFile...\n"; 138 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n"; 139 140 #Open up all the files and skip label lines for text files... 141 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 142 my(@TextFilesHandleList) = (); 143 for $Index (0 .. $#TextFilesList) { 144 $TextFilesHandleList[$Index] = new FileHandle; 145 $TextFile = $TextFilesList[$Index]; 146 open $TextFilesHandleList[$Index], "$TextFile" or die "Error: Couldn't open $TextFile: $! \n"; 147 $Line = GetTextLine($TextFilesHandleList[$Index]); 148 } 149 150 if ($Options{keys}) { 151 MergeTextColumnValuesUsingKeys(); 152 } 153 else { 154 MergeTextColumnValues(); 155 } 156 157 #close up all the files... 158 close NEWSDFILE; 159 close SDFILE; 160 for $Index (0 .. $#TextFilesList) { 161 close $TextFilesHandleList[$Index]; 162 } 163 print "$ScriptName:Done...\n\n"; 164 165 $EndTime = new Benchmark; 166 $TotalTime = timediff ($EndTime, $StartTime); 167 print "Total time: ", timestr($TotalTime), "\n"; 168 169 ############################################################################### 170 171 # Merge the specified text columns into SD file... 172 sub MergeTextColumnValues { 173 my($Value, $CmpdString); 174 while ($CmpdString = ReadCmpdString(\*SDFILE)) { 175 $CmpdString =~ s/\$\$\$\$//g; 176 print NEWSDFILE "$CmpdString"; 177 @ColLabels = (); @ColValues = (); 178 # Merge coulmn values from other text files... 179 for $Index (0 .. $#TextFilesList) { 180 push @ColLabels, @{$TextFilesColToMergeLabels[$Index]}; 181 $InDelim = $TextFilesInDelim[$Index]; 182 if ($Line = GetTextLine($TextFilesHandleList[$Index])) { 183 @LineWords = quotewords($InDelim, 0, $Line); 184 for $ColNum (@{$TextFilesColToMerge[$Index]}) { 185 $Value = ($ColNum < @LineWords) ? $LineWords[$ColNum] : ""; 186 push @ColValues, $Value; 187 } 188 } 189 } 190 for $ColIndex (0 .. $#ColLabels) { 191 print NEWSDFILE "> <$ColLabels[$ColIndex]>\n$ColValues[$ColIndex]\n\n"; 192 } 193 print NEWSDFILE "\$\$\$\$\n"; 194 } 195 } 196 197 # Merge the specified text columns into SD file using keys... 198 sub MergeTextColumnValuesUsingKeys { 199 my(@CmpdLines, $CmpdString, $Value, $KeyColNum, $KeyColValue, @TextFilesKeysToLinesMap, %DataFieldValues); 200 201 @TextFilesKeysToLinesMap = (); 202 # Retrieve text lines from all the text files... 203 for $Index (0 .. $#TextFilesList) { 204 $InDelim = $TextFilesInDelim[$Index]; 205 %{$TextFilesKeysToLinesMap[$Index]} = (); 206 $KeyColNum = $TextFilesKeysToUse[$Index]; 207 while ($Line = GetTextLine($TextFilesHandleList[$Index])) { 208 @LineWords = quotewords($InDelim, 0, $Line); 209 if ($KeyColNum < @LineWords) { 210 $KeyColValue = $LineWords[$KeyColNum]; 211 if (length($KeyColValue)) { 212 if (exists($TextFilesKeysToLinesMap[$Index]{$KeyColValue})) { 213 warn "Warning: Ignoring line, $Line, in text file $TextFilesList[$Index]: Column key value, $KeyColValue, already exists\n"; 214 } 215 else { 216 @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}} = (); 217 push @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}}, @LineWords; 218 } 219 } 220 } 221 } 222 } 223 while ($CmpdString = ReadCmpdString(\*SDFILE)) { 224 @CmpdLines = split "\n", $CmpdString; 225 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 226 if (exists($DataFieldValues{$Options{sdkey}})) { 227 @ColLabels = (); @ColValues = (); 228 $CmpdString =~ s/\$\$\$\$//g; 229 print NEWSDFILE "$CmpdString"; 230 231 $KeyColValue = $DataFieldValues{$Options{sdkey}}; 232 233 # Merge coulmn values from other text files... 234 for $Index (0 .. $#TextFilesList) { 235 push @ColLabels, @{$TextFilesColToMergeLabels[$Index]}; 236 @LineWords = (); 237 if (exists($TextFilesKeysToLinesMap[$Index]{$KeyColValue})) { 238 push @LineWords, @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}}; 239 } 240 for $ColNum (@{$TextFilesColToMerge[$Index]}) { 241 $Value = ($ColNum < @LineWords) ? $LineWords[$ColNum] : ""; 242 push @ColValues, $Value; 243 } 244 } 245 for $ColIndex (0 .. $#ColLabels) { 246 $Value = (($ColIndex < @ColValues) && IsNotEmpty($ColValues[$ColIndex]) ) ? $ColValues[$ColIndex] : ""; 247 print NEWSDFILE "> <$ColLabels[$ColIndex]>\n$Value\n\n"; 248 } 249 print NEWSDFILE "\$\$\$\$\n"; 250 } 251 } 252 } 253 254 # Process specified columns... 255 sub ProcessColumnsOption { 256 @TextFilesColSpecified = (); @TextFilesColToMerge = (); @TextFilesColToMergeLabels = (); 257 @TextFilesColToMergeNumToLabelMap = (); 258 for $Index (0 .. $#TextFilesList) { 259 $Values = "all"; 260 if ($Options{columns}) { 261 $Values = $ColValues[$Index] 262 } 263 @{$TextFilesColSpecified[$Index]} = (); 264 if ($Values =~ /all/i) { 265 if ($Options{mode} =~ /^colnum$/i) { 266 for $ColNum (1 .. $TextFilesColCount[$Index]) { 267 push @{$TextFilesColSpecified[$Index]}, $ColNum; 268 } 269 } else { 270 push @{$TextFilesColSpecified[$Index]}, @{$TextFilesColLabels[$Index]}; 271 } 272 } else { 273 @Words = split ",", $Values; 274 push @{$TextFilesColSpecified[$Index]}, @Words; 275 } 276 @{$TextFilesColToMerge[$Index]} = (); 277 %{$TextFilesColToMergeNumToLabelMap[$Index]} = (); 278 if ($Options{mode} =~ /^collabel$/i) { 279 for $ColIndex (0 .. $#{$TextFilesColSpecified[$Index]}) { 280 $ColLabel = $TextFilesColSpecified[$Index][$ColIndex]; 281 if (exists($TextFilesColLabelToNumMap[$Index]{$ColLabel})) { 282 $ColNum = $TextFilesColLabelToNumMap[$Index]{$ColLabel}; 283 push @{$TextFilesColToMerge[$Index]}, $ColNum; 284 $TextFilesColToMergeNumToLabelMap[$Index]{$ColNum} = $ColLabel; 285 } else { 286 warn "Warning: Ignoring value, $ColLabel, specified using \"-c --column\" option: column name doesn't exist in $TextFilesList[$Index] \n"; 287 } 288 } 289 } 290 else { 291 for $ColIndex (0 .. $#{$TextFilesColSpecified[$Index]}) { 292 $ColNum = $TextFilesColSpecified[$Index][$ColIndex]; 293 # Make sure it's a numeric value... 294 if (!IsPositiveInteger($ColNum)) { 295 warn "Warning: Ignoring value, $ColNum, specified using \"-c --column\" option: Allowed integer values: > 0\n"; 296 } 297 else { 298 if ($ColNum > 0 && $ColNum <= $TextFilesColCount[$Index]) { 299 $ColNum -= 1; 300 push @{$TextFilesColToMerge[$Index]}, $ColNum; 301 $TextFilesColToMergeNumToLabelMap[$Index]{$ColNum} = $TextFilesColLabels[$Index][$ColNum]; 302 } 303 else { 304 warn "Warning: Ignoring value, $ColNum, specified using \"-c --column\" option: column number doesn't exist in $TextFilesList[$Index] \n"; 305 } 306 } 307 } 308 } 309 my (@TextFilesColToMergeSorted) = sort @{$TextFilesColToMerge[$Index]}; 310 @{$TextFilesColToMerge[$Index]} = (); 311 push @{$TextFilesColToMerge[$Index]}, @TextFilesColToMergeSorted; 312 # Set up the labels... 313 @{$TextFilesColToMergeLabels[$Index]} = (); 314 for $ColNum (@TextFilesColToMergeSorted) { 315 push @{$TextFilesColToMergeLabels[$Index]}, $TextFilesColToMergeNumToLabelMap[$Index]{$ColNum}; 316 } 317 } 318 } 319 320 # Process specified keys.... 321 sub ProcessKeysOption { 322 @TextFilesKeysSpecified = (); @TextFilesKeysToUse = (); 323 for $Index (0 .. $#TextFilesList) { 324 $Key = $KeyValues[$Index]; 325 $TextFilesKeysSpecified[$Index] = $Key; 326 $TextFilesKeysToUse[$Index] = -1; 327 if ($Options{mode} =~ /^collabel$/i) { 328 $ColLabel = $Key; 329 if (exists($TextFilesColLabelToNumMap[$Index]{$ColLabel})) { 330 $TextFilesKeysToUse[$Index] = $TextFilesColLabelToNumMap[$Index]{$ColLabel}; 331 } 332 else { 333 warn "Warning: Ignoring value, $ColLabel, specified using \"-k --keys\" option: column name doesn't exist in $TextFilesList[$Index] \n"; 334 } 335 } 336 else { 337 $ColNum = $Key; 338 if (!IsPositiveInteger($ColNum)) { 339 warn "Warning: Ignoring value, $ColNum, specified using \"-k --keys\" option: Allowed integer values: > 0 \n"; 340 } 341 else { 342 if ($ColNum > 0 && $ColNum <= $TextFilesColCount[$Index]) { 343 $TextFilesKeysToUse[$Index] = $ColNum - 1; 344 } 345 else { 346 warn "Warning: Ignoring value, $ColNum, specified using \"-k --keys\" option: column number doesn't exist in $TextFilesList[$Index] \n"; 347 } 348 } 349 } 350 } 351 # Modify columns to merge list to make sure the columns identified by key are taken off the list 352 my(@TextFilesColToMergeFiltered, @TextFilesColToMergeLabelsFiltered); 353 for $Index (0 .. $#TextFilesList) { 354 @TextFilesColToMergeFiltered = (); 355 @TextFilesColToMergeLabelsFiltered = (); 356 for $ColNum (@{$TextFilesColToMerge[$Index]}) { 357 if ($TextFilesKeysToUse[$Index] != $ColNum) { 358 push @TextFilesColToMergeFiltered, $ColNum; 359 push @TextFilesColToMergeLabelsFiltered, $TextFilesColToMergeNumToLabelMap[$Index]{$ColNum}; 360 } 361 } 362 @{$TextFilesColToMerge[$Index]} = (); 363 push @{$TextFilesColToMerge[$Index]}, @TextFilesColToMergeFiltered; 364 @{$TextFilesColToMergeLabels[$Index]} = (); 365 push @{$TextFilesColToMergeLabels[$Index]}, @TextFilesColToMergeLabelsFiltered; 366 } 367 } 368 sub ProcessSDFileInfo { 369 if (!CheckFileType($SDFile, "sd sdf")) { 370 die "Error: Invalid first file $SDFile: It's not a SD file\n"; 371 } 372 if (!(-e $SDFile)) { 373 die "Error: SDFile $SDFile doesn't exist\n"; 374 } 375 } 376 377 # Retrieve information about input text files... 378 sub RetrieveTextFilesInfo { 379 my($TextFilesErrorCount) = 0; 380 381 @TextFilesColCount = (); @TextFilesInDelim = (); @TextFilesColLabels = (); 382 @TextFilesColLabelToNumMap = (); 383 FILELIST: for $Index (0 .. $#TextFilesList) { 384 $TextFile = $TextFilesList[$Index]; 385 $TextFilesColCount[$Index] = 0; 386 @{$TextFilesColLabels[$Index]} = (); 387 %{$TextFilesColLabelToNumMap[$Index]} = (); 388 if (!(-e $TextFile)) { 389 print "File $TextFile doesn't exist\n"; 390 $TextFilesErrorCount++; 391 next FILELIST; 392 } 393 if (!CheckFileType($TextFile, "csv tsv")) { 394 print "Problematic file $TextFile: It's not a csv or tsv file\n"; 395 $TextFilesErrorCount++; 396 next FILELIST; 397 } 398 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 399 if ($FileExt =~ /^tsv$/i) { 400 $InDelim = "\t"; 401 } 402 else { 403 $InDelim = "\,"; 404 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 405 warn "Warning: Ignoring file $TextFile: The value specified, $Options{indelim}, for option \"--indelim\" is not valid for csv files\n"; 406 $TextFilesErrorCount++; 407 next FILELIST; 408 } 409 if ($Options{indelim} =~ /^semicolon$/i) { 410 $InDelim = "\;"; 411 } 412 } 413 if (!open TEXTFILE, "$TextFile") { 414 print "Problematic file $TextFile: Couldn't open it: $! \n"; 415 $TextFilesErrorCount++; 416 next FILELIST; 417 } 418 $Line = GetTextLine(\*TEXTFILE); 419 @ColLabels = quotewords($InDelim, 0, $Line); 420 $TextFilesInDelim[$Index] = $InDelim; 421 $TextFilesColCount[$Index] = @ColLabels; 422 push @{$TextFilesColLabels[$Index]}, @ColLabels; 423 for $ColNum (0 .. $#ColLabels) { 424 $ColLabel = $ColLabels[$ColNum]; 425 $TextFilesColLabelToNumMap[$Index]{$ColLabel} = $ColNum; 426 } 427 close TEXTFILE; 428 } 429 if ($TextFilesErrorCount) { 430 die "Error: Problems with input text file(s)...\n"; 431 } 432 } 433 434 # Setup script usage and retrieve command line arguments specified using various options... 435 sub SetupScriptUsage { 436 437 # Retrieve all the options... 438 %Options = (); 439 $Options{mode} = "colnum"; 440 $Options{indelim} = "comma"; 441 if (!GetOptions(\%Options, "help|h", "indelim=s", "columns|c=s", "keys|k=s", "mode|m=s", "overwrite|o", "root|r=s", "sdkey|s=s", "workingdir|w=s")) { 442 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 443 } 444 if ($Options{workingdir}) { 445 if (! -d $Options{workingdir}) { 446 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 447 } 448 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 449 } 450 if ($Options{mode} !~ /^(colnum|collabel)$/i) { 451 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum, or collabel\n"; 452 } 453 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 454 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; 455 } 456 if ($Options{sdkey} && !$Options{keys}) { 457 die "Error: The option \"-s --sdkey\" can't be specified without the \"-k --keys\" option.\n"; 458 } 459 elsif (!$Options{sdkey} && $Options{keys}) { 460 die "Error: The option \"-k --keys\" can't be specified without the \"-s --sdkey\" option.\n"; 461 } 462 } 463