1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: MergeTextFilesWithSD.pl,v $ 4 # $Date: 2008/01/30 21:44:48 $ 5 # $Revision: 1.22 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use 5.006; 30 use strict; 31 use FindBin; use lib "$FindBin::Bin/../lib"; 32 use Getopt::Long; 33 use File::Basename; 34 use Text::ParseWords; 35 use Benchmark; 36 use FileHandle; 37 use SDFileUtil; 38 use FileUtil; 39 use TextUtil; 40 41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 42 my(@InputFilesList, $TextFile, @TextFilesList, $NewTextFile, $SDFile, $NewSDFile, $Index, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, @ColValues, @KeyValues, $ColLabel, $ColNum, $ColIndex, $Values, @Words, @LineWords); 43 44 # Autoflush STDOUT 45 $| = 1; 46 47 # Starting message... 48 $ScriptName = basename $0; 49 print "\n$ScriptName:Starting...\n\n"; 50 $StartTime = new Benchmark; 51 52 # Get the options and setup script... 53 SetupScriptUsage(); 54 if ($Options{help} || @ARGV < 1) { 55 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 56 } 57 58 @InputFilesList = ExpandFileNames(\@ARGV, "csv tsv"); 59 60 if (@InputFilesList == 1) { 61 die "Error: Specify more than one Text file.\n"; 62 } 63 $SDFile = $InputFilesList[0]; 64 @TextFilesList = (); 65 for $Index (1 .. $#InputFilesList) { 66 push @TextFilesList, $InputFilesList[$Index]; 67 } 68 if ($Options{root}) { 69 $FileDir = ""; $FileName = ""; $FileExt = ""; 70 ($FileDir, $FileName, $FileExt) = ParseFileName($Options{root}); 71 if ($FileName && $FileExt) { 72 $NewSDFile = $FileName; 73 } 74 else { 75 $NewSDFile = $Options{root}; 76 } 77 } 78 else { 79 $FileDir = ""; $FileName = ""; $FileExt = ""; 80 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 81 $NewSDFile = $FileName; 82 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFilesList[0]); 83 $NewSDFile = $NewSDFile . "MergedWith" . $FileName . "1To" . @TextFilesList; 84 } 85 $NewSDFile = $NewSDFile . ".sdf"; 86 if (!$Options{overwrite}) { 87 if (-e $NewSDFile) { 88 die "Error: The file $NewSDFile already exists.\n"; 89 } 90 } 91 if ($Options{root}) { 92 if (lc($NewSDFile) eq lc($SDFile)) { 93 die "Error: Output filename, $NewSDFile, is similar to a input file name.\nSpecify a different name using \"-r --root\" option or use default name.\n"; 94 } 95 } 96 if ($Options{columns}) { 97 @ColValues = split ";", $Options{columns}; 98 if (@ColValues != @TextFilesList) { 99 die "Error: Invalid number of values specified by \"-c --columns\" option: it must be equal to number of input text files.\n"; 100 } 101 for $Index (0 .. $#ColValues) { 102 if (!length($ColValues[$Index])) { 103 die "Error: Invalid value specified by \"-c --columns\" option: empty values are not allowed.\n"; 104 } 105 } 106 } 107 if ($Options{keys}) { 108 @KeyValues = split ";", $Options{keys}; 109 if (@KeyValues != @TextFilesList) { 110 die "Error: Invalid number of values specified by \"-k --keys\" option: it must be equal to number of input text files.\n"; 111 } 112 for $Index (0 .. $#KeyValues) { 113 if (!length($KeyValues[$Index])) { 114 die "Error: Invalid value specified by \"-k --keys\" option: empty values are not allowed.\n"; 115 } 116 } 117 } 118 119 print "Processing various options and checking input text files...\n"; 120 121 # Process SD file related options 122 ProcessSDFileInfo(); 123 124 # Collect column information for all the text files... 125 my(@TextFilesColCount, @TextFilesInDelim, @TextFilesColLabels, @TextFilesColLabelToNumMap); 126 RetrieveTextFilesInfo(); 127 128 # Collect values specified using "-c --columns" option and map 'em to colnum... 129 my(@TextFilesColSpecified, @TextFilesColToMerge, @TextFilesColToMergeLabels, @TextFilesColToMergeNumToLabelMap); 130 ProcessColumnsOption(); 131 132 # Collect values specified using "-k --keys" option and map 'em to colnum... 133 my(@TextFilesKeysSpecified, @TextFilesKeysToUse, $Key); 134 if ($Options{keys}) { 135 ProcessKeysOption(); 136 } 137 138 print "Generating new SD file $NewSDFile...\n"; 139 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n"; 140 141 #Open up all the files and skip label lines for text files... 142 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 143 my(@TextFilesHandleList) = (); 144 for $Index (0 .. $#TextFilesList) { 145 $TextFilesHandleList[$Index] = new FileHandle; 146 $TextFile = $TextFilesList[$Index]; 147 open $TextFilesHandleList[$Index], "$TextFile" or die "Error: Couldn't open $TextFile: $! \n"; 148 $Line = GetTextLine($TextFilesHandleList[$Index]); 149 } 150 151 if ($Options{keys}) { 152 MergeTextColumnValuesUsingKeys(); 153 } 154 else { 155 MergeTextColumnValues(); 156 } 157 158 #close up all the files... 159 close NEWSDFILE; 160 close SDFILE; 161 for $Index (0 .. $#TextFilesList) { 162 close $TextFilesHandleList[$Index]; 163 } 164 print "$ScriptName:Done...\n\n"; 165 166 $EndTime = new Benchmark; 167 $TotalTime = timediff ($EndTime, $StartTime); 168 print "Total time: ", timestr($TotalTime), "\n"; 169 170 ############################################################################### 171 172 # Merge the specified text columns into SD file... 173 sub MergeTextColumnValues { 174 my($Value, $CmpdString); 175 while ($CmpdString = ReadCmpdString(\*SDFILE)) { 176 $CmpdString =~ s/\$\$\$\$//g; 177 print NEWSDFILE "$CmpdString"; 178 @ColLabels = (); @ColValues = (); 179 # Merge coulmn values from other text files... 180 for $Index (0 .. $#TextFilesList) { 181 push @ColLabels, @{$TextFilesColToMergeLabels[$Index]}; 182 $InDelim = $TextFilesInDelim[$Index]; 183 if ($Line = GetTextLine($TextFilesHandleList[$Index])) { 184 @LineWords = quotewords($InDelim, 0, $Line); 185 for $ColNum (@{$TextFilesColToMerge[$Index]}) { 186 $Value = ($ColNum < @LineWords) ? $LineWords[$ColNum] : ""; 187 push @ColValues, $Value; 188 } 189 } 190 } 191 for $ColIndex (0 .. $#ColLabels) { 192 print NEWSDFILE "> <$ColLabels[$ColIndex]>\n$ColValues[$ColIndex]\n\n"; 193 } 194 print NEWSDFILE "\$\$\$\$\n"; 195 } 196 } 197 198 # Merge the specified text columns into SD file using keys... 199 sub MergeTextColumnValuesUsingKeys { 200 my(@CmpdLines, $CmpdString, $Value, $KeyColNum, $KeyColValue, @TextFilesKeysToLinesMap, %DataFieldValues); 201 202 @TextFilesKeysToLinesMap = (); 203 # Retrieve text lines from all the text files... 204 for $Index (0 .. $#TextFilesList) { 205 $InDelim = $TextFilesInDelim[$Index]; 206 %{$TextFilesKeysToLinesMap[$Index]} = (); 207 $KeyColNum = $TextFilesKeysToUse[$Index]; 208 while ($Line = GetTextLine($TextFilesHandleList[$Index])) { 209 @LineWords = quotewords($InDelim, 0, $Line); 210 if ($KeyColNum < @LineWords) { 211 $KeyColValue = $LineWords[$KeyColNum]; 212 if (length($KeyColValue)) { 213 if (exists($TextFilesKeysToLinesMap[$Index]{$KeyColValue})) { 214 warn "Warning: Ignoring line, $Line, in text file $TextFilesList[$Index]: Column key value, $KeyColValue, already exists\n"; 215 } 216 else { 217 @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}} = (); 218 push @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}}, @LineWords; 219 } 220 } 221 } 222 } 223 } 224 while ($CmpdString = ReadCmpdString(\*SDFILE)) { 225 @CmpdLines = split "\n", $CmpdString; 226 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 227 if (exists($DataFieldValues{$Options{sdkey}})) { 228 @ColLabels = (); @ColValues = (); 229 $CmpdString =~ s/\$\$\$\$//g; 230 print NEWSDFILE "$CmpdString"; 231 232 $KeyColValue = $DataFieldValues{$Options{sdkey}}; 233 234 # Merge coulmn values from other text files... 235 for $Index (0 .. $#TextFilesList) { 236 push @ColLabels, @{$TextFilesColToMergeLabels[$Index]}; 237 @LineWords = (); 238 if (exists($TextFilesKeysToLinesMap[$Index]{$KeyColValue})) { 239 push @LineWords, @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}}; 240 } 241 for $ColNum (@{$TextFilesColToMerge[$Index]}) { 242 $Value = ($ColNum < @LineWords) ? $LineWords[$ColNum] : ""; 243 push @ColValues, $Value; 244 } 245 } 246 for $ColIndex (0 .. $#ColLabels) { 247 $Value = (($ColIndex < @ColValues) && IsNotEmpty($ColValues[$ColIndex]) ) ? $ColValues[$ColIndex] : ""; 248 print NEWSDFILE "> <$ColLabels[$ColIndex]>\n$Value\n\n"; 249 } 250 print NEWSDFILE "\$\$\$\$\n"; 251 } 252 } 253 } 254 255 # Process specified columns... 256 sub ProcessColumnsOption { 257 @TextFilesColSpecified = (); @TextFilesColToMerge = (); @TextFilesColToMergeLabels = (); 258 @TextFilesColToMergeNumToLabelMap = (); 259 for $Index (0 .. $#TextFilesList) { 260 $Values = "all"; 261 if ($Options{columns}) { 262 $Values = $ColValues[$Index] 263 } 264 @{$TextFilesColSpecified[$Index]} = (); 265 if ($Values =~ /all/i) { 266 if ($Options{mode} =~ /^colnum$/i) { 267 for $ColNum (1 .. $TextFilesColCount[$Index]) { 268 push @{$TextFilesColSpecified[$Index]}, $ColNum; 269 } 270 } else { 271 push @{$TextFilesColSpecified[$Index]}, @{$TextFilesColLabels[$Index]}; 272 } 273 } else { 274 @Words = split ",", $Values; 275 push @{$TextFilesColSpecified[$Index]}, @Words; 276 } 277 @{$TextFilesColToMerge[$Index]} = (); 278 %{$TextFilesColToMergeNumToLabelMap[$Index]} = (); 279 if ($Options{mode} =~ /^collabel$/i) { 280 for $ColIndex (0 .. $#{$TextFilesColSpecified[$Index]}) { 281 $ColLabel = $TextFilesColSpecified[$Index][$ColIndex]; 282 if (exists($TextFilesColLabelToNumMap[$Index]{$ColLabel})) { 283 $ColNum = $TextFilesColLabelToNumMap[$Index]{$ColLabel}; 284 push @{$TextFilesColToMerge[$Index]}, $ColNum; 285 $TextFilesColToMergeNumToLabelMap[$Index]{$ColNum} = $ColLabel; 286 } else { 287 warn "Warning: Ignoring value, $ColLabel, specified using \"-c --column\" option: column name doesn't exist in $TextFilesList[$Index] \n"; 288 } 289 } 290 } 291 else { 292 for $ColIndex (0 .. $#{$TextFilesColSpecified[$Index]}) { 293 $ColNum = $TextFilesColSpecified[$Index][$ColIndex]; 294 # Make sure it's a numeric value... 295 if (!IsPositiveInteger($ColNum)) { 296 warn "Warning: Ignoring value, $ColNum, specified using \"-c --column\" option: Allowed integer values: > 0\n"; 297 } 298 else { 299 if ($ColNum > 0 && $ColNum <= $TextFilesColCount[$Index]) { 300 $ColNum -= 1; 301 push @{$TextFilesColToMerge[$Index]}, $ColNum; 302 $TextFilesColToMergeNumToLabelMap[$Index]{$ColNum} = $TextFilesColLabels[$Index][$ColNum]; 303 } 304 else { 305 warn "Warning: Ignoring value, $ColNum, specified using \"-c --column\" option: column number doesn't exist in $TextFilesList[$Index] \n"; 306 } 307 } 308 } 309 } 310 my (@TextFilesColToMergeSorted) = sort @{$TextFilesColToMerge[$Index]}; 311 @{$TextFilesColToMerge[$Index]} = (); 312 push @{$TextFilesColToMerge[$Index]}, @TextFilesColToMergeSorted; 313 # Set up the labels... 314 @{$TextFilesColToMergeLabels[$Index]} = (); 315 for $ColNum (@TextFilesColToMergeSorted) { 316 push @{$TextFilesColToMergeLabels[$Index]}, $TextFilesColToMergeNumToLabelMap[$Index]{$ColNum}; 317 } 318 } 319 } 320 321 # Process specified keys.... 322 sub ProcessKeysOption { 323 @TextFilesKeysSpecified = (); @TextFilesKeysToUse = (); 324 for $Index (0 .. $#TextFilesList) { 325 $Key = $KeyValues[$Index]; 326 $TextFilesKeysSpecified[$Index] = $Key; 327 $TextFilesKeysToUse[$Index] = -1; 328 if ($Options{mode} =~ /^collabel$/i) { 329 $ColLabel = $Key; 330 if (exists($TextFilesColLabelToNumMap[$Index]{$ColLabel})) { 331 $TextFilesKeysToUse[$Index] = $TextFilesColLabelToNumMap[$Index]{$ColLabel}; 332 } 333 else { 334 warn "Warning: Ignoring value, $ColLabel, specified using \"-k --keys\" option: column name doesn't exist in $TextFilesList[$Index] \n"; 335 } 336 } 337 else { 338 $ColNum = $Key; 339 if (!IsPositiveInteger($ColNum)) { 340 warn "Warning: Ignoring value, $ColNum, specified using \"-k --keys\" option: Allowed integer values: > 0 \n"; 341 } 342 else { 343 if ($ColNum > 0 && $ColNum <= $TextFilesColCount[$Index]) { 344 $TextFilesKeysToUse[$Index] = $ColNum - 1; 345 } 346 else { 347 warn "Warning: Ignoring value, $ColNum, specified using \"-k --keys\" option: column number doesn't exist in $TextFilesList[$Index] \n"; 348 } 349 } 350 } 351 } 352 # Modify columns to merge list to make sure the columns identified by key are taken off the list 353 my(@TextFilesColToMergeFiltered, @TextFilesColToMergeLabelsFiltered); 354 for $Index (0 .. $#TextFilesList) { 355 @TextFilesColToMergeFiltered = (); 356 @TextFilesColToMergeLabelsFiltered = (); 357 for $ColNum (@{$TextFilesColToMerge[$Index]}) { 358 if ($TextFilesKeysToUse[$Index] != $ColNum) { 359 push @TextFilesColToMergeFiltered, $ColNum; 360 push @TextFilesColToMergeLabelsFiltered, $TextFilesColToMergeNumToLabelMap[$Index]{$ColNum}; 361 } 362 } 363 @{$TextFilesColToMerge[$Index]} = (); 364 push @{$TextFilesColToMerge[$Index]}, @TextFilesColToMergeFiltered; 365 @{$TextFilesColToMergeLabels[$Index]} = (); 366 push @{$TextFilesColToMergeLabels[$Index]}, @TextFilesColToMergeLabelsFiltered; 367 } 368 } 369 sub ProcessSDFileInfo { 370 if (!CheckFileType($SDFile, "sd sdf")) { 371 die "Error: Invalid first file $SDFile: It's not a SD file\n"; 372 } 373 if (!(-e $SDFile)) { 374 die "Error: SDFile $SDFile doesn't exist\n"; 375 } 376 } 377 378 # Retrieve information about input text files... 379 sub RetrieveTextFilesInfo { 380 my($TextFilesErrorCount) = 0; 381 382 @TextFilesColCount = (); @TextFilesInDelim = (); @TextFilesColLabels = (); 383 @TextFilesColLabelToNumMap = (); 384 FILELIST: for $Index (0 .. $#TextFilesList) { 385 $TextFile = $TextFilesList[$Index]; 386 $TextFilesColCount[$Index] = 0; 387 @{$TextFilesColLabels[$Index]} = (); 388 %{$TextFilesColLabelToNumMap[$Index]} = (); 389 if (!(-e $TextFile)) { 390 print "File $TextFile doesn't exist\n"; 391 $TextFilesErrorCount++; 392 next FILELIST; 393 } 394 if (!CheckFileType($TextFile, "csv tsv")) { 395 print "Problematic file $TextFile: It's not a csv or tsv file\n"; 396 $TextFilesErrorCount++; 397 next FILELIST; 398 } 399 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 400 if ($FileExt =~ /^tsv$/i) { 401 $InDelim = "\t"; 402 } 403 else { 404 $InDelim = "\,"; 405 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 406 warn "Warning: Ignoring file $TextFile: The value specified, $Options{indelim}, for option \"--indelim\" is not valid for csv files\n"; 407 $TextFilesErrorCount++; 408 next FILELIST; 409 } 410 if ($Options{indelim} =~ /^semicolon$/i) { 411 $InDelim = "\;"; 412 } 413 } 414 if (!open TEXTFILE, "$TextFile") { 415 print "Problematic file $TextFile: Couldn't open it: $! \n"; 416 $TextFilesErrorCount++; 417 next FILELIST; 418 } 419 $Line = GetTextLine(\*TEXTFILE); 420 @ColLabels = quotewords($InDelim, 0, $Line); 421 $TextFilesInDelim[$Index] = $InDelim; 422 $TextFilesColCount[$Index] = @ColLabels; 423 push @{$TextFilesColLabels[$Index]}, @ColLabels; 424 for $ColNum (0 .. $#ColLabels) { 425 $ColLabel = $ColLabels[$ColNum]; 426 $TextFilesColLabelToNumMap[$Index]{$ColLabel} = $ColNum; 427 } 428 close TEXTFILE; 429 } 430 if ($TextFilesErrorCount) { 431 die "Error: Problems with input text file(s)...\n"; 432 } 433 } 434 435 # Setup script usage and retrieve command line arguments specified using various options... 436 sub SetupScriptUsage { 437 438 # Retrieve all the options... 439 %Options = (); 440 $Options{mode} = "colnum"; 441 $Options{indelim} = "comma"; 442 if (!GetOptions(\%Options, "help|h", "indelim=s", "columns|c=s", "keys|k=s", "mode|m=s", "overwrite|o", "root|r=s", "sdkey|s=s", "workingdir|w=s")) { 443 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 444 } 445 if ($Options{workingdir}) { 446 if (! -d $Options{workingdir}) { 447 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 448 } 449 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 450 } 451 if ($Options{mode} !~ /(^(colnum|collabel)$)/i) { 452 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum, or collabel\n"; 453 } 454 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 455 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; 456 } 457 if ($Options{sdkey} && !$Options{keys}) { 458 die "Error: The option \"-s --sdkey\" can't be specified without the \"-k --keys\" option.\n"; 459 } 460 elsif (!$Options{sdkey} && $Options{keys}) { 461 die "Error: The option \"-k --keys\" can't be specified without the \"-s --sdkey\" option.\n"; 462 } 463 } 464