1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: ExtractFromTextFiles.pl,v $ 4 # $Date: 2008/01/30 21:44:46 $ 5 # $Revision: 1.26 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use 5.006; 30 use strict; 31 use FindBin; use lib "$FindBin::Bin/../lib"; 32 use Getopt::Long; 33 use File::Basename; 34 use Text::ParseWords; 35 use FileHandle; 36 use Benchmark; 37 use FileUtil; 38 use TextUtil; 39 40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 41 42 # Autoflush STDOUT 43 $| = 1; 44 45 $StartTime = new Benchmark; 46 47 # Starting message... 48 $ScriptName = basename $0; 49 print "\n$ScriptName:Starting...\n\n"; 50 51 # Get the options and setup script... 52 SetupScriptUsage(); 53 if ($Options{help} || @ARGV < 1) { 54 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 55 } 56 57 my(@TextFilesList); 58 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); 59 60 my($OutDelim, $OutQuote, $SpecifiedCategoryCol, $SpecifiedRowsMode, @SpecifiedColumns, @SpecifiedRowValues); 61 ProcessOptions(); 62 63 # Collect column information for all the text files... 64 print "Checking input text file(s)...\n"; 65 my(@TextFilesOkay, @TextFilesColCount, @TextFilesColLabels, @TextFilesColLabelToNumMap, @TextFilesInDelim, @TextFilesOutFile, @TextFilesOutFileExt, @TextFilesCategoryOutFileRoot); 66 RetrieveTextFilesInfo(); 67 68 # Make sure the specified columns exists in text files... 69 my(@TextFilesCategoryColNum, @TextFilesColNumsToExtract); 70 ProcessColumnsInfo(); 71 72 # Process specified rows info... 73 my(@TextFilesRowValues); 74 ProcessRowsInfo(); 75 76 # Generate output files... 77 my($Index, $TextFile); 78 if (@TextFilesList > 1) { 79 print "Processing text files...\n"; 80 } 81 for $Index (0 .. $#TextFilesList) { 82 if ($TextFilesOkay[$Index]) { 83 $TextFile = $TextFilesList[$Index]; 84 if (@TextFilesList > 1) { 85 print "\nProcessing file $TextFile...\n"; 86 } 87 else { 88 print "Processing file $TextFile...\n" 89 } 90 if ($Options{mode} =~ /^categories$/i) { 91 ExtractCategoryData($Index); 92 } 93 elsif ($Options{mode} =~ /^rows$/i){ 94 ExtractRowsData($Index); 95 } 96 else { 97 ExtractColumnData($Index); 98 } 99 } 100 } 101 102 print "$ScriptName:Done...\n\n"; 103 104 $EndTime = new Benchmark; 105 $TotalTime = timediff ($EndTime, $StartTime); 106 print "Total time: ", timestr($TotalTime), "\n"; 107 108 ############################################################################### 109 110 # Geneate category files... 111 sub ExtractCategoryData { 112 my($Index) = @_; 113 my($TextFile, $CategoryCol, $NewTextFile, $InDelim, @ColLabels); 114 115 $TextFile = $TextFilesList[$Index]; 116 $NewTextFile =$TextFilesOutFile[$Index]; 117 $CategoryCol = $TextFilesCategoryColNum[$Index]; 118 $InDelim = $TextFilesInDelim[$Index]; 119 @ColLabels = @{$TextFilesColLabels[$Index]}; 120 121 my($Line, @LineWords, $CategoryName, $CategoryCount, %CategoriesNameToCountMap, %CategoriesNameToLinesMap); 122 # Collect category data... 123 open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n"; 124 # Skip label line... 125 $_ = <TEXTFILE>; 126 127 %CategoriesNameToCountMap = (); 128 %CategoriesNameToLinesMap = (); 129 while ($Line = GetTextLine(\*TEXTFILE)) { 130 @LineWords = quotewords($InDelim, 0, $Line); 131 $CategoryName = ($CategoryCol <= @LineWords) ? $LineWords[$CategoryCol] : ""; 132 if (exists($CategoriesNameToCountMap{$CategoryName})) { 133 $CategoriesNameToCountMap{$CategoryName} += 1; 134 push @{$CategoriesNameToLinesMap{$CategoryName}}, $Line; 135 } 136 else { 137 $CategoriesNameToCountMap{$CategoryName} = 1; 138 @{$CategoriesNameToLinesMap{$CategoryName}} = (); 139 push @{$CategoriesNameToLinesMap{$CategoryName}}, $Line; 140 } 141 } 142 close TEXTFILE; 143 144 # Setup file names for individual category files... 145 my(%CategoriesNameToFileHandleMap, %CategoriesNameToFileNameMap, $CategoryFile, $CategoryFileHandle); 146 %CategoriesNameToFileHandleMap = (); 147 %CategoriesNameToFileNameMap = (); 148 for $CategoryName (keys %CategoriesNameToCountMap) { 149 $CategoryFile = $TextFilesCategoryOutFileRoot[$Index] . "$CategoryName" . ".$TextFilesOutFileExt[$Index]";; 150 $CategoryFile =~ s/ //g; 151 $CategoryFileHandle = new FileHandle; 152 open $CategoryFileHandle, ">$CategoryFile" or die "Couldn't open $CategoryFile: $! \n"; 153 $CategoriesNameToFileNameMap{$CategoryName} = $CategoryFile; 154 $CategoriesNameToFileHandleMap{$CategoryName} = $CategoryFileHandle; 155 } 156 157 # Write out summary file... 158 print "Generating file $NewTextFile...\n"; 159 open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n"; 160 # Write out column labels... 161 @LineWords = ("Category","Count"); 162 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 163 print NEWTEXTFILE "$Line\n"; 164 165 # Write out the category names and count... 166 for $CategoryName (sort { lc($a) cmp lc($b) } keys %CategoriesNameToCountMap) { 167 $CategoryCount = $CategoriesNameToCountMap{$CategoryName}; 168 @LineWords = ("$CategoryName","$CategoryCount"); 169 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 170 print NEWTEXTFILE "$Line\n"; 171 } 172 close NEWTEXTFILE; 173 174 # Write out a file for each category... 175 my($ColLabelLine, $LineIndex); 176 $ColLabelLine = JoinWords(\@ColLabels, $OutDelim, $OutQuote); 177 print "\nGenerating text files for each category...\n"; 178 for $CategoryName (sort { lc($a) cmp lc($b) } keys %CategoriesNameToCountMap) { 179 print "Generating file $CategoriesNameToFileNameMap{$CategoryName}...\n"; 180 $CategoryFileHandle = $CategoriesNameToFileHandleMap{$CategoryName}; 181 print $CategoryFileHandle "$ColLabelLine\n"; 182 for $LineIndex (0 .. $#{@{$CategoriesNameToLinesMap{$CategoryName}}}) { 183 $Line = ${$CategoriesNameToLinesMap{$CategoryName}}[$LineIndex]; 184 @LineWords = quotewords($InDelim, 0, $Line); 185 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 186 print $CategoryFileHandle "$Line\n"; 187 } 188 close $CategoryFileHandle; 189 } 190 } 191 192 # Extract data for specific columns... 193 sub ExtractColumnData { 194 my($Index) = @_; 195 my($TextFile, @ColNumsToExtract, $NewTextFile, $InDelim); 196 197 $TextFile = $TextFilesList[$Index]; 198 $NewTextFile =$TextFilesOutFile[$Index]; 199 $InDelim = $TextFilesInDelim[$Index]; 200 @ColNumsToExtract = @{$TextFilesColNumsToExtract[$Index]}; 201 202 print "Generating file $NewTextFile...\n"; 203 open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n"; 204 open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n"; 205 $_ = <TEXTFILE>; 206 # Write out column labels... 207 my($Line, @LineWords, @ColLabels, $ColLabelLine, @ColValues, $ColValuesLine, $ColNum, $ColValue); 208 @ColLabels = (); $ColLabelLine = ""; 209 for $ColNum (@ColNumsToExtract) { 210 push @ColLabels, $TextFilesColLabels[$Index][$ColNum]; 211 } 212 $ColLabelLine = JoinWords(\@ColLabels, $OutDelim, $OutQuote); 213 print NEWTEXTFILE "$ColLabelLine\n"; 214 while ($Line = GetTextLine(\*TEXTFILE)) { 215 @LineWords = quotewords($InDelim, 0, $Line); 216 @ColValues = (); $ColValuesLine = ""; 217 for $ColNum (@ColNumsToExtract) { 218 $ColValue = ""; 219 if ($ColNum < @LineWords) { 220 $ColValue = (defined $LineWords[$ColNum]) ? $LineWords[$ColNum] : ""; 221 } 222 push @ColValues, $ColValue; 223 } 224 $ColValuesLine = JoinWords(\@ColValues, $OutDelim, $OutQuote); 225 print NEWTEXTFILE "$ColValuesLine\n"; 226 } 227 close NEWTEXTFILE; 228 close TEXTFILE; 229 } 230 231 # Extract data for specific rows... 232 sub ExtractRowsData { 233 my($Index) = @_; 234 my($TextFile, $NewTextFile, $InDelim); 235 236 $TextFile = $TextFilesList[$Index]; 237 $NewTextFile =$TextFilesOutFile[$Index]; 238 $InDelim = $TextFilesInDelim[$Index]; 239 240 print "Generating file $NewTextFile...\n"; 241 open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n"; 242 open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n"; 243 244 my($Line, $RowCount, @LineWords, @ColLabels); 245 246 # Write out column labels... 247 $Line = <TEXTFILE>; 248 push @ColLabels, @{$TextFilesColLabels[$Index]}; 249 $Line = JoinWords(\@ColLabels, $OutDelim, $OutQuote); 250 print NEWTEXTFILE "$Line\n"; 251 252 if ($SpecifiedRowsMode =~ /^rowsbycolvalue$/i) { 253 ExtractRowsByColValue($Index, \*TEXTFILE, \*NEWTEXTFILE); 254 } 255 elsif ($SpecifiedRowsMode =~ /^rowsbycolvaluelist$/i) { 256 ExtractRowsByColValueList($Index, \*TEXTFILE, \*NEWTEXTFILE); 257 } 258 elsif ($SpecifiedRowsMode =~ /^rowsbycolvaluerange$/i) { 259 ExtractRowsByColValueRange($Index, \*TEXTFILE, \*NEWTEXTFILE); 260 } 261 elsif ($SpecifiedRowsMode =~ /^(rowbymincolvalue|rowbymaxcolvalue)$/i) { 262 ExtractRowByMinOrMaxColValue($Index, \*TEXTFILE, \*NEWTEXTFILE); 263 } 264 elsif ($SpecifiedRowsMode =~ /^rownums$/i) { 265 ExtractRowsByRowNums($Index, \*TEXTFILE, \*NEWTEXTFILE); 266 } 267 elsif ($SpecifiedRowsMode =~ /^rownumrange$/i) { 268 ExtractRowsByRowNumRange($Index, \*TEXTFILE, \*NEWTEXTFILE); 269 } 270 271 close NEWTEXTFILE; 272 close TEXTFILE; 273 } 274 275 # Extract rows by column value... 276 sub ExtractRowsByColValue { 277 my($Index, $TextFileRef, $NewTextFileRef) = @_; 278 279 my($Line, $ColNum, $ColValue, $Criterion, $Value, $ValueIndex, $InDelim, @LineWords); 280 $InDelim = $TextFilesInDelim[$Index]; 281 282 LINE: while ($Line = GetTextLine($TextFileRef)) { 283 @LineWords = quotewords($InDelim, 0, $Line); 284 for ($ValueIndex = 0; $ValueIndex < @{$TextFilesRowValues[$Index]}; $ValueIndex = $ValueIndex + 3) { 285 $ColNum = $TextFilesRowValues[$Index][$ValueIndex]; 286 $ColValue = $TextFilesRowValues[$Index][$ValueIndex + 1]; 287 $Criterion = $TextFilesRowValues[$Index][$ValueIndex + 2]; 288 if ($ColNum > $#LineWords) { 289 next LINE; 290 } 291 $Value = $LineWords[$ColNum]; 292 if ($Criterion =~ /^le$/i) { 293 if ($Value > $ColValue) { 294 next LINE; 295 } 296 } 297 elsif ($Criterion =~ /^ge$/i) { 298 if ($Value < $ColValue) { 299 next LINE; 300 } 301 } 302 elsif ($Criterion =~ /^eq$/i) { 303 if ($Value ne $ColValue) { 304 next LINE; 305 } 306 } 307 } 308 # Write it out... 309 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 310 print $NewTextFileRef "$Line\n"; 311 } 312 } 313 # Extract rows by column value list... 314 sub ExtractRowsByColValueList { 315 my($Index, $TextFileRef, $NewTextFileRef) = @_; 316 317 my($Line, $ColNum, $ColValue, $ValueIndex, $Value, $InDelim, %ColValueMap, @LineWords); 318 $InDelim = $TextFilesInDelim[$Index]; 319 $ColNum = $TextFilesRowValues[$Index][0]; 320 321 # Setup a col value map... 322 %ColValueMap = (); 323 for $ValueIndex (1 .. $#{$TextFilesRowValues[$Index]}) { 324 $Value = $TextFilesRowValues[$Index][$ValueIndex]; 325 $ColValueMap{$Value} = $Value; 326 } 327 328 LINE: while ($Line = GetTextLine($TextFileRef)) { 329 @LineWords = quotewords($InDelim, 0, $Line); 330 if ($ColNum > $#LineWords) { 331 next LINE; 332 } 333 $ColValue = $LineWords[$ColNum]; 334 if (exists $ColValueMap{$ColValue}) { 335 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 336 print $NewTextFileRef "$Line\n"; 337 } 338 } 339 } 340 341 # Extract row by minimum column value... 342 sub ExtractRowByMinOrMaxColValue { 343 my($Index, $TextFileRef, $NewTextFileRef) = @_; 344 345 my($Line, $ColNum, $ColValue, $FirstValue, $ValueLine, $InDelim, @LineWords); 346 $InDelim = $TextFilesInDelim[$Index]; 347 $ColNum = $TextFilesRowValues[$Index][0]; 348 349 $ValueLine = ''; $ColValue = ''; $FirstValue = 1; 350 LINE: while ($Line = GetTextLine($TextFileRef)) { 351 @LineWords = quotewords($InDelim, 0, $Line); 352 if ($ColNum > $#LineWords) { 353 next LINE; 354 } 355 if ($FirstValue) { 356 $FirstValue = 0; 357 $ColValue = $LineWords[$ColNum]; 358 $ValueLine = $Line; 359 next LINE; 360 } 361 if ($SpecifiedRowsMode =~ /^rowbymaxcolvalue$/i) { 362 if ($LineWords[$ColNum] > $ColValue) { 363 $ColValue = $LineWords[$ColNum]; 364 $ValueLine = $Line; 365 } 366 } 367 else { 368 if ($LineWords[$ColNum] < $ColValue) { 369 $ColValue = $LineWords[$ColNum]; 370 $ValueLine = $Line; 371 } 372 } 373 } 374 if ($ValueLine) { 375 @LineWords = quotewords($InDelim, 0, $ValueLine); 376 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 377 print $NewTextFileRef "$Line\n"; 378 } 379 } 380 381 # Extract rows by column value range... 382 sub ExtractRowsByColValueRange { 383 my($Index, $TextFileRef, $NewTextFileRef) = @_; 384 385 my($Line, $ColNum, $ColValue, $MinValue, $MaxValue, $InDelim, @LineWords); 386 $InDelim = $TextFilesInDelim[$Index]; 387 $ColNum = $TextFilesRowValues[$Index][0]; 388 $MinValue = $TextFilesRowValues[$Index][1]; 389 $MaxValue = $TextFilesRowValues[$Index][2]; 390 391 LINE: while ($Line = GetTextLine($TextFileRef)) { 392 @LineWords = quotewords($InDelim, 0, $Line); 393 if ($ColNum > $#LineWords) { 394 next LINE; 395 } 396 $ColValue = $LineWords[$ColNum]; 397 if ($ColValue >= $MinValue && $ColValue <= $MaxValue) { 398 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 399 print $NewTextFileRef "$Line\n"; 400 } 401 } 402 } 403 404 # Extract rows by row number range... 405 sub ExtractRowsByRowNumRange { 406 my($Index, $TextFileRef, $NewTextFileRef) = @_; 407 408 my($Line, $MinRowNum, $MaxRowNum, $RowCount, $InDelim, @LineWords); 409 $InDelim = $TextFilesInDelim[$Index]; 410 $MinRowNum = $TextFilesRowValues[$Index][0]; 411 $MaxRowNum = $TextFilesRowValues[$Index][1]; 412 413 $RowCount = 1; 414 LINE: while ($Line = GetTextLine($TextFileRef)) { 415 $RowCount++; 416 @LineWords = quotewords($InDelim, 0, $Line); 417 if ($RowCount >= $MinRowNum && $RowCount <= $MaxRowNum) { 418 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 419 print $NewTextFileRef "$Line\n"; 420 } 421 elsif ($RowCount > $MaxRowNum) { 422 last LINE; 423 } 424 } 425 } 426 427 # Extract rows by row numbers... 428 sub ExtractRowsByRowNums { 429 my($Index, $TextFileRef, $NewTextFileRef) = @_; 430 431 my($Line, $RowNum, $MaxRowNum, $RowCount, $InDelim, %RowNumMap, @LineWords); 432 $InDelim = $TextFilesInDelim[$Index]; 433 434 # Setup a row nums map... 435 %RowNumMap = (); 436 $MaxRowNum = $TextFilesRowValues[$Index][0]; 437 for $RowNum (@{$TextFilesRowValues[$Index]}) { 438 if ($RowNum > $MaxRowNum) { 439 $MaxRowNum = $RowNum; 440 } 441 $RowNumMap{$RowNum} = $RowNum; 442 } 443 444 $RowCount = 1; 445 LINE: while ($Line = GetTextLine($TextFileRef)) { 446 $RowCount++; 447 @LineWords = quotewords($InDelim, 0, $Line); 448 if (exists $RowNumMap{$RowCount}) { 449 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 450 print $NewTextFileRef "$Line\n"; 451 } 452 elsif ($RowCount > $MaxRowNum) { 453 last LINE; 454 } 455 } 456 } 457 458 # Process option values... 459 sub ProcessOptions { 460 $SpecifiedCategoryCol = ""; 461 if (defined $Options{categorycol}) { 462 my(@SpecifiedValues) = split ",", $Options{categorycol}; 463 if (@SpecifiedValues != 1) { 464 die "Error: Invalid number of values, ",scalar(@SpecifiedValues), " using \"--categorycol\" option: Only one value is allowed.\n"; 465 } 466 $SpecifiedCategoryCol = $SpecifiedValues[0]; 467 if ($Options{colmode} =~ /^colnum$/i) { 468 if (!IsPositiveInteger($SpecifiedCategoryCol)) { 469 die "Error: Category column value, $SpecifiedCategoryCol, specified using \"--categorycol\" is not valid. Allowed integer values: > 0.\n"; 470 } 471 } 472 } 473 @SpecifiedColumns = (); 474 if (defined $Options{columns}) { 475 my(@SpecifiedValues) = split ",", $Options{columns}; 476 if ($Options{colmode} =~ /^colnum$/i) { 477 my($ColValue); 478 for $ColValue (@SpecifiedValues) { 479 if (!IsPositiveInteger($ColValue)) { 480 die "Error: Column value, $ColValue, specified using \"--columns\" is not valid: Allowed integer values: > 0.\n"; 481 } 482 } 483 } 484 push @SpecifiedColumns, @SpecifiedValues; 485 } 486 $OutDelim = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,"); 487 $OutQuote = ($Options{quote} =~ /^yes$/i) ? 1 : 0; 488 489 # Process any specified rows values... 490 @SpecifiedRowValues = (); 491 $SpecifiedRowsMode = $Options{rowsmode}; 492 if (defined $Options{rows}) { 493 (@SpecifiedRowValues) = split ",", $Options{rows}; 494 } 495 else { 496 if ($Options{rowsmode} !~ /^rownums$/i) { 497 die "Error: Specify value for \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\n"; 498 } 499 push @SpecifiedRowValues, "1"; 500 } 501 502 my($SpecifiedColID, $SpecifiedRowID); 503 # Make sure specified values are okay... 504 if ($Options{rowsmode} =~ /^rowsbycolvalue$/i) { 505 if (@SpecifiedRowValues % 3) { 506 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain triplets.\n"; 507 } 508 # Triplet format: colid,value,criteria. Criterion: le,ge,eq 509 my($Index, $ColID, $Criterion, $Value); 510 for ($Index = 0; $Index < @SpecifiedRowValues; $Index = $Index + 3) { 511 $ColID = $SpecifiedRowValues[$Index]; 512 $Value = $SpecifiedRowValues[$Index + 1]; 513 $Criterion = $SpecifiedRowValues[$Index + 2]; 514 if ($Options{colmode} =~ /^colnum$/i) { 515 if (!IsPositiveInteger($ColID)) { 516 die "Error: Invalid column id, $ColID, specified in triplet, \"$ColID,$Criterion,$Value\", using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 517 } 518 } 519 if ($Criterion !~ /^(eq|le|ge)$/i) { 520 die "Error: Invalid criterion value, $Criterion, specified in triplet, \"$ColID,$Criterion,$Value\", using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed values: le, ge, or eq.\n"; 521 } 522 } 523 } 524 elsif ($Options{rowsmode} =~ /^rowsbycolvaluelist$/i) { 525 ($SpecifiedColID) = $SpecifiedRowValues[0]; 526 if ($Options{colmode} =~ /^colnum$/i) { 527 if (!IsPositiveInteger($SpecifiedColID)) { 528 die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 529 } 530 } 531 if (@SpecifiedRowValues == 1) { 532 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain more than one value\n"; 533 } 534 } 535 elsif ($Options{rowsmode} =~ /^rowsbycolvaluerange$/i) { 536 if (@SpecifiedRowValues != 3) { 537 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain three values\n"; 538 } 539 ($SpecifiedColID) = $SpecifiedRowValues[0]; 540 if ($Options{colmode} =~ /^colnum$/i) { 541 if (!IsPositiveInteger($SpecifiedColID)) { 542 die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 543 } 544 } 545 if ($SpecifiedRowValues[1] >= $SpecifiedRowValues[2]) { 546 die "Error: Invalid value triplet - ", JoinWords(\@SpecifiedRowValues, ',', 0) , " - specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nAllowed values: second value < third value\n"; 547 } 548 } 549 elsif ($Options{rowsmode} =~ /^(rowbymincolvalue|rowbymaxcolvalue)$/i) { 550 if (@SpecifiedRowValues != 1) { 551 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nOnly one value is allowed.\n"; 552 } 553 ($SpecifiedColID) = $SpecifiedRowValues[0]; 554 if ($Options{colmode} =~ /^colnum$/i) { 555 if (!IsPositiveInteger($SpecifiedColID)) { 556 die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 557 } 558 } 559 } 560 elsif ($Options{rowsmode} =~ /^rownums$/i) { 561 for $SpecifiedRowID (@SpecifiedRowValues) { 562 if (!IsPositiveInteger($SpecifiedRowID)) { 563 die "Error: Rows value, $SpecifiedRowID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 564 } 565 } 566 } 567 elsif ($Options{rowsmode} =~ /^rownumrange$/i) { 568 if (@SpecifiedRowValues != 2) { 569 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain only two values.\n"; 570 } 571 for $SpecifiedRowID (@SpecifiedRowValues) { 572 if (!IsPositiveInteger($SpecifiedRowID)) { 573 die "Error: Rows value, $SpecifiedRowID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 574 } 575 } 576 if ($SpecifiedRowValues[0] >= $SpecifiedRowValues[1]) { 577 die "Error: Invalid value pair - ", JoinWords(\@SpecifiedRowValues, ',', 0) , " - specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nAllowed values: First value < second value\n"; 578 } 579 } 580 } 581 582 # Retrieve information about input text files... 583 sub RetrieveTextFilesInfo { 584 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $OutFileRoot, $CategoryOutFileRoot, $OutFile, $ColNum, $ColLabel); 585 586 @TextFilesOkay = (); 587 @TextFilesColCount = (); @TextFilesColLabels = (); 588 @TextFilesColLabelToNumMap = (); 589 @TextFilesInDelim = (); 590 @TextFilesOutFile = (); @TextFilesOutFileExt = (); @TextFilesCategoryOutFileRoot = (); 591 592 FILELIST: for $Index (0 .. $#TextFilesList) { 593 $TextFile = $TextFilesList[$Index]; 594 $TextFilesOkay[$Index] = 0; 595 $TextFilesColCount[$Index] = 0; 596 $TextFilesInDelim[$Index] = ""; 597 $TextFilesOutFile[$Index] = ""; 598 $TextFilesOutFileExt[$Index] = ""; 599 $TextFilesCategoryOutFileRoot[$Index] = ""; 600 @{$TextFilesColLabels[$Index]} = (); 601 %{$TextFilesColLabelToNumMap[$Index]} = (); 602 if (!(-e $TextFile)) { 603 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; 604 next FILELIST; 605 } 606 if (!CheckFileType($TextFile, "csv tsv")) { 607 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n"; 608 next FILELIST; 609 } 610 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 611 if ($FileExt =~ /^tsv$/i) { 612 $InDelim = "\t"; 613 } 614 else { 615 $InDelim = "\,"; 616 if (!($Options{indelim} =~ /^(comma|semicolon)$/i)) { 617 warn "Warning: Ignoring file $TextFile: The value specified, $Options{indelim}, for option \"--indelim\" is not valid for csv files\n"; 618 next FILELIST; 619 } 620 if ($Options{indelim} =~ /^semicolon$/i) { 621 $InDelim = "\;"; 622 } 623 } 624 625 if (!open TEXTFILE, "$TextFile") { 626 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; 627 next FILELIST; 628 } 629 630 $Line = GetTextLine(\*TEXTFILE); 631 @ColLabels = quotewords($InDelim, 0, $Line); 632 close TEXTFILE; 633 634 $FileDir = ""; $FileName = ""; $FileExt = ""; 635 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 636 $FileExt = "csv"; 637 if ($Options{outdelim} =~ /^tab$/i) { 638 $FileExt = "tsv"; 639 } 640 if ($Options{root} && (@TextFilesList == 1)) { 641 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); 642 if ($RootFileName && $RootFileExt) { 643 $FileName = $RootFileName; 644 } 645 else { 646 $FileName = $Options{root}; 647 } 648 $OutFileRoot .= $FileName; 649 } 650 else { 651 $OutFileRoot = $FileName; 652 $OutFileRoot .= ($Options{mode} =~ /^categories$/i) ? "CategoriesSummary" : (($Options{mode} =~ /^rows$/i) ? "ExtractedRows" : "ExtractedColumns"); 653 } 654 $CategoryOutFileRoot = "$FileName" . "Category"; 655 656 $OutFile = $OutFileRoot . ".$FileExt"; 657 if (lc($OutFile) eq lc($TextFile)) { 658 warn "Warning: Ignoring file $TextFile:Output file name, $OutFile, is same as input text file name, $TextFile\n"; 659 next FILELIST; 660 } 661 if (!$Options{overwrite}) { 662 if (-e $OutFile) { 663 warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n"; 664 next FILELIST; 665 } 666 } 667 668 $TextFilesOkay[$Index] = 1; 669 $TextFilesInDelim[$Index] = $InDelim; 670 $TextFilesCategoryOutFileRoot[$Index] = "$CategoryOutFileRoot"; 671 $TextFilesOutFile[$Index] = "$OutFile"; 672 $TextFilesOutFileExt[$Index] = "$FileExt"; 673 674 $TextFilesColCount[$Index] = @ColLabels; 675 push @{$TextFilesColLabels[$Index]}, @ColLabels; 676 for $ColNum (0 .. $#ColLabels) { 677 $ColLabel = $ColLabels[$ColNum]; 678 $TextFilesColLabelToNumMap[$Index]{$ColLabel} = $ColNum; 679 } 680 } 681 } 682 683 # Make sure the specified columns exists in text files... 684 sub ProcessColumnsInfo { 685 my($Index, @ColNumsToExtract, $TextFile); 686 687 @TextFilesCategoryColNum = (); 688 @TextFilesColNumsToExtract = (); 689 FILELIST: for $Index (0 .. $#TextFilesList) { 690 $TextFile = $TextFilesList[$Index]; 691 692 $TextFilesCategoryColNum[$Index] = 0; 693 @{$TextFilesColNumsToExtract[$Index]} = (); 694 695 if ($TextFilesOkay[$Index]) { 696 if ($Options{mode} =~ /^categories$/i) { 697 my($CategoryColNum, $CategoryColValid); 698 699 $CategoryColNum = 0; 700 $CategoryColValid = 1; 701 if ($SpecifiedCategoryCol) { 702 if ($Options{colmode} =~ /^colnum$/i) { 703 if ($SpecifiedCategoryCol <= $TextFilesColCount[$Index]) { 704 $CategoryColNum = $SpecifiedCategoryCol - 1; 705 } 706 else { 707 $CategoryColValid = 0; 708 } 709 } 710 else { 711 if (exists($TextFilesColLabelToNumMap[$Index]{$SpecifiedCategoryCol})) { 712 $CategoryColNum = $TextFilesColLabelToNumMap[$Index]{$SpecifiedCategoryCol}; 713 } 714 else { 715 $CategoryColValid = 0; 716 } 717 } 718 } 719 if ($CategoryColValid) { 720 $TextFilesCategoryColNum[$Index] = $CategoryColNum; 721 } 722 else { 723 warn "Warning: Ignoring file $TextFile: Category column specified, $SpecifiedCategoryCol, using \"--categorycol\" option doesn't exist\n"; 724 $TextFilesOkay[$Index] = 0; 725 } 726 } 727 elsif ($Options{mode} =~ /^columns$/i) { 728 my($SpecifiedColNum, $ColNum); 729 $ColNum = 0; 730 @ColNumsToExtract = (); 731 if (@SpecifiedColumns) { 732 if ($Options{colmode} =~ /^colnum$/i) { 733 for $SpecifiedColNum (@SpecifiedColumns) { 734 if ($SpecifiedColNum >=1 && $SpecifiedColNum <= $TextFilesColCount[$Index]) { 735 $ColNum = $SpecifiedColNum - 1; 736 push @ColNumsToExtract, $ColNum; 737 } 738 } 739 } 740 else { 741 my($ColLabel); 742 for $ColLabel (@SpecifiedColumns) { 743 if (exists($TextFilesColLabelToNumMap[$Index]{$ColLabel})) { 744 push @ColNumsToExtract, $TextFilesColLabelToNumMap[$Index]{$ColLabel}; 745 } 746 } 747 } 748 } 749 else { 750 push @ColNumsToExtract, $ColNum; 751 } 752 if (@ColNumsToExtract) { 753 push @{$TextFilesColNumsToExtract[$Index]}, @ColNumsToExtract; 754 } 755 else { 756 warn "Warning: Ignoring file $TextFile: None of the columns specified, @SpecifiedColumns, using \"--columns\" option exist\n"; 757 $TextFilesOkay[$Index] = 0; 758 } 759 } 760 } 761 } 762 } 763 764 # Process specified rows info... 765 sub ProcessRowsInfo { 766 my($Index, $TextFile, $ColID, $ColIDOkay, $Value, $Criterion, $ColNum, @RowValues); 767 768 @TextFilesRowValues = (); 769 770 FILELIST: for $Index (0 .. $#TextFilesList) { 771 $TextFile = $TextFilesList[$Index]; 772 @{$TextFilesRowValues[$Index]} = (); 773 774 if ($Options{mode} !~ /^rows$/i) { 775 next FILELIST; 776 } 777 if (!$TextFilesOkay[$Index]) { 778 next FILELIST; 779 } 780 @RowValues = (); 781 if ($Options{rowsmode} =~ /^rowsbycolvalue$/i) { 782 my($ValueIndex); 783 for ($ValueIndex = 0; $ValueIndex < @SpecifiedRowValues; $ValueIndex = $ValueIndex + 3) { 784 $ColID = $SpecifiedRowValues[$ValueIndex]; 785 $Value = $SpecifiedRowValues[$ValueIndex + 1]; 786 $Criterion = $SpecifiedRowValues[$ValueIndex + 2]; 787 788 $ColIDOkay = 0; 789 if ($Options{colmode} =~ /^collabel$/i) { 790 if (exists $TextFilesColLabelToNumMap[$Index]{$ColID}) { 791 $ColIDOkay = 1; 792 $ColNum = $TextFilesColLabelToNumMap[$Index]{$ColID}; 793 } 794 } 795 else { 796 if ($ColID >=1 && $ColID <= $TextFilesColCount[$Index]) { 797 $ColNum = $ColID - 1; 798 $ColIDOkay = 1; 799 } 800 } 801 if ($ColIDOkay) { 802 push @RowValues, ($ColNum, $Value, $Criterion); 803 } 804 } 805 } 806 elsif ($Options{rowsmode} =~ /^(rowsbycolvaluelist|rowsbycolvaluerange|rowbymincolvalue|rowbymaxcolvalue)$/i) { 807 # Process coulumn id... 808 $ColID = $SpecifiedRowValues[0]; 809 $ColIDOkay = 0; 810 if ($Options{colmode} =~ /^collabel$/i) { 811 if (exists $TextFilesColLabelToNumMap[$Index]{$ColID}) { 812 $ColIDOkay = 1; 813 $ColNum = $TextFilesColLabelToNumMap[$Index]{$ColID}; 814 } 815 } 816 else { 817 if ($ColID >=1 && $ColID <= $TextFilesColCount[$Index]) { 818 $ColIDOkay = 1; 819 $ColNum = $ColID - 1; 820 } 821 } 822 if ($ColIDOkay) { 823 push @RowValues, $ColNum; 824 # Get rest of the specified values... 825 if (@SpecifiedRowValues > 1) { 826 for $Index (1 .. $#SpecifiedRowValues) { 827 push @RowValues, $SpecifiedRowValues[$Index]; 828 } 829 } 830 } 831 } 832 elsif ($Options{rowsmode} =~ /^(rownums|rownumrange)$/i) { 833 push @RowValues, @SpecifiedRowValues; 834 } 835 836 if (@RowValues) { 837 push @{$TextFilesRowValues[$Index]}, @RowValues; 838 } 839 else { 840 warn "Warning: Ignoring file $TextFile: Column specified, $ColID, using \"--rows\" option doesn't exist\n"; 841 $TextFilesOkay[$Index] = 0; 842 } 843 } 844 } 845 846 # Setup script usage and retrieve command line arguments specified using various options... 847 sub SetupScriptUsage { 848 849 # Setup default and retrieve all the options... 850 %Options = (); 851 $Options{colmode} = "colnum"; 852 $Options{indelim} = "comma"; 853 $Options{mode} = "columns"; 854 $Options{outdelim} = "comma"; 855 $Options{quote} = "yes"; 856 $Options{rowsmode} = "rownums"; 857 858 if (!GetOptions(\%Options, "categorycol=s", "columns=s", "colmode|c=s", "help|h", "indelim=s", "mode|m=s", "outdelim=s", "overwrite|o", "quote|q=s", "root|r=s", "rows=s", "rowsmode=s", "workingdir|w=s")) { 859 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 860 } 861 if ($Options{workingdir}) { 862 if (! -d $Options{workingdir}) { 863 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 864 } 865 chdir $Options{workingdir} || die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 866 } 867 if ($Options{mode} !~ /(^(columns|rows|categories)$)/i) { 868 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: columns, rows or categories \n"; 869 } 870 if ($Options{colmode} !~ /(^(colnum|collabel)$)/i) { 871 die "Error: The value specified, $Options{colmode}, for option \"--colmode\" is not valid. Allowed values: colnum or collabel \n"; 872 } 873 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 874 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; 875 } 876 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 877 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 878 } 879 if ($Options{quote} !~ /^(yes|no)$/i) { 880 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; 881 } 882 if ($Options{rowsmode} !~ /^(rowsbycolvalue|rowsbycolvaluelist|rowsbycolvaluerange|rowbymincolvalue|rowbymaxcolvalue|rownums|rownumrange)$/i) { 883 die "Error: The value specified, $Options{rowsmode}, for option \"--rowsmode\" is not valid. Allowed values: rowsbycolvalue, rowsbycolvaluelist, rowsbycolvaluerange, rowbymincolvalue, rowbymaxcolvalue, rownum, rownumrange\n"; 884 } 885 }