1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: ExtractFromTextFiles.pl,v $ 4 # $Date: 2010/06/23 20:59:29 $ 5 # $Revision: 1.32 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2010 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use FileHandle; 35 use Benchmark; 36 use FileUtil; 37 use TextUtil; 38 39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 40 41 # Autoflush STDOUT 42 $| = 1; 43 44 $StartTime = new Benchmark; 45 46 # Starting message... 47 $ScriptName = basename $0; 48 print "\n$ScriptName:Starting...\n\n"; 49 50 # Get the options and setup script... 51 SetupScriptUsage(); 52 if ($Options{help} || @ARGV < 1) { 53 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 54 } 55 56 my(@TextFilesList); 57 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); 58 59 my($OutDelim, $OutQuote, $SpecifiedCategoryCol, $SpecifiedRowsMode, @SpecifiedColumns, @SpecifiedRowValues); 60 ProcessOptions(); 61 62 # Collect column information for all the text files... 63 print "Checking input text file(s)...\n"; 64 my(@TextFilesOkay, @TextFilesColCount, @TextFilesColLabels, @TextFilesColLabelToNumMap, @TextFilesInDelim, @TextFilesOutFile, @TextFilesOutFileExt, @TextFilesCategoryOutFileRoot); 65 RetrieveTextFilesInfo(); 66 67 # Make sure the specified columns exists in text files... 68 my(@TextFilesCategoryColNum, @TextFilesColNumsToExtract); 69 ProcessColumnsInfo(); 70 71 # Process specified rows info... 72 my(@TextFilesRowValues); 73 ProcessRowsInfo(); 74 75 # Generate output files... 76 my($Index, $TextFile); 77 if (@TextFilesList > 1) { 78 print "Processing text files...\n"; 79 } 80 for $Index (0 .. $#TextFilesList) { 81 if ($TextFilesOkay[$Index]) { 82 $TextFile = $TextFilesList[$Index]; 83 if (@TextFilesList > 1) { 84 print "\nProcessing file $TextFile...\n"; 85 } 86 else { 87 print "Processing file $TextFile...\n" 88 } 89 if ($Options{mode} =~ /^categories$/i) { 90 ExtractCategoryData($Index); 91 } 92 elsif ($Options{mode} =~ /^rows$/i){ 93 ExtractRowsData($Index); 94 } 95 else { 96 ExtractColumnData($Index); 97 } 98 } 99 } 100 101 print "$ScriptName:Done...\n\n"; 102 103 $EndTime = new Benchmark; 104 $TotalTime = timediff ($EndTime, $StartTime); 105 print "Total time: ", timestr($TotalTime), "\n"; 106 107 ############################################################################### 108 109 # Geneate category files... 110 sub ExtractCategoryData { 111 my($Index) = @_; 112 my($TextFile, $CategoryCol, $NewTextFile, $InDelim, @ColLabels); 113 114 $TextFile = $TextFilesList[$Index]; 115 $NewTextFile =$TextFilesOutFile[$Index]; 116 $CategoryCol = $TextFilesCategoryColNum[$Index]; 117 $InDelim = $TextFilesInDelim[$Index]; 118 @ColLabels = @{$TextFilesColLabels[$Index]}; 119 120 my($Line, @LineWords, $CategoryName, $CategoryCount, %CategoriesNameToCountMap, %CategoriesNameToLinesMap); 121 # Collect category data... 122 open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n"; 123 # Skip label line... 124 $_ = <TEXTFILE>; 125 126 %CategoriesNameToCountMap = (); 127 %CategoriesNameToLinesMap = (); 128 while ($Line = GetTextLine(\*TEXTFILE)) { 129 @LineWords = quotewords($InDelim, 0, $Line); 130 $CategoryName = ($CategoryCol <= @LineWords) ? $LineWords[$CategoryCol] : ""; 131 if (exists($CategoriesNameToCountMap{$CategoryName})) { 132 $CategoriesNameToCountMap{$CategoryName} += 1; 133 push @{$CategoriesNameToLinesMap{$CategoryName}}, $Line; 134 } 135 else { 136 $CategoriesNameToCountMap{$CategoryName} = 1; 137 @{$CategoriesNameToLinesMap{$CategoryName}} = (); 138 push @{$CategoriesNameToLinesMap{$CategoryName}}, $Line; 139 } 140 } 141 close TEXTFILE; 142 143 # Setup file names for individual category files... 144 my(%CategoriesNameToFileHandleMap, %CategoriesNameToFileNameMap, $CategoryFile, $CategoryFileHandle); 145 %CategoriesNameToFileHandleMap = (); 146 %CategoriesNameToFileNameMap = (); 147 for $CategoryName (keys %CategoriesNameToCountMap) { 148 $CategoryFile = $TextFilesCategoryOutFileRoot[$Index] . "$CategoryName" . ".$TextFilesOutFileExt[$Index]";; 149 $CategoryFile =~ s/ //g; 150 $CategoryFileHandle = new FileHandle; 151 open $CategoryFileHandle, ">$CategoryFile" or die "Couldn't open $CategoryFile: $! \n"; 152 $CategoriesNameToFileNameMap{$CategoryName} = $CategoryFile; 153 $CategoriesNameToFileHandleMap{$CategoryName} = $CategoryFileHandle; 154 } 155 156 # Write out summary file... 157 print "Generating file $NewTextFile...\n"; 158 open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n"; 159 # Write out column labels... 160 @LineWords = ("Category","Count"); 161 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 162 print NEWTEXTFILE "$Line\n"; 163 164 # Write out the category names and count... 165 for $CategoryName (sort { lc($a) cmp lc($b) } keys %CategoriesNameToCountMap) { 166 $CategoryCount = $CategoriesNameToCountMap{$CategoryName}; 167 @LineWords = ("$CategoryName","$CategoryCount"); 168 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 169 print NEWTEXTFILE "$Line\n"; 170 } 171 close NEWTEXTFILE; 172 173 # Write out a file for each category... 174 my($ColLabelLine, $LineIndex); 175 $ColLabelLine = JoinWords(\@ColLabels, $OutDelim, $OutQuote); 176 print "\nGenerating text files for each category...\n"; 177 for $CategoryName (sort { lc($a) cmp lc($b) } keys %CategoriesNameToCountMap) { 178 print "Generating file $CategoriesNameToFileNameMap{$CategoryName}...\n"; 179 $CategoryFileHandle = $CategoriesNameToFileHandleMap{$CategoryName}; 180 print $CategoryFileHandle "$ColLabelLine\n"; 181 for $LineIndex (0 .. $#{$CategoriesNameToLinesMap{$CategoryName}}) { 182 $Line = ${$CategoriesNameToLinesMap{$CategoryName}}[$LineIndex]; 183 @LineWords = quotewords($InDelim, 0, $Line); 184 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 185 print $CategoryFileHandle "$Line\n"; 186 } 187 close $CategoryFileHandle; 188 } 189 } 190 191 # Extract data for specific columns... 192 sub ExtractColumnData { 193 my($Index) = @_; 194 my($TextFile, @ColNumsToExtract, $NewTextFile, $InDelim); 195 196 $TextFile = $TextFilesList[$Index]; 197 $NewTextFile =$TextFilesOutFile[$Index]; 198 $InDelim = $TextFilesInDelim[$Index]; 199 @ColNumsToExtract = @{$TextFilesColNumsToExtract[$Index]}; 200 201 print "Generating file $NewTextFile...\n"; 202 open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n"; 203 open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n"; 204 $_ = <TEXTFILE>; 205 # Write out column labels... 206 my($Line, @LineWords, @ColLabels, $ColLabelLine, @ColValues, $ColValuesLine, $ColNum, $ColValue); 207 @ColLabels = (); $ColLabelLine = ""; 208 for $ColNum (@ColNumsToExtract) { 209 push @ColLabels, $TextFilesColLabels[$Index][$ColNum]; 210 } 211 $ColLabelLine = JoinWords(\@ColLabels, $OutDelim, $OutQuote); 212 print NEWTEXTFILE "$ColLabelLine\n"; 213 while ($Line = GetTextLine(\*TEXTFILE)) { 214 @LineWords = quotewords($InDelim, 0, $Line); 215 @ColValues = (); $ColValuesLine = ""; 216 for $ColNum (@ColNumsToExtract) { 217 $ColValue = ""; 218 if ($ColNum < @LineWords) { 219 $ColValue = (defined $LineWords[$ColNum]) ? $LineWords[$ColNum] : ""; 220 } 221 push @ColValues, $ColValue; 222 } 223 $ColValuesLine = JoinWords(\@ColValues, $OutDelim, $OutQuote); 224 print NEWTEXTFILE "$ColValuesLine\n"; 225 } 226 close NEWTEXTFILE; 227 close TEXTFILE; 228 } 229 230 # Extract data for specific rows... 231 sub ExtractRowsData { 232 my($Index) = @_; 233 my($TextFile, $NewTextFile, $InDelim); 234 235 $TextFile = $TextFilesList[$Index]; 236 $NewTextFile =$TextFilesOutFile[$Index]; 237 $InDelim = $TextFilesInDelim[$Index]; 238 239 print "Generating file $NewTextFile...\n"; 240 open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n"; 241 open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n"; 242 243 my($Line, $RowCount, @LineWords, @ColLabels); 244 245 # Write out column labels... 246 $Line = <TEXTFILE>; 247 push @ColLabels, @{$TextFilesColLabels[$Index]}; 248 $Line = JoinWords(\@ColLabels, $OutDelim, $OutQuote); 249 print NEWTEXTFILE "$Line\n"; 250 251 if ($SpecifiedRowsMode =~ /^rowsbycolvalue$/i) { 252 ExtractRowsByColValue($Index, \*TEXTFILE, \*NEWTEXTFILE); 253 } 254 elsif ($SpecifiedRowsMode =~ /^rowsbycolvaluelist$/i) { 255 ExtractRowsByColValueList($Index, \*TEXTFILE, \*NEWTEXTFILE); 256 } 257 elsif ($SpecifiedRowsMode =~ /^rowsbycolvaluerange$/i) { 258 ExtractRowsByColValueRange($Index, \*TEXTFILE, \*NEWTEXTFILE); 259 } 260 elsif ($SpecifiedRowsMode =~ /^(rowbymincolvalue|rowbymaxcolvalue)$/i) { 261 ExtractRowByMinOrMaxColValue($Index, \*TEXTFILE, \*NEWTEXTFILE); 262 } 263 elsif ($SpecifiedRowsMode =~ /^rownums$/i) { 264 ExtractRowsByRowNums($Index, \*TEXTFILE, \*NEWTEXTFILE); 265 } 266 elsif ($SpecifiedRowsMode =~ /^rownumrange$/i) { 267 ExtractRowsByRowNumRange($Index, \*TEXTFILE, \*NEWTEXTFILE); 268 } 269 270 close NEWTEXTFILE; 271 close TEXTFILE; 272 } 273 274 # Extract rows by column value... 275 sub ExtractRowsByColValue { 276 my($Index, $TextFileRef, $NewTextFileRef) = @_; 277 278 my($Line, $ColNum, $ColValue, $Criterion, $Value, $ValueIndex, $InDelim, @LineWords); 279 $InDelim = $TextFilesInDelim[$Index]; 280 281 LINE: while ($Line = GetTextLine($TextFileRef)) { 282 @LineWords = quotewords($InDelim, 0, $Line); 283 for ($ValueIndex = 0; $ValueIndex < @{$TextFilesRowValues[$Index]}; $ValueIndex = $ValueIndex + 3) { 284 $ColNum = $TextFilesRowValues[$Index][$ValueIndex]; 285 $ColValue = $TextFilesRowValues[$Index][$ValueIndex + 1]; 286 $Criterion = $TextFilesRowValues[$Index][$ValueIndex + 2]; 287 if ($ColNum > $#LineWords) { 288 next LINE; 289 } 290 $Value = $LineWords[$ColNum]; 291 if ($Criterion =~ /^le$/i) { 292 if ($Value > $ColValue) { 293 next LINE; 294 } 295 } 296 elsif ($Criterion =~ /^ge$/i) { 297 if ($Value < $ColValue) { 298 next LINE; 299 } 300 } 301 elsif ($Criterion =~ /^eq$/i) { 302 if ($Value ne $ColValue) { 303 next LINE; 304 } 305 } 306 } 307 # Write it out... 308 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 309 print $NewTextFileRef "$Line\n"; 310 } 311 } 312 # Extract rows by column value list... 313 sub ExtractRowsByColValueList { 314 my($Index, $TextFileRef, $NewTextFileRef) = @_; 315 316 my($Line, $ColNum, $ColValue, $ValueIndex, $Value, $InDelim, %ColValueMap, @LineWords); 317 $InDelim = $TextFilesInDelim[$Index]; 318 $ColNum = $TextFilesRowValues[$Index][0]; 319 320 # Setup a col value map... 321 %ColValueMap = (); 322 for $ValueIndex (1 .. $#{$TextFilesRowValues[$Index]}) { 323 $Value = $TextFilesRowValues[$Index][$ValueIndex]; 324 $ColValueMap{$Value} = $Value; 325 } 326 327 LINE: while ($Line = GetTextLine($TextFileRef)) { 328 @LineWords = quotewords($InDelim, 0, $Line); 329 if ($ColNum > $#LineWords) { 330 next LINE; 331 } 332 $ColValue = $LineWords[$ColNum]; 333 if (exists $ColValueMap{$ColValue}) { 334 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 335 print $NewTextFileRef "$Line\n"; 336 } 337 } 338 } 339 340 # Extract row by minimum column value... 341 sub ExtractRowByMinOrMaxColValue { 342 my($Index, $TextFileRef, $NewTextFileRef) = @_; 343 344 my($Line, $ColNum, $ColValue, $FirstValue, $ValueLine, $InDelim, @LineWords); 345 $InDelim = $TextFilesInDelim[$Index]; 346 $ColNum = $TextFilesRowValues[$Index][0]; 347 348 $ValueLine = ''; $ColValue = ''; $FirstValue = 1; 349 LINE: while ($Line = GetTextLine($TextFileRef)) { 350 @LineWords = quotewords($InDelim, 0, $Line); 351 if ($ColNum > $#LineWords) { 352 next LINE; 353 } 354 if ($FirstValue) { 355 $FirstValue = 0; 356 $ColValue = $LineWords[$ColNum]; 357 $ValueLine = $Line; 358 next LINE; 359 } 360 if ($SpecifiedRowsMode =~ /^rowbymaxcolvalue$/i) { 361 if ($LineWords[$ColNum] > $ColValue) { 362 $ColValue = $LineWords[$ColNum]; 363 $ValueLine = $Line; 364 } 365 } 366 else { 367 if ($LineWords[$ColNum] < $ColValue) { 368 $ColValue = $LineWords[$ColNum]; 369 $ValueLine = $Line; 370 } 371 } 372 } 373 if ($ValueLine) { 374 @LineWords = quotewords($InDelim, 0, $ValueLine); 375 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 376 print $NewTextFileRef "$Line\n"; 377 } 378 } 379 380 # Extract rows by column value range... 381 sub ExtractRowsByColValueRange { 382 my($Index, $TextFileRef, $NewTextFileRef) = @_; 383 384 my($Line, $ColNum, $ColValue, $MinValue, $MaxValue, $InDelim, @LineWords); 385 $InDelim = $TextFilesInDelim[$Index]; 386 $ColNum = $TextFilesRowValues[$Index][0]; 387 $MinValue = $TextFilesRowValues[$Index][1]; 388 $MaxValue = $TextFilesRowValues[$Index][2]; 389 390 LINE: while ($Line = GetTextLine($TextFileRef)) { 391 @LineWords = quotewords($InDelim, 0, $Line); 392 if ($ColNum > $#LineWords) { 393 next LINE; 394 } 395 $ColValue = $LineWords[$ColNum]; 396 if ($ColValue >= $MinValue && $ColValue <= $MaxValue) { 397 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 398 print $NewTextFileRef "$Line\n"; 399 } 400 } 401 } 402 403 # Extract rows by row number range... 404 sub ExtractRowsByRowNumRange { 405 my($Index, $TextFileRef, $NewTextFileRef) = @_; 406 407 my($Line, $MinRowNum, $MaxRowNum, $RowCount, $InDelim, @LineWords); 408 $InDelim = $TextFilesInDelim[$Index]; 409 $MinRowNum = $TextFilesRowValues[$Index][0]; 410 $MaxRowNum = $TextFilesRowValues[$Index][1]; 411 412 $RowCount = 1; 413 LINE: while ($Line = GetTextLine($TextFileRef)) { 414 $RowCount++; 415 if ($RowCount >= $MinRowNum && $RowCount <= $MaxRowNum) { 416 @LineWords = quotewords($InDelim, 0, $Line); 417 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 418 print $NewTextFileRef "$Line\n"; 419 } 420 elsif ($RowCount > $MaxRowNum) { 421 last LINE; 422 } 423 } 424 } 425 426 # Extract rows by row numbers... 427 sub ExtractRowsByRowNums { 428 my($Index, $TextFileRef, $NewTextFileRef) = @_; 429 430 my($Line, $RowNum, $MaxRowNum, $RowCount, $InDelim, %RowNumMap, @LineWords); 431 $InDelim = $TextFilesInDelim[$Index]; 432 433 # Setup a row nums map... 434 %RowNumMap = (); 435 $MaxRowNum = $TextFilesRowValues[$Index][0]; 436 for $RowNum (@{$TextFilesRowValues[$Index]}) { 437 if ($RowNum > $MaxRowNum) { 438 $MaxRowNum = $RowNum; 439 } 440 $RowNumMap{$RowNum} = $RowNum; 441 } 442 443 $RowCount = 1; 444 LINE: while ($Line = GetTextLine($TextFileRef)) { 445 $RowCount++; 446 if (exists $RowNumMap{$RowCount}) { 447 @LineWords = quotewords($InDelim, 0, $Line); 448 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 449 print $NewTextFileRef "$Line\n"; 450 } 451 elsif ($RowCount > $MaxRowNum) { 452 last LINE; 453 } 454 } 455 } 456 457 # Process option values... 458 sub ProcessOptions { 459 $SpecifiedCategoryCol = ""; 460 if (defined $Options{categorycol}) { 461 my(@SpecifiedValues) = split ",", $Options{categorycol}; 462 if (@SpecifiedValues != 1) { 463 die "Error: Invalid number of values, ",scalar(@SpecifiedValues), " using \"--categorycol\" option: Only one value is allowed.\n"; 464 } 465 $SpecifiedCategoryCol = $SpecifiedValues[0]; 466 if ($Options{colmode} =~ /^colnum$/i) { 467 if (!IsPositiveInteger($SpecifiedCategoryCol)) { 468 die "Error: Category column value, $SpecifiedCategoryCol, specified using \"--categorycol\" is not valid. Allowed integer values: > 0.\n"; 469 } 470 } 471 } 472 @SpecifiedColumns = (); 473 if (defined $Options{columns}) { 474 my(@SpecifiedValues) = split ",", $Options{columns}; 475 if ($Options{colmode} =~ /^colnum$/i) { 476 my($ColValue); 477 for $ColValue (@SpecifiedValues) { 478 if (!IsPositiveInteger($ColValue)) { 479 die "Error: Column value, $ColValue, specified using \"--columns\" is not valid: Allowed integer values: > 0.\n"; 480 } 481 } 482 } 483 push @SpecifiedColumns, @SpecifiedValues; 484 } 485 $OutDelim = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,"); 486 $OutQuote = ($Options{quote} =~ /^yes$/i) ? 1 : 0; 487 488 # Process any specified rows values... 489 @SpecifiedRowValues = (); 490 $SpecifiedRowsMode = $Options{rowsmode}; 491 if (defined $Options{rows}) { 492 (@SpecifiedRowValues) = split ",", $Options{rows}; 493 } 494 else { 495 if ($Options{rowsmode} !~ /^rownums$/i) { 496 die "Error: Specify value for \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\n"; 497 } 498 push @SpecifiedRowValues, "1"; 499 } 500 501 my($SpecifiedColID, $SpecifiedRowID); 502 # Make sure specified values are okay... 503 if ($Options{rowsmode} =~ /^rowsbycolvalue$/i) { 504 if (@SpecifiedRowValues % 3) { 505 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain triplets.\n"; 506 } 507 # Triplet format: colid,value,criteria. Criterion: le,ge,eq 508 my($Index, $ColID, $Criterion, $Value); 509 for ($Index = 0; $Index < @SpecifiedRowValues; $Index = $Index + 3) { 510 $ColID = $SpecifiedRowValues[$Index]; 511 $Value = $SpecifiedRowValues[$Index + 1]; 512 $Criterion = $SpecifiedRowValues[$Index + 2]; 513 if ($Options{colmode} =~ /^colnum$/i) { 514 if (!IsPositiveInteger($ColID)) { 515 die "Error: Invalid column id, $ColID, specified in triplet, \"$ColID,$Criterion,$Value\", using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 516 } 517 } 518 if ($Criterion !~ /^(eq|le|ge)$/i) { 519 die "Error: Invalid criterion value, $Criterion, specified in triplet, \"$ColID,$Criterion,$Value\", using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed values: le, ge, or eq.\n"; 520 } 521 } 522 } 523 elsif ($Options{rowsmode} =~ /^rowsbycolvaluelist$/i) { 524 ($SpecifiedColID) = $SpecifiedRowValues[0]; 525 if ($Options{colmode} =~ /^colnum$/i) { 526 if (!IsPositiveInteger($SpecifiedColID)) { 527 die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 528 } 529 } 530 if (@SpecifiedRowValues == 1) { 531 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain more than one value\n"; 532 } 533 } 534 elsif ($Options{rowsmode} =~ /^rowsbycolvaluerange$/i) { 535 if (@SpecifiedRowValues != 3) { 536 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain three values\n"; 537 } 538 ($SpecifiedColID) = $SpecifiedRowValues[0]; 539 if ($Options{colmode} =~ /^colnum$/i) { 540 if (!IsPositiveInteger($SpecifiedColID)) { 541 die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 542 } 543 } 544 if ($SpecifiedRowValues[1] >= $SpecifiedRowValues[2]) { 545 die "Error: Invalid value triplet - ", JoinWords(\@SpecifiedRowValues, ',', 0) , " - specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nAllowed values: second value < third value\n"; 546 } 547 } 548 elsif ($Options{rowsmode} =~ /^(rowbymincolvalue|rowbymaxcolvalue)$/i) { 549 if (@SpecifiedRowValues != 1) { 550 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nOnly one value is allowed.\n"; 551 } 552 ($SpecifiedColID) = $SpecifiedRowValues[0]; 553 if ($Options{colmode} =~ /^colnum$/i) { 554 if (!IsPositiveInteger($SpecifiedColID)) { 555 die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 556 } 557 } 558 } 559 elsif ($Options{rowsmode} =~ /^rownums$/i) { 560 for $SpecifiedRowID (@SpecifiedRowValues) { 561 if (!IsPositiveInteger($SpecifiedRowID)) { 562 die "Error: Rows value, $SpecifiedRowID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 563 } 564 } 565 } 566 elsif ($Options{rowsmode} =~ /^rownumrange$/i) { 567 if (@SpecifiedRowValues != 2) { 568 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain only two values.\n"; 569 } 570 for $SpecifiedRowID (@SpecifiedRowValues) { 571 if (!IsPositiveInteger($SpecifiedRowID)) { 572 die "Error: Rows value, $SpecifiedRowID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 573 } 574 } 575 if ($SpecifiedRowValues[0] >= $SpecifiedRowValues[1]) { 576 die "Error: Invalid value pair - ", JoinWords(\@SpecifiedRowValues, ',', 0) , " - specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nAllowed values: First value < second value\n"; 577 } 578 } 579 } 580 581 # Retrieve information about input text files... 582 sub RetrieveTextFilesInfo { 583 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $OutFileRoot, $CategoryOutFileRoot, $OutFile, $ColNum, $ColLabel); 584 585 @TextFilesOkay = (); 586 @TextFilesColCount = (); @TextFilesColLabels = (); 587 @TextFilesColLabelToNumMap = (); 588 @TextFilesInDelim = (); 589 @TextFilesOutFile = (); @TextFilesOutFileExt = (); @TextFilesCategoryOutFileRoot = (); 590 591 FILELIST: for $Index (0 .. $#TextFilesList) { 592 $TextFile = $TextFilesList[$Index]; 593 $TextFilesOkay[$Index] = 0; 594 $TextFilesColCount[$Index] = 0; 595 $TextFilesInDelim[$Index] = ""; 596 $TextFilesOutFile[$Index] = ""; 597 $TextFilesOutFileExt[$Index] = ""; 598 $TextFilesCategoryOutFileRoot[$Index] = ""; 599 @{$TextFilesColLabels[$Index]} = (); 600 %{$TextFilesColLabelToNumMap[$Index]} = (); 601 if (!(-e $TextFile)) { 602 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; 603 next FILELIST; 604 } 605 if (!CheckFileType($TextFile, "csv tsv")) { 606 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n"; 607 next FILELIST; 608 } 609 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 610 if ($FileExt =~ /^tsv$/i) { 611 $InDelim = "\t"; 612 } 613 else { 614 $InDelim = "\,"; 615 if (!($Options{indelim} =~ /^(comma|semicolon)$/i)) { 616 warn "Warning: Ignoring file $TextFile: The value specified, $Options{indelim}, for option \"--indelim\" is not valid for csv files\n"; 617 next FILELIST; 618 } 619 if ($Options{indelim} =~ /^semicolon$/i) { 620 $InDelim = "\;"; 621 } 622 } 623 624 if (!open TEXTFILE, "$TextFile") { 625 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; 626 next FILELIST; 627 } 628 629 $Line = GetTextLine(\*TEXTFILE); 630 @ColLabels = quotewords($InDelim, 0, $Line); 631 close TEXTFILE; 632 633 $FileDir = ""; $FileName = ""; $FileExt = ""; 634 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 635 $FileExt = "csv"; 636 if ($Options{outdelim} =~ /^tab$/i) { 637 $FileExt = "tsv"; 638 } 639 if ($Options{root} && (@TextFilesList == 1)) { 640 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); 641 if ($RootFileName && $RootFileExt) { 642 $FileName = $RootFileName; 643 } 644 else { 645 $FileName = $Options{root}; 646 } 647 $OutFileRoot .= $FileName; 648 } 649 else { 650 $OutFileRoot = $FileName; 651 $OutFileRoot .= ($Options{mode} =~ /^categories$/i) ? "CategoriesSummary" : (($Options{mode} =~ /^rows$/i) ? "ExtractedRows" : "ExtractedColumns"); 652 } 653 $CategoryOutFileRoot = "$FileName" . "Category"; 654 655 $OutFile = $OutFileRoot . ".$FileExt"; 656 if (lc($OutFile) eq lc($TextFile)) { 657 warn "Warning: Ignoring file $TextFile:Output file name, $OutFile, is same as input text file name, $TextFile\n"; 658 next FILELIST; 659 } 660 if (!$Options{overwrite}) { 661 if (-e $OutFile) { 662 warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n"; 663 next FILELIST; 664 } 665 } 666 667 $TextFilesOkay[$Index] = 1; 668 $TextFilesInDelim[$Index] = $InDelim; 669 $TextFilesCategoryOutFileRoot[$Index] = "$CategoryOutFileRoot"; 670 $TextFilesOutFile[$Index] = "$OutFile"; 671 $TextFilesOutFileExt[$Index] = "$FileExt"; 672 673 $TextFilesColCount[$Index] = @ColLabels; 674 push @{$TextFilesColLabels[$Index]}, @ColLabels; 675 for $ColNum (0 .. $#ColLabels) { 676 $ColLabel = $ColLabels[$ColNum]; 677 $TextFilesColLabelToNumMap[$Index]{$ColLabel} = $ColNum; 678 } 679 } 680 } 681 682 # Make sure the specified columns exists in text files... 683 sub ProcessColumnsInfo { 684 my($Index, @ColNumsToExtract, $TextFile); 685 686 @TextFilesCategoryColNum = (); 687 @TextFilesColNumsToExtract = (); 688 FILELIST: for $Index (0 .. $#TextFilesList) { 689 $TextFile = $TextFilesList[$Index]; 690 691 $TextFilesCategoryColNum[$Index] = 0; 692 @{$TextFilesColNumsToExtract[$Index]} = (); 693 694 if ($TextFilesOkay[$Index]) { 695 if ($Options{mode} =~ /^categories$/i) { 696 my($CategoryColNum, $CategoryColValid); 697 698 $CategoryColNum = 0; 699 $CategoryColValid = 1; 700 if ($SpecifiedCategoryCol) { 701 if ($Options{colmode} =~ /^colnum$/i) { 702 if ($SpecifiedCategoryCol <= $TextFilesColCount[$Index]) { 703 $CategoryColNum = $SpecifiedCategoryCol - 1; 704 } 705 else { 706 $CategoryColValid = 0; 707 } 708 } 709 else { 710 if (exists($TextFilesColLabelToNumMap[$Index]{$SpecifiedCategoryCol})) { 711 $CategoryColNum = $TextFilesColLabelToNumMap[$Index]{$SpecifiedCategoryCol}; 712 } 713 else { 714 $CategoryColValid = 0; 715 } 716 } 717 } 718 if ($CategoryColValid) { 719 $TextFilesCategoryColNum[$Index] = $CategoryColNum; 720 } 721 else { 722 warn "Warning: Ignoring file $TextFile: Category column specified, $SpecifiedCategoryCol, using \"--categorycol\" option doesn't exist\n"; 723 $TextFilesOkay[$Index] = 0; 724 } 725 } 726 elsif ($Options{mode} =~ /^columns$/i) { 727 my($SpecifiedColNum, $ColNum); 728 $ColNum = 0; 729 @ColNumsToExtract = (); 730 if (@SpecifiedColumns) { 731 if ($Options{colmode} =~ /^colnum$/i) { 732 for $SpecifiedColNum (@SpecifiedColumns) { 733 if ($SpecifiedColNum >=1 && $SpecifiedColNum <= $TextFilesColCount[$Index]) { 734 $ColNum = $SpecifiedColNum - 1; 735 push @ColNumsToExtract, $ColNum; 736 } 737 } 738 } 739 else { 740 my($ColLabel); 741 for $ColLabel (@SpecifiedColumns) { 742 if (exists($TextFilesColLabelToNumMap[$Index]{$ColLabel})) { 743 push @ColNumsToExtract, $TextFilesColLabelToNumMap[$Index]{$ColLabel}; 744 } 745 } 746 } 747 } 748 else { 749 push @ColNumsToExtract, $ColNum; 750 } 751 if (@ColNumsToExtract) { 752 push @{$TextFilesColNumsToExtract[$Index]}, @ColNumsToExtract; 753 } 754 else { 755 warn "Warning: Ignoring file $TextFile: None of the columns specified, @SpecifiedColumns, using \"--columns\" option exist\n"; 756 $TextFilesOkay[$Index] = 0; 757 } 758 } 759 } 760 } 761 } 762 763 # Process specified rows info... 764 sub ProcessRowsInfo { 765 my($Index, $TextFile, $ColID, $ColIDOkay, $Value, $Criterion, $ColNum, @RowValues); 766 767 @TextFilesRowValues = (); 768 769 FILELIST: for $Index (0 .. $#TextFilesList) { 770 $TextFile = $TextFilesList[$Index]; 771 @{$TextFilesRowValues[$Index]} = (); 772 773 if ($Options{mode} !~ /^rows$/i) { 774 next FILELIST; 775 } 776 if (!$TextFilesOkay[$Index]) { 777 next FILELIST; 778 } 779 @RowValues = (); 780 if ($Options{rowsmode} =~ /^rowsbycolvalue$/i) { 781 my($ValueIndex); 782 for ($ValueIndex = 0; $ValueIndex < @SpecifiedRowValues; $ValueIndex = $ValueIndex + 3) { 783 $ColID = $SpecifiedRowValues[$ValueIndex]; 784 $Value = $SpecifiedRowValues[$ValueIndex + 1]; 785 $Criterion = $SpecifiedRowValues[$ValueIndex + 2]; 786 787 $ColIDOkay = 0; 788 if ($Options{colmode} =~ /^collabel$/i) { 789 if (exists $TextFilesColLabelToNumMap[$Index]{$ColID}) { 790 $ColIDOkay = 1; 791 $ColNum = $TextFilesColLabelToNumMap[$Index]{$ColID}; 792 } 793 } 794 else { 795 if ($ColID >=1 && $ColID <= $TextFilesColCount[$Index]) { 796 $ColNum = $ColID - 1; 797 $ColIDOkay = 1; 798 } 799 } 800 if ($ColIDOkay) { 801 push @RowValues, ($ColNum, $Value, $Criterion); 802 } 803 } 804 } 805 elsif ($Options{rowsmode} =~ /^(rowsbycolvaluelist|rowsbycolvaluerange|rowbymincolvalue|rowbymaxcolvalue)$/i) { 806 # Process coulumn id... 807 $ColID = $SpecifiedRowValues[0]; 808 $ColIDOkay = 0; 809 if ($Options{colmode} =~ /^collabel$/i) { 810 if (exists $TextFilesColLabelToNumMap[$Index]{$ColID}) { 811 $ColIDOkay = 1; 812 $ColNum = $TextFilesColLabelToNumMap[$Index]{$ColID}; 813 } 814 } 815 else { 816 if ($ColID >=1 && $ColID <= $TextFilesColCount[$Index]) { 817 $ColIDOkay = 1; 818 $ColNum = $ColID - 1; 819 } 820 } 821 if ($ColIDOkay) { 822 push @RowValues, $ColNum; 823 # Get rest of the specified values... 824 if (@SpecifiedRowValues > 1) { 825 for $Index (1 .. $#SpecifiedRowValues) { 826 push @RowValues, $SpecifiedRowValues[$Index]; 827 } 828 } 829 } 830 } 831 elsif ($Options{rowsmode} =~ /^(rownums|rownumrange)$/i) { 832 push @RowValues, @SpecifiedRowValues; 833 } 834 835 if (@RowValues) { 836 push @{$TextFilesRowValues[$Index]}, @RowValues; 837 } 838 else { 839 warn "Warning: Ignoring file $TextFile: Column specified, $ColID, using \"--rows\" option doesn't exist\n"; 840 $TextFilesOkay[$Index] = 0; 841 } 842 } 843 } 844 845 # Setup script usage and retrieve command line arguments specified using various options... 846 sub SetupScriptUsage { 847 848 # Setup default and retrieve all the options... 849 %Options = (); 850 $Options{colmode} = "colnum"; 851 $Options{indelim} = "comma"; 852 $Options{mode} = "columns"; 853 $Options{outdelim} = "comma"; 854 $Options{quote} = "yes"; 855 $Options{rowsmode} = "rownums"; 856 857 if (!GetOptions(\%Options, "categorycol=s", "columns=s", "colmode|c=s", "help|h", "indelim=s", "mode|m=s", "outdelim=s", "overwrite|o", "quote|q=s", "root|r=s", "rows=s", "rowsmode=s", "workingdir|w=s")) { 858 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 859 } 860 if ($Options{workingdir}) { 861 if (! -d $Options{workingdir}) { 862 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 863 } 864 chdir $Options{workingdir} || die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 865 } 866 if ($Options{mode} !~ /^(columns|rows|categories)$/i) { 867 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: columns, rows or categories \n"; 868 } 869 if ($Options{colmode} !~ /^(colnum|collabel)$/i) { 870 die "Error: The value specified, $Options{colmode}, for option \"--colmode\" is not valid. Allowed values: colnum or collabel \n"; 871 } 872 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 873 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; 874 } 875 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 876 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 877 } 878 if ($Options{quote} !~ /^(yes|no)$/i) { 879 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; 880 } 881 if ($Options{rowsmode} !~ /^(rowsbycolvalue|rowsbycolvaluelist|rowsbycolvaluerange|rowbymincolvalue|rowbymaxcolvalue|rownums|rownumrange)$/i) { 882 die "Error: The value specified, $Options{rowsmode}, for option \"--rowsmode\" is not valid. Allowed values: rowsbycolvalue, rowsbycolvaluelist, rowsbycolvaluerange, rowbymincolvalue, rowbymaxcolvalue, rownum, rownumrange\n"; 883 } 884 }