1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: InfoTextFiles.pl,v $ 4 # $Date: 2008/01/30 21:44:48 $ 5 # $Revision: 1.16 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use 5.006; 30 use strict; 31 use FindBin; use lib "$FindBin::Bin/../lib"; 32 use Getopt::Long; 33 use File::Basename; 34 use Text::ParseWords; 35 use Benchmark; 36 use FileUtil; 37 use TextUtil; 38 39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 40 41 # Autoflush STDOUT 42 $| = 1; 43 44 # Starting message... 45 $ScriptName = basename($0); 46 print "\n$ScriptName: Starting...\n\n"; 47 $StartTime = new Benchmark; 48 49 # Get the options and setup script... 50 SetupScriptUsage(); 51 if ($Options{help} || @ARGV < 1) { 52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 53 } 54 55 my(@TextFilesList); 56 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); 57 58 my($DetailLevel, $ParseLines, $CountEmpty, $CheckData, $CheckNumericalData, @SpecifiedNumericalDataCols); 59 ProcessOptions(); 60 61 print "Checking input text file(s)...\n"; 62 my(@TextFilesOkay, @TextFilesColCount, @TextFilesSize, @TextFilesLastModified, @TextFilesColLabels, @TextFilesColLabelToNumMap, @TextFilesInDelim); 63 RetrieveTextFilesInfo(); 64 65 my(@TextFilesNumericalDataColNums, @TextFilesNumericalDataColLabels); 66 ProcessColumnsInfo(); 67 68 # Generate output files... 69 my($Index, $TextFile); 70 if (@TextFilesList > 1) { 71 print "Processing text files...\n"; 72 } 73 for $Index (0 .. $#TextFilesList) { 74 if ($TextFilesOkay[$Index]) { 75 $TextFile = $TextFilesList[$Index]; 76 if (@TextFilesList > 1) { 77 print "\nProcessing file $TextFile...\n"; 78 } 79 else { 80 print "Processing file $TextFile...\n" 81 } 82 ListTextFileInfo($Index); 83 } 84 } 85 86 ListTotalSizeOfFiles(); 87 88 print "$ScriptName:Done...\n\n"; 89 90 $EndTime = new Benchmark; 91 $TotalTime = timediff ($EndTime, $StartTime); 92 print "Total time: ", timestr($TotalTime), "\n"; 93 94 ############################################################################### 95 96 # List appropriate information... 97 sub ListTextFileInfo { 98 my($Index) = @_; 99 my($TextFile, $Line, $InDelim, $LineCount, $EmptyLinesCount, $EmptyColDataLinesCount, $GreaterThanMaxColLinesCount, $Label, $Value, $ColNum, $EmptyColValueFound, $PrintTextLine, $NonNumericalDataFound, @ColLabels, @LineWords, %EmptyColValuesCountMap, %NonEmptyColValuesCountMap, %SpecifiedNonNumericalColValuesCountMap, %NonNumericalColValuesCountMap, %NumericalColValuesCountMap,); 100 101 $TextFile = $TextFilesList[$Index]; 102 $InDelim = $TextFilesInDelim[$Index]; 103 @ColLabels = @{$TextFilesColLabels[$Index]}; 104 105 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n"; 106 107 $LineCount = 0; 108 $EmptyLinesCount = 0; 109 $EmptyColDataLinesCount = 0; 110 $GreaterThanMaxColLinesCount = 0; 111 112 %EmptyColValuesCountMap = (); 113 %NonEmptyColValuesCountMap = (); 114 %SpecifiedNonNumericalColValuesCountMap = (); 115 %NonNumericalColValuesCountMap = (); 116 %NumericalColValuesCountMap = (); 117 118 if ($ParseLines) { 119 # Skip over column labels from old file... 120 if (<TEXTFILE>) { 121 $LineCount++; 122 LINE: while ($Line = <TEXTFILE>) { 123 $LineCount++; 124 $PrintTextLine = 0; 125 $Line =~ s/(\r\n)|(\r)|\n//g; 126 @LineWords = quotewords($InDelim, 0, $Line); 127 if ($CountEmpty) { 128 # Count lines with no data... 129 if (!@LineWords) { 130 $EmptyLinesCount++; 131 if ($DetailLevel >= 2) { 132 print "Line number $LineCount is empty...\n"; 133 } 134 next LINE; 135 } 136 # Count lines with empty data for some columns... 137 $EmptyColValueFound = 0; 138 VALUE: for $Value (@LineWords) { 139 if (!IsNotEmpty($Value)) { 140 $EmptyColValueFound = 1; 141 next VALUE; 142 } 143 } 144 if ($EmptyColValueFound) { 145 $EmptyColDataLinesCount++; 146 if ($DetailLevel >= 2) { 147 print "Line number $LineCount contains empty column value(s)...\n"; 148 } 149 $PrintTextLine = ($DetailLevel >= 3) ? 1 : 0; 150 } 151 # Count lines with columns greater than the column label line... 152 if (@LineWords > @ColLabels) { 153 $GreaterThanMaxColLinesCount++; 154 if ($DetailLevel >= 2) { 155 print "Line number $LineCount contains more than ", scalar(@ColLabels), " columns...\n"; 156 } 157 $PrintTextLine = ($DetailLevel >= 3) ? 1 : 0; 158 } 159 # Count empty values for each coulmn... 160 for $ColNum (0 .. $#LineWords) { 161 if ($ColNum < @ColLabels) { 162 $Label = $ColLabels[$ColNum]; 163 if (IsNotEmpty($LineWords[$ColNum])) { 164 if (exists($NonEmptyColValuesCountMap{$Label})) { 165 $NonEmptyColValuesCountMap{$Label} += 1; 166 } 167 else { 168 $NonEmptyColValuesCountMap{$Label} = 1; 169 } 170 } 171 else { 172 $PrintTextLine = ($DetailLevel >= 3) ? 1 : 0; 173 if (exists($EmptyColValuesCountMap{$Label})) { 174 $EmptyColValuesCountMap{$Label} += 1; 175 } 176 else { 177 $EmptyColValuesCountMap{$Label} = 1; 178 } 179 } 180 } 181 } 182 } 183 if ($CheckData) { 184 for $ColNum (0 .. $#LineWords) { 185 if ($ColNum < @ColLabels) { 186 if (IsNumerical($LineWords[$ColNum])) { 187 $Label = $ColLabels[$ColNum]; 188 if (exists($NumericalColValuesCountMap{$Label})) { 189 $NumericalColValuesCountMap{$Label} += 1; 190 } 191 else { 192 $NumericalColValuesCountMap{$Label} = 1; 193 } 194 } 195 else { 196 $Label = $ColLabels[$ColNum]; 197 if (IsNotEmpty($LineWords[$ColNum])) { 198 if (exists($NonNumericalColValuesCountMap{$Label})) { 199 $NonNumericalColValuesCountMap{$Label} += 1; 200 } 201 else { 202 $NonNumericalColValuesCountMap{$Label} = 1; 203 } 204 } 205 } 206 } 207 } 208 } 209 if ($CheckNumericalData) { 210 $NonNumericalDataFound = 0; 211 for $ColNum (@{$TextFilesNumericalDataColNums[$Index]}) { 212 if ($ColNum < @LineWords) { 213 if (!IsNumerical($LineWords[$ColNum])) { 214 $NonNumericalDataFound = 1; 215 $Label = $ColLabels[$ColNum]; 216 if (exists($SpecifiedNonNumericalColValuesCountMap{$Label})) { 217 $SpecifiedNonNumericalColValuesCountMap{$Label} += 1; 218 } 219 else { 220 $SpecifiedNonNumericalColValuesCountMap{$Label} = 1; 221 } 222 } 223 } 224 } 225 if ($NonNumericalDataFound) { 226 $PrintTextLine = ($DetailLevel >= 3) ? 1 : 0; 227 if ($DetailLevel >=2 ) { 228 print "Line number $LineCount contains non-numerical data for some specified column(s)...\n"; 229 } 230 } 231 } 232 if ($PrintTextLine) { 233 print "Line $LineCount: $Line\n\n"; 234 } 235 } 236 } 237 } 238 else { 239 while (<TEXTFILE>) { 240 $LineCount++; 241 } 242 } 243 close TEXTFILE; 244 print "\nNumber of lines: $LineCount\n"; 245 print "Number of columns: $TextFilesColCount[$Index]\n"; 246 print "Column labels: ", JoinWords(\@ColLabels, ", ", 1), "\n"; 247 if ($CountEmpty) { 248 print "\nNumber of lines with no data: $EmptyLinesCount\n"; 249 print "Number of lines with some missing column data: $EmptyColDataLinesCount\n"; 250 print "Number of lines containing greater than ", scalar(@ColLabels), " columns: $GreaterThanMaxColLinesCount\n"; 251 PrintDataInformation("Number of non-empty values for each column(s)", \@ColLabels, \%NonEmptyColValuesCountMap); 252 PrintDataInformation("Number of empty values for each column(s)", \@ColLabels, \%EmptyColValuesCountMap); 253 } 254 if ($CheckData) { 255 print "\n"; 256 PrintDataInformation("Number of non-numerical data values for each column(s)", \@ColLabels, \%NonNumericalColValuesCountMap); 257 PrintDataInformation("Number of numerical data values for each column(s)", \@ColLabels, \%NumericalColValuesCountMap); 258 print "\n"; 259 } 260 if ($CheckNumericalData && @{$TextFilesNumericalDataColLabels[$Index]}) { 261 PrintDataInformation("Number of non-numerical data values for each column(s)", \@{$TextFilesNumericalDataColLabels[$Index]}, \%SpecifiedNonNumericalColValuesCountMap); 262 } 263 264 # File size and modification information... 265 print "\nFile size: ", FormatFileSize($TextFilesSize[$Index]), " \n"; 266 print "Last modified: ", $TextFilesLastModified[$Index], " \n"; 267 } 268 269 # Total size of all the fiels... 270 sub ListTotalSizeOfFiles { 271 my($FileOkayCount, $TotalSize, $Index); 272 273 $FileOkayCount = 0; 274 $TotalSize = 0; 275 276 for $Index (0 .. $#TextFilesList) { 277 if ($TextFilesOkay[$Index]) { 278 $FileOkayCount++; 279 $TotalSize += $TextFilesSize[$Index]; 280 } 281 } 282 if ($FileOkayCount > 1) { 283 print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n"; 284 } 285 } 286 287 288 # List data information... 289 sub PrintDataInformation { 290 my($InfoLabel, $DataLabelRef, $DataLabelToValueMapRef) = @_; 291 my($Line, $Label); 292 293 $Line = ""; 294 for $Label (@{$DataLabelRef}) { 295 $Line .= " \"$Label\" - " . (exists($DataLabelToValueMapRef->{$Label}) ? $DataLabelToValueMapRef->{$Label} : 0) . ","; 296 } 297 $Line =~ s/\,$//g; 298 print "$InfoLabel: $Line\n"; 299 } 300 301 # Process option values... 302 sub ProcessOptions { 303 $DetailLevel = $Options{detail}; 304 305 $ParseLines = ($Options{all} || $Options{empty} || $Options{numericaldatacols}) ? 1 : 0; 306 $CountEmpty = ($Options{all} || $Options{empty}) ? 1 : 0; 307 $CheckData = ($Options{all} || $Options{datacheck}) ? 1 : 0; 308 $CheckNumericalData = ($Options{all} || $Options{numericaldatacols}) ? 1 : 0; 309 310 @SpecifiedNumericalDataCols = (); 311 if ($Options{numericaldatacols}) { 312 @SpecifiedNumericalDataCols = split ",", $Options{numericaldatacols}; 313 if ($Options{mode} =~ /^colnum$/i) { 314 my($ColNum); 315 for $ColNum (@SpecifiedNumericalDataCols) { 316 if (!IsPositiveInteger($ColNum)) { 317 die "Error: Invalid value $ColNum specified using \"--numericaldatacols\" option: Allowed values: > 0\n"; 318 } 319 } 320 } 321 } 322 323 } 324 325 # Retrieve information about input text files... 326 sub RetrieveTextFilesInfo { 327 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $ColNum, $ColLabel, $ModifiedTimeString, $ModifiedDateString); 328 329 @TextFilesOkay = (); 330 @TextFilesColCount = (); @TextFilesColLabels = (); 331 @TextFilesColLabelToNumMap = (); 332 @TextFilesInDelim = (); 333 @TextFilesSize = (); 334 @TextFilesLastModified = (); 335 336 FILELIST: for $Index (0 .. $#TextFilesList) { 337 $TextFile = $TextFilesList[$Index]; 338 $TextFilesOkay[$Index] = 0; 339 $TextFilesColCount[$Index] = 0; 340 $TextFilesInDelim[$Index] = ""; 341 $TextFilesSize[$Index] = 0; 342 $TextFilesLastModified[$Index] = ''; 343 @{$TextFilesColLabels[$Index]} = (); 344 %{$TextFilesColLabelToNumMap[$Index]} = (); 345 if (!(-e $TextFile)) { 346 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; 347 next FILELIST; 348 } 349 if (!CheckFileType($TextFile, "csv tsv")) { 350 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n"; 351 next FILELIST; 352 } 353 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 354 if ($FileExt =~ /^tsv$/i) { 355 $InDelim = "\t"; 356 } 357 else { 358 $InDelim = "\,"; 359 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 360 warn "Warning: Ignoring file $TextFile: The value specified, $Options{indelim}, for option \"--indelim\" is not valid for csv files\n"; 361 next FILELIST; 362 } 363 if ($Options{indelim} =~ /^semicolon$/i) { 364 $InDelim = "\;"; 365 } 366 } 367 368 if (!open TEXTFILE, "$TextFile") { 369 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; 370 next FILELIST; 371 } 372 373 $Line = GetTextLine(\*TEXTFILE); 374 @ColLabels = quotewords($InDelim, 0, $Line); 375 close TEXTFILE; 376 377 $TextFilesOkay[$Index] = 1; 378 $TextFilesInDelim[$Index] = $InDelim; 379 380 $TextFilesColCount[$Index] = @ColLabels; 381 push @{$TextFilesColLabels[$Index]}, @ColLabels; 382 for $ColNum (0 .. $#ColLabels) { 383 $ColLabel = $ColLabels[$ColNum]; 384 $TextFilesColLabelToNumMap[$Index]{$ColLabel} = $ColNum; 385 } 386 $TextFilesSize[$Index] = FileSize($TextFile); 387 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($TextFile); 388 $TextFilesLastModified[$Index] = "$ModifiedTimeString; $ModifiedDateString"; 389 } 390 391 } 392 393 # Make sure specified numerical data columns are okay... 394 sub ProcessColumnsInfo { 395 my($Index, $TextFile); 396 397 @TextFilesNumericalDataColNums = (); 398 @TextFilesNumericalDataColLabels = (); 399 FILELIST: for $Index (0 .. $#TextFilesList) { 400 $TextFile = $TextFilesList[$Index]; 401 @{$TextFilesNumericalDataColNums[$Index]} = (); 402 @{$TextFilesNumericalDataColLabels[$Index]} = (); 403 404 if ($TextFilesOkay[$Index]) { 405 my($SpecifiedColNum, $ColNum, $ColLabel, @SpecifiedColNums, @SpecifiedColLabels); 406 @SpecifiedColNums = (); 407 if ($Options{mode} =~ /^colnum$/i) { 408 for $SpecifiedColNum (@SpecifiedNumericalDataCols) { 409 if ($SpecifiedColNum <= $TextFilesColCount[$Index]) { 410 $ColNum = $SpecifiedColNum - 1; 411 push @SpecifiedColNums, $ColNum; 412 push @SpecifiedColLabels, $TextFilesColLabels[$Index][$ColNum]; 413 } 414 } 415 } 416 else { 417 for $ColLabel (@SpecifiedNumericalDataCols) { 418 if (exists($TextFilesColLabelToNumMap[$Index]{$ColLabel})) { 419 $ColNum = $TextFilesColLabelToNumMap[$Index]{$ColLabel}; 420 push @SpecifiedColNums, $ColNum; 421 push @SpecifiedColLabels, $ColLabel; 422 } 423 } 424 } 425 if (@SpecifiedColNums) { 426 push @{$TextFilesNumericalDataColNums[$Index]}, @SpecifiedColNums; 427 push @{$TextFilesNumericalDataColLabels[$Index]}, @SpecifiedColLabels; 428 } 429 } 430 } 431 } 432 433 434 # Setup script usage and retrieve command line arguments specified using various options... 435 sub SetupScriptUsage { 436 437 # Retrieve all the options... 438 %Options = (); 439 $Options{detail} = 1; 440 $Options{mode} = "colnum"; 441 $Options{indelim} = "comma"; 442 if (!GetOptions(\%Options, "all|a", "count|c", "datacheck", "detail|d=i", "empty|e", "help|h", "indelim=s", "mode|m=s", "numericaldatacols|n=s", "workingdir|w=s")) { 443 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 444 } 445 if ($Options{workingdir}) { 446 if (! -d $Options{workingdir}) { 447 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 448 } 449 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 450 } 451 if ($Options{mode} !~ /^(colnum|collabel)$/i) { 452 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum or collabel\n"; 453 } 454 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 455 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; 456 } 457 if (!IsPositiveInteger($Options{detail})) { 458 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; 459 } 460 } 461