1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: AnalyzeTextFilesData.pl,v $ 4 # $Date: 2008/02/14 00:19:54 $ 5 # $Revision: 1.23 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 use 5.006; 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use TextUtil; 37 use StatisticsUtil; 38 39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 40 41 # Autoflush STDOUT 42 $| = 1; 43 44 # Starting message... 45 $ScriptName = basename($0); 46 print "\n$ScriptName: Starting...\n\n"; 47 $StartTime = new Benchmark; 48 49 # Get the options and setup script... 50 SetupScriptUsage(); 51 if ($Options{help} || @ARGV < 1) { 52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 53 } 54 55 my(@TextFilesList); 56 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); 57 58 my($DetailLevel, $OutDelim, $OutQuote, $Precision, $CheckData, $KLargest, $KSmallest, $TrimFraction, $AllColumnPairs, $NumOfBins, @BinRange, @SpecifiedColumns, @SpecifiedColumnPairs, $FileNameMode, @SpecifiedStatisticalFunctions, %SpecifiedStatisticalFunctionsMap); 59 print "Processing options...\n"; 60 ProcessOptions(); 61 62 # Collect column information for all the text files... 63 print "Checking input text file(s)...\n"; 64 my(@TextFilesOkay, @TextFilesColCount, @TextFilesColLabels, @TextFilesColLabelToNumMap, @TextFilesInDelim, @TextFilesOutFileRoot, @TextFilesOutFileExt); 65 RetrieveTextFilesInfo(); 66 67 # Make sure the specified columns exists in text files... 68 my(@TextFilesColNumsToAnalyze, @TextFilesColPairs1ToAnalyze, @TextFilesColPairs2ToAnalyze, @TextFilesUniqueColNumsToAnalyze); 69 ProcessColumnsInfo(); 70 71 # Generate output files... 72 my($Index, $TextFile); 73 if (@TextFilesList > 1) { 74 print "Processing text files...\n"; 75 } 76 for $Index (0 .. $#TextFilesList) { 77 if ($TextFilesOkay[$Index]) { 78 $TextFile = $TextFilesList[$Index]; 79 if (@TextFilesList > 1) { 80 print "\nProcessing file $TextFile...\n"; 81 } else { 82 print "Processing file $TextFile...\n" 83 } 84 AnalyzeTextFile($Index); 85 } 86 } 87 88 print "$ScriptName:Done...\n\n"; 89 90 $EndTime = new Benchmark; 91 $TotalTime = timediff ($EndTime, $StartTime); 92 print "Total time: ", timestr($TotalTime), "\n"; 93 94 ############################################################################### 95 96 # Analyze data... 97 sub AnalyzeTextFile { 98 my($Index) = @_; 99 my($TextFile, $Line, $InDelim, $ColNum, $Value, @LineWords, @ColNumsToAnalyze, %ColValuesToAnalyzeMap); 100 101 $TextFile = $TextFilesList[$Index]; 102 $InDelim = $TextFilesInDelim[$Index]; 103 @ColNumsToAnalyze = @{$TextFilesUniqueColNumsToAnalyze[$Index]}; 104 %ColValuesToAnalyzeMap = (); 105 for $ColNum (@ColNumsToAnalyze) { 106 @{$ColValuesToAnalyzeMap{$ColNum}} = (); 107 } 108 109 my($LineCount, $InvalidLineCount, @InvalidColLabels); 110 111 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n"; 112 # Skip over column labels line in text file and collect appropriate column data 113 # for analysis... 114 $Line = GetTextLine(\*TEXTFILE); 115 $LineCount = 1; 116 $InvalidLineCount = 0; 117 while ($Line = GetTextLine(\*TEXTFILE)) { 118 $LineCount++; 119 @LineWords = quotewords($InDelim, 0, $Line); 120 @InvalidColLabels = (); 121 COLNUM: for $ColNum (@ColNumsToAnalyze) { 122 $Value = $LineWords[$ColNum]; 123 if ($CheckData) { 124 if (!IsNumerical($Value)) { 125 push @InvalidColLabels, $TextFilesColLabels[$Index][$ColNum]; 126 next COLNUM; 127 } 128 } 129 push @{$ColValuesToAnalyzeMap{$ColNum}}, $Value; 130 } 131 if (@InvalidColLabels) { 132 $InvalidLineCount++; 133 if ($DetailLevel >=4 ) { 134 print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for column(s) - ", JoinWords(\@InvalidColLabels, ", ", 0)," - to be analyzed: $Line \n"; 135 } 136 elsif ($DetailLevel >= 3) { 137 print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for column(s) - ", JoinWords(\@InvalidColLabels, ", ", 0)," - to be analyzed...\n"; 138 } 139 elsif ($DetailLevel >= 2) { 140 print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for columns to be analyzed...\n"; 141 } 142 } 143 } 144 if ($InvalidLineCount && ($DetailLevel >= 1)) { 145 print "Non-numerical or empty data present in $InvalidLineCount line(s)...\n"; 146 } 147 close TEXTFILE; 148 149 # Perform the analysis... 150 my(@SpecifiedFunctionNames, $SpecifiedFunction); 151 @SpecifiedFunctionNames = (); 152 153 for $SpecifiedFunction (@SpecifiedStatisticalFunctions) { 154 if ($SpecifiedFunction !~ /^(Covariance|Correlation|Frequency|Rsquare|StandardScores|StandardScoresN)$/i) { 155 push @SpecifiedFunctionNames, $SpecifiedStatisticalFunctionsMap{lc($SpecifiedFunction)}; 156 } 157 } 158 if (@SpecifiedFunctionNames) { 159 PerformAnalysis($Index, \@SpecifiedFunctionNames, \%ColValuesToAnalyzeMap) 160 } 161 if (exists($SpecifiedStatisticalFunctionsMap{covariance}) || exists($SpecifiedStatisticalFunctionsMap{correlation}) || exists($SpecifiedStatisticalFunctionsMap{rsquare})) { 162 if ($AllColumnPairs) { 163 PerformMatrixAnalysis($Index, \%ColValuesToAnalyzeMap); 164 } 165 else { 166 # Perform pairwise analysis for specified columns and write out calculated values - correlation 167 # rsquare, or covariance - in the same file. 168 PerformColumnPairAnalysis($Index, \%ColValuesToAnalyzeMap); 169 } 170 } 171 if (exists($SpecifiedStatisticalFunctionsMap{standardscores}) || exists($SpecifiedStatisticalFunctionsMap{standardscoresn}) ) { 172 PerformStandardScoresAnalysis($Index, \%ColValuesToAnalyzeMap); 173 } 174 if (exists($SpecifiedStatisticalFunctionsMap{frequency})) { 175 PerformFrequencyAnalysis($Index, \%ColValuesToAnalyzeMap); 176 } 177 } 178 179 # Calculate values for various statistical functions... 180 sub PerformAnalysis { 181 my($Index, $SpecifiedFunctionNamesRef, $ColValuesToAnalyzeMapRef) = @_; 182 my($NewTextFile, $Line, $SpecifiedFunction, $Label, @ColLabels, @ColNumsToAnalyze); 183 184 $NewTextFile = $TextFilesOutFileRoot[$Index] . $FileNameMode . "." . $TextFilesOutFileExt[$Index]; 185 186 print "Generating new text file $NewTextFile...\n"; 187 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 188 189 # Write out column labels... 190 @ColLabels = (); 191 push @ColLabels, "ColumnID"; 192 for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) { 193 $Label = $SpecifiedFunction; 194 if ($SpecifiedFunction =~ /^(KLargest|KSmallest)$/i) { 195 my($KthValue); 196 $KthValue = ($SpecifiedFunction =~ /^KLargest$/i) ? $KLargest : $KSmallest; 197 $Label = AddNumberSuffix($KthValue) . "$SpecifiedFunction"; 198 $Label =~ s/K//g; 199 } 200 elsif ($SpecifiedFunction =~ /^TrimMean$/i) { 201 $Label = "${SpecifiedFunction}($TrimFraction)"; 202 } 203 push @ColLabels, $Label; 204 } 205 $Line = JoinWords(\@ColLabels, $OutDelim, $OutQuote); 206 print NEWTEXTFILE "$Line\n"; 207 208 # Go over each column to be analyzed... 209 @ColNumsToAnalyze = @{$TextFilesColNumsToAnalyze[$Index]}; 210 211 # Turn off "strict"; otherwise, invoking statistical functions using function name string 212 # is problematic. 213 no strict; 214 215 my($ColValuesRef, $ColNum, $Value, @RowValues, %CalculatedValues); 216 %CalculatedValues = (); 217 for $ColNum (@ColNumsToAnalyze) { 218 @RowValues = (); 219 # Setup column id... 220 push @RowValues, $TextFilesColLabels[$Index][$ColNum]; 221 $ColValuesRef = \@{$ColValuesToAnalyzeMapRef->{$ColNum}}; 222 FUNCTIONNAME: for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) { 223 $Value = ""; 224 if (!@{$ColValuesToAnalyzeMapRef->{$ColNum}}) { 225 # Invalid column values... 226 push @RowValues, $Value; 227 next FUNCTIONNAME; 228 } 229 if ($SpecifiedFunction =~ /^Count$/i) { 230 $Value = @{$ColValuesToAnalyzeMapRef->{$ColNum}}; 231 } 232 elsif ($SpecifiedFunction =~ /^KLargest$/i) { 233 $Value = &$SpecifiedFunction($ColValuesRef, $KLargest); 234 } 235 elsif ($SpecifiedFunction =~ /^KSmallest$/i) { 236 $Value = &$SpecifiedFunction($ColValuesRef, $KSmallest); 237 } 238 elsif ($SpecifiedFunction =~ /^StandardDeviation$/i) { 239 if (exists($CalculatedValues{$ColNum}{StandardDeviation})) { 240 $Value = $CalculatedValues{$ColNum}{StandardDeviation}; 241 } 242 else { 243 $Value = &$SpecifiedFunction($ColValuesRef); 244 $CalculatedValues{$ColNum}{StandardDeviation} = $Value; 245 } 246 } 247 elsif ($SpecifiedFunction =~ /^StandardError$/i) { 248 if (!exists($CalculatedValues{$ColNum}{StandardDeviation})) { 249 $Value = StandardDeviation($ColValuesRef); 250 $CalculatedValues{$ColNum}{StandardDeviation} = $Value; 251 } 252 if (defined $CalculatedValues{$ColNum}{StandardDeviation}) { 253 $Value = &$SpecifiedFunction($CalculatedValues{$ColNum}{StandardDeviation}, @{$ColValuesToAnalyzeMapRef->{$ColNum}}); 254 } 255 } 256 elsif ($SpecifiedFunction =~ /^TrimMean$/i) { 257 $Value = &$SpecifiedFunction($ColValuesRef, $TrimFraction); 258 } 259 else { 260 $Value = &$SpecifiedFunction($ColValuesRef); 261 } 262 # Format the output value. And add zero to get rid of tariling zeros... 263 $Value = (defined($Value) && length($Value)) ? (sprintf("%.${Precision}f", $Value) + 0) : ""; 264 push @RowValues, $Value; 265 } 266 $Line = JoinWords(\@RowValues, $OutDelim, $OutQuote); 267 print NEWTEXTFILE "$Line\n"; 268 } 269 close NEWTEXTFILE; 270 } 271 272 # Calculate covariance, correlation, rsquare for specified column pairs.... 273 sub PerformColumnPairAnalysis { 274 my($Index, $ColValuesToAnalyzeMapRef) = @_; 275 my($NewTextFile, @ColLabels, $Line, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance); 276 $CalculateCorrelation = exists($SpecifiedStatisticalFunctionsMap{correlation}) ? 1 : 0; 277 $CalculateRSquare = exists($SpecifiedStatisticalFunctionsMap{rsquare}) ? 1 : 0; 278 $CalculateCovariance = exists($SpecifiedStatisticalFunctionsMap{covariance}) ? 1 : 0; 279 280 $NewTextFile = $TextFilesOutFileRoot[$Index] . "ColumnPairsAnalysis." . $TextFilesOutFileExt[$Index]; 281 print "Generating new text file $NewTextFile...\n"; 282 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 283 284 # Write out the column labels... 285 @ColLabels = (); 286 push @ColLabels, ("ColumnID1", "ColumnID2"); 287 if ($CalculateCorrelation || $CalculateRSquare) { 288 push @ColLabels, "Correlation"; 289 if ($CalculateRSquare) { 290 push @ColLabels, "RSquare"; 291 } 292 } 293 if ($CalculateCovariance) { 294 push @ColLabels, "Covariance"; 295 } 296 $Line = JoinWords(\@ColLabels, $OutDelim, $OutQuote); 297 print NEWTEXTFILE "$Line\n"; 298 299 # Go over each column pair... 300 my($CorrelationValue, $RSquareValue, $CovarianceValue, $ColIndex, $ColNum1, $ColNum2, $ColValuesRef1, $ColValuesRef2, @ColPairs1ToAnalyze, @ColPairs2ToAnalyze, @RowValues, $Value); 301 302 @ColPairs1ToAnalyze = @{$TextFilesColPairs1ToAnalyze[$Index]}; 303 @ColPairs2ToAnalyze = @{$TextFilesColPairs2ToAnalyze[$Index]}; 304 for $ColIndex (0 .. $#ColPairs1ToAnalyze) { 305 @RowValues = (); 306 $ColNum1 = $ColPairs1ToAnalyze[$ColIndex]; 307 $ColNum2 = $ColPairs2ToAnalyze[$ColIndex]; 308 $ColValuesRef1 = \@{$ColValuesToAnalyzeMapRef->{$ColNum1}}; 309 $ColValuesRef2 = \@{$ColValuesToAnalyzeMapRef->{$ColNum2}}; 310 311 # Setup column ids... 312 push @RowValues, $TextFilesColLabels[$Index][$ColNum1]; 313 push @RowValues, $TextFilesColLabels[$Index][$ColNum2]; 314 315 if (@$ColValuesRef1 != @$ColValuesRef2) { 316 # Print a warning... 317 warn "Warning: Skipping analysis for column pair $TextFilesColLabels[$Index][$ColNum1], $TextFilesColLabels[$Index][$ColNum2]: Number of valid data values must be same.\n"; 318 if ($CalculateCorrelation || $CalculateRSquare) { 319 push @RowValues, ""; 320 if ($CalculateRSquare) { 321 push @RowValues, ""; 322 } 323 } 324 if ($CalculateCovariance) { 325 push @RowValues, ""; 326 } 327 } 328 else { 329 # Calculate appropriate value... 330 if ($CalculateCorrelation || $CalculateRSquare) { 331 $CorrelationValue = Correlation($ColValuesRef1, $ColValuesRef2); 332 $Value = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.${Precision}f", $CorrelationValue) + 0) : ""; 333 push @RowValues, $Value; 334 if ($CalculateRSquare) { 335 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : ""; 336 $Value = (length($RSquareValue)) ? (sprintf("%.${Precision}f", $RSquareValue) + 0) : ""; 337 push @RowValues, $Value; 338 } 339 } 340 if ($CalculateCovariance) { 341 $CovarianceValue = Covariance($ColValuesRef1, $ColValuesRef2); 342 $Value = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.${Precision}f", $CovarianceValue) + 0) : ""; 343 push @RowValues, $Value; 344 } 345 } 346 $Line = JoinWords(\@RowValues, $OutDelim, $OutQuote); 347 print NEWTEXTFILE "$Line\n"; 348 } 349 close NEWTEXTFILE; 350 } 351 352 # Generate histogram numbers... 353 sub PerformFrequencyAnalysis { 354 my($Index, $ColValuesToAnalyzeMapRef) = @_; 355 my($NewTextFile, $ColLabel, @ColLabels, @RowValues, $Line, $ColNum, @ColNumsToAnalyze, $ColValuesRef, $BinValue, $FrequencyValue, $Value, %FrequencyMap); 356 357 @ColNumsToAnalyze = @{$TextFilesColNumsToAnalyze[$Index]}; 358 for $ColNum (@ColNumsToAnalyze) { 359 $NewTextFile = $TextFilesOutFileRoot[$Index] . $TextFilesColLabels[$Index][$ColNum] . "FrequencyAnalysis." . $TextFilesOutFileExt[$Index]; 360 print "Generating new text file $NewTextFile...\n"; 361 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 362 363 # Write out the column labels... 364 @ColLabels = (); 365 push @ColLabels , ("Bins", "Frequency"); 366 $Line = JoinWords(\@ColLabels, $OutDelim, $OutQuote); 367 print NEWTEXTFILE "$Line\n"; 368 369 #Calculate and write out frequency values... 370 %FrequencyMap = (); 371 $ColValuesRef = \@{$ColValuesToAnalyzeMapRef->{$ColNum}}; 372 if (@$ColValuesRef) { 373 if (@BinRange) { 374 %FrequencyMap = Frequency($ColValuesRef, \@BinRange); 375 } 376 else { 377 %FrequencyMap = Frequency($ColValuesRef, $NumOfBins); 378 } 379 } 380 for $BinValue (sort { $a <=> $b } keys %FrequencyMap) { 381 $FrequencyValue = $FrequencyMap{$BinValue}; 382 383 @RowValues = (); 384 $Value = (length($BinValue)) ? (sprintf("%.${Precision}f", $BinValue) + 0) : ""; 385 push @RowValues, $Value; 386 $Value = (length($FrequencyValue)) ? (sprintf("%.${Precision}f", $FrequencyValue) + 0) : ""; 387 push @RowValues, $Value; 388 389 $Line = JoinWords(\@RowValues, $OutDelim, $OutQuote); 390 print NEWTEXTFILE "$Line\n"; 391 } 392 close NEWTEXTFILE; 393 } 394 } 395 396 # Calculate covariance, correlation/rsquare matrices.... 397 sub PerformMatrixAnalysis { 398 my($Index, $ColValuesToAnalyzeMapRef) = @_; 399 my($CorrelationTextFile, $CovarianceTextFile, $RSquareTextFile, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance); 400 401 $CalculateCorrelation = exists($SpecifiedStatisticalFunctionsMap{correlation}) ? 1 : 0; 402 $CalculateRSquare = exists($SpecifiedStatisticalFunctionsMap{rsquare}) ? 1 : 0; 403 $CalculateCovariance = exists($SpecifiedStatisticalFunctionsMap{covariance}) ? 1 : 0; 404 405 $CorrelationTextFile = $TextFilesOutFileRoot[$Index] . "CorrelationMatrix." . $TextFilesOutFileExt[$Index]; 406 $RSquareTextFile = $TextFilesOutFileRoot[$Index] . "RSquareMatrix." . $TextFilesOutFileExt[$Index]; 407 $CovarianceTextFile = $TextFilesOutFileRoot[$Index] . "CovarianceMatrix." . $TextFilesOutFileExt[$Index]; 408 409 my($TextFilesList, $Delimiter); 410 $TextFilesList = ""; 411 if ($CalculateCorrelation || $CalculateRSquare) { 412 $TextFilesList = $CorrelationTextFile; 413 if ($CalculateRSquare) { 414 $TextFilesList .= ", $CorrelationTextFile"; 415 } 416 } 417 $Delimiter = length($TextFilesList) ? "," : ""; 418 if ($CalculateCovariance) { 419 $TextFilesList .= "${Delimiter} ${CorrelationTextFile}"; 420 } 421 if ($TextFilesList =~ /\,/) { 422 print "Generating new text files $TextFilesList ...\n" 423 } 424 else { 425 print "Generating new text file $TextFilesList ...\n" 426 } 427 if ($CalculateCorrelation || $CalculateRSquare) { 428 open CORRELATIONTEXTFILE, ">$CorrelationTextFile" or die "Error: Can't open $CorrelationTextFile: $! \n"; 429 if ($CalculateRSquare) { 430 open RSQUARETEXTFILE, ">$RSquareTextFile" or die "Error: Can't open $RSquareTextFile: $! \n"; 431 } 432 } 433 if ($CalculateCovariance) { 434 open COVARIANCETEXTFILE, ">$CovarianceTextFile" or die "Error: Can't open $CovarianceTextFile: $! \n"; 435 } 436 437 my($Line, $Value, $CorrelationValue, $RSquareValue, $CovarianceValue, $ColNum, $ColNum1, $ColNum2, $ColValuesRef1, $ColValuesRef2, @ColLabels, @CovarianceRowValues, @CorrelationRowValues, @RSquareRowValues); 438 439 # Write out the column labels... 440 @ColLabels = (); 441 push @ColLabels, ""; 442 for $ColNum (0 .. ($TextFilesColCount[$Index] - 1)) { 443 push @ColLabels, $TextFilesColLabels[$Index][$ColNum]; 444 } 445 $Line = JoinWords(\@ColLabels, $OutDelim, $OutQuote); 446 if ($CalculateCorrelation || $CalculateRSquare) { 447 print CORRELATIONTEXTFILE "$Line\n"; 448 if ($CalculateRSquare) { 449 print RSQUARETEXTFILE "$Line\n"; 450 } 451 } 452 if ($CalculateCovariance) { 453 print COVARIANCETEXTFILE "$Line\n"; 454 } 455 456 # Due to symmetric nature of these matrices, only one half needs to be 457 # calculated. So, just calculate the lower half and copy it to upper half... 458 my(%CorrelationMatrixMap, %RSquareMatrixMap, %CovarianceMatrixMap); 459 460 %CorrelationMatrixMap = (); %RSquareMatrixMap = (); %CovarianceMatrixMap = (); 461 for $ColNum1 (0 .. ($TextFilesColCount[$Index] - 1)) { 462 for $ColNum2 (0 .. $ColNum1) { 463 $ColValuesRef1 = \@{$ColValuesToAnalyzeMapRef->{$ColNum1}}; 464 $ColValuesRef2 = \@{$ColValuesToAnalyzeMapRef->{$ColNum2}}; 465 if ($CalculateCorrelation || $CalculateRSquare) { 466 $CorrelationValue = Correlation($ColValuesRef1, $ColValuesRef2); 467 $CorrelationValue = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.${Precision}f", $CorrelationValue) + 0) : ""; 468 $CorrelationMatrixMap{$ColNum1}{$ColNum2} = $CorrelationValue; 469 if ($ColNum1 != $ColNum2) { 470 $CorrelationMatrixMap{$ColNum2}{$ColNum1} = $CorrelationValue; 471 } 472 if ($CalculateRSquare) { 473 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : ""; 474 $RSquareValue = (length($RSquareValue)) ? (sprintf("%.${Precision}f", $RSquareValue) + 0) : ""; 475 $RSquareMatrixMap{$ColNum1}{$ColNum2} = $RSquareValue; 476 if ($ColNum1 != $ColNum2) { 477 $RSquareMatrixMap{$ColNum2}{$ColNum1} = $RSquareValue; 478 } 479 } 480 } 481 if ($CalculateCovariance) { 482 $CovarianceValue = Covariance($ColValuesRef1, $ColValuesRef2); 483 $CovarianceValue = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.${Precision}f", $CovarianceValue) + 0) : ""; 484 $CovarianceMatrixMap{$ColNum1}{$ColNum2} = $CovarianceValue; 485 if ($ColNum1 != $ColNum2) { 486 $CovarianceMatrixMap{$ColNum2}{$ColNum1} = $CovarianceValue; 487 } 488 } 489 } 490 } 491 492 # Write out the matrices... 493 for $ColNum1 (0 .. ($TextFilesColCount[$Index] - 1)) { 494 @CorrelationRowValues = (); 495 @RSquareRowValues = (); 496 @CovarianceRowValues = (); 497 if ($CalculateCorrelation || $CalculateRSquare) { 498 push @CorrelationRowValues, $TextFilesColLabels[$Index][$ColNum1]; 499 if ($CalculateRSquare) { 500 push @RSquareRowValues, $TextFilesColLabels[$Index][$ColNum1]; 501 } 502 } 503 if ($CalculateCovariance) { 504 push @CovarianceRowValues, $TextFilesColLabels[$Index][$ColNum1]; 505 } 506 for $ColNum2 (0 .. ($TextFilesColCount[$Index] - 1)) { 507 if ($CalculateCorrelation || $CalculateRSquare) { 508 push @CorrelationRowValues, $CorrelationMatrixMap{$ColNum1}{$ColNum2}; 509 if ($CalculateRSquare) { 510 push @RSquareRowValues, $RSquareMatrixMap{$ColNum1}{$ColNum2}; 511 } 512 } 513 if ($CalculateCovariance) { 514 push @CovarianceRowValues, $CovarianceMatrixMap{$ColNum1}{$ColNum2}; 515 } 516 } 517 if ($CalculateCorrelation || $CalculateRSquare) { 518 $Line = JoinWords(\@CorrelationRowValues, $OutDelim, $OutQuote); 519 print CORRELATIONTEXTFILE "$Line\n"; 520 if ($CalculateRSquare) { 521 $Line = JoinWords(\@RSquareRowValues, $OutDelim, $OutQuote); 522 print RSQUARETEXTFILE "$Line\n"; 523 } 524 } 525 if ($CalculateCovariance) { 526 $Line = JoinWords(\@CovarianceRowValues, $OutDelim, $OutQuote); 527 print COVARIANCETEXTFILE "$Line\n"; 528 } 529 } 530 if ($CalculateCorrelation || $CalculateRSquare) { 531 close CORRELATIONTEXTFILE; 532 if ($CalculateRSquare) { 533 close RSQUARETEXTFILE; 534 } 535 } 536 if ($CalculateCovariance) { 537 close COVARIANCETEXTFILE; 538 } 539 } 540 541 # Calculate standard scores... 542 sub PerformStandardScoresAnalysis { 543 my($Index, $ColValuesToAnalyzeMapRef) = @_; 544 my($StandardScores, $StandardScoresN, $NewTextFile, @ColLabels, $Label, $NewLine); 545 546 $StandardScores = exists($SpecifiedStatisticalFunctionsMap{standardscores}) ? 1 : 0; 547 $StandardScoresN = exists($SpecifiedStatisticalFunctionsMap{standardscoresn}) ? 1 : 0; 548 549 $NewTextFile = $TextFilesOutFileRoot[$Index] . "StandardScores." . $TextFilesOutFileExt[$Index]; 550 print "Generating new text file $NewTextFile...\n"; 551 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 552 553 my($ColValuesRef, $ColNum, @ColNumsToAnalyze); 554 # Write out column labels... 555 @ColLabels = (); 556 @ColNumsToAnalyze = @{$TextFilesColNumsToAnalyze[$Index]}; 557 for $ColNum (@ColNumsToAnalyze) { 558 $Label = $TextFilesColLabels[$Index][$ColNum]; 559 if ($StandardScores) { 560 push @ColLabels, "${Label}\(StandardScores)"; 561 } 562 if ($StandardScoresN) { 563 push @ColLabels, "${Label}\(StandardScoresN)"; 564 } 565 } 566 $NewLine = JoinWords(\@ColLabels, $OutDelim, $OutQuote); 567 print NEWTEXTFILE "$NewLine\n"; 568 569 # Go over each column to be analyzed and calculate standard deviation 570 # and mean values... 571 my(%StandardDeviationMap, %StandardDeviationNMap, %MeanMap); 572 %StandardDeviationMap = (); 573 %StandardDeviationNMap = (); 574 %MeanMap = (); 575 for $ColNum (@ColNumsToAnalyze) { 576 $ColValuesRef = \@{$ColValuesToAnalyzeMapRef->{$ColNum}}; 577 if (!exists($MeanMap{$ColNum})) { 578 $MeanMap{$ColNum} = Mean($ColValuesRef); 579 } 580 if ($StandardScores) { 581 if (!exists($StandardDeviationMap{$ColNum})) { 582 $StandardDeviationMap{$ColNum} = StandardDeviation($ColValuesRef); 583 } 584 } 585 if ($StandardScoresN) { 586 if (!exists($StandardDeviationNMap{$ColNum})) { 587 $StandardDeviationNMap{$ColNum} = StandardDeviationN($ColValuesRef); 588 } 589 } 590 } 591 # 592 # Go over each row and calculate standard scores for each column 593 # using (x[i] - mean) / (n - 1) for StandardScores and (x[i] - mean) / n 594 # for StandardScoresN; write out the calculated values as well... 595 596 my($TextFile, $InDelim, $Line, $Value, $ValueOkay, $ScoreValue, @RowValues, @LineWords); 597 $TextFile = $TextFilesList[$Index]; 598 $InDelim = $TextFilesInDelim[$Index]; 599 600 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n"; 601 $Line = GetTextLine(\*TEXTFILE); 602 while ($Line = GetTextLine(\*TEXTFILE)) { 603 @LineWords = quotewords($InDelim, 0, $Line); 604 @RowValues = (); 605 COLNUM: for $ColNum (@ColNumsToAnalyze) { 606 $Value = $LineWords[$ColNum]; 607 $ValueOkay = ($CheckData && !IsNumerical($Value)) ? 0 : 1; 608 if ($StandardScores) { 609 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$ColNum})/$StandardDeviationMap{$ColNum}) : ""; 610 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.${Precision}f", $ScoreValue) + 0) : ""; 611 push @RowValues, $ScoreValue; 612 } 613 if ($StandardScoresN) { 614 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$ColNum})/$StandardDeviationNMap{$ColNum}) : ""; 615 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.${Precision}f", $ScoreValue) + 0) : ""; 616 push @RowValues, $ScoreValue; 617 } 618 } 619 $NewLine = JoinWords(\@RowValues, $OutDelim, $OutQuote); 620 print NEWTEXTFILE "$NewLine\n"; 621 } 622 close TEXTFILE; 623 close NEWTEXTFILE; 624 } 625 626 # Make sure the specified columns exists in text files... 627 sub ProcessColumnsInfo { 628 my($Index, $TextFile, $ColNum, $NewColNum, $ColIndex, @ColNumsToAnalyze, %UniqueColNumsToAnalyzeMap); 629 630 @TextFilesColNumsToAnalyze = (); 631 @TextFilesColPairs1ToAnalyze= (); 632 @TextFilesColPairs2ToAnalyze= (); 633 @TextFilesUniqueColNumsToAnalyze = (); 634 FILELIST: for $Index (0 .. $#TextFilesList) { 635 $TextFile = $TextFilesList[$Index]; 636 637 @{$TextFilesColNumsToAnalyze[$Index]} = (); 638 @{$TextFilesColPairs1ToAnalyze[$Index]} = (); 639 @{$TextFilesColPairs2ToAnalyze[$Index]} = (); 640 @{$TextFilesUniqueColNumsToAnalyze[$Index]} = (); 641 642 %UniqueColNumsToAnalyzeMap = (); 643 644 if ($TextFilesOkay[$Index]) { 645 @ColNumsToAnalyze = (); 646 if (@SpecifiedColumns) { 647 if ($Options{colmode} =~ /^colnum$/i) { 648 for $ColNum (@SpecifiedColumns) { 649 if ($ColNum >=1 && $ColNum <= $TextFilesColCount[$Index]) { 650 $NewColNum = $ColNum -1; 651 push @ColNumsToAnalyze, $NewColNum; 652 } 653 } 654 } 655 else { 656 my($ColLabel); 657 for $ColLabel (@SpecifiedColumns) { 658 if (exists($TextFilesColLabelToNumMap[$Index]{$ColLabel})) { 659 push @ColNumsToAnalyze, $TextFilesColLabelToNumMap[$Index]{$ColLabel}; 660 } 661 } 662 } 663 } 664 elsif (defined $Options{columns} && $Options{columns} =~ /^All$/i) { 665 for $ColNum (0 .. ($TextFilesColCount[$Index] - 1)) { 666 push @ColNumsToAnalyze, $ColNum; 667 } 668 } 669 else { 670 push @ColNumsToAnalyze, 0; 671 } 672 if (@ColNumsToAnalyze) { 673 push @{$TextFilesColNumsToAnalyze[$Index]}, @ColNumsToAnalyze; 674 # Set up unique columns map as well... 675 for $ColNum (@ColNumsToAnalyze) { 676 if (!exists $UniqueColNumsToAnalyzeMap{$ColNum}) { 677 $UniqueColNumsToAnalyzeMap{$ColNum} = $ColNum; 678 } 679 } 680 } 681 else { 682 warn "Warning: Ignoring file $TextFile: None of the columns specified, @SpecifiedColumns, using \"--columns\" option exist.\n"; 683 $TextFilesOkay[$Index] = 0; 684 next FILELIST; 685 } 686 if (!$Options{overwrite} && exists($SpecifiedStatisticalFunctionsMap{frequency})) { 687 # Make sure specific frequency files don't exist... 688 my($FrequencyFile); 689 for $ColNum (@ColNumsToAnalyze) { 690 $FrequencyFile = $TextFilesOutFileRoot[$Index] . $TextFilesColLabels[$Index][$ColNum] . "FrequencyAnalysis." . $TextFilesOutFileExt[$Index]; 691 if (-e $FrequencyFile) { 692 warn "Warning: Ignoring file $TextFile: The file $FrequencyFile already exists.\n"; 693 $TextFilesOkay[$Index] = 0; 694 next FILELIST; 695 } 696 } 697 } 698 # Setup specified column pairs... 699 if (exists $SpecifiedStatisticalFunctionsMap{correlation} || exists $SpecifiedStatisticalFunctionsMap{covariance} || exists $SpecifiedStatisticalFunctionsMap{rsquare}) { 700 my(@ColPairsToAnalyze, $ColNum1, $ColNum2); 701 if (@SpecifiedColumnPairs) { 702 # Make sure both columns exist... 703 if ($Options{colmode} =~ /^colnum$/i) { 704 for ($ColIndex = 0; (($ColIndex + 1) < @SpecifiedColumnPairs); $ColIndex += 2 ) { 705 $ColNum1 = $SpecifiedColumnPairs[$ColIndex]; 706 $ColNum2 = $SpecifiedColumnPairs[$ColIndex + 1]; 707 if ($ColNum1 >=1 && $ColNum1 <= $TextFilesColCount[$Index] && $ColNum2 >=1 && $ColNum2 <= $TextFilesColCount[$Index]) { 708 $ColNum1 -= 1; 709 $ColNum2 -= 1; 710 push @ColPairsToAnalyze, ($ColNum1, $ColNum2); 711 } 712 } 713 } 714 else { 715 my($ColLabel1, $ColLabel2); 716 for ($ColIndex = 0; (($ColIndex + 1) < @SpecifiedColumnPairs); $ColIndex += 2 ) { 717 $ColLabel1 = $SpecifiedColumnPairs[$ColIndex]; 718 $ColLabel2 = $SpecifiedColumnPairs[$ColIndex + 1]; 719 if (exists($TextFilesColLabelToNumMap[$Index]{$ColLabel1}) && exists($TextFilesColLabelToNumMap[$Index]{$ColLabel2})) { 720 $ColNum1 = $TextFilesColLabelToNumMap[$Index]{$ColLabel1}; 721 $ColNum2 = $TextFilesColLabelToNumMap[$Index]{$ColLabel2}; 722 push @ColPairsToAnalyze, ($ColNum1, $ColNum2); 723 } 724 } 725 } 726 } 727 elsif ($AllColumnPairs) { 728 for $ColNum1 (0 .. ($TextFilesColCount[$Index] - 1)) { 729 for $ColNum2 (0 .. ($TextFilesColCount[$Index] - 1)) { 730 push @ColPairsToAnalyze, ($ColNum1, $ColNum2); 731 } 732 } 733 } 734 else { 735 if ($TextFilesColCount[$Index] >= 2) { 736 push @ColPairsToAnalyze, (0,1); 737 } 738 } 739 if (@ColPairsToAnalyze) { 740 if (@ColPairsToAnalyze % 2) { 741 warn "Warning: Ignoring file $TextFile: Invalid number values specified using \"--columnpairs\" option: It must contain even number of valid values.\n"; 742 $TextFilesOkay[$Index] = 0; 743 next FILELIST; 744 } 745 else { 746 for ($ColIndex = 0; $ColIndex < @ColPairsToAnalyze; $ColIndex += 2) { 747 push @{$TextFilesColPairs1ToAnalyze[$Index]}, $ColPairsToAnalyze[$ColIndex]; 748 push @{$TextFilesColPairs2ToAnalyze[$Index]}, $ColPairsToAnalyze[$ColIndex + 1]; 749 } 750 # Set up unique columns map as well... 751 for $ColNum (@ColPairsToAnalyze) { 752 if (!exists $UniqueColNumsToAnalyzeMap{$ColNum}) { 753 $UniqueColNumsToAnalyzeMap{$ColNum} = $ColNum; 754 } 755 } 756 } 757 } 758 } 759 # Setup uniques columns array... 760 push @{$TextFilesUniqueColNumsToAnalyze[$Index]}, (sort keys %UniqueColNumsToAnalyzeMap); 761 } 762 } 763 } 764 765 # Process option values... 766 sub ProcessOptions { 767 $DetailLevel = $Options{detail}; 768 769 # Setup supported statistical functions... 770 my($SupportedFunction, @SupportedStatisticaFunctions, %SupportedStatisticaFunctionsMap); 771 %SupportedStatisticaFunctionsMap = (); 772 @SupportedStatisticaFunctions = qw(Average AverageDeviation Correlation Count Covariance GeometricMean Frequency HarmonicMean KLargest KSmallest Kurtosis Maximum Minimum Mean Median Mode RSquare Skewness Sum SumOfSquares StandardDeviation StandardDeviationN StandardError StandardScores StandardScoresN TrimMean Variance VarianceN); 773 774 for $SupportedFunction (@SupportedStatisticaFunctions) { 775 $SupportedStatisticaFunctionsMap{lc($SupportedFunction)} = $SupportedFunction; 776 } 777 778 # Setup a list of functions to use for analysis... 779 my($SpecifiedFunction); 780 %SpecifiedStatisticalFunctionsMap = (); 781 @SpecifiedStatisticalFunctions = (); 782 # Check mode values... 783 if ($Options{mode} =~ /^DescriptiveStatisticsBasic$/i ) { 784 $FileNameMode = "DescriptiveStatisticsBasic"; 785 @SpecifiedStatisticalFunctions = qw(Count Maximum Minimum Mean Median StandardDeviation StandardError Variance Sum); 786 } 787 elsif ($Options{mode} =~ /^DescriptiveStatisticsAll$/i ) { 788 $FileNameMode = "DescriptiveStatisticsAll"; 789 @SpecifiedStatisticalFunctions = qw(Count Maximum Minimum Mean GeometricMean HarmonicMean TrimMean Median Mode StandardDeviation Kurtosis Skewness StandardError Variance RSquare Frequency KLargest KSmallest Sum); 790 } 791 elsif ($Options{mode} =~ /^All$/i ) { 792 $FileNameMode = "AllStatistics"; 793 @SpecifiedStatisticalFunctions = @SupportedStatisticaFunctions; 794 } 795 else { 796 $FileNameMode = "SpecifiedStatistics"; 797 # Comma delimited list of functions... 798 my($Mode, @SpecifiedFunctions, @UnsupportedSpecifiedFunctions); 799 $Mode = $Options{mode}; 800 $Mode =~ s/ //g; 801 @SpecifiedFunctions = split ",", $Mode; 802 @UnsupportedSpecifiedFunctions = (); 803 for $SpecifiedFunction (@SpecifiedFunctions) { 804 if (exists($SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)})) { 805 push @SpecifiedStatisticalFunctions, $SpecifiedFunction; 806 } 807 else { 808 push @UnsupportedSpecifiedFunctions, $SpecifiedFunction; 809 } 810 } 811 if (@UnsupportedSpecifiedFunctions) { 812 if (@UnsupportedSpecifiedFunctions > 1) { 813 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedFunctions, ", ", 0)," - for option \"-m --mode\" are not valid.\n"; 814 } 815 else { 816 warn "Error: The value specified, @UnsupportedSpecifiedFunctions , for option \"-m --mode\" is not valid.\n"; 817 } 818 die "Allowed values:", JoinWords(\@SupportedStatisticaFunctions, ", ", 0), "\n"; 819 } 820 } 821 FUNCTION: for $SpecifiedFunction (@SpecifiedStatisticalFunctions) { 822 if (exists $SpecifiedStatisticalFunctionsMap{lc($SpecifiedFunction)} ) { 823 next FUNCTION; 824 } 825 $SpecifiedStatisticalFunctionsMap{lc($SpecifiedFunction)} = $SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)}; 826 } 827 828 # Setup delimiter and quotes... 829 $OutDelim = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,"); 830 $OutQuote = ($Options{quote} =~ /yes/i ) ? 1 : 0; 831 832 # Setup miscellaneous options... 833 $CheckData = $Options{fast} ? 0 : 1; 834 $Precision = $Options{precision}; 835 836 $KLargest = $Options{klargest}; 837 $KSmallest = $Options{ksmallest}; 838 839 $TrimFraction = $Options{trimfraction}; 840 841 # Setup frequency bin values... 842 $NumOfBins = 10; 843 @BinRange = (); 844 if ($Options{frequencybins} =~ /\,/) { 845 my($BinValue, @SpecifiedBinRange); 846 @SpecifiedBinRange = split /\,/, $Options{frequencybins}; 847 if (@SpecifiedBinRange < 2) { 848 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n"; 849 } 850 for $BinValue (@SpecifiedBinRange) { 851 if (!IsNumerical($BinValue)) { 852 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n"; 853 } 854 } 855 my($Index1, $Index2); 856 for $Index1 (0 .. $#SpecifiedBinRange) { 857 for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) { 858 if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) { 859 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must cont