MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: AnalyzeTextFilesData.pl,v $
   4 # $Date: 2008/02/14 00:19:54 $
   5 # $Revision: 1.23 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 use 5.006;
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 use StatisticsUtil;
  38 
  39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  40 
  41 # Autoflush STDOUT
  42 $| = 1;
  43 
  44 # Starting message...
  45 $ScriptName = basename($0);
  46 print "\n$ScriptName: Starting...\n\n";
  47 $StartTime = new Benchmark;
  48 
  49 # Get the options and setup script...
  50 SetupScriptUsage();
  51 if ($Options{help} || @ARGV < 1) {
  52   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  53 }
  54 
  55 my(@TextFilesList);
  56 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
  57 
  58 my($DetailLevel, $OutDelim, $OutQuote, $Precision, $CheckData, $KLargest, $KSmallest, $TrimFraction, $AllColumnPairs, $NumOfBins, @BinRange, @SpecifiedColumns, @SpecifiedColumnPairs, $FileNameMode, @SpecifiedStatisticalFunctions, %SpecifiedStatisticalFunctionsMap);
  59 print "Processing options...\n";
  60 ProcessOptions();
  61 
  62 # Collect column information for all the text files...
  63 print "Checking input text file(s)...\n";
  64 my(@TextFilesOkay, @TextFilesColCount, @TextFilesColLabels, @TextFilesColLabelToNumMap, @TextFilesInDelim, @TextFilesOutFileRoot, @TextFilesOutFileExt);
  65 RetrieveTextFilesInfo();
  66 
  67 # Make sure the specified columns exists in text files...
  68 my(@TextFilesColNumsToAnalyze, @TextFilesColPairs1ToAnalyze, @TextFilesColPairs2ToAnalyze, @TextFilesUniqueColNumsToAnalyze);
  69 ProcessColumnsInfo();
  70 
  71 # Generate output files...
  72 my($Index, $TextFile);
  73 if (@TextFilesList > 1) {
  74   print "Processing text files...\n";
  75 }
  76 for $Index (0 .. $#TextFilesList) {
  77   if ($TextFilesOkay[$Index]) {
  78     $TextFile = $TextFilesList[$Index];
  79     if (@TextFilesList > 1) {
  80       print "\nProcessing file $TextFile...\n";
  81     } else {
  82       print "Processing file $TextFile...\n"
  83     }
  84     AnalyzeTextFile($Index);
  85   }
  86 }
  87 
  88 print "$ScriptName:Done...\n\n";
  89 
  90 $EndTime = new Benchmark;
  91 $TotalTime = timediff ($EndTime, $StartTime);
  92 print "Total time: ", timestr($TotalTime), "\n";
  93 
  94 ###############################################################################
  95 
  96 # Analyze data...
  97 sub AnalyzeTextFile {
  98   my($Index) = @_;
  99   my($TextFile, $Line, $InDelim, $ColNum, $Value, @LineWords, @ColNumsToAnalyze, %ColValuesToAnalyzeMap);
 100 
 101   $TextFile = $TextFilesList[$Index];
 102   $InDelim = $TextFilesInDelim[$Index];
 103   @ColNumsToAnalyze = @{$TextFilesUniqueColNumsToAnalyze[$Index]};
 104   %ColValuesToAnalyzeMap = ();
 105   for $ColNum (@ColNumsToAnalyze) {
 106     @{$ColValuesToAnalyzeMap{$ColNum}} = ();
 107   }
 108 
 109   my($LineCount, $InvalidLineCount, @InvalidColLabels);
 110 
 111   open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n";
 112   # Skip over column labels line in text file and collect appropriate column data
 113   # for analysis...
 114   $Line = GetTextLine(\*TEXTFILE);
 115   $LineCount = 1;
 116   $InvalidLineCount = 0;
 117   while ($Line = GetTextLine(\*TEXTFILE)) {
 118     $LineCount++;
 119     @LineWords = quotewords($InDelim, 0, $Line);
 120     @InvalidColLabels = ();
 121     COLNUM: for $ColNum (@ColNumsToAnalyze) {
 122       $Value = $LineWords[$ColNum];
 123       if ($CheckData) {
 124 	if (!IsNumerical($Value)) {
 125 	  push @InvalidColLabels, $TextFilesColLabels[$Index][$ColNum];
 126 	  next COLNUM;
 127 	}
 128       }
 129       push @{$ColValuesToAnalyzeMap{$ColNum}}, $Value;
 130     }
 131     if (@InvalidColLabels) {
 132       $InvalidLineCount++;
 133       if ($DetailLevel >=4 ) {
 134 	print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for column(s) - ", JoinWords(\@InvalidColLabels, ", ", 0)," - to be analyzed: $Line \n";
 135       }
 136       elsif ($DetailLevel >= 3) {
 137 	print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for column(s) - ", JoinWords(\@InvalidColLabels, ", ", 0)," - to be analyzed...\n";
 138       }
 139       elsif ($DetailLevel >= 2) {
 140 	print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for columns to be analyzed...\n";
 141       }
 142     }
 143   }
 144   if ($InvalidLineCount && ($DetailLevel >= 1)) {
 145     print "Non-numerical or empty data present in $InvalidLineCount line(s)...\n";
 146   }
 147   close TEXTFILE;
 148 
 149   # Perform the analysis...
 150   my(@SpecifiedFunctionNames, $SpecifiedFunction);
 151   @SpecifiedFunctionNames = ();
 152 
 153   for $SpecifiedFunction (@SpecifiedStatisticalFunctions) {
 154     if ($SpecifiedFunction !~ /^(Covariance|Correlation|Frequency|Rsquare|StandardScores|StandardScoresN)$/i) {
 155       push @SpecifiedFunctionNames, $SpecifiedStatisticalFunctionsMap{lc($SpecifiedFunction)};
 156     }
 157   }
 158   if (@SpecifiedFunctionNames) {
 159     PerformAnalysis($Index, \@SpecifiedFunctionNames, \%ColValuesToAnalyzeMap)
 160   }
 161   if (exists($SpecifiedStatisticalFunctionsMap{covariance}) || exists($SpecifiedStatisticalFunctionsMap{correlation}) || exists($SpecifiedStatisticalFunctionsMap{rsquare})) {
 162     if ($AllColumnPairs) {
 163       PerformMatrixAnalysis($Index, \%ColValuesToAnalyzeMap);
 164     }
 165     else {
 166       # Perform pairwise analysis for specified columns and write out calculated values - correlation
 167       # rsquare, or covariance - in the same file.
 168       PerformColumnPairAnalysis($Index, \%ColValuesToAnalyzeMap);
 169     }
 170   }
 171   if (exists($SpecifiedStatisticalFunctionsMap{standardscores}) || exists($SpecifiedStatisticalFunctionsMap{standardscoresn}) ) {
 172     PerformStandardScoresAnalysis($Index, \%ColValuesToAnalyzeMap);
 173   }
 174   if (exists($SpecifiedStatisticalFunctionsMap{frequency})) {
 175     PerformFrequencyAnalysis($Index, \%ColValuesToAnalyzeMap);
 176   }
 177 }
 178 
 179 # Calculate values for various statistical functions...
 180 sub PerformAnalysis {
 181   my($Index, $SpecifiedFunctionNamesRef, $ColValuesToAnalyzeMapRef) = @_;
 182   my($NewTextFile, $Line, $SpecifiedFunction, $Label, @ColLabels, @ColNumsToAnalyze);
 183 
 184   $NewTextFile = $TextFilesOutFileRoot[$Index] . $FileNameMode . "." . $TextFilesOutFileExt[$Index];
 185 
 186   print "Generating new text file $NewTextFile...\n";
 187   open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
 188 
 189   # Write out column labels...
 190   @ColLabels = ();
 191   push @ColLabels, "ColumnID";
 192   for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) {
 193     $Label = $SpecifiedFunction;
 194     if ($SpecifiedFunction =~ /^(KLargest|KSmallest)$/i) {
 195       my($KthValue);
 196       $KthValue = ($SpecifiedFunction =~ /^KLargest$/i) ? $KLargest : $KSmallest;
 197       $Label = AddNumberSuffix($KthValue) . "$SpecifiedFunction";
 198       $Label =~ s/K//g;
 199     }
 200     elsif ($SpecifiedFunction =~ /^TrimMean$/i) {
 201       $Label = "${SpecifiedFunction}($TrimFraction)";
 202     }
 203     push @ColLabels, $Label;
 204   }
 205   $Line = JoinWords(\@ColLabels, $OutDelim, $OutQuote);
 206   print NEWTEXTFILE "$Line\n";
 207 
 208   # Go over each column to be analyzed...
 209   @ColNumsToAnalyze = @{$TextFilesColNumsToAnalyze[$Index]};
 210 
 211   # Turn off "strict"; otherwise, invoking statistical functions using function name string
 212   # is problematic.
 213   no strict;
 214 
 215   my($ColValuesRef, $ColNum, $Value, @RowValues, %CalculatedValues);
 216   %CalculatedValues = ();
 217   for $ColNum (@ColNumsToAnalyze) {
 218     @RowValues = ();
 219     # Setup column id...
 220     push @RowValues, $TextFilesColLabels[$Index][$ColNum];
 221     $ColValuesRef =  \@{$ColValuesToAnalyzeMapRef->{$ColNum}};
 222     FUNCTIONNAME: for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) {
 223       $Value = "";
 224       if (!@{$ColValuesToAnalyzeMapRef->{$ColNum}}) {
 225 	# Invalid column values...
 226 	push @RowValues, $Value;
 227 	next FUNCTIONNAME;
 228       }
 229       if ($SpecifiedFunction =~ /^Count$/i) {
 230 	$Value = @{$ColValuesToAnalyzeMapRef->{$ColNum}};
 231       }
 232       elsif ($SpecifiedFunction =~ /^KLargest$/i) {
 233 	$Value = &$SpecifiedFunction($ColValuesRef, $KLargest);
 234       }
 235       elsif ($SpecifiedFunction =~ /^KSmallest$/i) {
 236 	$Value = &$SpecifiedFunction($ColValuesRef, $KSmallest);
 237       }
 238       elsif ($SpecifiedFunction =~ /^StandardDeviation$/i) {
 239 	if (exists($CalculatedValues{$ColNum}{StandardDeviation})) {
 240 	  $Value = $CalculatedValues{$ColNum}{StandardDeviation};
 241 	}
 242 	else {
 243 	  $Value = &$SpecifiedFunction($ColValuesRef);
 244 	  $CalculatedValues{$ColNum}{StandardDeviation} = $Value;
 245 	}
 246       }
 247       elsif ($SpecifiedFunction =~ /^StandardError$/i) {
 248 	if (!exists($CalculatedValues{$ColNum}{StandardDeviation})) {
 249 	  $Value = StandardDeviation($ColValuesRef);
 250 	  $CalculatedValues{$ColNum}{StandardDeviation} = $Value;
 251 	}
 252 	if (defined $CalculatedValues{$ColNum}{StandardDeviation}) {
 253 	  $Value = &$SpecifiedFunction($CalculatedValues{$ColNum}{StandardDeviation}, @{$ColValuesToAnalyzeMapRef->{$ColNum}});
 254 	}
 255       }
 256       elsif ($SpecifiedFunction =~ /^TrimMean$/i) {
 257 	$Value = &$SpecifiedFunction($ColValuesRef, $TrimFraction);
 258       }
 259       else {
 260 	$Value = &$SpecifiedFunction($ColValuesRef);
 261       }
 262       # Format the output value. And add zero to get rid of tariling zeros...
 263       $Value = (defined($Value) && length($Value)) ? (sprintf("%.${Precision}f", $Value) + 0) : "";
 264       push @RowValues, $Value;
 265     }
 266     $Line = JoinWords(\@RowValues, $OutDelim, $OutQuote);
 267     print NEWTEXTFILE "$Line\n";
 268   }
 269   close NEWTEXTFILE;
 270 }
 271 
 272 # Calculate covariance, correlation, rsquare for specified column pairs....
 273 sub PerformColumnPairAnalysis {
 274   my($Index, $ColValuesToAnalyzeMapRef) = @_;
 275   my($NewTextFile, @ColLabels, $Line, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance);
 276   $CalculateCorrelation = exists($SpecifiedStatisticalFunctionsMap{correlation}) ? 1 : 0;
 277   $CalculateRSquare = exists($SpecifiedStatisticalFunctionsMap{rsquare}) ? 1 : 0;
 278   $CalculateCovariance = exists($SpecifiedStatisticalFunctionsMap{covariance}) ? 1 : 0;
 279 
 280   $NewTextFile = $TextFilesOutFileRoot[$Index] . "ColumnPairsAnalysis." .  $TextFilesOutFileExt[$Index];
 281   print "Generating new text file $NewTextFile...\n";
 282   open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
 283 
 284   # Write out the column labels...
 285   @ColLabels = ();
 286   push @ColLabels, ("ColumnID1", "ColumnID2");
 287   if ($CalculateCorrelation || $CalculateRSquare) {
 288     push @ColLabels, "Correlation";
 289     if ($CalculateRSquare) {
 290       push @ColLabels, "RSquare";
 291     }
 292   }
 293   if ($CalculateCovariance) {
 294     push @ColLabels, "Covariance";
 295   }
 296   $Line = JoinWords(\@ColLabels, $OutDelim, $OutQuote);
 297   print NEWTEXTFILE "$Line\n";
 298 
 299   # Go over each column pair...
 300   my($CorrelationValue, $RSquareValue, $CovarianceValue,  $ColIndex, $ColNum1, $ColNum2, $ColValuesRef1, $ColValuesRef2, @ColPairs1ToAnalyze, @ColPairs2ToAnalyze, @RowValues, $Value);
 301 
 302   @ColPairs1ToAnalyze = @{$TextFilesColPairs1ToAnalyze[$Index]};
 303   @ColPairs2ToAnalyze = @{$TextFilesColPairs2ToAnalyze[$Index]};
 304   for $ColIndex (0 .. $#ColPairs1ToAnalyze) {
 305     @RowValues = ();
 306     $ColNum1 = $ColPairs1ToAnalyze[$ColIndex];
 307     $ColNum2 = $ColPairs2ToAnalyze[$ColIndex];
 308     $ColValuesRef1 =  \@{$ColValuesToAnalyzeMapRef->{$ColNum1}};
 309     $ColValuesRef2 =  \@{$ColValuesToAnalyzeMapRef->{$ColNum2}};
 310 
 311     # Setup column ids...
 312     push @RowValues, $TextFilesColLabels[$Index][$ColNum1];
 313     push @RowValues, $TextFilesColLabels[$Index][$ColNum2];
 314 
 315     if (@$ColValuesRef1 != @$ColValuesRef2) {
 316       # Print a warning...
 317       warn "Warning: Skipping analysis for column pair $TextFilesColLabels[$Index][$ColNum1], $TextFilesColLabels[$Index][$ColNum2]: Number of valid data values must be same.\n";
 318       if ($CalculateCorrelation || $CalculateRSquare) {
 319 	push @RowValues, "";
 320 	if ($CalculateRSquare) {
 321 	  push @RowValues, "";
 322 	}
 323       }
 324       if ($CalculateCovariance) {
 325 	push @RowValues, "";
 326       }
 327     }
 328     else {
 329       # Calculate appropriate value...
 330       if ($CalculateCorrelation || $CalculateRSquare) {
 331 	$CorrelationValue = Correlation($ColValuesRef1, $ColValuesRef2);
 332 	$Value = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.${Precision}f", $CorrelationValue) + 0) : "";
 333 	push @RowValues, $Value;
 334 	if ($CalculateRSquare) {
 335 	  $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : "";
 336 	  $Value = (length($RSquareValue)) ? (sprintf("%.${Precision}f", $RSquareValue) + 0) : "";
 337 	  push @RowValues, $Value;
 338 	}
 339       }
 340       if ($CalculateCovariance) {
 341 	$CovarianceValue = Covariance($ColValuesRef1, $ColValuesRef2);
 342 	$Value = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.${Precision}f", $CovarianceValue) + 0) : "";
 343 	push @RowValues, $Value;
 344       }
 345     }
 346     $Line = JoinWords(\@RowValues, $OutDelim, $OutQuote);
 347     print NEWTEXTFILE "$Line\n";
 348   }
 349   close NEWTEXTFILE;
 350 }
 351 
 352 # Generate histogram numbers...
 353 sub PerformFrequencyAnalysis {
 354   my($Index, $ColValuesToAnalyzeMapRef) = @_;
 355   my($NewTextFile, $ColLabel, @ColLabels, @RowValues, $Line, $ColNum, @ColNumsToAnalyze, $ColValuesRef, $BinValue, $FrequencyValue, $Value, %FrequencyMap);
 356 
 357   @ColNumsToAnalyze = @{$TextFilesColNumsToAnalyze[$Index]};
 358   for $ColNum (@ColNumsToAnalyze) {
 359     $NewTextFile = $TextFilesOutFileRoot[$Index] . $TextFilesColLabels[$Index][$ColNum] . "FrequencyAnalysis." .  $TextFilesOutFileExt[$Index];
 360     print "Generating new text file $NewTextFile...\n";
 361     open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
 362 
 363     # Write out the column labels...
 364     @ColLabels = ();
 365     push @ColLabels , ("Bins", "Frequency");
 366     $Line = JoinWords(\@ColLabels, $OutDelim, $OutQuote);
 367     print NEWTEXTFILE "$Line\n";
 368 
 369     #Calculate and write out frequency values...
 370     %FrequencyMap = ();
 371     $ColValuesRef =  \@{$ColValuesToAnalyzeMapRef->{$ColNum}};
 372     if (@$ColValuesRef) {
 373       if (@BinRange) {
 374 	%FrequencyMap = Frequency($ColValuesRef, \@BinRange);
 375       }
 376       else {
 377 	%FrequencyMap = Frequency($ColValuesRef, $NumOfBins);
 378       }
 379     }
 380     for $BinValue (sort { $a <=> $b }  keys %FrequencyMap) {
 381       $FrequencyValue = $FrequencyMap{$BinValue};
 382 
 383       @RowValues = ();
 384       $Value = (length($BinValue)) ? (sprintf("%.${Precision}f", $BinValue) + 0) : "";
 385       push @RowValues, $Value;
 386       $Value = (length($FrequencyValue)) ? (sprintf("%.${Precision}f", $FrequencyValue) + 0) : "";
 387       push @RowValues, $Value;
 388 
 389       $Line = JoinWords(\@RowValues, $OutDelim, $OutQuote);
 390       print NEWTEXTFILE "$Line\n";
 391     }
 392     close NEWTEXTFILE;
 393   }
 394 }
 395 
 396 # Calculate covariance, correlation/rsquare matrices....
 397 sub PerformMatrixAnalysis {
 398   my($Index, $ColValuesToAnalyzeMapRef) = @_;
 399   my($CorrelationTextFile, $CovarianceTextFile, $RSquareTextFile, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance);
 400 
 401   $CalculateCorrelation = exists($SpecifiedStatisticalFunctionsMap{correlation}) ? 1 : 0;
 402   $CalculateRSquare = exists($SpecifiedStatisticalFunctionsMap{rsquare}) ? 1 : 0;
 403   $CalculateCovariance = exists($SpecifiedStatisticalFunctionsMap{covariance}) ? 1 : 0;
 404 
 405   $CorrelationTextFile = $TextFilesOutFileRoot[$Index] . "CorrelationMatrix." .  $TextFilesOutFileExt[$Index];
 406   $RSquareTextFile = $TextFilesOutFileRoot[$Index] . "RSquareMatrix." .  $TextFilesOutFileExt[$Index];
 407   $CovarianceTextFile = $TextFilesOutFileRoot[$Index] . "CovarianceMatrix." .  $TextFilesOutFileExt[$Index];
 408 
 409   my($TextFilesList, $Delimiter);
 410   $TextFilesList =  "";
 411   if ($CalculateCorrelation || $CalculateRSquare) {
 412     $TextFilesList = $CorrelationTextFile;
 413     if ($CalculateRSquare) {
 414       $TextFilesList .= ", $CorrelationTextFile";
 415     }
 416   }
 417   $Delimiter = length($TextFilesList) ? "," : "";
 418   if ($CalculateCovariance) {
 419     $TextFilesList .= "${Delimiter} ${CorrelationTextFile}";
 420   }
 421   if ($TextFilesList =~ /\,/) {
 422     print "Generating new text files $TextFilesList ...\n"
 423   }
 424   else {
 425     print "Generating new text file $TextFilesList ...\n"
 426   }
 427   if ($CalculateCorrelation || $CalculateRSquare) {
 428     open CORRELATIONTEXTFILE, ">$CorrelationTextFile" or die "Error: Can't open $CorrelationTextFile: $! \n";
 429     if ($CalculateRSquare) {
 430       open RSQUARETEXTFILE, ">$RSquareTextFile" or die "Error: Can't open $RSquareTextFile: $! \n";
 431     }
 432   }
 433   if ($CalculateCovariance) {
 434     open COVARIANCETEXTFILE, ">$CovarianceTextFile" or die "Error: Can't open $CovarianceTextFile: $! \n";
 435   }
 436 
 437   my($Line, $Value, $CorrelationValue, $RSquareValue, $CovarianceValue, $ColNum, $ColNum1, $ColNum2, $ColValuesRef1, $ColValuesRef2, @ColLabels, @CovarianceRowValues, @CorrelationRowValues, @RSquareRowValues);
 438 
 439   # Write out the column labels...
 440   @ColLabels = ();
 441   push @ColLabels, "";
 442   for $ColNum (0 .. ($TextFilesColCount[$Index] - 1)) {
 443     push @ColLabels, $TextFilesColLabels[$Index][$ColNum];
 444   }
 445   $Line = JoinWords(\@ColLabels, $OutDelim, $OutQuote);
 446   if ($CalculateCorrelation || $CalculateRSquare) {
 447     print CORRELATIONTEXTFILE "$Line\n";
 448     if ($CalculateRSquare) {
 449       print RSQUARETEXTFILE "$Line\n";
 450     }
 451   }
 452   if ($CalculateCovariance) {
 453     print COVARIANCETEXTFILE "$Line\n";
 454   }
 455 
 456   # Due to symmetric nature of these matrices, only one half needs to be
 457   # calculated. So, just calculate the lower half and copy it to upper half...
 458   my(%CorrelationMatrixMap, %RSquareMatrixMap, %CovarianceMatrixMap);
 459 
 460   %CorrelationMatrixMap = (); %RSquareMatrixMap = (); %CovarianceMatrixMap = ();
 461   for $ColNum1 (0 .. ($TextFilesColCount[$Index] - 1)) {
 462     for $ColNum2 (0 .. $ColNum1) {
 463       $ColValuesRef1 =  \@{$ColValuesToAnalyzeMapRef->{$ColNum1}};
 464       $ColValuesRef2 =  \@{$ColValuesToAnalyzeMapRef->{$ColNum2}};
 465       if ($CalculateCorrelation || $CalculateRSquare) {
 466 	$CorrelationValue = Correlation($ColValuesRef1, $ColValuesRef2);
 467 	$CorrelationValue = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.${Precision}f", $CorrelationValue) + 0) : "";
 468 	$CorrelationMatrixMap{$ColNum1}{$ColNum2} = $CorrelationValue;
 469 	if ($ColNum1 != $ColNum2) {
 470 	  $CorrelationMatrixMap{$ColNum2}{$ColNum1} = $CorrelationValue;
 471 	}
 472 	if ($CalculateRSquare) {
 473 	  $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : "";
 474 	  $RSquareValue = (length($RSquareValue)) ? (sprintf("%.${Precision}f", $RSquareValue) + 0) : "";
 475 	  $RSquareMatrixMap{$ColNum1}{$ColNum2} = $RSquareValue;
 476 	  if ($ColNum1 != $ColNum2) {
 477 	    $RSquareMatrixMap{$ColNum2}{$ColNum1} = $RSquareValue;
 478 	  }
 479 	}
 480       }
 481       if ($CalculateCovariance) {
 482 	$CovarianceValue = Covariance($ColValuesRef1, $ColValuesRef2);
 483 	$CovarianceValue = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.${Precision}f", $CovarianceValue) + 0) : "";
 484 	$CovarianceMatrixMap{$ColNum1}{$ColNum2} = $CovarianceValue;
 485 	if ($ColNum1 != $ColNum2) {
 486 	  $CovarianceMatrixMap{$ColNum2}{$ColNum1} = $CovarianceValue;
 487 	}
 488       }
 489     }
 490   }
 491 
 492   # Write out the matrices...
 493   for $ColNum1 (0 .. ($TextFilesColCount[$Index] - 1)) {
 494     @CorrelationRowValues = ();
 495     @RSquareRowValues = ();
 496     @CovarianceRowValues = ();
 497     if ($CalculateCorrelation || $CalculateRSquare) {
 498       push @CorrelationRowValues, $TextFilesColLabels[$Index][$ColNum1];
 499       if ($CalculateRSquare) {
 500 	push @RSquareRowValues, $TextFilesColLabels[$Index][$ColNum1];
 501       }
 502     }
 503     if ($CalculateCovariance) {
 504       push @CovarianceRowValues, $TextFilesColLabels[$Index][$ColNum1];
 505     }
 506     for $ColNum2 (0 .. ($TextFilesColCount[$Index] - 1)) {
 507       if ($CalculateCorrelation || $CalculateRSquare) {
 508 	push @CorrelationRowValues, $CorrelationMatrixMap{$ColNum1}{$ColNum2};
 509 	if ($CalculateRSquare) {
 510 	  push @RSquareRowValues, $RSquareMatrixMap{$ColNum1}{$ColNum2};
 511 	}
 512       }
 513       if ($CalculateCovariance) {
 514 	push @CovarianceRowValues, $CovarianceMatrixMap{$ColNum1}{$ColNum2};
 515       }
 516     }
 517     if ($CalculateCorrelation || $CalculateRSquare) {
 518       $Line = JoinWords(\@CorrelationRowValues, $OutDelim, $OutQuote);
 519       print CORRELATIONTEXTFILE "$Line\n";
 520       if ($CalculateRSquare) {
 521 	$Line = JoinWords(\@RSquareRowValues, $OutDelim, $OutQuote);
 522 	print RSQUARETEXTFILE "$Line\n";
 523       }
 524     }
 525     if ($CalculateCovariance) {
 526       $Line = JoinWords(\@CovarianceRowValues, $OutDelim, $OutQuote);
 527       print COVARIANCETEXTFILE "$Line\n";
 528     }
 529   }
 530   if ($CalculateCorrelation || $CalculateRSquare) {
 531     close CORRELATIONTEXTFILE;
 532     if ($CalculateRSquare) {
 533       close RSQUARETEXTFILE;
 534     }
 535   }
 536   if ($CalculateCovariance) {
 537     close COVARIANCETEXTFILE;
 538   }
 539 }
 540 
 541 # Calculate standard scores...
 542 sub PerformStandardScoresAnalysis {
 543   my($Index, $ColValuesToAnalyzeMapRef) = @_;
 544   my($StandardScores, $StandardScoresN, $NewTextFile, @ColLabels, $Label, $NewLine);
 545 
 546   $StandardScores = exists($SpecifiedStatisticalFunctionsMap{standardscores}) ? 1 : 0;
 547   $StandardScoresN = exists($SpecifiedStatisticalFunctionsMap{standardscoresn}) ? 1 : 0;
 548 
 549   $NewTextFile = $TextFilesOutFileRoot[$Index] . "StandardScores." .  $TextFilesOutFileExt[$Index];
 550   print "Generating new text file $NewTextFile...\n";
 551   open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
 552 
 553   my($ColValuesRef, $ColNum, @ColNumsToAnalyze);
 554   # Write out column labels...
 555   @ColLabels = ();
 556   @ColNumsToAnalyze = @{$TextFilesColNumsToAnalyze[$Index]};
 557   for $ColNum (@ColNumsToAnalyze) {
 558     $Label = $TextFilesColLabels[$Index][$ColNum];
 559     if ($StandardScores) {
 560       push @ColLabels, "${Label}\(StandardScores)";
 561     }
 562     if ($StandardScoresN) {
 563       push @ColLabels, "${Label}\(StandardScoresN)";
 564     }
 565   }
 566   $NewLine = JoinWords(\@ColLabels, $OutDelim, $OutQuote);
 567   print NEWTEXTFILE "$NewLine\n";
 568 
 569   # Go over each column to be analyzed and calculate standard deviation
 570   # and mean values...
 571   my(%StandardDeviationMap, %StandardDeviationNMap, %MeanMap);
 572   %StandardDeviationMap = ();
 573   %StandardDeviationNMap = ();
 574   %MeanMap = ();
 575   for $ColNum (@ColNumsToAnalyze) {
 576     $ColValuesRef =  \@{$ColValuesToAnalyzeMapRef->{$ColNum}};
 577     if (!exists($MeanMap{$ColNum})) {
 578       $MeanMap{$ColNum} = Mean($ColValuesRef);
 579     }
 580     if ($StandardScores) {
 581       if (!exists($StandardDeviationMap{$ColNum})) {
 582 	$StandardDeviationMap{$ColNum} = StandardDeviation($ColValuesRef);
 583       }
 584     }
 585     if ($StandardScoresN) {
 586       if (!exists($StandardDeviationNMap{$ColNum})) {
 587 	$StandardDeviationNMap{$ColNum} = StandardDeviationN($ColValuesRef);
 588       }
 589     }
 590   }
 591   #
 592   # Go over each row and calculate standard scores for each column
 593   # using (x[i] - mean) / (n - 1) for StandardScores and (x[i] - mean) / n
 594   # for StandardScoresN; write out the calculated values as well...
 595 
 596   my($TextFile, $InDelim, $Line, $Value, $ValueOkay, $ScoreValue, @RowValues, @LineWords);
 597   $TextFile = $TextFilesList[$Index];
 598   $InDelim = $TextFilesInDelim[$Index];
 599 
 600   open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n";
 601   $Line = GetTextLine(\*TEXTFILE);
 602   while ($Line = GetTextLine(\*TEXTFILE)) {
 603     @LineWords = quotewords($InDelim, 0, $Line);
 604     @RowValues = ();
 605     COLNUM: for $ColNum (@ColNumsToAnalyze) {
 606       $Value = $LineWords[$ColNum];
 607       $ValueOkay = ($CheckData && !IsNumerical($Value)) ? 0 : 1;
 608       if ($StandardScores) {
 609 	$ScoreValue = $ValueOkay ? (($Value - $MeanMap{$ColNum})/$StandardDeviationMap{$ColNum}) : "";
 610 	$ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.${Precision}f", $ScoreValue) + 0) : "";
 611 	push @RowValues, $ScoreValue;
 612       }
 613       if ($StandardScoresN) {
 614 	$ScoreValue = $ValueOkay ? (($Value - $MeanMap{$ColNum})/$StandardDeviationNMap{$ColNum}) : "";
 615 	$ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.${Precision}f", $ScoreValue) + 0) : "";
 616 	push @RowValues, $ScoreValue;
 617       }
 618     }
 619     $NewLine = JoinWords(\@RowValues, $OutDelim, $OutQuote);
 620     print NEWTEXTFILE "$NewLine\n";
 621   }
 622   close TEXTFILE;
 623   close NEWTEXTFILE;
 624 }
 625 
 626 # Make sure the specified columns exists in text files...
 627 sub ProcessColumnsInfo {
 628   my($Index, $TextFile, $ColNum, $NewColNum, $ColIndex, @ColNumsToAnalyze, %UniqueColNumsToAnalyzeMap);
 629 
 630   @TextFilesColNumsToAnalyze = ();
 631   @TextFilesColPairs1ToAnalyze= ();
 632   @TextFilesColPairs2ToAnalyze= ();
 633   @TextFilesUniqueColNumsToAnalyze = ();
 634  FILELIST: for $Index (0 .. $#TextFilesList) {
 635     $TextFile = $TextFilesList[$Index];
 636 
 637     @{$TextFilesColNumsToAnalyze[$Index]} = ();
 638     @{$TextFilesColPairs1ToAnalyze[$Index]} = ();
 639     @{$TextFilesColPairs2ToAnalyze[$Index]} = ();
 640     @{$TextFilesUniqueColNumsToAnalyze[$Index]} = ();
 641 
 642     %UniqueColNumsToAnalyzeMap = ();
 643 
 644     if ($TextFilesOkay[$Index]) {
 645       @ColNumsToAnalyze = ();
 646       if (@SpecifiedColumns) {
 647 	if ($Options{colmode} =~ /^colnum$/i) {
 648 	  for $ColNum (@SpecifiedColumns) {
 649 	    if ($ColNum >=1 && $ColNum <= $TextFilesColCount[$Index]) {
 650 	      $NewColNum = $ColNum -1;
 651 	      push @ColNumsToAnalyze, $NewColNum;
 652 	    }
 653 	  }
 654 	}
 655 	else {
 656 	  my($ColLabel);
 657 	  for $ColLabel (@SpecifiedColumns) {
 658 	    if (exists($TextFilesColLabelToNumMap[$Index]{$ColLabel})) {
 659 	      push @ColNumsToAnalyze, $TextFilesColLabelToNumMap[$Index]{$ColLabel};
 660 	    }
 661 	  }
 662 	}
 663       }
 664       elsif (defined  $Options{columns} && $Options{columns} =~ /^All$/i) {
 665 	for $ColNum (0 .. ($TextFilesColCount[$Index] - 1)) {
 666 	  push @ColNumsToAnalyze, $ColNum;
 667 	}
 668       }
 669       else {
 670 	push @ColNumsToAnalyze, 0;
 671       }
 672       if (@ColNumsToAnalyze) {
 673 	push @{$TextFilesColNumsToAnalyze[$Index]}, @ColNumsToAnalyze;
 674 	# Set up unique columns map as well...
 675 	for $ColNum (@ColNumsToAnalyze) {
 676 	  if (!exists $UniqueColNumsToAnalyzeMap{$ColNum}) {
 677 	    $UniqueColNumsToAnalyzeMap{$ColNum} = $ColNum;
 678 	  }
 679 	}
 680       }
 681       else {
 682 	warn "Warning: Ignoring file $TextFile: None of the columns specified, @SpecifiedColumns, using \"--columns\" option exist.\n";
 683 	$TextFilesOkay[$Index] = 0;
 684 	next FILELIST;
 685       }
 686       if (!$Options{overwrite} && exists($SpecifiedStatisticalFunctionsMap{frequency})) {
 687 	# Make sure specific frequency files don't exist...
 688 	my($FrequencyFile);
 689 	for $ColNum (@ColNumsToAnalyze) {
 690 	  $FrequencyFile = $TextFilesOutFileRoot[$Index] . $TextFilesColLabels[$Index][$ColNum] . "FrequencyAnalysis." .  $TextFilesOutFileExt[$Index];
 691 	  if (-e $FrequencyFile) {
 692 	    warn "Warning: Ignoring file $TextFile: The file $FrequencyFile already exists.\n";
 693 	    $TextFilesOkay[$Index] = 0;
 694 	    next FILELIST;
 695 	  }
 696 	}
 697       }
 698       # Setup specified column pairs...
 699       if (exists $SpecifiedStatisticalFunctionsMap{correlation} || exists $SpecifiedStatisticalFunctionsMap{covariance} || exists $SpecifiedStatisticalFunctionsMap{rsquare}) {
 700 	my(@ColPairsToAnalyze, $ColNum1, $ColNum2);
 701 	if (@SpecifiedColumnPairs) {
 702 	  # Make sure both columns exist...
 703 	  if ($Options{colmode} =~ /^colnum$/i) {
 704 	    for ($ColIndex = 0; (($ColIndex + 1) < @SpecifiedColumnPairs); $ColIndex += 2 ) {
 705 	      $ColNum1 = $SpecifiedColumnPairs[$ColIndex];
 706 	      $ColNum2 = $SpecifiedColumnPairs[$ColIndex + 1];
 707 	      if ($ColNum1 >=1 && $ColNum1 <= $TextFilesColCount[$Index] && $ColNum2 >=1 && $ColNum2 <= $TextFilesColCount[$Index]) {
 708 		$ColNum1 -= 1;
 709 		$ColNum2 -= 1;
 710 		push @ColPairsToAnalyze, ($ColNum1, $ColNum2);
 711 	      }
 712 	    }
 713 	  }
 714 	  else {
 715 	    my($ColLabel1, $ColLabel2);
 716 	    for ($ColIndex = 0; (($ColIndex + 1) < @SpecifiedColumnPairs); $ColIndex += 2 ) {
 717 	      $ColLabel1 = $SpecifiedColumnPairs[$ColIndex];
 718 	      $ColLabel2 = $SpecifiedColumnPairs[$ColIndex + 1];
 719 	      if (exists($TextFilesColLabelToNumMap[$Index]{$ColLabel1}) && exists($TextFilesColLabelToNumMap[$Index]{$ColLabel2})) {
 720 		$ColNum1 = $TextFilesColLabelToNumMap[$Index]{$ColLabel1};
 721 		$ColNum2 = $TextFilesColLabelToNumMap[$Index]{$ColLabel2};
 722 		push @ColPairsToAnalyze, ($ColNum1, $ColNum2);
 723 	      }
 724 	    }
 725 	  }
 726 	}
 727 	elsif ($AllColumnPairs) {
 728 	  for $ColNum1 (0 .. ($TextFilesColCount[$Index] - 1)) {
 729 	    for $ColNum2 (0 .. ($TextFilesColCount[$Index] - 1)) {
 730 	      push @ColPairsToAnalyze, ($ColNum1, $ColNum2);
 731 	    }
 732 	  }
 733 	}
 734 	else {
 735 	  if ($TextFilesColCount[$Index] >= 2) {
 736 	    push @ColPairsToAnalyze, (0,1);
 737 	  }
 738 	}
 739 	if (@ColPairsToAnalyze) {
 740 	  if (@ColPairsToAnalyze % 2) {
 741 	    warn "Warning: Ignoring file $TextFile: Invalid number  values specified using \"--columnpairs\" option: It must contain even number of valid values.\n";
 742 	    $TextFilesOkay[$Index] = 0;
 743 	    next FILELIST;
 744 	  }
 745 	  else {
 746 	    for ($ColIndex = 0; $ColIndex < @ColPairsToAnalyze; $ColIndex += 2) {
 747 	      push @{$TextFilesColPairs1ToAnalyze[$Index]}, $ColPairsToAnalyze[$ColIndex];
 748 	      push @{$TextFilesColPairs2ToAnalyze[$Index]}, $ColPairsToAnalyze[$ColIndex + 1];
 749 	    }
 750 	    # Set up unique columns map as well...
 751 	    for $ColNum (@ColPairsToAnalyze) {
 752 	      if (!exists $UniqueColNumsToAnalyzeMap{$ColNum}) {
 753 		$UniqueColNumsToAnalyzeMap{$ColNum} = $ColNum;
 754 	      }
 755 	    }
 756 	  }
 757 	}
 758       }
 759       # Setup uniques columns array...
 760       push @{$TextFilesUniqueColNumsToAnalyze[$Index]}, (sort keys %UniqueColNumsToAnalyzeMap);
 761     }
 762   }
 763 }
 764 
 765 # Process option values...
 766 sub ProcessOptions {
 767   $DetailLevel = $Options{detail};
 768 
 769   # Setup supported statistical functions...
 770   my($SupportedFunction, @SupportedStatisticaFunctions, %SupportedStatisticaFunctionsMap);
 771   %SupportedStatisticaFunctionsMap = ();
 772   @SupportedStatisticaFunctions = qw(Average AverageDeviation Correlation Count Covariance GeometricMean Frequency HarmonicMean KLargest KSmallest Kurtosis Maximum Minimum Mean Median Mode RSquare Skewness Sum SumOfSquares StandardDeviation StandardDeviationN StandardError StandardScores StandardScoresN TrimMean Variance VarianceN);
 773 
 774   for $SupportedFunction (@SupportedStatisticaFunctions) {
 775     $SupportedStatisticaFunctionsMap{lc($SupportedFunction)} = $SupportedFunction;
 776   }
 777 
 778   # Setup a list of functions to use for analysis...
 779   my($SpecifiedFunction);
 780   %SpecifiedStatisticalFunctionsMap = ();
 781   @SpecifiedStatisticalFunctions = ();
 782   # Check mode values...
 783   if ($Options{mode} =~ /^DescriptiveStatisticsBasic$/i ) {
 784     $FileNameMode = "DescriptiveStatisticsBasic";
 785     @SpecifiedStatisticalFunctions = qw(Count Maximum Minimum Mean Median StandardDeviation StandardError Variance Sum);
 786   }
 787   elsif ($Options{mode} =~ /^DescriptiveStatisticsAll$/i ) {
 788     $FileNameMode = "DescriptiveStatisticsAll";
 789     @SpecifiedStatisticalFunctions = qw(Count Maximum Minimum Mean GeometricMean HarmonicMean TrimMean Median Mode StandardDeviation Kurtosis Skewness StandardError Variance  RSquare Frequency  KLargest KSmallest Sum);
 790   }
 791   elsif ($Options{mode} =~ /^All$/i ) {
 792     $FileNameMode = "AllStatistics";
 793     @SpecifiedStatisticalFunctions = @SupportedStatisticaFunctions;
 794   }
 795   else {
 796     $FileNameMode = "SpecifiedStatistics";
 797     # Comma delimited list of functions...
 798     my($Mode, @SpecifiedFunctions, @UnsupportedSpecifiedFunctions);
 799     $Mode = $Options{mode};
 800     $Mode =~ s/ //g;
 801     @SpecifiedFunctions = split ",", $Mode;
 802     @UnsupportedSpecifiedFunctions = ();
 803     for $SpecifiedFunction (@SpecifiedFunctions) {
 804       if (exists($SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)})) {
 805 	push @SpecifiedStatisticalFunctions, $SpecifiedFunction;
 806       }
 807       else {
 808 	push @UnsupportedSpecifiedFunctions, $SpecifiedFunction;
 809       }
 810     }
 811     if (@UnsupportedSpecifiedFunctions) {
 812       if (@UnsupportedSpecifiedFunctions > 1) {
 813 	warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedFunctions, ", ", 0)," - for option \"-m --mode\" are not valid.\n";
 814       }
 815       else {
 816 	warn "Error: The value specified, @UnsupportedSpecifiedFunctions , for option \"-m --mode\" is not valid.\n";
 817       }
 818       die "Allowed values:", JoinWords(\@SupportedStatisticaFunctions, ", ", 0), "\n";
 819     }
 820   }
 821   FUNCTION: for $SpecifiedFunction (@SpecifiedStatisticalFunctions) {
 822     if (exists $SpecifiedStatisticalFunctionsMap{lc($SpecifiedFunction)} ) {
 823       next FUNCTION;
 824     }
 825     $SpecifiedStatisticalFunctionsMap{lc($SpecifiedFunction)} = $SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)};
 826   }
 827 
 828   # Setup delimiter and quotes...
 829   $OutDelim = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
 830   $OutQuote = ($Options{quote} =~ /yes/i ) ? 1 : 0;
 831 
 832   # Setup miscellaneous options...
 833   $CheckData = $Options{fast} ? 0 : 1;
 834   $Precision = $Options{precision};
 835 
 836   $KLargest = $Options{klargest};
 837   $KSmallest = $Options{ksmallest};
 838 
 839   $TrimFraction = $Options{trimfraction};
 840 
 841   # Setup frequency bin values...
 842   $NumOfBins = 10;
 843   @BinRange = ();
 844   if ($Options{frequencybins} =~ /\,/) {
 845     my($BinValue, @SpecifiedBinRange);
 846     @SpecifiedBinRange = split /\,/,  $Options{frequencybins};
 847     if (@SpecifiedBinRange < 2) {
 848       die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n";
 849     }
 850     for $BinValue (@SpecifiedBinRange) {
 851       if (!IsNumerical($BinValue)) {
 852 	die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n";
 853       }
 854     }
 855     my($Index1, $Index2);
 856     for $Index1 (0 .. $#SpecifiedBinRange) {
 857       for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) {
 858 	if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) {
 859 	  die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must cont