MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: AnalyzeSDFilesData.pl,v $
   4 # $Date: 2008/01/30 21:44:43 $
   5 # $Revision: 1.14 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 use 5.006;
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use SDFileUtil;
  37 use TextUtil;
  38 use StatisticsUtil;
  39 
  40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  41 
  42 # Autoflush STDOUT
  43 $| = 1;
  44 
  45 # Starting message...
  46 $ScriptName = basename($0);
  47 print "\n$ScriptName: Starting...\n\n";
  48 $StartTime = new Benchmark;
  49 
  50 # Get the options and setup script...
  51 SetupScriptUsage();
  52 if ($Options{help} || @ARGV < 1) {
  53   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  54 }
  55 
  56 my(@SDFilesList);
  57 @SDFilesList = ExpandFileNames(\@ARGV, "sd sdf");
  58 
  59 my($DetailLevel, $OutDelim, $OutQuote, $Precision, $CheckData, $KLargest, $KSmallest, $TrimFraction, $AllDataLabelPairs, $CommonDataLabelPairs, $NumOfBins, @BinRange, @SpecifiedDataLabels, @SpecifiedDataLabelPairs, $FileNameMode, @SpecifiedStatisticalFunctions, %SpecifiedStatisticalFunctionsMap);
  60 print "Processing options...\n";
  61 ProcessOptions();
  62 
  63 # Collect information about SD files...
  64 print "Checking input SD file(s)...\n";
  65 my(@SDFilesOkay, @SDFilesCmpdCount, @SDFilesAllDataLabels, @SDFilesAllDataLabelsMap, @SDFilesCommonDataLabels, @SDFilesNewTextFileRoot, @SDFilesNewTextFileExt);
  66 RetrieveSDFilesInfo();
  67 
  68 # Make sure the specified data field labels exists in SD files...
  69 my(@SDFilesDataLabelsToAnalyze, @SDFilesDataLabelPairs1ToAnalyze, @SDFilesDataLabelPairs2ToAnalyze, @SDFilesUniqueDataLabelsToAnalyze);
  70 ProcessSDFilesDataLabelsInfo();
  71 
  72 # Generate output files...
  73 my($Index, $SDFile);
  74 if (@SDFilesList > 1) {
  75   print "Processing SD files...\n";
  76 }
  77 for $Index (0 .. $#SDFilesList) {
  78   if ($SDFilesOkay[$Index]) {
  79     $SDFile = $SDFilesList[$Index];
  80     if (@SDFilesList > 1) {
  81       print "\nProcessing file $SDFile...\n";
  82     } else {
  83       print "Processing file $SDFile...\n"
  84     }
  85     AnalyzeSDFile($Index);
  86   }
  87 }
  88 
  89 print "$ScriptName:Done...\n\n";
  90 
  91 $EndTime = new Benchmark;
  92 $TotalTime = timediff ($EndTime, $StartTime);
  93 print "Total time: ", timestr($TotalTime), "\n";
  94 
  95 ###############################################################################
  96 
  97 # Analyze data...
  98 sub AnalyzeSDFile {
  99   my($Index) = @_;
 100   my($SDFile, $DataLabel, $DataValue, @DataLabelsToAnalyze, %DataFieldValuesToAnalyzeMap);
 101 
 102   $SDFile = $SDFilesList[$Index];
 103   @DataLabelsToAnalyze = @{$SDFilesUniqueDataLabelsToAnalyze[$Index]};
 104   %DataFieldValuesToAnalyzeMap = ();
 105   for $DataLabel (@DataLabelsToAnalyze) {
 106     @{$DataFieldValuesToAnalyzeMap{$DataLabel}} = ();
 107   }
 108 
 109   # Collect appropriate data field label values for analysis...
 110   my($CmpdString, @CmpdLines, %DataFieldValues, $CmpdCount, $InvalidCmpdCount, @InvalidCmpdDataLabels);
 111   open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
 112   $CmpdCount = 0;
 113   $InvalidCmpdCount = 0;
 114   while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 115     $CmpdCount++;
 116     @CmpdLines = split "\n", $CmpdString;
 117     %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 118     @InvalidCmpdDataLabels = ();
 119     DATALABEL: for $DataLabel (@DataLabelsToAnalyze) {
 120       if (exists $DataFieldValues{$DataLabel}) {
 121 	$DataValue = $DataFieldValues{$DataLabel};
 122 	if ($CheckData) {
 123 	  if (!IsNumerical($DataValue)) {
 124 	    push @InvalidCmpdDataLabels, $DataLabel;
 125 	    next DATALABEL;
 126 	  }
 127 	}
 128 	push @{$DataFieldValuesToAnalyzeMap{$DataLabel}}, $DataValue;
 129       }
 130     }
 131     if (@InvalidCmpdDataLabels) {
 132       $InvalidCmpdCount++;
 133       if ($DetailLevel >=4 ) {
 134 	print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field(s) - ", JoinWords(\@InvalidCmpdDataLabels, ", ", 0)," - to be analyzed:\n$CmpdString \n";
 135       }
 136       elsif ($DetailLevel >= 3) {
 137 	print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field(s) - ", JoinWords(\@InvalidCmpdDataLabels, ", ", 0)," - to be analyzed...\n";
 138       }
 139       elsif ($DetailLevel >= 2) {
 140 	print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field to be analyzed...\n";
 141       }
 142     }
 143   }
 144   if ($InvalidCmpdCount && ($DetailLevel >= 1)) {
 145     print "Non-numerical or empty data present in $InvalidCmpdCount compound record(s)...\n";
 146   }
 147   close SDFILE;
 148 
 149   # Perform the analysis...
 150   my(@SpecifiedFunctionNames, $SpecifiedFunction);
 151   @SpecifiedFunctionNames = ();
 152 
 153   for $SpecifiedFunction (@SpecifiedStatisticalFunctions) {
 154     if ($SpecifiedFunction !~ /^(Covariance|Correlation|Frequency|Rsquare|StandardScores|StandardScoresN)$/i) {
 155       push @SpecifiedFunctionNames, $SpecifiedStatisticalFunctionsMap{lc($SpecifiedFunction)};
 156     }
 157   }
 158   if (@SpecifiedFunctionNames) {
 159     PerformAnalysis($Index, \@SpecifiedFunctionNames, \%DataFieldValuesToAnalyzeMap)
 160   }
 161   if (exists($SpecifiedStatisticalFunctionsMap{covariance}) || exists($SpecifiedStatisticalFunctionsMap{correlation}) || exists($SpecifiedStatisticalFunctionsMap{rsquare})) {
 162     if ($AllDataLabelPairs || $CommonDataLabelPairs) {
 163       PerformMatrixAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
 164     }
 165     else {
 166       # Perform pairwise analysis for specified columns and write out calculated values - correlation
 167       # rsquare, or covariance - in the same file.
 168       PerformDataLabelPairAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
 169     }
 170   }
 171   if (exists($SpecifiedStatisticalFunctionsMap{standardscores}) || exists($SpecifiedStatisticalFunctionsMap{standardscoresn}) ) {
 172     PerformStandardScoresAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
 173   }
 174   if (exists($SpecifiedStatisticalFunctionsMap{frequency})) {
 175     PerformFrequencyAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
 176   }
 177 
 178 }
 179 
 180 # Calculate values for various statistical functions...
 181 sub PerformAnalysis {
 182   my($Index, $SpecifiedFunctionNamesRef, $DataValuesToAnalyzeMapRef) = @_;
 183   my($NewTextFile, $Line, $SpecifiedFunction, $Label, @ColLabels, @DataLabelsToAnalyze);
 184 
 185   $NewTextFile = $SDFilesNewTextFileRoot[$Index] . $FileNameMode . "." . $SDFilesNewTextFileExt[$Index];
 186 
 187   print "Generating new text file $NewTextFile...\n";
 188   open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
 189 
 190   # Write out column labels...
 191   @ColLabels = ();
 192   push @ColLabels, "DataLabel";
 193   for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) {
 194     $Label = $SpecifiedFunction;
 195     if ($SpecifiedFunction =~ /^(KLargest|KSmallest)$/i) {
 196       my($KthValue);
 197       $KthValue = ($SpecifiedFunction =~ /^KLargest$/i) ? $KLargest : $KSmallest;
 198       $Label = AddNumberSuffix($KthValue) . "$SpecifiedFunction";
 199       $Label =~ s/K//g;
 200     }
 201     elsif ($SpecifiedFunction =~ /^TrimMean$/i) {
 202       $Label = "${SpecifiedFunction}($TrimFraction)";
 203     }
 204     push @ColLabels, $Label;
 205   }
 206   $Line = JoinWords(\@ColLabels, $OutDelim, $OutQuote);
 207   print NEWTEXTFILE "$Line\n";
 208 
 209   # Go over each column to be analyzed...
 210   @DataLabelsToAnalyze = @{$SDFilesDataLabelsToAnalyze[$Index]};
 211 
 212   # Turn off "strict"; otherwise, invoking statistical functions using function name string
 213   # is problematic.
 214   no strict;
 215 
 216   my($DataValuesRef, $DataLabel, $Value, @RowValues, %CalculatedValues);
 217   %CalculatedValues = ();
 218   for $DataLabel (@DataLabelsToAnalyze) {
 219     @RowValues = ();
 220     # Setup column id...
 221     push @RowValues, $DataLabel;
 222     $DataValuesRef =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel}};
 223     FUNCTIONNAME: for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) {
 224       $Value = "";
 225       if (!@{$DataValuesToAnalyzeMapRef->{$DataLabel}}) {
 226 	# Invalid column values...
 227 	push @RowValues, $Value;
 228 	next FUNCTIONNAME;
 229       }
 230       if ($SpecifiedFunction =~ /^Count$/i) {
 231 	$Value = @{$DataValuesToAnalyzeMapRef->{$DataLabel}};
 232       }
 233       elsif ($SpecifiedFunction =~ /^KLargest$/i) {
 234 	$Value = &$SpecifiedFunction($DataValuesRef, $KLargest);
 235       }
 236       elsif ($SpecifiedFunction =~ /^KSmallest$/i) {
 237 	$Value = &$SpecifiedFunction($DataValuesRef, $KSmallest);
 238       }
 239       elsif ($SpecifiedFunction =~ /^StandardDeviation$/i) {
 240 	if (exists($CalculatedValues{$DataLabel}{StandardDeviation})) {
 241 	  $Value = $CalculatedValues{$DataLabel}{StandardDeviation};
 242 	}
 243 	else {
 244 	  $Value = &$SpecifiedFunction($DataValuesRef);
 245 	  $CalculatedValues{$DataLabel}{StandardDeviation} = $Value;
 246 	}
 247       }
 248       elsif ($SpecifiedFunction =~ /^StandardError$/i) {
 249 	if (!exists($CalculatedValues{$DataLabel}{StandardDeviation})) {
 250 	  $Value = StandardDeviation($DataValuesRef);
 251 	  $CalculatedValues{$DataLabel}{StandardDeviation} = $Value;
 252 	}
 253 	if (defined $CalculatedValues{$DataLabel}{StandardDeviation}) {
 254 	  $Value = &$SpecifiedFunction($CalculatedValues{$DataLabel}{StandardDeviation}, @{$DataValuesToAnalyzeMapRef->{$DataLabel}});
 255 	}
 256       }
 257       elsif ($SpecifiedFunction =~ /^TrimMean$/i) {
 258 	$Value = &$SpecifiedFunction($DataValuesRef, $TrimFraction);
 259       }
 260       else {
 261 	$Value = &$SpecifiedFunction($DataValuesRef);
 262       }
 263       # Format the output value. And add zero to get rid of tariling zeros...
 264       $Value = (defined($Value) && length($Value)) ? (sprintf("%.${Precision}f", $Value) + 0) : "";
 265       push @RowValues, $Value;
 266     }
 267     $Line = JoinWords(\@RowValues, $OutDelim, $OutQuote);
 268     print NEWTEXTFILE "$Line\n";
 269   }
 270   close NEWTEXTFILE;
 271 }
 272 
 273 # Calculate covariance, correlation, rsquare for specified data field label pairs....
 274 sub PerformDataLabelPairAnalysis {
 275   my($Index, $DataValuesToAnalyzeMapRef) = @_;
 276   my($NewTextFile, @ColLabels, $Line, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance);
 277 
 278   $CalculateCorrelation = exists($SpecifiedStatisticalFunctionsMap{correlation}) ? 1 : 0;
 279   $CalculateRSquare = exists($SpecifiedStatisticalFunctionsMap{rsquare}) ? 1 : 0;
 280   $CalculateCovariance = exists($SpecifiedStatisticalFunctionsMap{covariance}) ? 1 : 0;
 281 
 282   $NewTextFile = $SDFilesNewTextFileRoot[$Index] . "DataFieldPairsAnalysis." .  $SDFilesNewTextFileExt[$Index];
 283   print "Generating new text file $NewTextFile...\n";
 284   open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
 285 
 286   # Write out the column labels...
 287   @ColLabels = ();
 288   push @ColLabels, ("DataLabel1", "DataLabel2");
 289   if ($CalculateCorrelation || $CalculateRSquare) {
 290     push @ColLabels, "Correlation";
 291     if ($CalculateRSquare) {
 292       push @ColLabels, "RSquare";
 293     }
 294   }
 295   if ($CalculateCovariance) {
 296     push @ColLabels, "Covariance";
 297   }
 298   $Line = JoinWords(\@ColLabels, $OutDelim, $OutQuote);
 299   print NEWTEXTFILE "$Line\n";
 300 
 301   # Go over each data field pair...
 302   my($CorrelationValue, $RSquareValue, $CovarianceValue,  $LabelIndex, $DataLabel1, $DataLabel2, $DataValues1, $DataValues2, @DataLabelPairs1ToAnalyze, @DataLabelPairs2ToAnalyze, @RowValues, $Value);
 303 
 304   @DataLabelPairs1ToAnalyze = @{$SDFilesDataLabelPairs1ToAnalyze[$Index]};
 305   @DataLabelPairs2ToAnalyze = @{$SDFilesDataLabelPairs2ToAnalyze[$Index]};
 306   for $LabelIndex (0 .. $#DataLabelPairs1ToAnalyze) {
 307     @RowValues = ();
 308     $DataLabel1 = $DataLabelPairs1ToAnalyze[$LabelIndex];
 309     $DataLabel2 = $DataLabelPairs2ToAnalyze[$LabelIndex];
 310     $DataValues1 =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel1}};
 311     $DataValues2 =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel2}};
 312 
 313     # Setup column ids...
 314     push @RowValues, $DataLabel1;
 315     push @RowValues, $DataLabel2;
 316 
 317     if (@$DataValues1 != @$DataValues2) {
 318       # Print a warning...
 319       warn "Warning: Skipping analysis for data field pair $DataLabel1, $DataLabel2: Number of valid data values must be same.\n";
 320       if ($CalculateCorrelation || $CalculateRSquare) {
 321 	push @RowValues, "";
 322 	if ($CalculateRSquare) {
 323 	  push @RowValues, "";
 324 	}
 325       }
 326       if ($CalculateCovariance) {
 327 	push @RowValues, "";
 328       }
 329     }
 330     else {
 331       # Calculate appropriate value...
 332       if ($CalculateCorrelation || $CalculateRSquare) {
 333 	$CorrelationValue = Correlation($DataValues1, $DataValues2);
 334 	$Value = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.${Precision}f", $CorrelationValue) + 0) : "";
 335 	push @RowValues, $Value;
 336 	if ($CalculateRSquare) {
 337 	  $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : "";
 338 	  $Value = (length($RSquareValue)) ? (sprintf("%.${Precision}f", $RSquareValue) + 0) : "";
 339 	  push @RowValues, $Value;
 340 	}
 341       }
 342       if ($CalculateCovariance) {
 343 	$CovarianceValue = Covariance($DataValues1, $DataValues2);
 344 	$Value = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.${Precision}f", $CovarianceValue) + 0) : "";
 345 	push @RowValues, $Value;
 346       }
 347     }
 348     $Line = JoinWords(\@RowValues, $OutDelim, $OutQuote);
 349     print NEWTEXTFILE "$Line\n";
 350   }
 351   close NEWTEXTFILE;
 352 }
 353 
 354 # Generate histogram numbers...
 355 sub PerformFrequencyAnalysis {
 356   my($Index, $DataValuesToAnalyzeMapRef) = @_;
 357   my($NewTextFile, $ColLabel, @ColLabels, @RowValues, $Line, $DataLabel, @DataLabelsToAnalyze, $DataValuesRef, $BinValue, $FrequencyValue, $Value, %FrequencyMap);
 358 
 359   @DataLabelsToAnalyze = @{$SDFilesDataLabelsToAnalyze[$Index]};
 360   for $DataLabel (@DataLabelsToAnalyze) {
 361     $NewTextFile = $SDFilesNewTextFileRoot[$Index] . $DataLabel . "FrequencyAnalysis." .  $SDFilesNewTextFileExt[$Index];
 362     print "Generating new text file $NewTextFile...\n";
 363     open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
 364 
 365     # Write out the column labels...
 366     @ColLabels = ();
 367     push @ColLabels , ("Bins", "Frequency");
 368     $Line = JoinWords(\@ColLabels, $OutDelim, $OutQuote);
 369     print NEWTEXTFILE "$Line\n";
 370 
 371     #Calculate and write out frequency values...
 372     %FrequencyMap = ();
 373     $DataValuesRef =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel}};
 374     if (@$DataValuesRef) {
 375       if (@BinRange) {
 376 	%FrequencyMap = Frequency($DataValuesRef, \@BinRange);
 377       }
 378       else {
 379 	%FrequencyMap = Frequency($DataValuesRef, $NumOfBins);
 380       }
 381     }
 382     for $BinValue (sort { $a <=> $b }  keys %FrequencyMap) {
 383       $FrequencyValue = $FrequencyMap{$BinValue};
 384 
 385       @RowValues = ();
 386       $Value = (length($BinValue)) ? (sprintf("%.${Precision}f", $BinValue) + 0) : "";
 387       push @RowValues, $Value;
 388       $Value = (length($FrequencyValue)) ? (sprintf("%.${Precision}f", $FrequencyValue) + 0) : "";
 389       push @RowValues, $Value;
 390 
 391       $Line = JoinWords(\@RowValues, $OutDelim, $OutQuote);
 392       print NEWTEXTFILE "$Line\n";
 393     }
 394     close NEWTEXTFILE;
 395   }
 396 }
 397 
 398 # Calculate covariance, correlation/rsquare matrices....
 399 sub PerformMatrixAnalysis {
 400   my($Index, $DataValuesToAnalyzeMapRef) = @_;
 401   my($CorrelationTextFile, $CovarianceTextFile, $RSquareTextFile, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance);
 402 
 403   $CalculateCorrelation = exists($SpecifiedStatisticalFunctionsMap{correlation}) ? 1 : 0;
 404   $CalculateRSquare = exists($SpecifiedStatisticalFunctionsMap{rsquare}) ? 1 : 0;
 405   $CalculateCovariance = exists($SpecifiedStatisticalFunctionsMap{covariance}) ? 1 : 0;
 406 
 407   $CorrelationTextFile = $SDFilesNewTextFileRoot[$Index] . "CorrelationMatrix." .  $SDFilesNewTextFileExt[$Index];
 408   $RSquareTextFile = $SDFilesNewTextFileRoot[$Index] . "RSquareMatrix." .  $SDFilesNewTextFileExt[$Index];
 409   $CovarianceTextFile = $SDFilesNewTextFileRoot[$Index] . "CovarianceMatrix." .  $SDFilesNewTextFileExt[$Index];
 410 
 411   my($TextFilesList, $Delimiter);
 412   $TextFilesList =  "";
 413   if ($CalculateCorrelation || $CalculateRSquare) {
 414     $TextFilesList = $CorrelationTextFile;
 415     if ($CalculateRSquare) {
 416       $TextFilesList .= ", $CorrelationTextFile";
 417     }
 418   }
 419   $Delimiter = length($TextFilesList) ? "," : "";
 420   if ($CalculateCovariance) {
 421     $TextFilesList .= "${Delimiter} ${CorrelationTextFile}";
 422   }
 423   if ($TextFilesList =~ /\,/) {
 424     print "Generating new text files $TextFilesList ...\n"
 425   }
 426   else {
 427     print "Generating new text file $TextFilesList ...\n"
 428   }
 429   if ($CalculateCorrelation || $CalculateRSquare) {
 430     open CORRELATIONTEXTFILE, ">$CorrelationTextFile" or die "Error: Can't open $CorrelationTextFile: $! \n";
 431     if ($CalculateRSquare) {
 432       open RSQUARETEXTFILE, ">$RSquareTextFile" or die "Error: Can't open $RSquareTextFile: $! \n";
 433     }
 434   }
 435   if ($CalculateCovariance) {
 436     open COVARIANCETEXTFILE, ">$CovarianceTextFile" or die "Error: Can't open $CovarianceTextFile: $! \n";
 437   }
 438 
 439   my($Line, $Value, $CorrelationValue, $RSquareValue, $CovarianceValue, $DataLabel, $DataLabel1, $DataLabel2, $DataValuesRef1, $DataValuesRef2, @ColLabels, @CovarianceRowValues, @CorrelationRowValues, @RSquareRowValues);
 440 
 441   # Write out the column labels...
 442   @ColLabels = ();
 443   push @ColLabels, @{$SDFilesAllDataLabels[$Index]};
 444   $Line = JoinWords(\@ColLabels, $OutDelim, $OutQuote);
 445   if ($CalculateCorrelation || $CalculateRSquare) {
 446     print CORRELATIONTEXTFILE "$Line\n";
 447     if ($CalculateRSquare) {
 448       print RSQUARETEXTFILE "$Line\n";
 449     }
 450   }
 451   if ($CalculateCovariance) {
 452     print COVARIANCETEXTFILE "$Line\n";
 453   }
 454 
 455   # Due to symmetric nature of these matrices, only one half needs to be
 456   # calculated. So, just calculate the lower half and copy it to upper half...
 457   my(%CorrelationMatrixMap, %RSquareMatrixMap, %CovarianceMatrixMap, $LabelIndex1, $LabelIndex2, @DataLabelsToAnalyze);
 458 
 459   %CorrelationMatrixMap = (); %RSquareMatrixMap = (); %CovarianceMatrixMap = ();
 460   @DataLabelsToAnalyze = ();
 461   @DataLabelsToAnalyze = $AllDataLabelPairs ? @{$SDFilesAllDataLabels[$Index]} : @{$SDFilesCommonDataLabels[$Index]};
 462 
 463   for $LabelIndex1 (0 .. (@DataLabelsToAnalyze - 1)) {
 464     $DataLabel1 = $DataLabelsToAnalyze[$LabelIndex1];
 465     for $LabelIndex2 (0 .. $LabelIndex1) {
 466       $DataLabel2 = $DataLabelsToAnalyze[$LabelIndex2];
 467       $DataValuesRef1 =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel1}};
 468       $DataValuesRef2 =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel2}};
 469       if ($CalculateCorrelation || $CalculateRSquare) {
 470 	$CorrelationValue = Correlation($DataValuesRef1, $DataValuesRef2);
 471 	$CorrelationValue = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.${Precision}f", $CorrelationValue) + 0) : "";
 472 	$CorrelationMatrixMap{$DataLabel1}{$DataLabel2} = $CorrelationValue;
 473 	if ($DataLabel1 ne $DataLabel2) {
 474 	  $CorrelationMatrixMap{$DataLabel2}{$DataLabel1} = $CorrelationValue;
 475 	}
 476 	if ($CalculateRSquare) {
 477 	  $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : "";
 478 	  $RSquareValue = (length($RSquareValue)) ? (sprintf("%.${Precision}f", $RSquareValue) + 0) : "";
 479 	  $RSquareMatrixMap{$DataLabel1}{$DataLabel2} = $RSquareValue;
 480 	  if ($DataLabel1 ne $DataLabel2) {
 481 	    $RSquareMatrixMap{$DataLabel2}{$DataLabel1} = $RSquareValue;
 482 	  }
 483 	}
 484       }
 485       if ($CalculateCovariance) {
 486 	$CovarianceValue = Covariance($DataValuesRef1, $DataValuesRef2);
 487 	$CovarianceValue = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.${Precision}f", $CovarianceValue) + 0) : "";
 488 	$CovarianceMatrixMap{$DataLabel1}{$DataLabel2} = $CovarianceValue;
 489 	if ($DataLabel1 ne $DataLabel2) {
 490 	  $CovarianceMatrixMap{$DataLabel2}{$DataLabel1} = $CovarianceValue;
 491 	}
 492       }
 493     }
 494   }
 495 
 496   # Write out the matrices...
 497   for $LabelIndex1 (0 .. (@DataLabelsToAnalyze - 1)) {
 498     $DataLabel1 = $DataLabelsToAnalyze[$LabelIndex1];
 499     @CorrelationRowValues = ();
 500     @RSquareRowValues = ();
 501     @CovarianceRowValues = ();
 502     if ($CalculateCorrelation || $CalculateRSquare) {
 503       push @CorrelationRowValues, $DataLabel1;
 504       if ($CalculateRSquare) {
 505 	push @RSquareRowValues, $DataLabel1;
 506       }
 507     }
 508     if ($CalculateCovariance) {
 509       push @CovarianceRowValues, $DataLabel;
 510     }
 511     for $LabelIndex2 (0 .. (@DataLabelsToAnalyze - 1)) {
 512       $DataLabel2 = $DataLabelsToAnalyze[$LabelIndex2];
 513       if ($CalculateCorrelation || $CalculateRSquare) {
 514 	push @CorrelationRowValues, $CorrelationMatrixMap{$DataLabel1}{$DataLabel2};
 515 	if ($CalculateRSquare) {
 516 	  push @RSquareRowValues, $RSquareMatrixMap{$DataLabel1}{$DataLabel2};
 517 	}
 518       }
 519       if ($CalculateCovariance) {
 520 	push @CovarianceRowValues, $CovarianceMatrixMap{$DataLabel1}{$DataLabel2};
 521       }
 522     }
 523     if ($CalculateCorrelation || $CalculateRSquare) {
 524       $Line = JoinWords(\@CorrelationRowValues, $OutDelim, $OutQuote);
 525       print CORRELATIONTEXTFILE "$Line\n";
 526       if ($CalculateRSquare) {
 527 	$Line = JoinWords(\@RSquareRowValues, $OutDelim, $OutQuote);
 528 	print RSQUARETEXTFILE "$Line\n";
 529       }
 530     }
 531     if ($CalculateCovariance) {
 532       $Line = JoinWords(\@CovarianceRowValues, $OutDelim, $OutQuote);
 533       print COVARIANCETEXTFILE "$Line\n";
 534     }
 535   }
 536   if ($CalculateCorrelation || $CalculateRSquare) {
 537     close CORRELATIONTEXTFILE;
 538     if ($CalculateRSquare) {
 539       close RSQUARETEXTFILE;
 540     }
 541   }
 542   if ($CalculateCovariance) {
 543     close COVARIANCETEXTFILE;
 544   }
 545 }
 546 
 547 # Calculate standard scores...
 548 sub PerformStandardScoresAnalysis {
 549   my($Index, $DataValuesToAnalyzeMapRef) = @_;
 550   my($StandardScores, $StandardScoresN, $NewTextFile, @ColLabels, $Label, $NewLine);
 551 
 552   $StandardScores = exists($SpecifiedStatisticalFunctionsMap{standardscores}) ? 1 : 0;
 553   $StandardScoresN = exists($SpecifiedStatisticalFunctionsMap{standardscoresn}) ? 1 : 0;
 554 
 555   $NewTextFile = $SDFilesNewTextFileRoot[$Index] . "StandardScores." .  $SDFilesNewTextFileExt[$Index];
 556   print "Generating new text file $NewTextFile...\n";
 557   open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
 558 
 559   my($DataLabel, @DataLabelsToAnalyze);
 560   # Write out column labels...
 561   @ColLabels = ();
 562   @DataLabelsToAnalyze = @{$SDFilesDataLabelsToAnalyze[$Index]};
 563   for $DataLabel (@DataLabelsToAnalyze) {
 564     if ($StandardScores) {
 565       push @ColLabels, "${DataLabel}\(StandardScores)";
 566     }
 567     if ($StandardScoresN) {
 568       push @ColLabels, "${DataLabel}\(StandardScoresN)";
 569     }
 570   }
 571   $NewLine = JoinWords(\@ColLabels, $OutDelim, $OutQuote);
 572   print NEWTEXTFILE "$NewLine\n";
 573 
 574   # Go over each column to be analyzed and calculate standard deviation
 575   # and mean values...
 576   my($DataValuesRef, %StandardDeviationMap, %StandardDeviationNMap, %MeanMap);
 577   %StandardDeviationMap = ();
 578   %StandardDeviationNMap = ();
 579   %MeanMap = ();
 580   for $DataLabel (@DataLabelsToAnalyze) {
 581     $DataValuesRef =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel}};
 582     if (!exists($MeanMap{$DataLabel})) {
 583       $MeanMap{$DataLabel} = Mean($DataValuesRef);
 584     }
 585     if ($StandardScores) {
 586       if (!exists($StandardDeviationMap{$DataLabel})) {
 587 	$StandardDeviationMap{$DataLabel} = StandardDeviation($DataValuesRef);
 588       }
 589     }
 590     if ($StandardScoresN) {
 591       if (!exists($StandardDeviationNMap{$DataLabel})) {
 592 	$StandardDeviationNMap{$DataLabel} = StandardDeviationN($DataValuesRef);
 593       }
 594     }
 595   }
 596   #
 597   # Go over each data field and calculate standard scores for each column
 598   # using (x[i] - mean) / (n - 1) for StandardScores and (x[i] - mean) / n
 599   # for StandardScoresN; write out the calculated values as well...
 600 
 601   my($SDFile, $Value, $ValueOkay, $ScoreValue, @RowValues, $CmpdString, @CmpdLines, %DataFieldValues);
 602   $SDFile = $SDFilesList[$Index];
 603 
 604   open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
 605   while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 606     @CmpdLines = split "\n", $CmpdString;
 607     %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 608     @RowValues = ();
 609     for $DataLabel (@DataLabelsToAnalyze) {
 610       $Value = "";
 611       if (exists $DataFieldValues{$DataLabel}) {
 612 	$Value = $DataFieldValues{$DataLabel};
 613       }
 614       $ValueOkay = ($CheckData && !IsNumerical($Value)) ? 0 : 1;
 615       if ($StandardScores) {
 616 	$ScoreValue = $ValueOkay ? (($Value - $MeanMap{$DataLabel})/$StandardDeviationMap{$DataLabel}) : "";
 617 	$ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.${Precision}f", $ScoreValue) + 0) : "";
 618 	push @RowValues, $ScoreValue;
 619       }
 620       if ($StandardScoresN) {
 621 	$ScoreValue = $ValueOkay ? (($Value - $MeanMap{$DataLabel})/$StandardDeviationNMap{$DataLabel}) : "";
 622 	$ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.${Precision}f", $ScoreValue) + 0) : "";
 623 	push @RowValues, $ScoreValue;
 624       }
 625     }
 626     $NewLine = JoinWords(\@RowValues, $OutDelim, $OutQuote);
 627     print NEWTEXTFILE "$NewLine\n";
 628   }
 629   close SDFILE;
 630   close NEWTEXTFILE;
 631 
 632 }
 633 
 634 # Make sure the specified data field labels exists in SD files...
 635 sub ProcessSDFilesDataLabelsInfo {
 636   my($Index, $DataFieldIndex, $SDFiles, $DataLabel, @DataLabelsToAnalyze, %UniqueDataLabelsToAnalyzeMap);
 637 
 638   @SDFilesDataLabelsToAnalyze = ();
 639   @SDFilesDataLabelPairs1ToAnalyze = ();
 640   @SDFilesDataLabelPairs2ToAnalyze = ();
 641   @SDFilesUniqueDataLabelsToAnalyze = ();
 642  FILELIST: for $Index (0 .. $#SDFilesList) {
 643     $SDFile = $SDFilesList[$Index];
 644 
 645     @{$SDFilesDataLabelsToAnalyze[$Index]} = ();
 646     @{$SDFilesDataLabelPairs1ToAnalyze[$Index]} = ();
 647     @{$SDFilesDataLabelPairs2ToAnalyze[$Index]} = ();
 648     @{$SDFilesUniqueDataLabelsToAnalyze[$Index]} = ();
 649 
 650     %UniqueDataLabelsToAnalyzeMap = ();
 651 
 652     if ($SDFilesOkay[$Index]) {
 653       @DataLabelsToAnalyze = ();
 654       if (@SpecifiedDataLabels) {
 655 	for $DataLabel (@SpecifiedDataLabels) {
 656 	  if (exists($SDFilesAllDataLabelsMap[$Index]{$DataLabel})) {
 657 	    push @DataLabelsToAnalyze, $DataLabel;
 658 	  }
 659 	}
 660       }
 661       elsif (defined($Options{datafields}) && $Options{datafields} =~ /^All$/i) {
 662 	push @DataLabelsToAnalyze, @{$SDFilesAllDataLabels[$Index]};
 663       }
 664       else {
 665 	push @DataLabelsToAnalyze, @{$SDFilesCommonDataLabels[$Index]};
 666       }
 667       if (@DataLabelsToAnalyze) {
 668 	push @{$SDFilesDataLabelsToAnalyze[$Index]}, @DataLabelsToAnalyze;
 669 	# Set up unique data field label map as well...
 670 	for $DataLabel (@DataLabelsToAnalyze) {
 671 	  if (!exists $UniqueDataLabelsToAnalyzeMap{$DataLabel}) {
 672 	    $UniqueDataLabelsToAnalyzeMap{$DataLabel} = $DataLabel;
 673 	  }
 674 	}
 675       }
 676       else {
 677 	warn "Warning: Ignoring file $SDFile: None of the data field labels specified, @SpecifiedDataLabels, using \"--datafields\" option exist.\n";
 678 	$SDFilesOkay[$Index] = 0;
 679 	next FILELIST;
 680       }
 681       if (!$Options{overwrite} && exists($SpecifiedStatisticalFunctionsMap{frequency})) {
 682 	# Make sure specific frequency files don't exist...
 683 	my($FrequencyFile);
 684 	for $DataLabel (@DataLabelsToAnalyze) {
 685 	  $FrequencyFile = $SDFilesNewTextFileRoot[$Index] . $SDFilesAllDataLabelsMap[$Index]{$DataLabel} . "FrequencyAnalysis." .  $SDFilesNewTextFileExt[$Index];
 686 	  if (-e $FrequencyFile) {
 687 	    warn "Warning: Ignoring file $SDFile: The file $FrequencyFile already exists.\n";
 688 	    $SDFilesOkay[$Index] = 0;
 689 	    next FILELIST;
 690 	  }
 691 	}
 692       }
 693       # Setup specified data field label pairs...
 694       if (exists $SpecifiedStatisticalFunctionsMap{correlation} || exists $SpecifiedStatisticalFunctionsMap{covariance} || exists $SpecifiedStatisticalFunctionsMap{rsquare}) {
 695 	my(@DataLabelPairsToAnalyze, $DataLabel1, $DataLabel2);
 696 	if (@SpecifiedDataLabelPairs) {
 697 	  # Make sure both data field labels exist...
 698 	  my($DataFieldIndex);
 699 	  for ($DataFieldIndex = 0; (($DataFieldIndex + 1) < @SpecifiedDataLabelPairs); $DataFieldIndex += 2 ) {
 700 	    $DataLabel1 = $SpecifiedDataLabelPairs[$DataFieldIndex];
 701 	    $DataLabel2 = $SpecifiedDataLabelPairs[$DataFieldIndex + 1];
 702 	    if (exists($SDFilesAllDataLabelsMap[$Index]{$DataLabel1}) && exists($SDFilesAllDataLabelsMap[$Index]{$DataLabel2})) {
 703 	      push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2);
 704 	    }
 705 	  }
 706 	}
 707 	elsif ($AllDataLabelPairs) {
 708 	  for $DataLabel1 (@{$SDFilesAllDataLabels[$Index]}) {
 709 	    for $DataLabel2 (@{$SDFilesAllDataLabels[$Index]}) {
 710 	      push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2);
 711 	    }
 712 	  }
 713 	}
 714 	else {
 715 	  for $DataLabel1 (@{$SDFilesCommonDataLabels[$Index]}) {
 716 	    for $DataLabel2 (@{$SDFilesCommonDataLabels[$Index]}) {
 717 	      push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2);
 718 	    }
 719 	  }
 720 	}
 721 	if (@DataLabelPairsToAnalyze) {
 722 	  if (@DataLabelPairsToAnalyze % 2) {
 723 	    warn "Warning: Ignoring file $SDFile: Invalid number  values specified using \"--datafieldpairs\" option: It must contain even number of valid values.\n";
 724 	    $SDFilesOkay[$Index] = 0;
 725 	    next FILELIST;
 726 	  }
 727 	  else {
 728 	    for ($DataFieldIndex = 0; $DataFieldIndex < @DataLabelPairsToAnalyze; $DataFieldIndex += 2) {
 729 	      push @{$SDFilesDataLabelPairs1ToAnalyze[$Index]}, $DataLabelPairsToAnalyze[$DataFieldIndex];
 730 	      push @{$SDFilesDataLabelPairs2ToAnalyze[$Index]}, $DataLabelPairsToAnalyze[$DataFieldIndex + 1];
 731 	    }
 732 	    # Set up unique data field labe map as well...
 733 	    for $DataLabel (@DataLabelPairsToAnalyze) {
 734 	      if (!exists $UniqueDataLabelsToAnalyzeMap{$DataLabel}) {
 735 		$UniqueDataLabelsToAnalyzeMap{$DataLabel} = $DataLabel;
 736 	      }
 737 	    }
 738 	  }
 739 	}
 740       }
 741       # Setup unique data field label array...
 742       push @{$SDFilesUniqueDataLabelsToAnalyze[$Index]}, (sort keys %UniqueDataLabelsToAnalyzeMap);
 743     }
 744   }
 745 }
 746 
 747 # Process option values...
 748 sub ProcessOptions {
 749   $DetailLevel = $Options{detail};
 750 
 751   # Setup supported statistical functions...
 752   my($SupportedFunction, @SupportedStatisticaFunctions, %SupportedStatisticaFunctionsMap);
 753   %SupportedStatisticaFunctionsMap = ();
 754   @SupportedStatisticaFunctions = qw(Average AverageDeviation Correlation Count Covariance GeometricMean Frequency HarmonicMean KLargest KSmallest Kurtosis Maximum Minimum Mean Median Mode RSquare Skewness Sum SumOfSquares StandardDeviation StandardDeviationN StandardError StandardScores StandardScoresN TrimMean Variance VarianceN);
 755 
 756   for $SupportedFunction (@SupportedStatisticaFunctions) {
 757     $SupportedStatisticaFunctionsMap{lc($SupportedFunction)} = $SupportedFunction;
 758   }
 759 
 760   # Setup a list of functions to use for analysis...
 761   my($SpecifiedFunction);
 762   %SpecifiedStatisticalFunctionsMap = ();
 763   @SpecifiedStatisticalFunctions = ();
 764   # Check mode values...
 765   if ($Options{mode} =~ /^DescriptiveStatisticsBasic$/i ) {
 766     $FileNameMode = "DescriptiveStatisticsBasic";
 767     @SpecifiedStatisticalFunctions = qw(Count Maximum Minimum Mean Median StandardDeviation StandardError Variance Sum);
 768   }
 769   elsif ($Options{mode} =~ /^DescriptiveStatisticsAll$/i ) {
 770     $FileNameMode = "DescriptiveStatisticsAll";
 771     @SpecifiedStatisticalFunctions = qw(Count Maximum Minimum Mean GeometricMean HarmonicMean TrimMean Median Mode StandardDeviation Kurtosis Skewness StandardError Variance  RSquare Frequency  KLargest KSmallest Sum);
 772   }
 773   elsif ($Options{mode} =~ /^All$/i ) {
 774     $FileNameMode = "AllStatistics";
 775     @SpecifiedStatisticalFunctions = @SupportedStatisticaFunctions;
 776   }
 777   else {
 778     $FileNameMode = "SpecifiedStatistics";
 779     # Comma delimited list of functions...
 780     my($Mode, @SpecifiedFunctions, @UnsupportedSpecifiedFunctions);
 781     $Mode = $Options{mode};
 782     $Mode =~ s/ //g;
 783     @SpecifiedFunctions = split ",", $Mode;
 784     @UnsupportedSpecifiedFunctions = ();
 785     for $SpecifiedFunction (@SpecifiedFunctions) {
 786       if (exists($SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)})) {
 787 	push @SpecifiedStatisticalFunctions, $SpecifiedFunction;
 788       }
 789       else {
 790 	push @UnsupportedSpecifiedFunctions, $SpecifiedFunction;
 791       }
 792     }
 793     if (@UnsupportedSpecifiedFunctions) {
 794       if (@UnsupportedSpecifiedFunctions > 1) {
 795 	warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedFunctions, ", ", 0)," - for option \"-m --mode\" are not valid.\n";
 796       }
 797       else {
 798 	warn "Error: The value specified, @UnsupportedSpecifiedFunctions , for option \"-m --mode\" is not valid.\n";
 799       }
 800       die "Allowed values:", JoinWords(\@SupportedStatisticaFunctions, ", ", 0), "\n";
 801     }
 802   }
 803   FUNCTION: for $SpecifiedFunction (@SpecifiedStatisticalFunctions) {
 804     if (exists $SpecifiedStatisticalFunctionsMap{lc($SpecifiedFunction)} ) {
 805       next FUNCTION;
 806     }
 807     $SpecifiedStatisticalFunctionsMap{lc($SpecifiedFunction)} = $SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)};
 808   }
 809 
 810   # Setup delimiter and quotes...
 811   $OutDelim = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
 812   $OutQuote = ($Options{quote} =~ /yes/i ) ? 1 : 0;
 813 
 814   # Setup miscellaneous options...
 815   $CheckData = $Options{fast} ? 0 : 1;
 816   $Precision = $Options{precision};
 817 
 818   $KLargest = $Options{klargest};
 819   $KSmallest = $Options{ksmallest};
 820 
 821   $TrimFraction = $Options{trimfraction};
 822 
 823   # Setup frequency bin values...
 824   $NumOfBins = 10;
 825   @BinRange = ();
 826   if ($Options{frequencybins} =~ /\,/) {
 827     my($BinValue, @SpecifiedBinRange);
 828     @SpecifiedBinRange = split /\,/,  $Options{frequencybins};
 829     if (@SpecifiedBinRange < 2) {
 830       die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n";
 831     }
 832     for $BinValue (@SpecifiedBinRange) {
 833       if (!IsNumerical($BinValue)) {
 834 	die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n";
 835       }
 836     }
 837     my($Index1, $Index2);
 838     for $Index1 (0 .. $#SpecifiedBinRange) {
 839       for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) {
 840 	if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) {
 841 	  die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n";
 842 	}
 843       }
 844     }
 845     push @BinRange, @SpecifiedBinRange;
 846   }
 847   else {
 848     $NumOfBins = $Options{frequencybins};
 849     if (!IsPositiveInteger($NumOfBins)) {
 850       die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n";
 851     }
 852   }
 853 
 854   # Setup specified data field labels...
 855   @SpecifiedDataLabels = ();
 856   if (defined $Options{datafields} && $Options{datafields} !~ /^(All|Common)$/i ) {
 857     my(@SpecifiedValues) = split ",", $Options{datafields};
 858     push @SpecifiedDataLabels, @SpecifiedValues;
 859   }
 860   @SpecifiedDataLabelPairs = ();
 861   $AllDataLabelPairs = (defined($Options{datafieldpairs}) && $Options{datafieldpairs} =~ /^AllPairs$/i) ?