1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: AnalyzeSDFilesData.pl,v $ 4 # $Date: 2008/01/30 21:44:43 $ 5 # $Revision: 1.14 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 use 5.006; 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use SDFileUtil; 37 use TextUtil; 38 use StatisticsUtil; 39 40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 41 42 # Autoflush STDOUT 43 $| = 1; 44 45 # Starting message... 46 $ScriptName = basename($0); 47 print "\n$ScriptName: Starting...\n\n"; 48 $StartTime = new Benchmark; 49 50 # Get the options and setup script... 51 SetupScriptUsage(); 52 if ($Options{help} || @ARGV < 1) { 53 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 54 } 55 56 my(@SDFilesList); 57 @SDFilesList = ExpandFileNames(\@ARGV, "sd sdf"); 58 59 my($DetailLevel, $OutDelim, $OutQuote, $Precision, $CheckData, $KLargest, $KSmallest, $TrimFraction, $AllDataLabelPairs, $CommonDataLabelPairs, $NumOfBins, @BinRange, @SpecifiedDataLabels, @SpecifiedDataLabelPairs, $FileNameMode, @SpecifiedStatisticalFunctions, %SpecifiedStatisticalFunctionsMap); 60 print "Processing options...\n"; 61 ProcessOptions(); 62 63 # Collect information about SD files... 64 print "Checking input SD file(s)...\n"; 65 my(@SDFilesOkay, @SDFilesCmpdCount, @SDFilesAllDataLabels, @SDFilesAllDataLabelsMap, @SDFilesCommonDataLabels, @SDFilesNewTextFileRoot, @SDFilesNewTextFileExt); 66 RetrieveSDFilesInfo(); 67 68 # Make sure the specified data field labels exists in SD files... 69 my(@SDFilesDataLabelsToAnalyze, @SDFilesDataLabelPairs1ToAnalyze, @SDFilesDataLabelPairs2ToAnalyze, @SDFilesUniqueDataLabelsToAnalyze); 70 ProcessSDFilesDataLabelsInfo(); 71 72 # Generate output files... 73 my($Index, $SDFile); 74 if (@SDFilesList > 1) { 75 print "Processing SD files...\n"; 76 } 77 for $Index (0 .. $#SDFilesList) { 78 if ($SDFilesOkay[$Index]) { 79 $SDFile = $SDFilesList[$Index]; 80 if (@SDFilesList > 1) { 81 print "\nProcessing file $SDFile...\n"; 82 } else { 83 print "Processing file $SDFile...\n" 84 } 85 AnalyzeSDFile($Index); 86 } 87 } 88 89 print "$ScriptName:Done...\n\n"; 90 91 $EndTime = new Benchmark; 92 $TotalTime = timediff ($EndTime, $StartTime); 93 print "Total time: ", timestr($TotalTime), "\n"; 94 95 ############################################################################### 96 97 # Analyze data... 98 sub AnalyzeSDFile { 99 my($Index) = @_; 100 my($SDFile, $DataLabel, $DataValue, @DataLabelsToAnalyze, %DataFieldValuesToAnalyzeMap); 101 102 $SDFile = $SDFilesList[$Index]; 103 @DataLabelsToAnalyze = @{$SDFilesUniqueDataLabelsToAnalyze[$Index]}; 104 %DataFieldValuesToAnalyzeMap = (); 105 for $DataLabel (@DataLabelsToAnalyze) { 106 @{$DataFieldValuesToAnalyzeMap{$DataLabel}} = (); 107 } 108 109 # Collect appropriate data field label values for analysis... 110 my($CmpdString, @CmpdLines, %DataFieldValues, $CmpdCount, $InvalidCmpdCount, @InvalidCmpdDataLabels); 111 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n"; 112 $CmpdCount = 0; 113 $InvalidCmpdCount = 0; 114 while ($CmpdString = ReadCmpdString(\*SDFILE)) { 115 $CmpdCount++; 116 @CmpdLines = split "\n", $CmpdString; 117 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 118 @InvalidCmpdDataLabels = (); 119 DATALABEL: for $DataLabel (@DataLabelsToAnalyze) { 120 if (exists $DataFieldValues{$DataLabel}) { 121 $DataValue = $DataFieldValues{$DataLabel}; 122 if ($CheckData) { 123 if (!IsNumerical($DataValue)) { 124 push @InvalidCmpdDataLabels, $DataLabel; 125 next DATALABEL; 126 } 127 } 128 push @{$DataFieldValuesToAnalyzeMap{$DataLabel}}, $DataValue; 129 } 130 } 131 if (@InvalidCmpdDataLabels) { 132 $InvalidCmpdCount++; 133 if ($DetailLevel >=4 ) { 134 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field(s) - ", JoinWords(\@InvalidCmpdDataLabels, ", ", 0)," - to be analyzed:\n$CmpdString \n"; 135 } 136 elsif ($DetailLevel >= 3) { 137 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field(s) - ", JoinWords(\@InvalidCmpdDataLabels, ", ", 0)," - to be analyzed...\n"; 138 } 139 elsif ($DetailLevel >= 2) { 140 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field to be analyzed...\n"; 141 } 142 } 143 } 144 if ($InvalidCmpdCount && ($DetailLevel >= 1)) { 145 print "Non-numerical or empty data present in $InvalidCmpdCount compound record(s)...\n"; 146 } 147 close SDFILE; 148 149 # Perform the analysis... 150 my(@SpecifiedFunctionNames, $SpecifiedFunction); 151 @SpecifiedFunctionNames = (); 152 153 for $SpecifiedFunction (@SpecifiedStatisticalFunctions) { 154 if ($SpecifiedFunction !~ /^(Covariance|Correlation|Frequency|Rsquare|StandardScores|StandardScoresN)$/i) { 155 push @SpecifiedFunctionNames, $SpecifiedStatisticalFunctionsMap{lc($SpecifiedFunction)}; 156 } 157 } 158 if (@SpecifiedFunctionNames) { 159 PerformAnalysis($Index, \@SpecifiedFunctionNames, \%DataFieldValuesToAnalyzeMap) 160 } 161 if (exists($SpecifiedStatisticalFunctionsMap{covariance}) || exists($SpecifiedStatisticalFunctionsMap{correlation}) || exists($SpecifiedStatisticalFunctionsMap{rsquare})) { 162 if ($AllDataLabelPairs || $CommonDataLabelPairs) { 163 PerformMatrixAnalysis($Index, \%DataFieldValuesToAnalyzeMap); 164 } 165 else { 166 # Perform pairwise analysis for specified columns and write out calculated values - correlation 167 # rsquare, or covariance - in the same file. 168 PerformDataLabelPairAnalysis($Index, \%DataFieldValuesToAnalyzeMap); 169 } 170 } 171 if (exists($SpecifiedStatisticalFunctionsMap{standardscores}) || exists($SpecifiedStatisticalFunctionsMap{standardscoresn}) ) { 172 PerformStandardScoresAnalysis($Index, \%DataFieldValuesToAnalyzeMap); 173 } 174 if (exists($SpecifiedStatisticalFunctionsMap{frequency})) { 175 PerformFrequencyAnalysis($Index, \%DataFieldValuesToAnalyzeMap); 176 } 177 178 } 179 180 # Calculate values for various statistical functions... 181 sub PerformAnalysis { 182 my($Index, $SpecifiedFunctionNamesRef, $DataValuesToAnalyzeMapRef) = @_; 183 my($NewTextFile, $Line, $SpecifiedFunction, $Label, @ColLabels, @DataLabelsToAnalyze); 184 185 $NewTextFile = $SDFilesNewTextFileRoot[$Index] . $FileNameMode . "." . $SDFilesNewTextFileExt[$Index]; 186 187 print "Generating new text file $NewTextFile...\n"; 188 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 189 190 # Write out column labels... 191 @ColLabels = (); 192 push @ColLabels, "DataLabel"; 193 for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) { 194 $Label = $SpecifiedFunction; 195 if ($SpecifiedFunction =~ /^(KLargest|KSmallest)$/i) { 196 my($KthValue); 197 $KthValue = ($SpecifiedFunction =~ /^KLargest$/i) ? $KLargest : $KSmallest; 198 $Label = AddNumberSuffix($KthValue) . "$SpecifiedFunction"; 199 $Label =~ s/K//g; 200 } 201 elsif ($SpecifiedFunction =~ /^TrimMean$/i) { 202 $Label = "${SpecifiedFunction}($TrimFraction)"; 203 } 204 push @ColLabels, $Label; 205 } 206 $Line = JoinWords(\@ColLabels, $OutDelim, $OutQuote); 207 print NEWTEXTFILE "$Line\n"; 208 209 # Go over each column to be analyzed... 210 @DataLabelsToAnalyze = @{$SDFilesDataLabelsToAnalyze[$Index]}; 211 212 # Turn off "strict"; otherwise, invoking statistical functions using function name string 213 # is problematic. 214 no strict; 215 216 my($DataValuesRef, $DataLabel, $Value, @RowValues, %CalculatedValues); 217 %CalculatedValues = (); 218 for $DataLabel (@DataLabelsToAnalyze) { 219 @RowValues = (); 220 # Setup column id... 221 push @RowValues, $DataLabel; 222 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}}; 223 FUNCTIONNAME: for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) { 224 $Value = ""; 225 if (!@{$DataValuesToAnalyzeMapRef->{$DataLabel}}) { 226 # Invalid column values... 227 push @RowValues, $Value; 228 next FUNCTIONNAME; 229 } 230 if ($SpecifiedFunction =~ /^Count$/i) { 231 $Value = @{$DataValuesToAnalyzeMapRef->{$DataLabel}}; 232 } 233 elsif ($SpecifiedFunction =~ /^KLargest$/i) { 234 $Value = &$SpecifiedFunction($DataValuesRef, $KLargest); 235 } 236 elsif ($SpecifiedFunction =~ /^KSmallest$/i) { 237 $Value = &$SpecifiedFunction($DataValuesRef, $KSmallest); 238 } 239 elsif ($SpecifiedFunction =~ /^StandardDeviation$/i) { 240 if (exists($CalculatedValues{$DataLabel}{StandardDeviation})) { 241 $Value = $CalculatedValues{$DataLabel}{StandardDeviation}; 242 } 243 else { 244 $Value = &$SpecifiedFunction($DataValuesRef); 245 $CalculatedValues{$DataLabel}{StandardDeviation} = $Value; 246 } 247 } 248 elsif ($SpecifiedFunction =~ /^StandardError$/i) { 249 if (!exists($CalculatedValues{$DataLabel}{StandardDeviation})) { 250 $Value = StandardDeviation($DataValuesRef); 251 $CalculatedValues{$DataLabel}{StandardDeviation} = $Value; 252 } 253 if (defined $CalculatedValues{$DataLabel}{StandardDeviation}) { 254 $Value = &$SpecifiedFunction($CalculatedValues{$DataLabel}{StandardDeviation}, @{$DataValuesToAnalyzeMapRef->{$DataLabel}}); 255 } 256 } 257 elsif ($SpecifiedFunction =~ /^TrimMean$/i) { 258 $Value = &$SpecifiedFunction($DataValuesRef, $TrimFraction); 259 } 260 else { 261 $Value = &$SpecifiedFunction($DataValuesRef); 262 } 263 # Format the output value. And add zero to get rid of tariling zeros... 264 $Value = (defined($Value) && length($Value)) ? (sprintf("%.${Precision}f", $Value) + 0) : ""; 265 push @RowValues, $Value; 266 } 267 $Line = JoinWords(\@RowValues, $OutDelim, $OutQuote); 268 print NEWTEXTFILE "$Line\n"; 269 } 270 close NEWTEXTFILE; 271 } 272 273 # Calculate covariance, correlation, rsquare for specified data field label pairs.... 274 sub PerformDataLabelPairAnalysis { 275 my($Index, $DataValuesToAnalyzeMapRef) = @_; 276 my($NewTextFile, @ColLabels, $Line, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance); 277 278 $CalculateCorrelation = exists($SpecifiedStatisticalFunctionsMap{correlation}) ? 1 : 0; 279 $CalculateRSquare = exists($SpecifiedStatisticalFunctionsMap{rsquare}) ? 1 : 0; 280 $CalculateCovariance = exists($SpecifiedStatisticalFunctionsMap{covariance}) ? 1 : 0; 281 282 $NewTextFile = $SDFilesNewTextFileRoot[$Index] . "DataFieldPairsAnalysis." . $SDFilesNewTextFileExt[$Index]; 283 print "Generating new text file $NewTextFile...\n"; 284 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 285 286 # Write out the column labels... 287 @ColLabels = (); 288 push @ColLabels, ("DataLabel1", "DataLabel2"); 289 if ($CalculateCorrelation || $CalculateRSquare) { 290 push @ColLabels, "Correlation"; 291 if ($CalculateRSquare) { 292 push @ColLabels, "RSquare"; 293 } 294 } 295 if ($CalculateCovariance) { 296 push @ColLabels, "Covariance"; 297 } 298 $Line = JoinWords(\@ColLabels, $OutDelim, $OutQuote); 299 print NEWTEXTFILE "$Line\n"; 300 301 # Go over each data field pair... 302 my($CorrelationValue, $RSquareValue, $CovarianceValue, $LabelIndex, $DataLabel1, $DataLabel2, $DataValues1, $DataValues2, @DataLabelPairs1ToAnalyze, @DataLabelPairs2ToAnalyze, @RowValues, $Value); 303 304 @DataLabelPairs1ToAnalyze = @{$SDFilesDataLabelPairs1ToAnalyze[$Index]}; 305 @DataLabelPairs2ToAnalyze = @{$SDFilesDataLabelPairs2ToAnalyze[$Index]}; 306 for $LabelIndex (0 .. $#DataLabelPairs1ToAnalyze) { 307 @RowValues = (); 308 $DataLabel1 = $DataLabelPairs1ToAnalyze[$LabelIndex]; 309 $DataLabel2 = $DataLabelPairs2ToAnalyze[$LabelIndex]; 310 $DataValues1 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel1}}; 311 $DataValues2 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel2}}; 312 313 # Setup column ids... 314 push @RowValues, $DataLabel1; 315 push @RowValues, $DataLabel2; 316 317 if (@$DataValues1 != @$DataValues2) { 318 # Print a warning... 319 warn "Warning: Skipping analysis for data field pair $DataLabel1, $DataLabel2: Number of valid data values must be same.\n"; 320 if ($CalculateCorrelation || $CalculateRSquare) { 321 push @RowValues, ""; 322 if ($CalculateRSquare) { 323 push @RowValues, ""; 324 } 325 } 326 if ($CalculateCovariance) { 327 push @RowValues, ""; 328 } 329 } 330 else { 331 # Calculate appropriate value... 332 if ($CalculateCorrelation || $CalculateRSquare) { 333 $CorrelationValue = Correlation($DataValues1, $DataValues2); 334 $Value = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.${Precision}f", $CorrelationValue) + 0) : ""; 335 push @RowValues, $Value; 336 if ($CalculateRSquare) { 337 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : ""; 338 $Value = (length($RSquareValue)) ? (sprintf("%.${Precision}f", $RSquareValue) + 0) : ""; 339 push @RowValues, $Value; 340 } 341 } 342 if ($CalculateCovariance) { 343 $CovarianceValue = Covariance($DataValues1, $DataValues2); 344 $Value = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.${Precision}f", $CovarianceValue) + 0) : ""; 345 push @RowValues, $Value; 346 } 347 } 348 $Line = JoinWords(\@RowValues, $OutDelim, $OutQuote); 349 print NEWTEXTFILE "$Line\n"; 350 } 351 close NEWTEXTFILE; 352 } 353 354 # Generate histogram numbers... 355 sub PerformFrequencyAnalysis { 356 my($Index, $DataValuesToAnalyzeMapRef) = @_; 357 my($NewTextFile, $ColLabel, @ColLabels, @RowValues, $Line, $DataLabel, @DataLabelsToAnalyze, $DataValuesRef, $BinValue, $FrequencyValue, $Value, %FrequencyMap); 358 359 @DataLabelsToAnalyze = @{$SDFilesDataLabelsToAnalyze[$Index]}; 360 for $DataLabel (@DataLabelsToAnalyze) { 361 $NewTextFile = $SDFilesNewTextFileRoot[$Index] . $DataLabel . "FrequencyAnalysis." . $SDFilesNewTextFileExt[$Index]; 362 print "Generating new text file $NewTextFile...\n"; 363 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 364 365 # Write out the column labels... 366 @ColLabels = (); 367 push @ColLabels , ("Bins", "Frequency"); 368 $Line = JoinWords(\@ColLabels, $OutDelim, $OutQuote); 369 print NEWTEXTFILE "$Line\n"; 370 371 #Calculate and write out frequency values... 372 %FrequencyMap = (); 373 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}}; 374 if (@$DataValuesRef) { 375 if (@BinRange) { 376 %FrequencyMap = Frequency($DataValuesRef, \@BinRange); 377 } 378 else { 379 %FrequencyMap = Frequency($DataValuesRef, $NumOfBins); 380 } 381 } 382 for $BinValue (sort { $a <=> $b } keys %FrequencyMap) { 383 $FrequencyValue = $FrequencyMap{$BinValue}; 384 385 @RowValues = (); 386 $Value = (length($BinValue)) ? (sprintf("%.${Precision}f", $BinValue) + 0) : ""; 387 push @RowValues, $Value; 388 $Value = (length($FrequencyValue)) ? (sprintf("%.${Precision}f", $FrequencyValue) + 0) : ""; 389 push @RowValues, $Value; 390 391 $Line = JoinWords(\@RowValues, $OutDelim, $OutQuote); 392 print NEWTEXTFILE "$Line\n"; 393 } 394 close NEWTEXTFILE; 395 } 396 } 397 398 # Calculate covariance, correlation/rsquare matrices.... 399 sub PerformMatrixAnalysis { 400 my($Index, $DataValuesToAnalyzeMapRef) = @_; 401 my($CorrelationTextFile, $CovarianceTextFile, $RSquareTextFile, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance); 402 403 $CalculateCorrelation = exists($SpecifiedStatisticalFunctionsMap{correlation}) ? 1 : 0; 404 $CalculateRSquare = exists($SpecifiedStatisticalFunctionsMap{rsquare}) ? 1 : 0; 405 $CalculateCovariance = exists($SpecifiedStatisticalFunctionsMap{covariance}) ? 1 : 0; 406 407 $CorrelationTextFile = $SDFilesNewTextFileRoot[$Index] . "CorrelationMatrix." . $SDFilesNewTextFileExt[$Index]; 408 $RSquareTextFile = $SDFilesNewTextFileRoot[$Index] . "RSquareMatrix." . $SDFilesNewTextFileExt[$Index]; 409 $CovarianceTextFile = $SDFilesNewTextFileRoot[$Index] . "CovarianceMatrix." . $SDFilesNewTextFileExt[$Index]; 410 411 my($TextFilesList, $Delimiter); 412 $TextFilesList = ""; 413 if ($CalculateCorrelation || $CalculateRSquare) { 414 $TextFilesList = $CorrelationTextFile; 415 if ($CalculateRSquare) { 416 $TextFilesList .= ", $CorrelationTextFile"; 417 } 418 } 419 $Delimiter = length($TextFilesList) ? "," : ""; 420 if ($CalculateCovariance) { 421 $TextFilesList .= "${Delimiter} ${CorrelationTextFile}"; 422 } 423 if ($TextFilesList =~ /\,/) { 424 print "Generating new text files $TextFilesList ...\n" 425 } 426 else { 427 print "Generating new text file $TextFilesList ...\n" 428 } 429 if ($CalculateCorrelation || $CalculateRSquare) { 430 open CORRELATIONTEXTFILE, ">$CorrelationTextFile" or die "Error: Can't open $CorrelationTextFile: $! \n"; 431 if ($CalculateRSquare) { 432 open RSQUARETEXTFILE, ">$RSquareTextFile" or die "Error: Can't open $RSquareTextFile: $! \n"; 433 } 434 } 435 if ($CalculateCovariance) { 436 open COVARIANCETEXTFILE, ">$CovarianceTextFile" or die "Error: Can't open $CovarianceTextFile: $! \n"; 437 } 438 439 my($Line, $Value, $CorrelationValue, $RSquareValue, $CovarianceValue, $DataLabel, $DataLabel1, $DataLabel2, $DataValuesRef1, $DataValuesRef2, @ColLabels, @CovarianceRowValues, @CorrelationRowValues, @RSquareRowValues); 440 441 # Write out the column labels... 442 @ColLabels = (); 443 push @ColLabels, @{$SDFilesAllDataLabels[$Index]}; 444 $Line = JoinWords(\@ColLabels, $OutDelim, $OutQuote); 445 if ($CalculateCorrelation || $CalculateRSquare) { 446 print CORRELATIONTEXTFILE "$Line\n"; 447 if ($CalculateRSquare) { 448 print RSQUARETEXTFILE "$Line\n"; 449 } 450 } 451 if ($CalculateCovariance) { 452 print COVARIANCETEXTFILE "$Line\n"; 453 } 454 455 # Due to symmetric nature of these matrices, only one half needs to be 456 # calculated. So, just calculate the lower half and copy it to upper half... 457 my(%CorrelationMatrixMap, %RSquareMatrixMap, %CovarianceMatrixMap, $LabelIndex1, $LabelIndex2, @DataLabelsToAnalyze); 458 459 %CorrelationMatrixMap = (); %RSquareMatrixMap = (); %CovarianceMatrixMap = (); 460 @DataLabelsToAnalyze = (); 461 @DataLabelsToAnalyze = $AllDataLabelPairs ? @{$SDFilesAllDataLabels[$Index]} : @{$SDFilesCommonDataLabels[$Index]}; 462 463 for $LabelIndex1 (0 .. (@DataLabelsToAnalyze - 1)) { 464 $DataLabel1 = $DataLabelsToAnalyze[$LabelIndex1]; 465 for $LabelIndex2 (0 .. $LabelIndex1) { 466 $DataLabel2 = $DataLabelsToAnalyze[$LabelIndex2]; 467 $DataValuesRef1 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel1}}; 468 $DataValuesRef2 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel2}}; 469 if ($CalculateCorrelation || $CalculateRSquare) { 470 $CorrelationValue = Correlation($DataValuesRef1, $DataValuesRef2); 471 $CorrelationValue = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.${Precision}f", $CorrelationValue) + 0) : ""; 472 $CorrelationMatrixMap{$DataLabel1}{$DataLabel2} = $CorrelationValue; 473 if ($DataLabel1 ne $DataLabel2) { 474 $CorrelationMatrixMap{$DataLabel2}{$DataLabel1} = $CorrelationValue; 475 } 476 if ($CalculateRSquare) { 477 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : ""; 478 $RSquareValue = (length($RSquareValue)) ? (sprintf("%.${Precision}f", $RSquareValue) + 0) : ""; 479 $RSquareMatrixMap{$DataLabel1}{$DataLabel2} = $RSquareValue; 480 if ($DataLabel1 ne $DataLabel2) { 481 $RSquareMatrixMap{$DataLabel2}{$DataLabel1} = $RSquareValue; 482 } 483 } 484 } 485 if ($CalculateCovariance) { 486 $CovarianceValue = Covariance($DataValuesRef1, $DataValuesRef2); 487 $CovarianceValue = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.${Precision}f", $CovarianceValue) + 0) : ""; 488 $CovarianceMatrixMap{$DataLabel1}{$DataLabel2} = $CovarianceValue; 489 if ($DataLabel1 ne $DataLabel2) { 490 $CovarianceMatrixMap{$DataLabel2}{$DataLabel1} = $CovarianceValue; 491 } 492 } 493 } 494 } 495 496 # Write out the matrices... 497 for $LabelIndex1 (0 .. (@DataLabelsToAnalyze - 1)) { 498 $DataLabel1 = $DataLabelsToAnalyze[$LabelIndex1]; 499 @CorrelationRowValues = (); 500 @RSquareRowValues = (); 501 @CovarianceRowValues = (); 502 if ($CalculateCorrelation || $CalculateRSquare) { 503 push @CorrelationRowValues, $DataLabel1; 504 if ($CalculateRSquare) { 505 push @RSquareRowValues, $DataLabel1; 506 } 507 } 508 if ($CalculateCovariance) { 509 push @CovarianceRowValues, $DataLabel; 510 } 511 for $LabelIndex2 (0 .. (@DataLabelsToAnalyze - 1)) { 512 $DataLabel2 = $DataLabelsToAnalyze[$LabelIndex2]; 513 if ($CalculateCorrelation || $CalculateRSquare) { 514 push @CorrelationRowValues, $CorrelationMatrixMap{$DataLabel1}{$DataLabel2}; 515 if ($CalculateRSquare) { 516 push @RSquareRowValues, $RSquareMatrixMap{$DataLabel1}{$DataLabel2}; 517 } 518 } 519 if ($CalculateCovariance) { 520 push @CovarianceRowValues, $CovarianceMatrixMap{$DataLabel1}{$DataLabel2}; 521 } 522 } 523 if ($CalculateCorrelation || $CalculateRSquare) { 524 $Line = JoinWords(\@CorrelationRowValues, $OutDelim, $OutQuote); 525 print CORRELATIONTEXTFILE "$Line\n"; 526 if ($CalculateRSquare) { 527 $Line = JoinWords(\@RSquareRowValues, $OutDelim, $OutQuote); 528 print RSQUARETEXTFILE "$Line\n"; 529 } 530 } 531 if ($CalculateCovariance) { 532 $Line = JoinWords(\@CovarianceRowValues, $OutDelim, $OutQuote); 533 print COVARIANCETEXTFILE "$Line\n"; 534 } 535 } 536 if ($CalculateCorrelation || $CalculateRSquare) { 537 close CORRELATIONTEXTFILE; 538 if ($CalculateRSquare) { 539 close RSQUARETEXTFILE; 540 } 541 } 542 if ($CalculateCovariance) { 543 close COVARIANCETEXTFILE; 544 } 545 } 546 547 # Calculate standard scores... 548 sub PerformStandardScoresAnalysis { 549 my($Index, $DataValuesToAnalyzeMapRef) = @_; 550 my($StandardScores, $StandardScoresN, $NewTextFile, @ColLabels, $Label, $NewLine); 551 552 $StandardScores = exists($SpecifiedStatisticalFunctionsMap{standardscores}) ? 1 : 0; 553 $StandardScoresN = exists($SpecifiedStatisticalFunctionsMap{standardscoresn}) ? 1 : 0; 554 555 $NewTextFile = $SDFilesNewTextFileRoot[$Index] . "StandardScores." . $SDFilesNewTextFileExt[$Index]; 556 print "Generating new text file $NewTextFile...\n"; 557 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 558 559 my($DataLabel, @DataLabelsToAnalyze); 560 # Write out column labels... 561 @ColLabels = (); 562 @DataLabelsToAnalyze = @{$SDFilesDataLabelsToAnalyze[$Index]}; 563 for $DataLabel (@DataLabelsToAnalyze) { 564 if ($StandardScores) { 565 push @ColLabels, "${DataLabel}\(StandardScores)"; 566 } 567 if ($StandardScoresN) { 568 push @ColLabels, "${DataLabel}\(StandardScoresN)"; 569 } 570 } 571 $NewLine = JoinWords(\@ColLabels, $OutDelim, $OutQuote); 572 print NEWTEXTFILE "$NewLine\n"; 573 574 # Go over each column to be analyzed and calculate standard deviation 575 # and mean values... 576 my($DataValuesRef, %StandardDeviationMap, %StandardDeviationNMap, %MeanMap); 577 %StandardDeviationMap = (); 578 %StandardDeviationNMap = (); 579 %MeanMap = (); 580 for $DataLabel (@DataLabelsToAnalyze) { 581 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}}; 582 if (!exists($MeanMap{$DataLabel})) { 583 $MeanMap{$DataLabel} = Mean($DataValuesRef); 584 } 585 if ($StandardScores) { 586 if (!exists($StandardDeviationMap{$DataLabel})) { 587 $StandardDeviationMap{$DataLabel} = StandardDeviation($DataValuesRef); 588 } 589 } 590 if ($StandardScoresN) { 591 if (!exists($StandardDeviationNMap{$DataLabel})) { 592 $StandardDeviationNMap{$DataLabel} = StandardDeviationN($DataValuesRef); 593 } 594 } 595 } 596 # 597 # Go over each data field and calculate standard scores for each column 598 # using (x[i] - mean) / (n - 1) for StandardScores and (x[i] - mean) / n 599 # for StandardScoresN; write out the calculated values as well... 600 601 my($SDFile, $Value, $ValueOkay, $ScoreValue, @RowValues, $CmpdString, @CmpdLines, %DataFieldValues); 602 $SDFile = $SDFilesList[$Index]; 603 604 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n"; 605 while ($CmpdString = ReadCmpdString(\*SDFILE)) { 606 @CmpdLines = split "\n", $CmpdString; 607 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 608 @RowValues = (); 609 for $DataLabel (@DataLabelsToAnalyze) { 610 $Value = ""; 611 if (exists $DataFieldValues{$DataLabel}) { 612 $Value = $DataFieldValues{$DataLabel}; 613 } 614 $ValueOkay = ($CheckData && !IsNumerical($Value)) ? 0 : 1; 615 if ($StandardScores) { 616 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$DataLabel})/$StandardDeviationMap{$DataLabel}) : ""; 617 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.${Precision}f", $ScoreValue) + 0) : ""; 618 push @RowValues, $ScoreValue; 619 } 620 if ($StandardScoresN) { 621 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$DataLabel})/$StandardDeviationNMap{$DataLabel}) : ""; 622 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.${Precision}f", $ScoreValue) + 0) : ""; 623 push @RowValues, $ScoreValue; 624 } 625 } 626 $NewLine = JoinWords(\@RowValues, $OutDelim, $OutQuote); 627 print NEWTEXTFILE "$NewLine\n"; 628 } 629 close SDFILE; 630 close NEWTEXTFILE; 631 632 } 633 634 # Make sure the specified data field labels exists in SD files... 635 sub ProcessSDFilesDataLabelsInfo { 636 my($Index, $DataFieldIndex, $SDFiles, $DataLabel, @DataLabelsToAnalyze, %UniqueDataLabelsToAnalyzeMap); 637 638 @SDFilesDataLabelsToAnalyze = (); 639 @SDFilesDataLabelPairs1ToAnalyze = (); 640 @SDFilesDataLabelPairs2ToAnalyze = (); 641 @SDFilesUniqueDataLabelsToAnalyze = (); 642 FILELIST: for $Index (0 .. $#SDFilesList) { 643 $SDFile = $SDFilesList[$Index]; 644 645 @{$SDFilesDataLabelsToAnalyze[$Index]} = (); 646 @{$SDFilesDataLabelPairs1ToAnalyze[$Index]} = (); 647 @{$SDFilesDataLabelPairs2ToAnalyze[$Index]} = (); 648 @{$SDFilesUniqueDataLabelsToAnalyze[$Index]} = (); 649 650 %UniqueDataLabelsToAnalyzeMap = (); 651 652 if ($SDFilesOkay[$Index]) { 653 @DataLabelsToAnalyze = (); 654 if (@SpecifiedDataLabels) { 655 for $DataLabel (@SpecifiedDataLabels) { 656 if (exists($SDFilesAllDataLabelsMap[$Index]{$DataLabel})) { 657 push @DataLabelsToAnalyze, $DataLabel; 658 } 659 } 660 } 661 elsif (defined($Options{datafields}) && $Options{datafields} =~ /^All$/i) { 662 push @DataLabelsToAnalyze, @{$SDFilesAllDataLabels[$Index]}; 663 } 664 else { 665 push @DataLabelsToAnalyze, @{$SDFilesCommonDataLabels[$Index]}; 666 } 667 if (@DataLabelsToAnalyze) { 668 push @{$SDFilesDataLabelsToAnalyze[$Index]}, @DataLabelsToAnalyze; 669 # Set up unique data field label map as well... 670 for $DataLabel (@DataLabelsToAnalyze) { 671 if (!exists $UniqueDataLabelsToAnalyzeMap{$DataLabel}) { 672 $UniqueDataLabelsToAnalyzeMap{$DataLabel} = $DataLabel; 673 } 674 } 675 } 676 else { 677 warn "Warning: Ignoring file $SDFile: None of the data field labels specified, @SpecifiedDataLabels, using \"--datafields\" option exist.\n"; 678 $SDFilesOkay[$Index] = 0; 679 next FILELIST; 680 } 681 if (!$Options{overwrite} && exists($SpecifiedStatisticalFunctionsMap{frequency})) { 682 # Make sure specific frequency files don't exist... 683 my($FrequencyFile); 684 for $DataLabel (@DataLabelsToAnalyze) { 685 $FrequencyFile = $SDFilesNewTextFileRoot[$Index] . $SDFilesAllDataLabelsMap[$Index]{$DataLabel} . "FrequencyAnalysis." . $SDFilesNewTextFileExt[$Index]; 686 if (-e $FrequencyFile) { 687 warn "Warning: Ignoring file $SDFile: The file $FrequencyFile already exists.\n"; 688 $SDFilesOkay[$Index] = 0; 689 next FILELIST; 690 } 691 } 692 } 693 # Setup specified data field label pairs... 694 if (exists $SpecifiedStatisticalFunctionsMap{correlation} || exists $SpecifiedStatisticalFunctionsMap{covariance} || exists $SpecifiedStatisticalFunctionsMap{rsquare}) { 695 my(@DataLabelPairsToAnalyze, $DataLabel1, $DataLabel2); 696 if (@SpecifiedDataLabelPairs) { 697 # Make sure both data field labels exist... 698 my($DataFieldIndex); 699 for ($DataFieldIndex = 0; (($DataFieldIndex + 1) < @SpecifiedDataLabelPairs); $DataFieldIndex += 2 ) { 700 $DataLabel1 = $SpecifiedDataLabelPairs[$DataFieldIndex]; 701 $DataLabel2 = $SpecifiedDataLabelPairs[$DataFieldIndex + 1]; 702 if (exists($SDFilesAllDataLabelsMap[$Index]{$DataLabel1}) && exists($SDFilesAllDataLabelsMap[$Index]{$DataLabel2})) { 703 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2); 704 } 705 } 706 } 707 elsif ($AllDataLabelPairs) { 708 for $DataLabel1 (@{$SDFilesAllDataLabels[$Index]}) { 709 for $DataLabel2 (@{$SDFilesAllDataLabels[$Index]}) { 710 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2); 711 } 712 } 713 } 714 else { 715 for $DataLabel1 (@{$SDFilesCommonDataLabels[$Index]}) { 716 for $DataLabel2 (@{$SDFilesCommonDataLabels[$Index]}) { 717 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2); 718 } 719 } 720 } 721 if (@DataLabelPairsToAnalyze) { 722 if (@DataLabelPairsToAnalyze % 2) { 723 warn "Warning: Ignoring file $SDFile: Invalid number values specified using \"--datafieldpairs\" option: It must contain even number of valid values.\n"; 724 $SDFilesOkay[$Index] = 0; 725 next FILELIST; 726 } 727 else { 728 for ($DataFieldIndex = 0; $DataFieldIndex < @DataLabelPairsToAnalyze; $DataFieldIndex += 2) { 729 push @{$SDFilesDataLabelPairs1ToAnalyze[$Index]}, $DataLabelPairsToAnalyze[$DataFieldIndex]; 730 push @{$SDFilesDataLabelPairs2ToAnalyze[$Index]}, $DataLabelPairsToAnalyze[$DataFieldIndex + 1]; 731 } 732 # Set up unique data field labe map as well... 733 for $DataLabel (@DataLabelPairsToAnalyze) { 734 if (!exists $UniqueDataLabelsToAnalyzeMap{$DataLabel}) { 735 $UniqueDataLabelsToAnalyzeMap{$DataLabel} = $DataLabel; 736 } 737 } 738 } 739 } 740 } 741 # Setup unique data field label array... 742 push @{$SDFilesUniqueDataLabelsToAnalyze[$Index]}, (sort keys %UniqueDataLabelsToAnalyzeMap); 743 } 744 } 745 } 746 747 # Process option values... 748 sub ProcessOptions { 749 $DetailLevel = $Options{detail}; 750 751 # Setup supported statistical functions... 752 my($SupportedFunction, @SupportedStatisticaFunctions, %SupportedStatisticaFunctionsMap); 753 %SupportedStatisticaFunctionsMap = (); 754 @SupportedStatisticaFunctions = qw(Average AverageDeviation Correlation Count Covariance GeometricMean Frequency HarmonicMean KLargest KSmallest Kurtosis Maximum Minimum Mean Median Mode RSquare Skewness Sum SumOfSquares StandardDeviation StandardDeviationN StandardError StandardScores StandardScoresN TrimMean Variance VarianceN); 755 756 for $SupportedFunction (@SupportedStatisticaFunctions) { 757 $SupportedStatisticaFunctionsMap{lc($SupportedFunction)} = $SupportedFunction; 758 } 759 760 # Setup a list of functions to use for analysis... 761 my($SpecifiedFunction); 762 %SpecifiedStatisticalFunctionsMap = (); 763 @SpecifiedStatisticalFunctions = (); 764 # Check mode values... 765 if ($Options{mode} =~ /^DescriptiveStatisticsBasic$/i ) { 766 $FileNameMode = "DescriptiveStatisticsBasic"; 767 @SpecifiedStatisticalFunctions = qw(Count Maximum Minimum Mean Median StandardDeviation StandardError Variance Sum); 768 } 769 elsif ($Options{mode} =~ /^DescriptiveStatisticsAll$/i ) { 770 $FileNameMode = "DescriptiveStatisticsAll"; 771 @SpecifiedStatisticalFunctions = qw(Count Maximum Minimum Mean GeometricMean HarmonicMean TrimMean Median Mode StandardDeviation Kurtosis Skewness StandardError Variance RSquare Frequency KLargest KSmallest Sum); 772 } 773 elsif ($Options{mode} =~ /^All$/i ) { 774 $FileNameMode = "AllStatistics"; 775 @SpecifiedStatisticalFunctions = @SupportedStatisticaFunctions; 776 } 777 else { 778 $FileNameMode = "SpecifiedStatistics"; 779 # Comma delimited list of functions... 780 my($Mode, @SpecifiedFunctions, @UnsupportedSpecifiedFunctions); 781 $Mode = $Options{mode}; 782 $Mode =~ s/ //g; 783 @SpecifiedFunctions = split ",", $Mode; 784 @UnsupportedSpecifiedFunctions = (); 785 for $SpecifiedFunction (@SpecifiedFunctions) { 786 if (exists($SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)})) { 787 push @SpecifiedStatisticalFunctions, $SpecifiedFunction; 788 } 789 else { 790 push @UnsupportedSpecifiedFunctions, $SpecifiedFunction; 791 } 792 } 793 if (@UnsupportedSpecifiedFunctions) { 794 if (@UnsupportedSpecifiedFunctions > 1) { 795 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedFunctions, ", ", 0)," - for option \"-m --mode\" are not valid.\n"; 796 } 797 else { 798 warn "Error: The value specified, @UnsupportedSpecifiedFunctions , for option \"-m --mode\" is not valid.\n"; 799 } 800 die "Allowed values:", JoinWords(\@SupportedStatisticaFunctions, ", ", 0), "\n"; 801 } 802 } 803 FUNCTION: for $SpecifiedFunction (@SpecifiedStatisticalFunctions) { 804 if (exists $SpecifiedStatisticalFunctionsMap{lc($SpecifiedFunction)} ) { 805 next FUNCTION; 806 } 807 $SpecifiedStatisticalFunctionsMap{lc($SpecifiedFunction)} = $SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)}; 808 } 809 810 # Setup delimiter and quotes... 811 $OutDelim = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,"); 812 $OutQuote = ($Options{quote} =~ /yes/i ) ? 1 : 0; 813 814 # Setup miscellaneous options... 815 $CheckData = $Options{fast} ? 0 : 1; 816 $Precision = $Options{precision}; 817 818 $KLargest = $Options{klargest}; 819 $KSmallest = $Options{ksmallest}; 820 821 $TrimFraction = $Options{trimfraction}; 822 823 # Setup frequency bin values... 824 $NumOfBins = 10; 825 @BinRange = (); 826 if ($Options{frequencybins} =~ /\,/) { 827 my($BinValue, @SpecifiedBinRange); 828 @SpecifiedBinRange = split /\,/, $Options{frequencybins}; 829 if (@SpecifiedBinRange < 2) { 830 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n"; 831 } 832 for $BinValue (@SpecifiedBinRange) { 833 if (!IsNumerical($BinValue)) { 834 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n"; 835 } 836 } 837 my($Index1, $Index2); 838 for $Index1 (0 .. $#SpecifiedBinRange) { 839 for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) { 840 if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) { 841 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n"; 842 } 843 } 844 } 845 push @BinRange, @SpecifiedBinRange; 846 } 847 else { 848 $NumOfBins = $Options{frequencybins}; 849 if (!IsPositiveInteger($NumOfBins)) { 850 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n"; 851 } 852 } 853 854 # Setup specified data field labels... 855 @SpecifiedDataLabels = (); 856 if (defined $Options{datafields} && $Options{datafields} !~ /^(All|Common)$/i ) { 857 my(@SpecifiedValues) = split ",", $Options{datafields}; 858 push @SpecifiedDataLabels, @SpecifiedValues; 859 } 860 @SpecifiedDataLabelPairs = (); 861 $AllDataLabelPairs = (defined($Options{datafieldpairs}) && $Options{datafieldpairs} =~ /^AllPairs$/i) ?