1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: SimilarityMatrixTextFiles.pl,v $ 4 # $Date: 2008/04/19 16:12:21 $ 5 # $Revision: 1.13 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use 5.006; 30 use strict; 31 use FindBin; use lib "$FindBin::Bin/../lib"; 32 use Getopt::Long; 33 use File::Basename; 34 use Text::ParseWords; 35 use Benchmark; 36 use FileUtil; 37 use TextUtil; 38 use Fingerprints::FingerprintsBitVector; 39 40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 41 42 # Autoflush STDOUT 43 $| = 1; 44 45 # Starting message... 46 $ScriptName = basename($0); 47 print "\n$ScriptName: Starting...\n\n"; 48 $StartTime = new Benchmark; 49 50 # Get the options and setup script... 51 SetupScriptUsage(); 52 if ($Options{help} || @ARGV < 1) { 53 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 54 } 55 56 my(@TextFilesList); 57 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); 58 59 # Process options... 60 my(%OptionsInfo); 61 ProcessOptions(); 62 63 # Setup information about input files... 64 my(%TextFilesInfo); 65 print "Checking input text file(s)...\n"; 66 RetrieveTextFilesInfo(); 67 68 ProcessColumnsInfo(); 69 70 # Process input files.. 71 my($FileIndex, $TextFile, $FileProcessingMsg); 72 $FileProcessingMsg = "Processing file"; 73 if (@TextFilesList > 1) { 74 print "Processing text files...\n"; 75 $FileProcessingMsg = "\n$FileProcessingMsg"; 76 } 77 78 for $FileIndex (0 .. $#TextFilesList) { 79 if ($TextFilesInfo{FileOkay}[$FileIndex]) { 80 $TextFile = $TextFilesList[$FileIndex]; 81 print "$FileProcessingMsg $TextFile...\n"; 82 GenerateSimilarityMatrices($FileIndex); 83 } 84 } 85 print "$ScriptName:Done...\n\n"; 86 87 $EndTime = new Benchmark; 88 $TotalTime = timediff ($EndTime, $StartTime); 89 print "Total time: ", timestr($TotalTime), "\n"; 90 91 ############################################################################### 92 93 # Generate similarity matrices using fingerprints data in text file... 94 # 95 sub GenerateSimilarityMatrices { 96 my($FileIndex) = @_; 97 my($CompundIDsRef, $FingerprintsBitVectorsRef); 98 99 # Process fingerprints data in text file... 100 print "Processing fingerprints data...\n"; 101 ($CompundIDsRef, $FingerprintsBitVectorsRef) = ProcessFingerprintsData($FileIndex); 102 103 # Generate similarity matrices... 104 my($SpecifiedSimilarityCoefficient, $SimilarityCoefficient, $NewTextFile, $SimilarityMatrixRef); 105 for $SpecifiedSimilarityCoefficient (@{$OptionsInfo{SpecifiedSimilarityCoefficientsRef}}) { 106 $SimilarityCoefficient = $OptionsInfo{SpecifiedSimilarityCoefficientsNameMapRef}->{lc($SpecifiedSimilarityCoefficient)}; 107 $NewTextFile = $TextFilesInfo{OutFileRoot}[$FileIndex] . "${SimilarityCoefficient}." . $TextFilesInfo{OutFileExt}[$FileIndex]; 108 109 print "Generating $NewTextFile...\n"; 110 111 $SimilarityMatrixRef = CalculateSimilarityMatrix($SimilarityCoefficient, $FingerprintsBitVectorsRef); 112 WriteSimilarityMatrix($NewTextFile, $CompundIDsRef, $SimilarityMatrixRef); 113 } 114 } 115 116 # Calculate a specific similarity matrix... 117 # 118 sub CalculateSimilarityMatrix { 119 my($SimilarityCoefficient, $FingerprintsBitVectorsRef) = @_; 120 my($Index, $Index1, $Index2, $Value, $MethodName, $FingerprintsBitVectorA, $FingerprintsBitVectorB, $UseAlphaOrBeta, $Alpha, $Beta, $Precision, @SimilarityMatrix); 121 122 # Initialize data... 123 @SimilarityMatrix = (); 124 for $Index (0 .. $#{$FingerprintsBitVectorsRef}) { 125 @{$SimilarityMatrix[$Index]} = (); 126 } 127 $MethodName = $OptionsInfo{SpecifiedSimilarityCoefficientsMethodMapRef}->{lc($SimilarityCoefficient)}; 128 129 $UseAlphaOrBeta = 1; 130 if ($SimilarityCoefficient =~ /^Tversky$/i) { 131 $Alpha = $OptionsInfo{Alpha}; 132 } 133 elsif ($SimilarityCoefficient =~ /^WeightedTversky$/i) { 134 $Alpha = $OptionsInfo{Alpha}; 135 $Beta = $OptionsInfo{Beta}; 136 } 137 elsif ($SimilarityCoefficient =~ /^WeightedTanimoto$/i) { 138 $Beta = $OptionsInfo{Beta}; 139 } 140 else { 141 $UseAlphaOrBeta = 0; 142 } 143 $Precision = $OptionsInfo{Precision}; 144 145 # Calculate pairwise similarity coefficients... 146 for $Index1 (0 .. $#{$FingerprintsBitVectorsRef}) { 147 $FingerprintsBitVectorA = $FingerprintsBitVectorsRef->[$Index1]; 148 149 for $Index2 (0 .. $#{$FingerprintsBitVectorsRef}) { 150 $FingerprintsBitVectorB = $FingerprintsBitVectorsRef->[$Index2]; 151 $Value = ''; 152 if ($UseAlphaOrBeta) { 153 if ($SimilarityCoefficient =~ /^Tversky$/i) { 154 $Value = $FingerprintsBitVectorA->$MethodName($FingerprintsBitVectorB, $Alpha); 155 } 156 elsif ($SimilarityCoefficient =~ /^WeightedTversky$/i) { 157 $Value = $FingerprintsBitVectorA->$MethodName($FingerprintsBitVectorB, $Alpha, $Beta); 158 } 159 elsif ($SimilarityCoefficient =~ /^WeightedTanimoto$/i) { 160 $Value = $FingerprintsBitVectorA->$MethodName($FingerprintsBitVectorB, $Beta); 161 } 162 } 163 else { 164 $Value = $FingerprintsBitVectorA->$MethodName($FingerprintsBitVectorB); 165 } 166 $Value = (defined($Value) && length($Value)) ? (sprintf("%.${Precision}f", $Value) + 0) : ''; 167 push @{$SimilarityMatrix[$Index1]}, $Value; 168 } 169 } 170 return \@SimilarityMatrix; 171 } 172 173 # Write out similarity matrix... 174 # 175 sub WriteSimilarityMatrix { 176 my($NewTextFile, $CompundIDsRef, $SimilarityMatrixRef) = @_; 177 my($Index, $Line, $OutDelim, $OutQuote, @LineWords); 178 179 $OutDelim = $OptionsInfo{OutDelim}; 180 $OutQuote = $OptionsInfo{OutQuote}; 181 182 # Write out similarity matrix... 183 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $>$NewTextFile: $! \n"; 184 185 # Write out column labels... 186 @LineWords = (); 187 push @LineWords, ''; 188 push @LineWords, @{$CompundIDsRef}; 189 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 190 print NEWTEXTFILE "$Line\n"; 191 192 # Write out similarity coefficients... 193 for $Index (0 .. $#{$CompundIDsRef}) { 194 @LineWords = (); 195 push @LineWords, $CompundIDsRef->[$Index]; 196 push @LineWords, @{$SimilarityMatrixRef->[$Index]}; 197 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 198 print NEWTEXTFILE "$Line\n"; 199 } 200 close NEWTEXTFILE; 201 202 } 203 204 # Process fingerprints data in text file and return references to list containing 205 # compound IDs and corresponding FingerprintsBitVectors... 206 # 207 sub ProcessFingerprintsData { 208 my($FileIndex) = @_; 209 my($TextFile, $Line, $InDelim, $LineCount, $InvalidDataLineCount, $MissingDataLineCount, $UseSequentialID, $CmpdIDColNum, $FingerprintsColNum, $DetailLevel, $UseInternalFormat, $FingerprintsType, $FingerprintsStringType, $FingerprintsSize, $FingerprintsString, $CheckData, $InvalidFingerprintsData, $CompoundID, $FirstFingerprintsDataLine, $FirstFingerprintsType, $FirstFingerprintsStringType, $FirstFingerprintsSize, @LineWords, @CompundIDs, @FingerprintsBitVectors); 210 211 @CompundIDs = (); 212 @FingerprintsBitVectors = (); 213 214 $TextFile = $TextFilesList[$FileIndex]; 215 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n"; 216 217 $LineCount = 0; 218 $InvalidDataLineCount = 0; 219 $MissingDataLineCount = 0; 220 221 $InDelim = $TextFilesInfo{InDelim}[$FileIndex]; 222 $DetailLevel = $OptionsInfo{Detail}; 223 224 $UseSequentialID = $TextFilesInfo{UseSequentialID}[$FileIndex]; 225 $CmpdIDColNum = $TextFilesInfo{CompoundIDColNum}[$FileIndex]; 226 $FingerprintsColNum = $TextFilesInfo{FingerprintsColNum}[$FileIndex]; 227 $UseInternalFormat = ($OptionsInfo{FingerprintsFormatMode} =~ /^Internal$/i) ? 1 : 0; 228 229 $CheckData = $OptionsInfo{Fast} ? 0 : 1; 230 $FirstFingerprintsDataLine = 1; 231 232 # Skip column label line... 233 $Line = GetTextLine(\*TEXTFILE); 234 235 LINE: while ($Line = GetTextLine(\*TEXTFILE)) { 236 $LineCount++; 237 @LineWords = quotewords($InDelim, 0, $Line); 238 239 if ($CheckData) { 240 if ($FingerprintsColNum > $#LineWords) { 241 # Missing data... 242 $MissingDataLineCount++; 243 if ($DetailLevel >= 3) { 244 print "Line number $LineCount contains no fingerprints data: $Line \n"; 245 } 246 elsif ($DetailLevel >= 2) { 247 print "Line number $LineCount contains no fingerprints data...\n"; 248 } 249 next LINE; 250 } 251 } 252 253 # Setup fingerprints bit vector... 254 $InvalidFingerprintsData = 0; 255 if ($UseInternalFormat) { 256 ($FingerprintsType, $FingerprintsStringType, $FingerprintsSize, $FingerprintsString) = $LineWords[$FingerprintsColNum] =~ /^(.*?):(.*?):(.*?):(.*?)$/; 257 if ($CheckData) { 258 if (IsEmpty($FingerprintsType) || IsEmpty($FingerprintsStringType) || IsEmpty($FingerprintsSize) || IsEmpty($FingerprintsString)) { 259 $InvalidFingerprintsData = 1; 260 } 261 elsif ($FirstFingerprintsDataLine) { 262 $FirstFingerprintsDataLine = 0; 263 ($FirstFingerprintsType, $FirstFingerprintsStringType, $FirstFingerprintsSize) = ($FingerprintsType, $FingerprintsStringType, $FingerprintsSize); 264 } 265 else { 266 if ($FirstFingerprintsType !~ /^$FingerprintsType$/i || $FirstFingerprintsStringType !~ /^$FingerprintsStringType$/i || $FirstFingerprintsSize !~ /^$FingerprintsSize$/i) { 267 $InvalidFingerprintsData = 1; 268 } 269 } 270 } 271 } 272 else { 273 $FingerprintsString = $LineWords[$FingerprintsColNum]; 274 $FingerprintsStringType = $OptionsInfo{FingerprintsString}; 275 if ($CheckData && IsEmpty($FingerprintsString)) { 276 $InvalidFingerprintsData = 1; 277 } 278 } 279 if ($InvalidFingerprintsData) { 280 # InvalidData data... 281 $InvalidDataLineCount++; 282 if ($DetailLevel >= 3) { 283 print "Line number $LineCount contains invalid fingerprints data: $Line \n"; 284 } 285 elsif ($DetailLevel >= 2) { 286 print "Line number $LineCount contains invalid fingerprints data...\n"; 287 } 288 next LINE; 289 } 290 my($FingerprintsBitVector); 291 292 $FingerprintsBitVector = ''; 293 if ($FingerprintsStringType =~ /^(Hexadecimal|Hex)$/i) { 294 $FingerprintsBitVector = FingerprintsBitVector::NewFromHexadecimalString($FingerprintsString); 295 } 296 elsif ($FingerprintsStringType =~ /^(Binary|Bin)$/i) { 297 $FingerprintsBitVector = FingerprintsBitVector::NewFromBinaryString($FingerprintsString); 298 } 299 elsif ($FingerprintsStringType =~ /^(RawBinary|RawBin)$/i) { 300 $FingerprintsBitVector = FingerprintsBitVector::NewFromRawBinaryString($FingerprintsString); 301 } 302 303 $CompoundID = (!$UseSequentialID && $CmpdIDColNum <= $#LineWords) ? $LineWords[$CmpdIDColNum] : "Cmpd${LineCount}"; 304 305 push @CompundIDs, $CompoundID; 306 push @FingerprintsBitVectors, $FingerprintsBitVector; 307 } 308 close TEXTFILE; 309 310 if ($DetailLevel >= 1) { 311 if ($MissingDataLineCount) { 312 print "Missing fingerprints data in $MissingDataLineCount line(s)...\n"; 313 } 314 if ($InvalidDataLineCount) { 315 print "Invalid fingerprints data in $InvalidDataLineCount line(s)...\n"; 316 } 317 } 318 319 return (\@CompundIDs, \@FingerprintsBitVectors); 320 } 321 322 # Make sure the specified columns exists in text files... 323 sub ProcessColumnsInfo { 324 my($Index, $TextFile, $ColNum, $ColLabel, $ColFound, $CmpdIDCol, $CmpdIDColNum, $FingerprintsCol, $FingerprintsColNum, $UseSequentialCmpdIDs); 325 326 @{$TextFilesInfo{CompoundIDColNum}} = (); 327 @{$TextFilesInfo{UseSequentialID}} = (); 328 @{$TextFilesInfo{FingerprintsColNum}} = (); 329 330 FILELIST: for $Index (0 .. $#TextFilesList) { 331 $TextFile = $TextFilesList[$Index]; 332 333 $TextFilesInfo{CompoundIDColNum}[$Index] = ''; 334 $TextFilesInfo{UseSequentialID}[$Index] = 0; 335 $TextFilesInfo{FingerprintsColNum}[$Index] = ''; 336 337 if (!$TextFilesInfo{FileOkay}[$Index]) { 338 next FILELIST; 339 } 340 341 # File not okay untill proven okay... 342 $TextFilesInfo{FileOkay}[$Index] = 0; 343 344 # CompoundIDCol... 345 $CmpdIDCol = $OptionsInfo{CompoundIDCol}; 346 347 $UseSequentialCmpdIDs = 0; 348 $CmpdIDColNum = ''; 349 350 if ($CmpdIDCol =~ /^UseDefault$/i) { 351 # First column containing the word CompoundID in its label or sequential generation... 352 $ColFound = 0; 353 COLLABEL: for $ColLabel (@{$TextFilesInfo{ColLabels}[$Index]}) { 354 if ($ColLabel =~ /CompoundID/i) { 355 $ColFound = 1; 356 $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; 357 last COLLABEL; 358 } 359 } 360 if ($ColFound) { 361 $CmpdIDColNum = $ColNum; 362 } 363 else { 364 $UseSequentialCmpdIDs = 1; 365 } 366 } 367 else { 368 if ($OptionsInfo{ColMode} =~ /^ColNum$/i) { 369 # Is it a valid column number? 370 if ($CmpdIDCol > $TextFilesInfo{ColCount}[$Index]) { 371 warn "Warning: Ignoring file $TextFile: Column number specified, $CmpdIDCol, using \"--CompoundIDCol\" option doesn't exist.\n"; 372 next FILELIST; 373 } 374 $CmpdIDColNum = $CmpdIDCol - 1; 375 } 376 elsif ($OptionsInfo{ColMode} =~ /^ColLabel$/i) { 377 # Does this column exists? 378 if (!exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$CmpdIDCol}) { 379 warn "Warning: Ignoring file $TextFile: Column label specified, $CmpdIDCol, using \"--CompoundIDCol\" option doesn't exist.\n"; 380 next FILELIST; 381 } 382 $CmpdIDColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$CmpdIDCol}; 383 } 384 } 385 # FingerprintsCol... 386 $FingerprintsColNum = ''; 387 $FingerprintsCol = $OptionsInfo{FingerprintsCol}; 388 389 if ($FingerprintsCol =~ /^UseDefault$/i) { 390 # First column containing the word Fingerprints in its label... 391 $ColFound = 0; 392 COLLABEL: for $ColLabel (@{$TextFilesInfo{ColLabels}[$Index]}) { 393 if ($ColLabel =~ /Fingerprints/i) { 394 $ColFound = 1; 395 $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; 396 last COLLABEL; 397 } 398 } 399 if (!$ColFound) { 400 warn "Warning: Ignoring file $TextFile: Column label containing \"Fingerprints\" string in its name doesn't exist.\n"; 401 next FILELIST; 402 } 403 $FingerprintsColNum = $ColNum; 404 } 405 else { 406 if ($OptionsInfo{ColMode} =~ /^ColNum$/i) { 407 # Is it a valid column number... 408 if ($FingerprintsCol > $TextFilesInfo{ColCount}[$Index]) { 409 warn "Warning: Ignoring file $TextFile: Column number specified, $FingerprintsCol, using \"--FingerprintsCol\" option doesn't exist.\n"; 410 next FILELIST; 411 } 412 $FingerprintsColNum = $FingerprintsCol - 1; 413 } 414 elsif ($OptionsInfo{ColMode} =~ /^ColLabel$/i) { 415 # Does this column exists? 416 if (!exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$FingerprintsCol}) { 417 warn "Warning: Ignoring file $TextFile: Column label specified, $FingerprintsCol, using \"--FingerprintsCol\" option doesn't exist.\n"; 418 next FILELIST; 419 } 420 $FingerprintsColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$FingerprintsCol}; 421 } 422 } 423 $TextFilesInfo{FileOkay}[$Index] = 1; 424 425 $TextFilesInfo{CompoundIDColNum}[$Index] = $CmpdIDColNum; 426 $TextFilesInfo{UseSequentialID}[$Index] = $UseSequentialCmpdIDs; 427 $TextFilesInfo{FingerprintsColNum}[$Index] = $FingerprintsColNum; 428 } 429 } 430 431 # Retrieve information about text files... 432 # 433 sub RetrieveTextFilesInfo { 434 my($TextFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $OutFileExt, $InDelim, $Line, $ColNum, $ColLabel, @ColLabels); 435 436 %TextFilesInfo = (); 437 @{$TextFilesInfo{FileOkay}} = (); 438 @{$TextFilesInfo{ColCount}} = (); 439 @{$TextFilesInfo{ColLabels}} = (); 440 @{$TextFilesInfo{ColLabelToNumMap}} = (); 441 @{$TextFilesInfo{InDelim}} = (); 442 @{$TextFilesInfo{OutFileRoot}} = (); 443 @{$TextFilesInfo{OutFileExt}} = (); 444 445 FILELIST: for $Index (0 .. $#TextFilesList) { 446 $TextFile = $TextFilesList[$Index]; 447 448 $TextFilesInfo{FileOkay}[$Index] = 0; 449 $TextFilesInfo{ColCount}[$Index] = 0; 450 @{$TextFilesInfo{ColLabels}[$Index]} = (); 451 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = (); 452 $TextFilesInfo{InDelim}[$Index] = ""; 453 $TextFilesInfo{OutFileRoot}[$Index] = ''; 454 $TextFilesInfo{OutFileExt}[$Index] = ''; 455 456 $TextFile = $TextFilesList[$Index]; 457 if (!(-e $TextFile)) { 458 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; 459 next FILELIST; 460 } 461 if (!CheckFileType($TextFile, "csv tsv")) { 462 warn "Warning: Ignoring file $TextFile: It's not a text file\n"; 463 next FILELIST; 464 } 465 466 $FileDir = ""; $FileName = ""; $FileExt = ""; 467 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 468 469 if ($FileExt =~ /^tsv$/i) { 470 $InDelim = "\t"; 471 } 472 else { 473 $InDelim = $OptionsInfo{InDelim}; 474 } 475 476 if (!open TEXTFILE, "$TextFile") { 477 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; 478 next FILELIST; 479 } 480 $Line = GetTextLine(\*TEXTFILE); 481 @ColLabels = quotewords($InDelim, 0, $Line); 482 close TEXTFILE; 483 484 # Setup output file names... 485 $FileDir = ""; $FileName = ""; $FileExt = ""; 486 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 487 488 $OutFileExt = "csv"; 489 if ($Options{outdelim} =~ /^tab$/i) { 490 $OutFileExt = "tsv"; 491 } 492 493 $OutFileRoot = $FileName; 494 if ($OptionsInfo{OutFileRoot} && (@TextFilesList == 1)) { 495 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 496 if ($RootFileName && $RootFileExt) { 497 $FileName = $RootFileName; 498 } 499 else { 500 $FileName = $OptionsInfo{OutFileRoot}; 501 } 502 $OutFileRoot = $FileName; 503 } 504 505 if (!$Options{overwrite}) { 506 my($SpecifiedSimilarityCoefficient, $SimilarityCoefficient, $SimilarityCoefficientsNameMapRef); 507 $SimilarityCoefficientsNameMapRef = $OptionsInfo{SpecifiedSimilarityCoefficientsNameMapRef}; 508 for $SpecifiedSimilarityCoefficient (@{$OptionsInfo{SpecifiedSimilarityCoefficientsRef}}) { 509 $SimilarityCoefficient = $SimilarityCoefficientsNameMapRef->{lc($SpecifiedSimilarityCoefficient)}; 510 if (-e "${OutFileRoot}${SimilarityCoefficient}.${OutFileExt}") { 511 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}${SimilarityCoefficient}.${OutFileExt} already exists.\n"; 512 next FILELIST; 513 } 514 } 515 } 516 517 $TextFilesInfo{FileOkay}[$Index] = 1; 518 $TextFilesInfo{InDelim}[$Index] = $InDelim; 519 $TextFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 520 $TextFilesInfo{OutFileExt}[$Index] = $OutFileExt; 521 522 523 $TextFilesInfo{ColCount}[$Index] = scalar @ColLabels; 524 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels; 525 for $ColNum (0 .. $#ColLabels) { 526 $ColLabel = $ColLabels[$ColNum]; 527 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum; 528 } 529 } 530 } 531 532 # Process option values... 533 sub ProcessOptions { 534 %OptionsInfo = (); 535 536 # Setup supported similarity coefficients... 537 my($SimilarityCoefficient, $SupportedSimilarityCoefficient, @SupportedSimilarityCoefficients, %SupportedSimilarityCoefficientsNameMap, %SupportedSimilarityCoefficientsMethodMap); 538 539 @SupportedSimilarityCoefficients = (); 540 %SupportedSimilarityCoefficientsNameMap = (); 541 %SupportedSimilarityCoefficientsMethodMap = (); 542 for $SupportedSimilarityCoefficient (FingerprintsBitVector::GetSupportedSimilarityCoefficients()) { 543 # Similarity coefficient function/method names contain "SimilarityCoefficient" in their names. 544 # So take 'em out and setup a map to original function/method name... 545 $SimilarityCoefficient = $SupportedSimilarityCoefficient; 546 $SimilarityCoefficient =~ s/SimilarityCoefficient$//; 547 push @SupportedSimilarityCoefficients, $SimilarityCoefficient; 548 $SupportedSimilarityCoefficientsNameMap{lc($SimilarityCoefficient)} = $SimilarityCoefficient; 549 $SupportedSimilarityCoefficientsMethodMap{lc($SimilarityCoefficient)} = $SupportedSimilarityCoefficient; 550 } 551 552 # Setup a list of similarity coefficients to use for calculating similarity matrices... 553 my($SpecifiedCoefficient, @SpecifiedSimilarityCoefficients, %SpecifiedSimilarityCoefficientsNameMap, %SpecifiedSimilarityCoefficientsMethodMap); 554 555 @SpecifiedSimilarityCoefficients = (); 556 %SpecifiedSimilarityCoefficientsNameMap = (); 557 %SpecifiedSimilarityCoefficientsMethodMap = (); 558 559 if ($Options{mode} =~ /^All$/i) { 560 push @SpecifiedSimilarityCoefficients, @SupportedSimilarityCoefficients; 561 } 562 else { 563 # Comma delimited list of similarity coefficients... 564 my($Mode, @SpecifiedCoefficients, @UnsupportedSpecifiedCoefficients); 565 566 $Mode = $Options{mode}; 567 $Mode =~ s/ //g; 568 @SpecifiedCoefficients = split ",", $Mode; 569 @UnsupportedSpecifiedCoefficients = (); 570 571 for $SpecifiedCoefficient (@SpecifiedCoefficients) { 572 if (exists($SupportedSimilarityCoefficientsMethodMap{lc($SpecifiedCoefficient)})) { 573 push @SpecifiedSimilarityCoefficients, $SpecifiedCoefficient; 574 } 575 else { 576 push @UnsupportedSpecifiedCoefficients, $SpecifiedCoefficient; 577 } 578 } 579 if (@UnsupportedSpecifiedCoefficients) { 580 if (@UnsupportedSpecifiedCoefficients > 1) { 581 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedCoefficients, ", ", 0)," - for option \"-m --mode\" are not valid.\n"; 582 } 583 else { 584 warn "Error: The value specified, @UnsupportedSpecifiedCoefficients, for option \"-m --mode\" is not valid.\n"; 585 } 586 die "Allowed values:", JoinWords(\@SupportedSimilarityCoefficients, ", ", 0), "\n"; 587 } 588 } 589 COEFFICIENT: for $SpecifiedCoefficient (@SpecifiedSimilarityCoefficients) { 590 if (exists $SpecifiedSimilarityCoefficientsMethodMap{lc($SpecifiedCoefficient)} ) { 591 next COEFFICIENT; 592 } 593 $SpecifiedSimilarityCoefficientsMethodMap{lc($SpecifiedCoefficient)} = $SupportedSimilarityCoefficientsMethodMap{lc($SpecifiedCoefficient)}; 594 $SpecifiedSimilarityCoefficientsNameMap{lc($SpecifiedCoefficient)} = $SupportedSimilarityCoefficientsNameMap{lc($SpecifiedCoefficient)}; 595 } 596 $OptionsInfo{Mode} = $Options{mode}; 597 $OptionsInfo{SpecifiedSimilarityCoefficientsRef} = \@SpecifiedSimilarityCoefficients; 598 $OptionsInfo{SpecifiedSimilarityCoefficientsNameMapRef} = \%SpecifiedSimilarityCoefficientsNameMap; 599 $OptionsInfo{SpecifiedSimilarityCoefficientsMethodMapRef} = \%SpecifiedSimilarityCoefficientsMethodMap; 600 601 # Make sure valid alpha parameter is specified for Tversky calculation... 602 $OptionsInfo{Alpha} = ''; 603 $SpecifiedCoefficient = 'Tversky'; 604 if ($SpecifiedSimilarityCoefficientsNameMap{lc($SpecifiedCoefficient)}) { 605 if (IsEmpty($Options{alpha})) { 606 die "Error: You must specify a value for \"-a, --alpha\" option in \"$SpecifiedCoefficient or All\" \"-m --mode\". \n"; 607 } 608 my($Alpha); 609 $Alpha = $Options{alpha}; 610 if (!(IsFloat($Alpha) && $Alpha >=0 && $Alpha <= 1)) { 611 die "Error: The value specified, $Options{alpha}, for option \"-a, --alpha\" is not valid. Allowed values: >= 0 and <= 1\n"; 612 } 613 $OptionsInfo{Alpha} = $Alpha; 614 } 615 616 # Make sure valid beta parameter is specified for WeightedTanimoto and WeightedTversky 617 # calculations... 618 my($SpecifiedCoefficient1, $SpecifiedCoefficient2); 619 $OptionsInfo{Beta} = ''; 620 $SpecifiedCoefficient1 = 'WeightedTversky'; 621 $SpecifiedCoefficient2 = 'WeightedTanimoto'; 622 if ($SpecifiedSimilarityCoefficientsNameMap{lc($SpecifiedCoefficient1)} || $SpecifiedSimilarityCoefficientsNameMap{lc($SpecifiedCoefficient1)}) { 623 if (IsEmpty($Options{beta})) { 624 die "Error: You must specify a value for \"-b, --beta\" option in \"$SpecifiedCoefficient1, $SpecifiedCoefficient2, or All\" \"-m --mode\". \n"; 625 } 626 my($Beta); 627 $Beta = $Options{beta}; 628 if (!(IsFloat($Beta) && $Beta >=0 && $Beta <= 1)) { 629 die "Error: The value specified, $Options{beta}, for option \"-b, --beta\" is not valid. Allowed values: >= 0 and <= 1\n"; 630 } 631 $OptionsInfo{Beta} = $Beta; 632 } 633 634 $OptionsInfo{ColMode} = $Options{colmode}; 635 636 if (IsNotEmpty($Options{compoundidcol})) { 637 if ($Options{colmode} =~ /^ColNum$/i) { 638 if (!IsPositiveInteger($Options{compoundidcol})) { 639 die "Error: Column value, $Options{compoundidcol}, specified using \"--CompoundIDCol\" is not valid: Allowed integer values: > 0.\n"; 640 } 641 } 642 $OptionsInfo{CompoundIDCol} = $Options{compoundidcol}; 643 } 644 else { 645 $OptionsInfo{CompoundIDCol} = 'UseDefault'; 646 } 647 648 if (IsNotEmpty($Options{fingerprintscol})) { 649 if ($Options{colmode} =~ /^ColNum$/i) { 650 if (!IsPositiveInteger($Options{fingerprintscol})) { 651 die "Error: Column value, $Options{fingerprintscol}, specified using \"--FingerprintsCol\" is not valid: Allowed integer values: > 0.\n"; 652 } 653 } 654 $OptionsInfo{FingerprintsCol} = $Options{fingerprintscol}; 655 } 656 else { 657 $OptionsInfo{FingerprintsCol} = 'UseDefault'; 658 } 659 if (IsNotEmpty($Options{compoundidcol}) && IsNotEmpty($Options{fingerprintscol})) { 660 if (IsPositiveInteger($Options{compoundidcol}) && IsPositiveInteger($Options{fingerprintscol})) { 661 if (($Options{compoundidcol} == $Options{fingerprintscol})) { 662 die "Error: Values specified using \"--CompoundIDCol\" and \"--FingerprintsCol\", $Options{compoundidcol}, must be different.\n"; 663 } 664 } 665 else { 666 if (($Options{compoundidcol} eq $Options{fingerprintscol})) { 667 die "Error: Values specified using \"--CompoundIDCol\" and \"--FingerprintsCol\", $Options{compoundidcol}, must be different.\n"; 668 } 669 } 670 } 671 672 673 $OptionsInfo{Detail} = $Options{detail}; 674 675 $OptionsInfo{FingerprintsFormatMode} = $Options{fingerprintsformatmode}; 676 $OptionsInfo{FingerprintsString} = ''; 677 if ($Options{fingerprintsformatmode} =~ /^Specify$/i) { 678 if (IsEmpty($Options{fingerprintsstring})) { 679 die "Error: You must specify a value for \"--FingerprintsString\" option in \"Specify\" \"--FingerprintsFormatMode\". \n"; 680 } 681 if ($Options{fingerprintsstring} !~ /^(Hexadecimal|Binary|RawBinary)$/i) { 682 die "Error: The value specified, $Options{fingerprintsstring}, for option \"--FingerprintsString\" is not valid. Allowed values: Hexadecimal, Binary, or RawBinary\n"; 683 } 684 $OptionsInfo{FingerprintsString} = $Options{fingerprintsstring}; 685 } 686 687 $OptionsInfo{InDelim} = ($Options{indelim} =~ /semicolon/i) ? "\;" : "\,"; 688 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,"); 689 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0; 690 691 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 692 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 693 694 $OptionsInfo{Fast} = $Options{fast} ? 1 : 0; 695 696 $OptionsInfo{Precision} = $Options{precision}; 697 } 698 699 # Setup script usage and retrieve command line arguments specified using various options... 700 sub SetupScriptUsage { 701 702 # Retrieve all the options... 703 %Options = (); 704 705 $Options{alpha} = 0.5; 706 $Options{beta} = 1; 707 $Options{colmode} = 'colnum'; 708 $Options{fingerprintsformatmode} = 'Internal'; 709 $Options{detail} = 1; 710 $Options{mode} = 'Tanimoto'; 711 $Options{indelim} = 'comma'; 712 $Options{outdelim} = 'comma'; 713 $Options{quote} = 'yes'; 714 $Options{precision} = 2; 715 716 if (!GetOptions(\%Options, "alpha|a=f", "beta|b=f", "colmode|c=s", "compoundidcol=s", "detail|d=i", "fast|f", "fingerprintscol=s", "fingerprintsformatmode=s", "fingerprintsstring=s", "help|h", "mode|m=s", "indelim=s", "outdelim=s", "overwrite|o", "precision|p=s", "quote|q=s", "root|r=s", "workingdir|w=s")) { 717 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 718 } 719 if ($Options{workingdir}) { 720 if (! -d $Options{workingdir}) { 721 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 722 } 723 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 724 } 725 if ($Options{colmode} !~ /^(ColNum|ColLabel)$/i) { 726 die "Error: The value specified, $Options{colmode}, for option \"-c, --ColMode\" is not valid. Allowed values: ColNum, or ColLabel\n"; 727 } 728 if (!IsPositiveInteger($Options{detail})) { 729 die "Error: The value specified, $Options{detail}, for option \"-d, --detail\" is not valid. Allowed values: > 0 \n"; 730 } 731 if ($Options{fingerprintsformatmode} !~ /^(Internal|Specify)$/i) { 732 die "Error: The value specified, $Options{fingerprintsformatmode}, for option \"--FingerprintsFormatMode\" is not valid. Allowed values: Internal or Specify\n"; 733 } 734 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 735 die "Error: The value specified, $Options{indelim}, for option \"--InDelim\" is not valid. Allowed values: comma, or semicolon\n"; 736 } 737 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 738 die "Error: The value specified, $Options{outdelim}, for option \"--OutDelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 739 } 740 if ($Options{quote} !~ /^(Yes|No)$/i) { 741 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n"; 742 } 743 if (!IsPositiveInteger($Options{precision})) { 744 die "Error: The value specified, $Options{precision}, for option \"--precision\" is not valid. Allowed values: > 0 \n"; 745 } 746 } 747