1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: SimilarityMatrixSDFiles.pl,v $ 4 # $Date: 2008/04/19 16:12:21 $ 5 # $Revision: 1.12 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use 5.006; 30 use strict; 31 use FindBin; use lib "$FindBin::Bin/../lib"; 32 use Getopt::Long; 33 use File::Basename; 34 use Text::ParseWords; 35 use Benchmark; 36 use FileUtil; 37 use TextUtil; 38 use SDFileUtil; 39 use Fingerprints::FingerprintsBitVector; 40 41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 42 43 # Autoflush STDOUT 44 $| = 1; 45 46 # Starting message... 47 $ScriptName = basename($0); 48 print "\n$ScriptName: Starting...\n\n"; 49 $StartTime = new Benchmark; 50 51 # Get the options and setup script... 52 SetupScriptUsage(); 53 if ($Options{help} || @ARGV < 1) { 54 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 55 } 56 57 my(@SDFilesList); 58 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 59 60 # Process options... 61 my(%OptionsInfo); 62 ProcessOptions(); 63 64 # Setup information about input files... 65 my(%SDFilesInfo); 66 print "Checking input SD file(s)...\n"; 67 RetrieveSDFilesInfo(); 68 69 # Process input files.. 70 my($FileIndex, $SDFile, $FileProcessingMsg); 71 $FileProcessingMsg = "Processing file"; 72 if (@SDFilesList > 1) { 73 print "Processing SD files...\n"; 74 $FileProcessingMsg = "\n$FileProcessingMsg"; 75 } 76 77 for $FileIndex (0 .. $#SDFilesList) { 78 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 79 $SDFile = $SDFilesList[$FileIndex]; 80 print "$FileProcessingMsg $SDFile...\n"; 81 GenerateSimilarityMatrices($FileIndex); 82 } 83 } 84 print "$ScriptName:Done...\n\n"; 85 86 $EndTime = new Benchmark; 87 $TotalTime = timediff ($EndTime, $StartTime); 88 print "Total time: ", timestr($TotalTime), "\n"; 89 90 ############################################################################### 91 92 # Generate similarity matrices using fingerprints data in SD file... 93 # 94 sub GenerateSimilarityMatrices { 95 my($FileIndex) = @_; 96 my($CompundIDsRef, $FingerprintsBitVectorsRef); 97 98 # Process fingerprints data in SD file... 99 print "Processing fingerprints data...\n"; 100 ($CompundIDsRef, $FingerprintsBitVectorsRef) = ProcessFingerprintsData($FileIndex); 101 102 # Generate similarity matrices... 103 my($SpecifiedSimilarityCoefficient, $SimilarityCoefficient, $NewTextFile, $SimilarityMatrixRef); 104 for $SpecifiedSimilarityCoefficient (@{$OptionsInfo{SpecifiedSimilarityCoefficientsRef}}) { 105 $SimilarityCoefficient = $OptionsInfo{SpecifiedSimilarityCoefficientsNameMapRef}->{lc($SpecifiedSimilarityCoefficient)}; 106 $NewTextFile = $SDFilesInfo{OutFileRoot}[$FileIndex] . "${SimilarityCoefficient}." . $SDFilesInfo{OutFileExt}[$FileIndex]; 107 108 print "Generating $NewTextFile...\n"; 109 110 $SimilarityMatrixRef = CalculateSimilarityMatrix($SimilarityCoefficient, $FingerprintsBitVectorsRef); 111 WriteSimilarityMatrix($NewTextFile, $CompundIDsRef, $SimilarityMatrixRef); 112 } 113 } 114 115 # Calculate a specific similarity matrix... 116 # 117 sub CalculateSimilarityMatrix { 118 my($SimilarityCoefficient, $FingerprintsBitVectorsRef) = @_; 119 my($Index, $Index1, $Index2, $Value, $MethodName, $FingerprintsBitVectorA, $FingerprintsBitVectorB, $UseAlphaOrBeta, $Alpha, $Beta, $Precision, @SimilarityMatrix); 120 121 # Initialize data... 122 @SimilarityMatrix = (); 123 for $Index (0 .. $#{$FingerprintsBitVectorsRef}) { 124 @{$SimilarityMatrix[$Index]} = (); 125 } 126 $MethodName = $OptionsInfo{SpecifiedSimilarityCoefficientsMethodMapRef}->{lc($SimilarityCoefficient)}; 127 128 $UseAlphaOrBeta = 1; 129 if ($SimilarityCoefficient =~ /^Tversky$/i) { 130 $Alpha = $OptionsInfo{Alpha}; 131 } 132 elsif ($SimilarityCoefficient =~ /^WeightedTversky$/i) { 133 $Alpha = $OptionsInfo{Alpha}; 134 $Beta = $OptionsInfo{Beta}; 135 } 136 elsif ($SimilarityCoefficient =~ /^WeightedTanimoto$/i) { 137 $Beta = $OptionsInfo{Beta}; 138 } 139 else { 140 $UseAlphaOrBeta = 0; 141 } 142 $Precision = $OptionsInfo{Precision}; 143 144 # Calculate pairwise similarity coefficients... 145 for $Index1 (0 .. $#{$FingerprintsBitVectorsRef}) { 146 $FingerprintsBitVectorA = $FingerprintsBitVectorsRef->[$Index1]; 147 148 for $Index2 (0 .. $#{$FingerprintsBitVectorsRef}) { 149 $FingerprintsBitVectorB = $FingerprintsBitVectorsRef->[$Index2]; 150 $Value = ''; 151 if ($UseAlphaOrBeta) { 152 if ($SimilarityCoefficient =~ /^Tversky$/i) { 153 $Value = $FingerprintsBitVectorA->$MethodName($FingerprintsBitVectorB, $Alpha); 154 } 155 elsif ($SimilarityCoefficient =~ /^WeightedTversky$/i) { 156 $Value = $FingerprintsBitVectorA->$MethodName($FingerprintsBitVectorB, $Alpha, $Beta); 157 } 158 elsif ($SimilarityCoefficient =~ /^WeightedTanimoto$/i) { 159 $Value = $FingerprintsBitVectorA->$MethodName($FingerprintsBitVectorB, $Beta); 160 } 161 } 162 else { 163 $Value = $FingerprintsBitVectorA->$MethodName($FingerprintsBitVectorB); 164 } 165 $Value = (defined($Value) && length($Value)) ? (sprintf("%.${Precision}f", $Value) + 0) : ''; 166 push @{$SimilarityMatrix[$Index1]}, $Value; 167 } 168 } 169 return \@SimilarityMatrix; 170 } 171 172 # Write out similarity matrix... 173 # 174 sub WriteSimilarityMatrix { 175 my($NewTextFile, $CompundIDsRef, $SimilarityMatrixRef) = @_; 176 my($Index, $Line, $OutDelim, $OutQuote, @LineWords); 177 178 $OutDelim = $OptionsInfo{OutDelim}; 179 $OutQuote = $OptionsInfo{OutQuote}; 180 181 # Write out similarity matrix... 182 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $>$NewTextFile: $! \n"; 183 184 # Write out column labels... 185 @LineWords = (); 186 push @LineWords, ''; 187 push @LineWords, @{$CompundIDsRef}; 188 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 189 print NEWTEXTFILE "$Line\n"; 190 191 # Write out similarity coefficients... 192 for $Index (0 .. $#{$CompundIDsRef}) { 193 @LineWords = (); 194 push @LineWords, $CompundIDsRef->[$Index]; 195 push @LineWords, @{$SimilarityMatrixRef->[$Index]}; 196 $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote); 197 print NEWTEXTFILE "$Line\n"; 198 } 199 close NEWTEXTFILE; 200 201 } 202 203 # Process fingerprints data in SD file and return references to list containing 204 # compound IDs and corresponding FingerprintsBitVectors... 205 # 206 sub ProcessFingerprintsData { 207 my($FileIndex) = @_; 208 my($SDFile, $CmpdString, $MolName, $CmpdCount, $InvalidCmpdDataCount, $MissingCmpdDataCount, $DetailLevel, $FingerprintsFieldLabel, $UseInternalFormat, $CheckData, $FirstFingerprintsCmpdData, $CompoundID, $FingerprintsType, $FingerprintsStringType, $FingerprintsSize, $FingerprintsString, $InvalidFingerprintsData, $FirstFingerprintsType, $FirstFingerprintsStringType, $FirstFingerprintsSize, @CmpdLines, @CompundIDs, @FingerprintsBitVectors, %DataFieldLabelsAndValues); 209 210 @CompundIDs = (); 211 @FingerprintsBitVectors = (); 212 213 $SDFile = $SDFilesList[$FileIndex]; 214 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 215 216 $CmpdCount = 0; 217 $InvalidCmpdDataCount = 0; 218 $MissingCmpdDataCount = 0; 219 220 $DetailLevel = $OptionsInfo{Detail}; 221 222 $FingerprintsFieldLabel = $SDFilesInfo{FingerprintsFieldLabel}[$FileIndex]; 223 $UseInternalFormat = ($OptionsInfo{FingerprintsFormatMode} =~ /^Internal$/i) ? 1 : 0; 224 225 $CheckData = $OptionsInfo{Fast} ? 0 : 1; 226 $FirstFingerprintsCmpdData = 1; 227 228 COMPOUND: while ($CmpdString = ReadCmpdString(\*SDFILE)) { 229 $CmpdCount++; 230 @CmpdLines = split "\n", $CmpdString; 231 %DataFieldLabelsAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 232 233 if ($CheckData) { 234 if (!exists $DataFieldLabelsAndValues{$FingerprintsFieldLabel}) { 235 # Missing data... 236 $MissingCmpdDataCount++; 237 if ($DetailLevel >= 3) { 238 print "Compound number $CmpdCount contains no fingerprints data: $CmpdString \n"; 239 } 240 elsif ($DetailLevel >= 2) { 241 print "Compound number $CmpdCount contains no fingerprints data...\n"; 242 } 243 next COMPOUND; 244 } 245 } 246 # Setup fingerprints bit vector... 247 $InvalidFingerprintsData = 0; 248 if ($UseInternalFormat) { 249 ($FingerprintsType, $FingerprintsStringType, $FingerprintsSize, $FingerprintsString) = $DataFieldLabelsAndValues{$FingerprintsFieldLabel} =~ /^(.*?):(.*?):(.*?):(.*?)$/; 250 if ($CheckData) { 251 if (IsEmpty($FingerprintsType) || IsEmpty($FingerprintsStringType) || IsEmpty($FingerprintsSize) || IsEmpty($FingerprintsString)) { 252 $InvalidFingerprintsData = 1; 253 } 254 elsif ($FirstFingerprintsCmpdData) { 255 $FirstFingerprintsCmpdData = 0; 256 ($FirstFingerprintsType, $FirstFingerprintsStringType, $FirstFingerprintsSize) = ($FingerprintsType, $FingerprintsStringType, $FingerprintsSize); 257 } 258 else { 259 if ($FirstFingerprintsType !~ /^$FingerprintsType$/i || $FirstFingerprintsStringType !~ /^$FingerprintsStringType$/i || $FirstFingerprintsSize !~ /^$FingerprintsSize$/i) { 260 $InvalidFingerprintsData = 1; 261 } 262 } 263 } 264 } 265 else { 266 $FingerprintsString = $DataFieldLabelsAndValues{$FingerprintsFieldLabel}; 267 $FingerprintsStringType = $OptionsInfo{FingerprintsString}; 268 if ($CheckData && IsEmpty($FingerprintsString)) { 269 $InvalidFingerprintsData = 1; 270 } 271 } 272 if ($InvalidFingerprintsData) { 273 # InvalidData data... 274 $InvalidCmpdDataCount++; 275 if ($DetailLevel >= 3) { 276 print "Compound number $CmpdCount contains invalid fingerprints data:\n$CmpdString\n"; 277 } 278 elsif ($DetailLevel >= 2) { 279 print "Compound number $CmpdCount contains invalid fingerprints data:\n"; 280 if ($UseInternalFormat) { 281 print "$DataFieldLabelsAndValues{$FingerprintsFieldLabel}\n"; 282 } 283 else { 284 print "FingerprintsStringType: $FingerprintsStringType\nFingerprintsString: $FingerprintsString\n"; 285 } 286 } 287 next COMPOUND; 288 } 289 my($FingerprintsBitVector); 290 291 $FingerprintsBitVector = ''; 292 if ($FingerprintsStringType =~ /^(Hexadecimal|Hex)$/i) { 293 $FingerprintsBitVector = FingerprintsBitVector::NewFromHexadecimalString($FingerprintsString); 294 } 295 elsif ($FingerprintsStringType =~ /^(Binary|Bin)$/i) { 296 $FingerprintsBitVector = FingerprintsBitVector::NewFromBinaryString($FingerprintsString); 297 } 298 elsif ($FingerprintsStringType =~ /^(RawBinary|RawBin)$/i) { 299 $FingerprintsBitVector = FingerprintsBitVector::NewFromRawBinaryString($FingerprintsString); 300 } 301 302 $MolName = $CmpdLines[0]; 303 $CompoundID = SetupCompoundID($CmpdCount, $MolName, \%DataFieldLabelsAndValues); 304 305 push @CompundIDs, $CompoundID; 306 push @FingerprintsBitVectors, $FingerprintsBitVector; 307 } 308 close SDFILE; 309 310 if ($DetailLevel >= 1) { 311 if ($MissingCmpdDataCount) { 312 print "Missing fingerprints data in $MissingCmpdDataCount compound(s)...\n"; 313 } 314 if ($InvalidCmpdDataCount) { 315 print "Invalid fingerprints data in $InvalidCmpdDataCount compound(s)...\n"; 316 } 317 } 318 319 return (\@CompundIDs, \@FingerprintsBitVectors); 320 } 321 322 # Generate compound ID... 323 # 324 sub SetupCompoundID { 325 my($CmpdCount, $MolName, $DataFieldLabelAndValuesRef) = @_; 326 my($CmpdID); 327 328 $CmpdID = ''; 329 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) { 330 $CmpdID = IsNotEmpty($MolName) ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}"; 331 } 332 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) { 333 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}"; 334 } 335 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) { 336 my($SpecifiedDataField); 337 $SpecifiedDataField = $OptionsInfo{CompoundID}; 338 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : ''; 339 } 340 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) { 341 $CmpdID = $MolName; 342 } 343 return $CmpdID; 344 } 345 346 # Retrieve information about SD files... 347 # 348 sub RetrieveSDFilesInfo { 349 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $OutFileExt, $InDelim, $Line, $FingerprintsFieldLabel, $SpecifiedDataField); 350 351 %SDFilesInfo = (); 352 @{$SDFilesInfo{FileOkay}} = (); 353 @{$SDFilesInfo{FingerprintsFieldLabel}} = (); 354 @{$SDFilesInfo{OutFileRoot}} = (); 355 @{$SDFilesInfo{OutFileExt}} = (); 356 357 FILELIST: for $Index (0 .. $#SDFilesList) { 358 $SDFile = $SDFilesList[$Index]; 359 360 $SDFilesInfo{FileOkay}[$Index] = 0; 361 $SDFilesInfo{FingerprintsFieldLabel}[$Index] = ''; 362 $SDFilesInfo{OutFileRoot}[$Index] = ''; 363 $SDFilesInfo{OutFileExt}[$Index] = ''; 364 365 $SDFile = $SDFilesList[$Index]; 366 if (!(-e $SDFile)) { 367 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 368 next FILELIST; 369 } 370 if (!CheckFileType($SDFile, "sdf sd")) { 371 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 372 next FILELIST; 373 } 374 375 $FileDir = ""; $FileName = ""; $FileExt = ""; 376 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 377 378 if (!open SDFILE, "$SDFile") { 379 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; 380 next FILELIST; 381 } 382 close SDFILE; 383 384 # Make sure data field exists in SD file.. 385 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues); 386 387 @CmpdLines = (); 388 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 389 $CmpdString = ReadCmpdString(\*SDFILE); 390 close SDFILE; 391 @CmpdLines = split "\n", $CmpdString; 392 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 393 394 if ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) { 395 $SpecifiedDataField = $OptionsInfo{CompoundID}; 396 if (!exists $DataFieldValues{$SpecifiedDataField}) { 397 warn "Warning: Ignoring file $SDFile: Data field value specified, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n"; 398 next FILELIST; 399 } 400 } 401 $FingerprintsFieldLabel = ''; 402 if ($OptionsInfo{FingerprintsField} !~ /^UseDefault$/i) { 403 $FingerprintsFieldLabel = $OptionsInfo{FingerprintsField}; 404 if (!exists $DataFieldValues{$FingerprintsFieldLabel}) { 405 warn "Warning: Ignoring file $SDFile: Data field value specified, $FingerprintsFieldLabel, using \"--FingerprintsField\" option doesn't exist\n"; 406 next FILELIST; 407 } 408 } 409 else { 410 # Make sure default fingerprints field does exist... 411 my($FingerprintsFieldFound, $DataFieldLabel); 412 $FingerprintsFieldFound = 0; 413 DATAFIELDLABEL: for $DataFieldLabel (keys %DataFieldValues) { 414 if ($DataFieldLabel =~ /Fingerprints/i) { 415 $FingerprintsFieldFound = 1; 416 $FingerprintsFieldLabel = $DataFieldLabel; 417 last DATAFIELDLABEL; 418 } 419 } 420 if (!$FingerprintsFieldFound) { 421 warn "Warning: Ignoring file $SDFile: Data field label containing \"Fingerprints\" string in its name doesn't exist.\n"; 422 next FILELIST; 423 } 424 } 425 426 # Setup output file names... 427 $FileDir = ""; $FileName = ""; $FileExt = ""; 428 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 429 430 $OutFileExt = "csv"; 431 if ($Options{outdelim} =~ /^tab$/i) { 432 $OutFileExt = "tsv"; 433 } 434 435 $OutFileRoot = $FileName; 436 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { 437 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 438 if ($RootFileName && $RootFileExt) { 439 $FileName = $RootFileName; 440 } 441 else { 442 $FileName = $OptionsInfo{OutFileRoot}; 443 } 444 $OutFileRoot = $FileName; 445 } 446 447 if (!$Options{overwrite}) { 448 my($SpecifiedSimilarityCoefficient, $SimilarityCoefficient, $SimilarityCoefficientsNameMapRef); 449 $SimilarityCoefficientsNameMapRef = $OptionsInfo{SpecifiedSimilarityCoefficientsNameMapRef}; 450 for $SpecifiedSimilarityCoefficient (@{$OptionsInfo{SpecifiedSimilarityCoefficientsRef}}) { 451 $SimilarityCoefficient = $SimilarityCoefficientsNameMapRef->{lc($SpecifiedSimilarityCoefficient)}; 452 if (-e "${OutFileRoot}${SimilarityCoefficient}.${OutFileExt}") { 453 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}${SimilarityCoefficient}.${OutFileExt} already exists.\n"; 454 next FILELIST; 455 } 456 } 457 } 458 459 $SDFilesInfo{FileOkay}[$Index] = 1; 460 $SDFilesInfo{FingerprintsFieldLabel}[$Index] = $FingerprintsFieldLabel; 461 462 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 463 $SDFilesInfo{OutFileExt}[$Index] = $OutFileExt; 464 } 465 } 466 467 # Process option values... 468 sub ProcessOptions { 469 %OptionsInfo = (); 470 471 # Setup supported similarity coefficients... 472 my($SimilarityCoefficient, $SupportedSimilarityCoefficient, @SupportedSimilarityCoefficients, %SupportedSimilarityCoefficientsNameMap, %SupportedSimilarityCoefficientsMethodMap); 473 474 @SupportedSimilarityCoefficients = (); 475 %SupportedSimilarityCoefficientsNameMap = (); 476 %SupportedSimilarityCoefficientsMethodMap = (); 477 for $SupportedSimilarityCoefficient (FingerprintsBitVector::GetSupportedSimilarityCoefficients()) { 478 # Similarity coefficient function/method names contain "SimilarityCoefficient" in their names. 479 # So take 'em out and setup a map to original function/method name... 480 $SimilarityCoefficient = $SupportedSimilarityCoefficient; 481 $SimilarityCoefficient =~ s/SimilarityCoefficient$//; 482 push @SupportedSimilarityCoefficients, $SimilarityCoefficient; 483 $SupportedSimilarityCoefficientsNameMap{lc($SimilarityCoefficient)} = $SimilarityCoefficient; 484 $SupportedSimilarityCoefficientsMethodMap{lc($SimilarityCoefficient)} = $SupportedSimilarityCoefficient; 485 } 486 487 # Setup a list of similarity coefficients to use for calculating similarity matrices... 488 my($SpecifiedCoefficient, @SpecifiedSimilarityCoefficients, %SpecifiedSimilarityCoefficientsNameMap, %SpecifiedSimilarityCoefficientsMethodMap); 489 490 @SpecifiedSimilarityCoefficients = (); 491 %SpecifiedSimilarityCoefficientsNameMap = (); 492 %SpecifiedSimilarityCoefficientsMethodMap = (); 493 494 if ($Options{mode} =~ /^All$/i) { 495 push @SpecifiedSimilarityCoefficients, @SupportedSimilarityCoefficients; 496 } 497 else { 498 # Comma delimited list of similarity coefficients... 499 my($Mode, @SpecifiedCoefficients, @UnsupportedSpecifiedCoefficients); 500 501 $Mode = $Options{mode}; 502 $Mode =~ s/ //g; 503 @SpecifiedCoefficients = split ",", $Mode; 504 @UnsupportedSpecifiedCoefficients = (); 505 506 for $SpecifiedCoefficient (@SpecifiedCoefficients) { 507 if (exists($SupportedSimilarityCoefficientsMethodMap{lc($SpecifiedCoefficient)})) { 508 push @SpecifiedSimilarityCoefficients, $SpecifiedCoefficient; 509 } 510 else { 511 push @UnsupportedSpecifiedCoefficients, $SpecifiedCoefficient; 512 } 513 } 514 if (@UnsupportedSpecifiedCoefficients) { 515 if (@UnsupportedSpecifiedCoefficients > 1) { 516 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedCoefficients, ", ", 0)," - for option \"-m --mode\" are not valid.\n"; 517 } 518 else { 519 warn "Error: The value specified, @UnsupportedSpecifiedCoefficients, for option \"-m --mode\" is not valid.\n"; 520 } 521 die "Allowed values:", JoinWords(\@SupportedSimilarityCoefficients, ", ", 0), "\n"; 522 } 523 } 524 COEFFICIENT: for $SpecifiedCoefficient (@SpecifiedSimilarityCoefficients) { 525 if (exists $SpecifiedSimilarityCoefficientsMethodMap{lc($SpecifiedCoefficient)} ) { 526 next COEFFICIENT; 527 } 528 $SpecifiedSimilarityCoefficientsMethodMap{lc($SpecifiedCoefficient)} = $SupportedSimilarityCoefficientsMethodMap{lc($SpecifiedCoefficient)}; 529 $SpecifiedSimilarityCoefficientsNameMap{lc($SpecifiedCoefficient)} = $SupportedSimilarityCoefficientsNameMap{lc($SpecifiedCoefficient)}; 530 } 531 $OptionsInfo{Mode} = $Options{mode}; 532 $OptionsInfo{SpecifiedSimilarityCoefficientsRef} = \@SpecifiedSimilarityCoefficients; 533 $OptionsInfo{SpecifiedSimilarityCoefficientsNameMapRef} = \%SpecifiedSimilarityCoefficientsNameMap; 534 $OptionsInfo{SpecifiedSimilarityCoefficientsMethodMapRef} = \%SpecifiedSimilarityCoefficientsMethodMap; 535 536 # Make sure valid alpha parameter is specified for Tversky calculation... 537 $OptionsInfo{Alpha} = ''; 538 $SpecifiedCoefficient = 'Tversky'; 539 if ($SpecifiedSimilarityCoefficientsNameMap{lc($SpecifiedCoefficient)}) { 540 if (IsEmpty($Options{alpha})) { 541 die "Error: You must specify a value for \"-a, --alpha\" option in \"$SpecifiedCoefficient or All\" \"-m --mode\". \n"; 542 } 543 my($Alpha); 544 $Alpha = $Options{alpha}; 545 if (!(IsFloat($Alpha) && $Alpha >=0 && $Alpha <= 1)) { 546 die "Error: The value specified, $Options{alpha}, for option \"-a, --alpha\" is not valid. Allowed values: >= 0 and <= 1\n"; 547 } 548 $OptionsInfo{Alpha} = $Alpha; 549 } 550 551 # Make sure valid beta parameter is specified for WeightedTanimoto and WeightedTversky 552 # calculations... 553 my($SpecifiedCoefficient1, $SpecifiedCoefficient2); 554 $OptionsInfo{Beta} = ''; 555 $SpecifiedCoefficient1 = 'WeightedTversky'; 556 $SpecifiedCoefficient2 = 'WeightedTanimoto'; 557 if ($SpecifiedSimilarityCoefficientsNameMap{lc($SpecifiedCoefficient1)} || $SpecifiedSimilarityCoefficientsNameMap{lc($SpecifiedCoefficient1)}) { 558 if (IsEmpty($Options{beta})) { 559 die "Error: You must specify a value for \"-b, --beta\" option in \"$SpecifiedCoefficient1, $SpecifiedCoefficient2, or All\" \"-m --mode\". \n"; 560 } 561 my($Beta); 562 $Beta = $Options{beta}; 563 if (!(IsFloat($Beta) && $Beta >=0 && $Beta <= 1)) { 564 die "Error: The value specified, $Options{beta}, for option \"-b, --beta\" is not valid. Allowed values: >= 0 and <= 1\n"; 565 } 566 $OptionsInfo{Beta} = $Beta; 567 } 568 569 if (IsNotEmpty($Options{fingerprintsfield})) { 570 $OptionsInfo{FingerprintsField} = $Options{fingerprintsfield}; 571 } 572 else { 573 $OptionsInfo{FingerprintsField} = 'UseDefault'; 574 } 575 576 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode}; 577 578 if ($Options{compoundidmode} =~ /^DataField$/i) { 579 if (!$Options{compoundid}) { 580 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n"; 581 } 582 $OptionsInfo{CompoundID} = $Options{compoundid}; 583 } 584 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) { 585 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd'; 586 } 587 588 $OptionsInfo{Detail} = $Options{detail}; 589 590 $OptionsInfo{FingerprintsFormatMode} = $Options{fingerprintsformatmode}; 591 $OptionsInfo{FingerprintsString} = ''; 592 if ($Options{fingerprintsformatmode} =~ /^Specify$/i) { 593 if (IsEmpty($Options{fingerprintsstring})) { 594 die "Error: You must specify a value for \"--FingerprintsString\" option in \"Specify\" \"--FingerprintsFormatMode\". \n"; 595 } 596 if ($Options{fingerprintsstring} !~ /^(Hexadecimal|Binary|RawBinary)$/i) { 597 die "Error: The value specified, $Options{fingerprintsstring}, for option \"--FingerprintsString\" is not valid. Allowed values: Hexadecimal, Binary, or RawBinary\n"; 598 } 599 $OptionsInfo{FingerprintsString} = $Options{fingerprintsstring}; 600 } 601 602 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,"); 603 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0; 604 605 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 606 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 607 608 $OptionsInfo{Fast} = $Options{fast} ? 1 : 0; 609 610 $OptionsInfo{Precision} = $Options{precision}; 611 } 612 613 # Setup script usage and retrieve command line arguments specified using various options... 614 sub SetupScriptUsage { 615 616 # Retrieve all the options... 617 %Options = (); 618 619 $Options{alpha} = 0.5; 620 $Options{beta} = 1; 621 $Options{fingerprintsformatmode} = 'Internal'; 622 623 $Options{compoundidmode} = 'LabelPrefix'; 624 $Options{compoundidlabel} = 'CompoundID'; 625 626 $Options{detail} = 1; 627 $Options{mode} = 'Tanimoto'; 628 $Options{outdelim} = 'comma'; 629 $Options{quote} = 'yes'; 630 $Options{precision} = 2; 631 632 if (!GetOptions(\%Options, "alpha|a=f", "beta|b=f", "compoundid=s", "compoundidmode=s", "detail|d=i", "fast|f", "fingerprintsfield=s", "fingerprintsformatmode=s", "fingerprintsstring=s", "help|h", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=s", "quote|q=s", "root|r=s", "workingdir|w=s")) { 633 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 634 } 635 if ($Options{workingdir}) { 636 if (! -d $Options{workingdir}) { 637 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 638 } 639 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 640 } 641 if ($Options{compoundidmode} !~ /(^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$)/i) { 642 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; 643 } 644 if (!IsPositiveInteger($Options{detail})) { 645 die "Error: The value specified, $Options{detail}, for option \"-d, --detail\" is not valid. Allowed values: > 0 \n"; 646 } 647 if ($Options{fingerprintsformatmode} !~ /^(Internal|Specify)$/i) { 648 die "Error: The value specified, $Options{fingerprintsformatmode}, for option \"--FingerprintsFormatMode\" is not valid. Allowed values: Internal or Specify\n"; 649 } 650 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 651 die "Error: The value specified, $Options{outdelim}, for option \"--OutDelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 652 } 653 if ($Options{quote} !~ /^(Yes|No)$/i) { 654 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n"; 655 } 656 if (!IsPositiveInteger($Options{precision})) { 657 die "Error: The value specified, $Options{precision}, for option \"--precision\" is not valid. Allowed values: > 0 \n"; 658 } 659 } 660