MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: SimilarityMatrixSDFiles.pl,v $
   4 # $Date: 2008/04/19 16:12:21 $
   5 # $Revision: 1.12 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use 5.006;
  30 use strict;
  31 use FindBin; use lib "$FindBin::Bin/../lib";
  32 use Getopt::Long;
  33 use File::Basename;
  34 use Text::ParseWords;
  35 use Benchmark;
  36 use FileUtil;
  37 use TextUtil;
  38 use SDFileUtil;
  39 use Fingerprints::FingerprintsBitVector;
  40 
  41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  42 
  43 # Autoflush STDOUT
  44 $| = 1;
  45 
  46 # Starting message...
  47 $ScriptName = basename($0);
  48 print "\n$ScriptName: Starting...\n\n";
  49 $StartTime = new Benchmark;
  50 
  51 # Get the options and setup script...
  52 SetupScriptUsage();
  53 if ($Options{help} || @ARGV < 1) {
  54   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  55 }
  56 
  57 my(@SDFilesList);
  58 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  59 
  60 # Process options...
  61 my(%OptionsInfo);
  62 ProcessOptions();
  63 
  64 # Setup information about input files...
  65 my(%SDFilesInfo);
  66 print "Checking input SD file(s)...\n";
  67 RetrieveSDFilesInfo();
  68 
  69 # Process input files..
  70 my($FileIndex, $SDFile, $FileProcessingMsg);
  71 $FileProcessingMsg = "Processing file";
  72 if (@SDFilesList > 1) {
  73   print "Processing SD files...\n";
  74   $FileProcessingMsg = "\n$FileProcessingMsg";
  75 }
  76 
  77 for $FileIndex (0 .. $#SDFilesList) {
  78   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  79     $SDFile = $SDFilesList[$FileIndex];
  80     print "$FileProcessingMsg $SDFile...\n";
  81     GenerateSimilarityMatrices($FileIndex);
  82   }
  83 }
  84 print "$ScriptName:Done...\n\n";
  85 
  86 $EndTime = new Benchmark;
  87 $TotalTime = timediff ($EndTime, $StartTime);
  88 print "Total time: ", timestr($TotalTime), "\n";
  89 
  90 ###############################################################################
  91 
  92 # Generate similarity matrices using fingerprints data in SD file...
  93 #
  94 sub GenerateSimilarityMatrices {
  95   my($FileIndex) = @_;
  96   my($CompundIDsRef, $FingerprintsBitVectorsRef);
  97 
  98   # Process fingerprints data in SD file...
  99   print "Processing fingerprints data...\n";
 100   ($CompundIDsRef, $FingerprintsBitVectorsRef) = ProcessFingerprintsData($FileIndex);
 101 
 102   # Generate similarity matrices...
 103   my($SpecifiedSimilarityCoefficient, $SimilarityCoefficient, $NewTextFile, $SimilarityMatrixRef);
 104   for $SpecifiedSimilarityCoefficient (@{$OptionsInfo{SpecifiedSimilarityCoefficientsRef}}) {
 105     $SimilarityCoefficient = $OptionsInfo{SpecifiedSimilarityCoefficientsNameMapRef}->{lc($SpecifiedSimilarityCoefficient)};
 106     $NewTextFile = $SDFilesInfo{OutFileRoot}[$FileIndex] . "${SimilarityCoefficient}." . $SDFilesInfo{OutFileExt}[$FileIndex];
 107 
 108     print "Generating $NewTextFile...\n";
 109 
 110     $SimilarityMatrixRef = CalculateSimilarityMatrix($SimilarityCoefficient, $FingerprintsBitVectorsRef);
 111     WriteSimilarityMatrix($NewTextFile, $CompundIDsRef, $SimilarityMatrixRef);
 112   }
 113 }
 114 
 115 # Calculate a specific similarity matrix...
 116 #
 117 sub CalculateSimilarityMatrix {
 118   my($SimilarityCoefficient, $FingerprintsBitVectorsRef) = @_;
 119   my($Index, $Index1, $Index2, $Value, $MethodName, $FingerprintsBitVectorA, $FingerprintsBitVectorB, $UseAlphaOrBeta, $Alpha, $Beta, $Precision, @SimilarityMatrix);
 120 
 121   # Initialize data...
 122   @SimilarityMatrix = ();
 123   for $Index (0 .. $#{$FingerprintsBitVectorsRef}) {
 124     @{$SimilarityMatrix[$Index]} = ();
 125   }
 126   $MethodName = $OptionsInfo{SpecifiedSimilarityCoefficientsMethodMapRef}->{lc($SimilarityCoefficient)};
 127 
 128   $UseAlphaOrBeta = 1;
 129   if ($SimilarityCoefficient =~ /^Tversky$/i) {
 130     $Alpha = $OptionsInfo{Alpha};
 131   }
 132   elsif ($SimilarityCoefficient =~ /^WeightedTversky$/i) {
 133     $Alpha = $OptionsInfo{Alpha};
 134     $Beta = $OptionsInfo{Beta};
 135   }
 136   elsif ($SimilarityCoefficient =~ /^WeightedTanimoto$/i) {
 137     $Beta = $OptionsInfo{Beta};
 138   }
 139   else {
 140     $UseAlphaOrBeta = 0;
 141   }
 142   $Precision = $OptionsInfo{Precision};
 143 
 144   # Calculate pairwise similarity coefficients...
 145   for $Index1 (0 .. $#{$FingerprintsBitVectorsRef}) {
 146     $FingerprintsBitVectorA = $FingerprintsBitVectorsRef->[$Index1];
 147 
 148     for $Index2 (0 .. $#{$FingerprintsBitVectorsRef}) {
 149       $FingerprintsBitVectorB = $FingerprintsBitVectorsRef->[$Index2];
 150       $Value = '';
 151       if ($UseAlphaOrBeta) {
 152 	if ($SimilarityCoefficient =~ /^Tversky$/i) {
 153 	  $Value = $FingerprintsBitVectorA->$MethodName($FingerprintsBitVectorB, $Alpha);
 154 	}
 155 	elsif ($SimilarityCoefficient =~ /^WeightedTversky$/i) {
 156 	  $Value = $FingerprintsBitVectorA->$MethodName($FingerprintsBitVectorB, $Alpha, $Beta);
 157 	}
 158 	elsif ($SimilarityCoefficient =~ /^WeightedTanimoto$/i) {
 159 	  $Value = $FingerprintsBitVectorA->$MethodName($FingerprintsBitVectorB, $Beta);
 160 	}
 161       }
 162       else {
 163 	$Value = $FingerprintsBitVectorA->$MethodName($FingerprintsBitVectorB);
 164       }
 165       $Value = (defined($Value) && length($Value)) ? (sprintf("%.${Precision}f", $Value) + 0) : '';
 166       push @{$SimilarityMatrix[$Index1]}, $Value;
 167     }
 168   }
 169   return \@SimilarityMatrix;
 170 }
 171 
 172 # Write out similarity matrix...
 173 #
 174 sub WriteSimilarityMatrix {
 175   my($NewTextFile, $CompundIDsRef, $SimilarityMatrixRef) = @_;
 176   my($Index, $Line, $OutDelim, $OutQuote, @LineWords);
 177 
 178   $OutDelim = $OptionsInfo{OutDelim};
 179   $OutQuote = $OptionsInfo{OutQuote};
 180 
 181   # Write out similarity matrix...
 182   open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $>$NewTextFile: $! \n";
 183 
 184   # Write out column labels...
 185   @LineWords = ();
 186   push @LineWords, '';
 187   push @LineWords, @{$CompundIDsRef};
 188   $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote);
 189   print NEWTEXTFILE "$Line\n";
 190 
 191   # Write out similarity coefficients...
 192   for $Index (0 .. $#{$CompundIDsRef}) {
 193     @LineWords = ();
 194     push @LineWords, $CompundIDsRef->[$Index];
 195     push @LineWords,  @{$SimilarityMatrixRef->[$Index]};
 196     $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote);
 197     print NEWTEXTFILE "$Line\n";
 198   }
 199   close NEWTEXTFILE;
 200 
 201 }
 202 
 203 # Process fingerprints data in SD file and return references to list containing
 204 # compound IDs and corresponding FingerprintsBitVectors...
 205 #
 206 sub ProcessFingerprintsData {
 207   my($FileIndex) = @_;
 208   my($SDFile, $CmpdString, $MolName, $CmpdCount, $InvalidCmpdDataCount, $MissingCmpdDataCount, $DetailLevel, $FingerprintsFieldLabel, $UseInternalFormat, $CheckData, $FirstFingerprintsCmpdData, $CompoundID, $FingerprintsType, $FingerprintsStringType, $FingerprintsSize, $FingerprintsString, $InvalidFingerprintsData, $FirstFingerprintsType, $FirstFingerprintsStringType, $FirstFingerprintsSize, @CmpdLines, @CompundIDs, @FingerprintsBitVectors, %DataFieldLabelsAndValues);
 209 
 210   @CompundIDs = ();
 211   @FingerprintsBitVectors = ();
 212 
 213   $SDFile = $SDFilesList[$FileIndex];
 214   open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 215 
 216   $CmpdCount = 0;
 217   $InvalidCmpdDataCount = 0;
 218   $MissingCmpdDataCount = 0;
 219 
 220   $DetailLevel = $OptionsInfo{Detail};
 221 
 222   $FingerprintsFieldLabel = $SDFilesInfo{FingerprintsFieldLabel}[$FileIndex];
 223   $UseInternalFormat = ($OptionsInfo{FingerprintsFormatMode} =~ /^Internal$/i) ? 1 : 0;
 224 
 225   $CheckData = $OptionsInfo{Fast} ? 0 : 1;
 226   $FirstFingerprintsCmpdData = 1;
 227 
 228   COMPOUND: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 229     $CmpdCount++;
 230     @CmpdLines = split "\n", $CmpdString;
 231     %DataFieldLabelsAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 232 
 233     if ($CheckData) {
 234       if (!exists $DataFieldLabelsAndValues{$FingerprintsFieldLabel}) {
 235 	# Missing data...
 236 	$MissingCmpdDataCount++;
 237 	if ($DetailLevel >= 3) {
 238 	  print "Compound number $CmpdCount contains no fingerprints data: $CmpdString \n";
 239 	}
 240 	elsif ($DetailLevel >= 2) {
 241 	  print "Compound number $CmpdCount contains no fingerprints data...\n";
 242 	}
 243 	next COMPOUND;
 244       }
 245     }
 246     # Setup fingerprints bit vector...
 247     $InvalidFingerprintsData = 0;
 248     if ($UseInternalFormat) {
 249       ($FingerprintsType, $FingerprintsStringType, $FingerprintsSize, $FingerprintsString) = $DataFieldLabelsAndValues{$FingerprintsFieldLabel} =~ /^(.*?):(.*?):(.*?):(.*?)$/;
 250       if ($CheckData) {
 251 	if (IsEmpty($FingerprintsType) || IsEmpty($FingerprintsStringType) || IsEmpty($FingerprintsSize) || IsEmpty($FingerprintsString)) {
 252 	  $InvalidFingerprintsData = 1;
 253 	}
 254 	elsif ($FirstFingerprintsCmpdData) {
 255 	  $FirstFingerprintsCmpdData = 0;
 256 	  ($FirstFingerprintsType, $FirstFingerprintsStringType, $FirstFingerprintsSize) = ($FingerprintsType, $FingerprintsStringType, $FingerprintsSize);
 257 	}
 258 	else {
 259 	  if ($FirstFingerprintsType !~ /^$FingerprintsType$/i || $FirstFingerprintsStringType !~ /^$FingerprintsStringType$/i || $FirstFingerprintsSize !~ /^$FingerprintsSize$/i) {
 260 	    $InvalidFingerprintsData = 1;
 261 	  }
 262 	}
 263       }
 264     }
 265     else {
 266       $FingerprintsString = $DataFieldLabelsAndValues{$FingerprintsFieldLabel};
 267       $FingerprintsStringType = $OptionsInfo{FingerprintsString};
 268       if ($CheckData && IsEmpty($FingerprintsString)) {
 269 	$InvalidFingerprintsData = 1;
 270       }
 271     }
 272     if ($InvalidFingerprintsData) {
 273       # InvalidData data...
 274       $InvalidCmpdDataCount++;
 275       if ($DetailLevel >= 3) {
 276 	print "Compound number $CmpdCount contains invalid fingerprints data:\n$CmpdString\n";
 277       }
 278       elsif ($DetailLevel >= 2) {
 279 	print "Compound number $CmpdCount contains invalid fingerprints data:\n";
 280 	if ($UseInternalFormat) {
 281 	  print "$DataFieldLabelsAndValues{$FingerprintsFieldLabel}\n";
 282 	}
 283 	else {
 284 	  print "FingerprintsStringType: $FingerprintsStringType\nFingerprintsString: $FingerprintsString\n";
 285 	}
 286       }
 287       next COMPOUND;
 288     }
 289     my($FingerprintsBitVector);
 290 
 291     $FingerprintsBitVector = '';
 292     if ($FingerprintsStringType =~ /^(Hexadecimal|Hex)$/i) {
 293       $FingerprintsBitVector = FingerprintsBitVector::NewFromHexadecimalString($FingerprintsString);
 294     }
 295     elsif ($FingerprintsStringType =~ /^(Binary|Bin)$/i) {
 296       $FingerprintsBitVector = FingerprintsBitVector::NewFromBinaryString($FingerprintsString);
 297     }
 298     elsif ($FingerprintsStringType =~ /^(RawBinary|RawBin)$/i) {
 299       $FingerprintsBitVector = FingerprintsBitVector::NewFromRawBinaryString($FingerprintsString);
 300     }
 301 
 302     $MolName = $CmpdLines[0];
 303     $CompoundID = SetupCompoundID($CmpdCount, $MolName, \%DataFieldLabelsAndValues);
 304 
 305     push @CompundIDs, $CompoundID;
 306     push @FingerprintsBitVectors, $FingerprintsBitVector;
 307   }
 308   close SDFILE;
 309 
 310   if ($DetailLevel >= 1) {
 311     if ($MissingCmpdDataCount) {
 312       print "Missing fingerprints data in $MissingCmpdDataCount compound(s)...\n";
 313     }
 314     if ($InvalidCmpdDataCount) {
 315       print "Invalid fingerprints data in $InvalidCmpdDataCount compound(s)...\n";
 316     }
 317   }
 318 
 319   return (\@CompundIDs, \@FingerprintsBitVectors);
 320 }
 321 
 322 # Generate compound ID...
 323 #
 324 sub SetupCompoundID {
 325   my($CmpdCount, $MolName, $DataFieldLabelAndValuesRef) = @_;
 326   my($CmpdID);
 327 
 328   $CmpdID = '';
 329   if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
 330     $CmpdID = IsNotEmpty($MolName) ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
 331   }
 332   elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
 333     $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
 334   }
 335   elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
 336     my($SpecifiedDataField);
 337     $SpecifiedDataField = $OptionsInfo{CompoundID};
 338     $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
 339   }
 340   elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
 341     $CmpdID = $MolName;
 342   }
 343   return $CmpdID;
 344 }
 345 
 346 # Retrieve information about SD files...
 347 #
 348 sub RetrieveSDFilesInfo {
 349   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $OutFileExt, $InDelim, $Line, $FingerprintsFieldLabel, $SpecifiedDataField);
 350 
 351   %SDFilesInfo = ();
 352   @{$SDFilesInfo{FileOkay}} = ();
 353   @{$SDFilesInfo{FingerprintsFieldLabel}} = ();
 354   @{$SDFilesInfo{OutFileRoot}} = ();
 355   @{$SDFilesInfo{OutFileExt}} = ();
 356 
 357   FILELIST: for $Index (0 .. $#SDFilesList) {
 358     $SDFile = $SDFilesList[$Index];
 359 
 360     $SDFilesInfo{FileOkay}[$Index] = 0;
 361     $SDFilesInfo{FingerprintsFieldLabel}[$Index] = '';
 362     $SDFilesInfo{OutFileRoot}[$Index] = '';
 363     $SDFilesInfo{OutFileExt}[$Index] = '';
 364 
 365     $SDFile = $SDFilesList[$Index];
 366     if (!(-e $SDFile)) {
 367       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 368       next FILELIST;
 369     }
 370     if (!CheckFileType($SDFile, "sdf sd")) {
 371       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 372       next FILELIST;
 373     }
 374 
 375     $FileDir = ""; $FileName = ""; $FileExt = "";
 376     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 377 
 378     if (!open SDFILE, "$SDFile") {
 379       warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
 380       next FILELIST;
 381     }
 382     close SDFILE;
 383 
 384     # Make sure data field exists in SD file..
 385     my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
 386 
 387     @CmpdLines = ();
 388     open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 389     $CmpdString = ReadCmpdString(\*SDFILE);
 390     close SDFILE;
 391     @CmpdLines = split "\n", $CmpdString;
 392     %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 393 
 394     if ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
 395       $SpecifiedDataField = $OptionsInfo{CompoundID};
 396       if (!exists $DataFieldValues{$SpecifiedDataField}) {
 397 	warn "Warning: Ignoring file $SDFile: Data field value specified, $SpecifiedDataField, using  \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
 398 	next FILELIST;
 399       }
 400     }
 401     $FingerprintsFieldLabel = '';
 402     if ($OptionsInfo{FingerprintsField} !~ /^UseDefault$/i) {
 403       $FingerprintsFieldLabel = $OptionsInfo{FingerprintsField};
 404       if (!exists $DataFieldValues{$FingerprintsFieldLabel}) {
 405 	warn "Warning: Ignoring file $SDFile: Data field value specified, $FingerprintsFieldLabel, using  \"--FingerprintsField\" option doesn't exist\n";
 406 	next FILELIST;
 407       }
 408     }
 409     else {
 410       # Make sure default fingerprints field does exist...
 411       my($FingerprintsFieldFound, $DataFieldLabel);
 412       $FingerprintsFieldFound = 0;
 413       DATAFIELDLABEL: for $DataFieldLabel (keys %DataFieldValues) {
 414 	if ($DataFieldLabel =~ /Fingerprints/i) {
 415 	  $FingerprintsFieldFound = 1;
 416 	  $FingerprintsFieldLabel = $DataFieldLabel;
 417 	  last DATAFIELDLABEL;
 418 	}
 419       }
 420       if (!$FingerprintsFieldFound) {
 421 	warn "Warning: Ignoring file $SDFile: Data field label containing \"Fingerprints\" string in its name doesn't exist.\n";
 422 	next FILELIST;
 423       }
 424     }
 425 
 426     # Setup output file names...
 427     $FileDir = ""; $FileName = ""; $FileExt = "";
 428     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 429 
 430     $OutFileExt = "csv";
 431     if ($Options{outdelim} =~ /^tab$/i) {
 432       $OutFileExt = "tsv";
 433     }
 434 
 435     $OutFileRoot = $FileName;
 436     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
 437       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 438       if ($RootFileName && $RootFileExt) {
 439 	$FileName = $RootFileName;
 440       }
 441       else {
 442 	$FileName = $OptionsInfo{OutFileRoot};
 443       }
 444       $OutFileRoot = $FileName;
 445     }
 446 
 447     if (!$Options{overwrite}) {
 448       my($SpecifiedSimilarityCoefficient, $SimilarityCoefficient, $SimilarityCoefficientsNameMapRef);
 449       $SimilarityCoefficientsNameMapRef = $OptionsInfo{SpecifiedSimilarityCoefficientsNameMapRef};
 450       for $SpecifiedSimilarityCoefficient (@{$OptionsInfo{SpecifiedSimilarityCoefficientsRef}}) {
 451 	$SimilarityCoefficient = $SimilarityCoefficientsNameMapRef->{lc($SpecifiedSimilarityCoefficient)};
 452 	if (-e "${OutFileRoot}${SimilarityCoefficient}.${OutFileExt}") {
 453 	  warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}${SimilarityCoefficient}.${OutFileExt} already exists.\n";
 454 	  next FILELIST;
 455 	}
 456       }
 457     }
 458 
 459     $SDFilesInfo{FileOkay}[$Index] = 1;
 460     $SDFilesInfo{FingerprintsFieldLabel}[$Index] = $FingerprintsFieldLabel;
 461 
 462     $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 463     $SDFilesInfo{OutFileExt}[$Index] = $OutFileExt;
 464   }
 465 }
 466 
 467 # Process option values...
 468 sub ProcessOptions {
 469   %OptionsInfo = ();
 470 
 471   # Setup supported similarity coefficients...
 472   my($SimilarityCoefficient, $SupportedSimilarityCoefficient, @SupportedSimilarityCoefficients, %SupportedSimilarityCoefficientsNameMap, %SupportedSimilarityCoefficientsMethodMap);
 473 
 474   @SupportedSimilarityCoefficients = ();
 475   %SupportedSimilarityCoefficientsNameMap = ();
 476   %SupportedSimilarityCoefficientsMethodMap = ();
 477   for $SupportedSimilarityCoefficient (FingerprintsBitVector::GetSupportedSimilarityCoefficients()) {
 478     # Similarity coefficient function/method names contain "SimilarityCoefficient" in their names.
 479     # So take 'em out and setup a map to original function/method name...
 480     $SimilarityCoefficient = $SupportedSimilarityCoefficient;
 481     $SimilarityCoefficient =~ s/SimilarityCoefficient$//;
 482     push @SupportedSimilarityCoefficients, $SimilarityCoefficient;
 483     $SupportedSimilarityCoefficientsNameMap{lc($SimilarityCoefficient)} = $SimilarityCoefficient;
 484     $SupportedSimilarityCoefficientsMethodMap{lc($SimilarityCoefficient)} = $SupportedSimilarityCoefficient;
 485   }
 486 
 487   # Setup a list of similarity coefficients to use for calculating similarity matrices...
 488   my($SpecifiedCoefficient, @SpecifiedSimilarityCoefficients, %SpecifiedSimilarityCoefficientsNameMap, %SpecifiedSimilarityCoefficientsMethodMap);
 489 
 490   @SpecifiedSimilarityCoefficients = ();
 491   %SpecifiedSimilarityCoefficientsNameMap = ();
 492   %SpecifiedSimilarityCoefficientsMethodMap = ();
 493 
 494   if ($Options{mode} =~ /^All$/i) {
 495     push @SpecifiedSimilarityCoefficients, @SupportedSimilarityCoefficients;
 496   }
 497   else {
 498     # Comma delimited list of similarity coefficients...
 499     my($Mode, @SpecifiedCoefficients, @UnsupportedSpecifiedCoefficients);
 500 
 501     $Mode = $Options{mode};
 502     $Mode =~ s/ //g;
 503     @SpecifiedCoefficients = split ",", $Mode;
 504     @UnsupportedSpecifiedCoefficients = ();
 505 
 506     for $SpecifiedCoefficient (@SpecifiedCoefficients) {
 507       if (exists($SupportedSimilarityCoefficientsMethodMap{lc($SpecifiedCoefficient)})) {
 508 	push @SpecifiedSimilarityCoefficients, $SpecifiedCoefficient;
 509       }
 510       else {
 511 	push @UnsupportedSpecifiedCoefficients, $SpecifiedCoefficient;
 512       }
 513     }
 514     if (@UnsupportedSpecifiedCoefficients) {
 515       if (@UnsupportedSpecifiedCoefficients > 1) {
 516 	warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedCoefficients, ", ", 0)," - for option \"-m --mode\" are not valid.\n";
 517       }
 518       else {
 519 	warn "Error: The value specified, @UnsupportedSpecifiedCoefficients, for option \"-m --mode\" is not valid.\n";
 520       }
 521       die "Allowed values:", JoinWords(\@SupportedSimilarityCoefficients, ", ", 0), "\n";
 522     }
 523   }
 524   COEFFICIENT: for $SpecifiedCoefficient (@SpecifiedSimilarityCoefficients) {
 525     if (exists $SpecifiedSimilarityCoefficientsMethodMap{lc($SpecifiedCoefficient)} ) {
 526       next COEFFICIENT;
 527     }
 528     $SpecifiedSimilarityCoefficientsMethodMap{lc($SpecifiedCoefficient)} = $SupportedSimilarityCoefficientsMethodMap{lc($SpecifiedCoefficient)};
 529     $SpecifiedSimilarityCoefficientsNameMap{lc($SpecifiedCoefficient)} = $SupportedSimilarityCoefficientsNameMap{lc($SpecifiedCoefficient)};
 530   }
 531   $OptionsInfo{Mode} = $Options{mode};
 532   $OptionsInfo{SpecifiedSimilarityCoefficientsRef} = \@SpecifiedSimilarityCoefficients;
 533   $OptionsInfo{SpecifiedSimilarityCoefficientsNameMapRef} = \%SpecifiedSimilarityCoefficientsNameMap;
 534   $OptionsInfo{SpecifiedSimilarityCoefficientsMethodMapRef} = \%SpecifiedSimilarityCoefficientsMethodMap;
 535 
 536   # Make sure valid alpha parameter is specified for Tversky calculation...
 537   $OptionsInfo{Alpha} = '';
 538   $SpecifiedCoefficient = 'Tversky';
 539   if ($SpecifiedSimilarityCoefficientsNameMap{lc($SpecifiedCoefficient)}) {
 540     if (IsEmpty($Options{alpha})) {
 541       die "Error: You must specify a value for \"-a, --alpha\" option in \"$SpecifiedCoefficient or All\" \"-m --mode\". \n";
 542     }
 543     my($Alpha);
 544     $Alpha = $Options{alpha};
 545     if (!(IsFloat($Alpha) && $Alpha >=0 && $Alpha <= 1)) {
 546       die "Error: The value specified, $Options{alpha}, for option \"-a, --alpha\" is not valid. Allowed values: >= 0 and <= 1\n";
 547     }
 548     $OptionsInfo{Alpha} = $Alpha;
 549   }
 550 
 551   # Make sure valid beta parameter is specified for WeightedTanimoto and WeightedTversky
 552   # calculations...
 553   my($SpecifiedCoefficient1, $SpecifiedCoefficient2);
 554   $OptionsInfo{Beta} = '';
 555   $SpecifiedCoefficient1 = 'WeightedTversky';
 556   $SpecifiedCoefficient2 = 'WeightedTanimoto';
 557   if ($SpecifiedSimilarityCoefficientsNameMap{lc($SpecifiedCoefficient1)} || $SpecifiedSimilarityCoefficientsNameMap{lc($SpecifiedCoefficient1)}) {
 558     if (IsEmpty($Options{beta})) {
 559       die "Error: You must specify a value for \"-b, --beta\" option in \"$SpecifiedCoefficient1, $SpecifiedCoefficient2, or All\" \"-m --mode\". \n";
 560     }
 561     my($Beta);
 562     $Beta = $Options{beta};
 563     if (!(IsFloat($Beta) && $Beta >=0 && $Beta <= 1)) {
 564       die "Error: The value specified, $Options{beta}, for option \"-b, --beta\" is not valid. Allowed values: >= 0 and <= 1\n";
 565     }
 566     $OptionsInfo{Beta} = $Beta;
 567   }
 568 
 569   if (IsNotEmpty($Options{fingerprintsfield})) {
 570     $OptionsInfo{FingerprintsField} = $Options{fingerprintsfield};
 571   }
 572   else {
 573     $OptionsInfo{FingerprintsField} = 'UseDefault';
 574   }
 575 
 576   $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
 577 
 578   if ($Options{compoundidmode} =~ /^DataField$/i) {
 579     if (!$Options{compoundid}) {
 580       die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
 581     }
 582     $OptionsInfo{CompoundID} = $Options{compoundid};
 583   }
 584   elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
 585     $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
 586   }
 587 
 588   $OptionsInfo{Detail} = $Options{detail};
 589 
 590   $OptionsInfo{FingerprintsFormatMode} = $Options{fingerprintsformatmode};
 591   $OptionsInfo{FingerprintsString} = '';
 592   if ($Options{fingerprintsformatmode} =~ /^Specify$/i) {
 593     if (IsEmpty($Options{fingerprintsstring})) {
 594       die "Error: You must specify a value for \"--FingerprintsString\" option in \"Specify\" \"--FingerprintsFormatMode\". \n";
 595     }
 596     if ($Options{fingerprintsstring} !~ /^(Hexadecimal|Binary|RawBinary)$/i) {
 597       die "Error: The value specified, $Options{fingerprintsstring}, for option \"--FingerprintsString\" is not valid. Allowed values: Hexadecimal, Binary, or RawBinary\n";
 598     }
 599     $OptionsInfo{FingerprintsString} = $Options{fingerprintsstring};
 600   }
 601 
 602   $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
 603   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
 604 
 605   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 606   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 607 
 608   $OptionsInfo{Fast} = $Options{fast} ? 1 : 0;
 609 
 610   $OptionsInfo{Precision} = $Options{precision};
 611 }
 612 
 613 # Setup script usage  and retrieve command line arguments specified using various options...
 614 sub SetupScriptUsage {
 615 
 616   # Retrieve all the options...
 617   %Options = ();
 618 
 619   $Options{alpha} = 0.5;
 620   $Options{beta} = 1;
 621   $Options{fingerprintsformatmode} = 'Internal';
 622 
 623   $Options{compoundidmode} = 'LabelPrefix';
 624   $Options{compoundidlabel} = 'CompoundID';
 625 
 626   $Options{detail} = 1;
 627   $Options{mode} = 'Tanimoto';
 628   $Options{outdelim} = 'comma';
 629   $Options{quote} = 'yes';
 630   $Options{precision} = 2;
 631 
 632   if (!GetOptions(\%Options, "alpha|a=f", "beta|b=f", "compoundid=s", "compoundidmode=s", "detail|d=i", "fast|f", "fingerprintsfield=s", "fingerprintsformatmode=s", "fingerprintsstring=s", "help|h", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=s", "quote|q=s", "root|r=s", "workingdir|w=s")) {
 633     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 634   }
 635   if ($Options{workingdir}) {
 636     if (! -d $Options{workingdir}) {
 637       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 638     }
 639     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 640   }
 641   if ($Options{compoundidmode} !~ /(^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$)/i) {
 642     die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
 643   }
 644   if (!IsPositiveInteger($Options{detail})) {
 645     die "Error: The value specified, $Options{detail}, for option \"-d, --detail\" is not valid. Allowed values: > 0 \n";
 646   }
 647   if ($Options{fingerprintsformatmode} !~ /^(Internal|Specify)$/i) {
 648     die "Error: The value specified, $Options{fingerprintsformatmode}, for option \"--FingerprintsFormatMode\" is not valid. Allowed values: Internal or Specify\n";
 649   }
 650   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 651     die "Error: The value specified, $Options{outdelim}, for option \"--OutDelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 652   }
 653   if ($Options{quote} !~ /^(Yes|No)$/i) {
 654     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
 655   }
 656   if (!IsPositiveInteger($Options{precision})) {
 657     die "Error: The value specified, $Options{precision}, for option \"--precision\" is not valid. Allowed values: > 0 \n";
 658   }
 659 }
 660