MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: SimilarityMatrixTextFiles.pl,v $
   4 # $Date: 2008/04/19 16:12:21 $
   5 # $Revision: 1.13 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use 5.006;
  30 use strict;
  31 use FindBin; use lib "$FindBin::Bin/../lib";
  32 use Getopt::Long;
  33 use File::Basename;
  34 use Text::ParseWords;
  35 use Benchmark;
  36 use FileUtil;
  37 use TextUtil;
  38 use Fingerprints::FingerprintsBitVector;
  39 
  40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  41 
  42 # Autoflush STDOUT
  43 $| = 1;
  44 
  45 # Starting message...
  46 $ScriptName = basename($0);
  47 print "\n$ScriptName: Starting...\n\n";
  48 $StartTime = new Benchmark;
  49 
  50 # Get the options and setup script...
  51 SetupScriptUsage();
  52 if ($Options{help} || @ARGV < 1) {
  53   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  54 }
  55 
  56 my(@TextFilesList);
  57 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
  58 
  59 # Process options...
  60 my(%OptionsInfo);
  61 ProcessOptions();
  62 
  63 # Setup information about input files...
  64 my(%TextFilesInfo);
  65 print "Checking input text file(s)...\n";
  66 RetrieveTextFilesInfo();
  67 
  68 ProcessColumnsInfo();
  69 
  70 # Process input files..
  71 my($FileIndex, $TextFile, $FileProcessingMsg);
  72 $FileProcessingMsg = "Processing file";
  73 if (@TextFilesList > 1) {
  74   print "Processing text files...\n";
  75   $FileProcessingMsg = "\n$FileProcessingMsg";
  76 }
  77 
  78 for $FileIndex (0 .. $#TextFilesList) {
  79   if ($TextFilesInfo{FileOkay}[$FileIndex]) {
  80     $TextFile = $TextFilesList[$FileIndex];
  81     print "$FileProcessingMsg $TextFile...\n";
  82     GenerateSimilarityMatrices($FileIndex);
  83   }
  84 }
  85 print "$ScriptName:Done...\n\n";
  86 
  87 $EndTime = new Benchmark;
  88 $TotalTime = timediff ($EndTime, $StartTime);
  89 print "Total time: ", timestr($TotalTime), "\n";
  90 
  91 ###############################################################################
  92 
  93 # Generate similarity matrices using fingerprints data in text file...
  94 #
  95 sub GenerateSimilarityMatrices {
  96   my($FileIndex) = @_;
  97   my($CompundIDsRef, $FingerprintsBitVectorsRef);
  98 
  99   # Process fingerprints data in text file...
 100   print "Processing fingerprints data...\n";
 101   ($CompundIDsRef, $FingerprintsBitVectorsRef) = ProcessFingerprintsData($FileIndex);
 102 
 103   # Generate similarity matrices...
 104   my($SpecifiedSimilarityCoefficient, $SimilarityCoefficient, $NewTextFile, $SimilarityMatrixRef);
 105   for $SpecifiedSimilarityCoefficient (@{$OptionsInfo{SpecifiedSimilarityCoefficientsRef}}) {
 106     $SimilarityCoefficient = $OptionsInfo{SpecifiedSimilarityCoefficientsNameMapRef}->{lc($SpecifiedSimilarityCoefficient)};
 107     $NewTextFile = $TextFilesInfo{OutFileRoot}[$FileIndex] . "${SimilarityCoefficient}." . $TextFilesInfo{OutFileExt}[$FileIndex];
 108 
 109     print "Generating $NewTextFile...\n";
 110 
 111     $SimilarityMatrixRef = CalculateSimilarityMatrix($SimilarityCoefficient, $FingerprintsBitVectorsRef);
 112     WriteSimilarityMatrix($NewTextFile, $CompundIDsRef, $SimilarityMatrixRef);
 113   }
 114 }
 115 
 116 # Calculate a specific similarity matrix...
 117 #
 118 sub CalculateSimilarityMatrix {
 119   my($SimilarityCoefficient, $FingerprintsBitVectorsRef) = @_;
 120   my($Index, $Index1, $Index2, $Value, $MethodName, $FingerprintsBitVectorA, $FingerprintsBitVectorB, $UseAlphaOrBeta, $Alpha, $Beta, $Precision, @SimilarityMatrix);
 121 
 122   # Initialize data...
 123   @SimilarityMatrix = ();
 124   for $Index (0 .. $#{$FingerprintsBitVectorsRef}) {
 125     @{$SimilarityMatrix[$Index]} = ();
 126   }
 127   $MethodName = $OptionsInfo{SpecifiedSimilarityCoefficientsMethodMapRef}->{lc($SimilarityCoefficient)};
 128 
 129   $UseAlphaOrBeta = 1;
 130   if ($SimilarityCoefficient =~ /^Tversky$/i) {
 131     $Alpha = $OptionsInfo{Alpha};
 132   }
 133   elsif ($SimilarityCoefficient =~ /^WeightedTversky$/i) {
 134     $Alpha = $OptionsInfo{Alpha};
 135     $Beta = $OptionsInfo{Beta};
 136   }
 137   elsif ($SimilarityCoefficient =~ /^WeightedTanimoto$/i) {
 138     $Beta = $OptionsInfo{Beta};
 139   }
 140   else {
 141     $UseAlphaOrBeta = 0;
 142   }
 143   $Precision = $OptionsInfo{Precision};
 144 
 145   # Calculate pairwise similarity coefficients...
 146   for $Index1 (0 .. $#{$FingerprintsBitVectorsRef}) {
 147     $FingerprintsBitVectorA = $FingerprintsBitVectorsRef->[$Index1];
 148 
 149     for $Index2 (0 .. $#{$FingerprintsBitVectorsRef}) {
 150       $FingerprintsBitVectorB = $FingerprintsBitVectorsRef->[$Index2];
 151       $Value = '';
 152       if ($UseAlphaOrBeta) {
 153 	if ($SimilarityCoefficient =~ /^Tversky$/i) {
 154 	  $Value = $FingerprintsBitVectorA->$MethodName($FingerprintsBitVectorB, $Alpha);
 155 	}
 156 	elsif ($SimilarityCoefficient =~ /^WeightedTversky$/i) {
 157 	  $Value = $FingerprintsBitVectorA->$MethodName($FingerprintsBitVectorB, $Alpha, $Beta);
 158 	}
 159 	elsif ($SimilarityCoefficient =~ /^WeightedTanimoto$/i) {
 160 	  $Value = $FingerprintsBitVectorA->$MethodName($FingerprintsBitVectorB, $Beta);
 161 	}
 162       }
 163       else {
 164 	$Value = $FingerprintsBitVectorA->$MethodName($FingerprintsBitVectorB);
 165       }
 166       $Value = (defined($Value) && length($Value)) ? (sprintf("%.${Precision}f", $Value) + 0) : '';
 167       push @{$SimilarityMatrix[$Index1]}, $Value;
 168     }
 169   }
 170   return \@SimilarityMatrix;
 171 }
 172 
 173 # Write out similarity matrix...
 174 #
 175 sub WriteSimilarityMatrix {
 176   my($NewTextFile, $CompundIDsRef, $SimilarityMatrixRef) = @_;
 177   my($Index, $Line, $OutDelim, $OutQuote, @LineWords);
 178 
 179   $OutDelim = $OptionsInfo{OutDelim};
 180   $OutQuote = $OptionsInfo{OutQuote};
 181 
 182   # Write out similarity matrix...
 183   open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $>$NewTextFile: $! \n";
 184 
 185   # Write out column labels...
 186   @LineWords = ();
 187   push @LineWords, '';
 188   push @LineWords, @{$CompundIDsRef};
 189   $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote);
 190   print NEWTEXTFILE "$Line\n";
 191 
 192   # Write out similarity coefficients...
 193   for $Index (0 .. $#{$CompundIDsRef}) {
 194     @LineWords = ();
 195     push @LineWords, $CompundIDsRef->[$Index];
 196     push @LineWords,  @{$SimilarityMatrixRef->[$Index]};
 197     $Line = JoinWords(\@LineWords, $OutDelim, $OutQuote);
 198     print NEWTEXTFILE "$Line\n";
 199   }
 200   close NEWTEXTFILE;
 201 
 202 }
 203 
 204 # Process fingerprints data in text file and return references to list containing
 205 # compound IDs and corresponding FingerprintsBitVectors...
 206 #
 207 sub ProcessFingerprintsData {
 208   my($FileIndex) = @_;
 209   my($TextFile, $Line, $InDelim, $LineCount, $InvalidDataLineCount, $MissingDataLineCount, $UseSequentialID, $CmpdIDColNum, $FingerprintsColNum, $DetailLevel, $UseInternalFormat, $FingerprintsType, $FingerprintsStringType, $FingerprintsSize, $FingerprintsString, $CheckData, $InvalidFingerprintsData, $CompoundID, $FirstFingerprintsDataLine, $FirstFingerprintsType, $FirstFingerprintsStringType, $FirstFingerprintsSize, @LineWords, @CompundIDs, @FingerprintsBitVectors);
 210 
 211   @CompundIDs = ();
 212   @FingerprintsBitVectors = ();
 213 
 214   $TextFile = $TextFilesList[$FileIndex];
 215   open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n";
 216 
 217   $LineCount = 0;
 218   $InvalidDataLineCount = 0;
 219   $MissingDataLineCount = 0;
 220 
 221   $InDelim = $TextFilesInfo{InDelim}[$FileIndex];
 222   $DetailLevel = $OptionsInfo{Detail};
 223 
 224   $UseSequentialID = $TextFilesInfo{UseSequentialID}[$FileIndex];
 225   $CmpdIDColNum = $TextFilesInfo{CompoundIDColNum}[$FileIndex];
 226   $FingerprintsColNum = $TextFilesInfo{FingerprintsColNum}[$FileIndex];
 227   $UseInternalFormat = ($OptionsInfo{FingerprintsFormatMode} =~ /^Internal$/i) ? 1 : 0;
 228 
 229   $CheckData = $OptionsInfo{Fast} ? 0 : 1;
 230   $FirstFingerprintsDataLine = 1;
 231 
 232   # Skip column label line...
 233   $Line = GetTextLine(\*TEXTFILE);
 234 
 235   LINE: while ($Line = GetTextLine(\*TEXTFILE)) {
 236     $LineCount++;
 237     @LineWords = quotewords($InDelim, 0, $Line);
 238 
 239     if ($CheckData) {
 240       if ($FingerprintsColNum > $#LineWords) {
 241 	# Missing data...
 242 	$MissingDataLineCount++;
 243 	if ($DetailLevel >= 3) {
 244 	  print "Line number $LineCount contains no fingerprints data: $Line \n";
 245 	}
 246 	elsif ($DetailLevel >= 2) {
 247 	  print "Line number $LineCount contains no fingerprints data...\n";
 248 	}
 249 	next LINE;
 250       }
 251     }
 252 
 253     # Setup fingerprints bit vector...
 254     $InvalidFingerprintsData = 0;
 255     if ($UseInternalFormat) {
 256       ($FingerprintsType, $FingerprintsStringType, $FingerprintsSize, $FingerprintsString) = $LineWords[$FingerprintsColNum] =~ /^(.*?):(.*?):(.*?):(.*?)$/;
 257       if ($CheckData) {
 258 	if (IsEmpty($FingerprintsType) || IsEmpty($FingerprintsStringType) || IsEmpty($FingerprintsSize) || IsEmpty($FingerprintsString)) {
 259 	  $InvalidFingerprintsData = 1;
 260 	}
 261 	elsif ($FirstFingerprintsDataLine) {
 262 	  $FirstFingerprintsDataLine = 0;
 263 	  ($FirstFingerprintsType, $FirstFingerprintsStringType, $FirstFingerprintsSize) = ($FingerprintsType, $FingerprintsStringType, $FingerprintsSize);
 264 	}
 265 	else {
 266 	  if ($FirstFingerprintsType !~ /^$FingerprintsType$/i || $FirstFingerprintsStringType !~ /^$FingerprintsStringType$/i || $FirstFingerprintsSize !~ /^$FingerprintsSize$/i) {
 267 	    $InvalidFingerprintsData = 1;
 268 	  }
 269 	}
 270       }
 271     }
 272     else {
 273       $FingerprintsString = $LineWords[$FingerprintsColNum];
 274       $FingerprintsStringType = $OptionsInfo{FingerprintsString};
 275       if ($CheckData && IsEmpty($FingerprintsString)) {
 276 	$InvalidFingerprintsData = 1;
 277       }
 278     }
 279     if ($InvalidFingerprintsData) {
 280       # InvalidData data...
 281       $InvalidDataLineCount++;
 282       if ($DetailLevel >= 3) {
 283 	print "Line number $LineCount contains invalid fingerprints data: $Line \n";
 284       }
 285       elsif ($DetailLevel >= 2) {
 286 	print "Line number $LineCount contains invalid fingerprints data...\n";
 287       }
 288       next LINE;
 289     }
 290     my($FingerprintsBitVector);
 291 
 292     $FingerprintsBitVector = '';
 293     if ($FingerprintsStringType =~ /^(Hexadecimal|Hex)$/i) {
 294       $FingerprintsBitVector = FingerprintsBitVector::NewFromHexadecimalString($FingerprintsString);
 295     }
 296     elsif ($FingerprintsStringType =~ /^(Binary|Bin)$/i) {
 297       $FingerprintsBitVector = FingerprintsBitVector::NewFromBinaryString($FingerprintsString);
 298     }
 299     elsif ($FingerprintsStringType =~ /^(RawBinary|RawBin)$/i) {
 300       $FingerprintsBitVector = FingerprintsBitVector::NewFromRawBinaryString($FingerprintsString);
 301     }
 302 
 303     $CompoundID = (!$UseSequentialID && $CmpdIDColNum <= $#LineWords) ? $LineWords[$CmpdIDColNum] : "Cmpd${LineCount}";
 304 
 305     push @CompundIDs, $CompoundID;
 306     push @FingerprintsBitVectors, $FingerprintsBitVector;
 307   }
 308   close TEXTFILE;
 309 
 310   if ($DetailLevel >= 1) {
 311     if ($MissingDataLineCount) {
 312       print "Missing fingerprints data in $MissingDataLineCount line(s)...\n";
 313     }
 314     if ($InvalidDataLineCount) {
 315       print "Invalid fingerprints data in $InvalidDataLineCount line(s)...\n";
 316     }
 317   }
 318 
 319   return (\@CompundIDs, \@FingerprintsBitVectors);
 320 }
 321 
 322 # Make sure the specified columns exists in text files...
 323 sub ProcessColumnsInfo {
 324   my($Index, $TextFile, $ColNum, $ColLabel, $ColFound, $CmpdIDCol, $CmpdIDColNum, $FingerprintsCol, $FingerprintsColNum, $UseSequentialCmpdIDs);
 325 
 326   @{$TextFilesInfo{CompoundIDColNum}} = ();
 327   @{$TextFilesInfo{UseSequentialID}} = ();
 328   @{$TextFilesInfo{FingerprintsColNum}} = ();
 329 
 330   FILELIST: for $Index (0 .. $#TextFilesList) {
 331     $TextFile = $TextFilesList[$Index];
 332 
 333     $TextFilesInfo{CompoundIDColNum}[$Index] = '';
 334     $TextFilesInfo{UseSequentialID}[$Index] = 0;
 335     $TextFilesInfo{FingerprintsColNum}[$Index] = '';
 336 
 337     if (!$TextFilesInfo{FileOkay}[$Index]) {
 338       next FILELIST;
 339     }
 340 
 341     # File not okay untill proven okay...
 342     $TextFilesInfo{FileOkay}[$Index] = 0;
 343 
 344     # CompoundIDCol...
 345     $CmpdIDCol = $OptionsInfo{CompoundIDCol};
 346 
 347     $UseSequentialCmpdIDs = 0;
 348     $CmpdIDColNum = '';
 349 
 350     if ($CmpdIDCol =~ /^UseDefault$/i) {
 351       # First column containing the word CompoundID in its label or sequential generation...
 352       $ColFound = 0;
 353       COLLABEL: for $ColLabel (@{$TextFilesInfo{ColLabels}[$Index]}) {
 354 	if ($ColLabel =~ /CompoundID/i) {
 355 	  $ColFound = 1;
 356 	  $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel};
 357 	  last COLLABEL;
 358 	}
 359       }
 360       if ($ColFound) {
 361 	$CmpdIDColNum = $ColNum;
 362       }
 363       else {
 364 	$UseSequentialCmpdIDs = 1;
 365       }
 366     }
 367     else {
 368       if ($OptionsInfo{ColMode} =~ /^ColNum$/i) {
 369 	# Is it a valid column number?
 370 	if ($CmpdIDCol > $TextFilesInfo{ColCount}[$Index]) {
 371 	  warn "Warning: Ignoring file $TextFile: Column number specified, $CmpdIDCol, using \"--CompoundIDCol\" option doesn't exist.\n";
 372 	  next FILELIST;
 373 	}
 374 	$CmpdIDColNum = $CmpdIDCol - 1;
 375       }
 376       elsif ($OptionsInfo{ColMode} =~ /^ColLabel$/i) {
 377 	# Does this column exists?
 378 	if (!exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$CmpdIDCol}) {
 379 	  warn "Warning: Ignoring file $TextFile: Column label specified, $CmpdIDCol, using \"--CompoundIDCol\" option doesn't exist.\n";
 380 	  next FILELIST;
 381 	}
 382 	$CmpdIDColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$CmpdIDCol};
 383       }
 384     }
 385     # FingerprintsCol...
 386     $FingerprintsColNum = '';
 387     $FingerprintsCol = $OptionsInfo{FingerprintsCol};
 388 
 389     if ($FingerprintsCol =~ /^UseDefault$/i) {
 390       # First column containing the word Fingerprints in its label...
 391       $ColFound = 0;
 392       COLLABEL: for $ColLabel (@{$TextFilesInfo{ColLabels}[$Index]}) {
 393 	if ($ColLabel =~ /Fingerprints/i) {
 394 	  $ColFound = 1;
 395 	  $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel};
 396 	  last COLLABEL;
 397 	}
 398       }
 399       if (!$ColFound) {
 400 	warn "Warning: Ignoring file $TextFile: Column label containing \"Fingerprints\" string in its name doesn't exist.\n";
 401 	next FILELIST;
 402       }
 403       $FingerprintsColNum = $ColNum;
 404     }
 405     else {
 406       if ($OptionsInfo{ColMode} =~ /^ColNum$/i) {
 407 	# Is it a valid column number...
 408 	if ($FingerprintsCol > $TextFilesInfo{ColCount}[$Index]) {
 409 	  warn "Warning: Ignoring file $TextFile: Column number specified, $FingerprintsCol, using \"--FingerprintsCol\" option doesn't exist.\n";
 410 	  next FILELIST;
 411 	}
 412 	$FingerprintsColNum = $FingerprintsCol - 1;
 413       }
 414       elsif ($OptionsInfo{ColMode} =~ /^ColLabel$/i) {
 415 	# Does this column exists?
 416 	if (!exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$FingerprintsCol}) {
 417 	  warn "Warning: Ignoring file $TextFile: Column label specified, $FingerprintsCol, using \"--FingerprintsCol\" option doesn't exist.\n";
 418 	  next FILELIST;
 419 	}
 420 	$FingerprintsColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$FingerprintsCol};
 421       }
 422     }
 423     $TextFilesInfo{FileOkay}[$Index] = 1;
 424 
 425     $TextFilesInfo{CompoundIDColNum}[$Index] = $CmpdIDColNum;
 426     $TextFilesInfo{UseSequentialID}[$Index] = $UseSequentialCmpdIDs;
 427     $TextFilesInfo{FingerprintsColNum}[$Index] = $FingerprintsColNum;
 428   }
 429 }
 430 
 431 # Retrieve information about text files...
 432 #
 433 sub RetrieveTextFilesInfo {
 434   my($TextFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $OutFileExt, $InDelim, $Line, $ColNum, $ColLabel, @ColLabels);
 435 
 436   %TextFilesInfo = ();
 437   @{$TextFilesInfo{FileOkay}} = ();
 438   @{$TextFilesInfo{ColCount}} = ();
 439   @{$TextFilesInfo{ColLabels}} = ();
 440   @{$TextFilesInfo{ColLabelToNumMap}} = ();
 441   @{$TextFilesInfo{InDelim}} = ();
 442   @{$TextFilesInfo{OutFileRoot}} = ();
 443   @{$TextFilesInfo{OutFileExt}} = ();
 444 
 445   FILELIST: for $Index (0 .. $#TextFilesList) {
 446     $TextFile = $TextFilesList[$Index];
 447 
 448     $TextFilesInfo{FileOkay}[$Index] = 0;
 449     $TextFilesInfo{ColCount}[$Index] = 0;
 450     @{$TextFilesInfo{ColLabels}[$Index]} = ();
 451     %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = ();
 452     $TextFilesInfo{InDelim}[$Index] = "";
 453     $TextFilesInfo{OutFileRoot}[$Index] = '';
 454     $TextFilesInfo{OutFileExt}[$Index] = '';
 455 
 456     $TextFile = $TextFilesList[$Index];
 457     if (!(-e $TextFile)) {
 458       warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
 459       next FILELIST;
 460     }
 461     if (!CheckFileType($TextFile, "csv tsv")) {
 462       warn "Warning: Ignoring file $TextFile: It's not a text file\n";
 463       next FILELIST;
 464     }
 465 
 466     $FileDir = ""; $FileName = ""; $FileExt = "";
 467     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 468 
 469     if ($FileExt =~ /^tsv$/i) {
 470       $InDelim = "\t";
 471     }
 472     else {
 473       $InDelim = $OptionsInfo{InDelim};
 474     }
 475 
 476     if (!open TEXTFILE, "$TextFile") {
 477       warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
 478       next FILELIST;
 479     }
 480     $Line = GetTextLine(\*TEXTFILE);
 481     @ColLabels = quotewords($InDelim, 0, $Line);
 482     close TEXTFILE;
 483 
 484     # Setup output file names...
 485     $FileDir = ""; $FileName = ""; $FileExt = "";
 486     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 487 
 488     $OutFileExt = "csv";
 489     if ($Options{outdelim} =~ /^tab$/i) {
 490       $OutFileExt = "tsv";
 491     }
 492 
 493     $OutFileRoot = $FileName;
 494     if ($OptionsInfo{OutFileRoot} && (@TextFilesList == 1)) {
 495       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 496       if ($RootFileName && $RootFileExt) {
 497 	$FileName = $RootFileName;
 498       }
 499       else {
 500 	$FileName = $OptionsInfo{OutFileRoot};
 501       }
 502       $OutFileRoot = $FileName;
 503     }
 504 
 505     if (!$Options{overwrite}) {
 506       my($SpecifiedSimilarityCoefficient, $SimilarityCoefficient, $SimilarityCoefficientsNameMapRef);
 507       $SimilarityCoefficientsNameMapRef = $OptionsInfo{SpecifiedSimilarityCoefficientsNameMapRef};
 508       for $SpecifiedSimilarityCoefficient (@{$OptionsInfo{SpecifiedSimilarityCoefficientsRef}}) {
 509 	$SimilarityCoefficient = $SimilarityCoefficientsNameMapRef->{lc($SpecifiedSimilarityCoefficient)};
 510 	if (-e "${OutFileRoot}${SimilarityCoefficient}.${OutFileExt}") {
 511 	  warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}${SimilarityCoefficient}.${OutFileExt} already exists.\n";
 512 	  next FILELIST;
 513 	}
 514       }
 515     }
 516 
 517     $TextFilesInfo{FileOkay}[$Index] = 1;
 518     $TextFilesInfo{InDelim}[$Index] = $InDelim;
 519     $TextFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 520     $TextFilesInfo{OutFileExt}[$Index] = $OutFileExt;
 521 
 522 
 523     $TextFilesInfo{ColCount}[$Index] = scalar @ColLabels;
 524     push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels;
 525     for $ColNum (0 .. $#ColLabels) {
 526       $ColLabel = $ColLabels[$ColNum];
 527       $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum;
 528     }
 529   }
 530 }
 531 
 532 # Process option values...
 533 sub ProcessOptions {
 534   %OptionsInfo = ();
 535 
 536   # Setup supported similarity coefficients...
 537   my($SimilarityCoefficient, $SupportedSimilarityCoefficient, @SupportedSimilarityCoefficients, %SupportedSimilarityCoefficientsNameMap, %SupportedSimilarityCoefficientsMethodMap);
 538 
 539   @SupportedSimilarityCoefficients = ();
 540   %SupportedSimilarityCoefficientsNameMap = ();
 541   %SupportedSimilarityCoefficientsMethodMap = ();
 542   for $SupportedSimilarityCoefficient (FingerprintsBitVector::GetSupportedSimilarityCoefficients()) {
 543     # Similarity coefficient function/method names contain "SimilarityCoefficient" in their names.
 544     # So take 'em out and setup a map to original function/method name...
 545     $SimilarityCoefficient = $SupportedSimilarityCoefficient;
 546     $SimilarityCoefficient =~ s/SimilarityCoefficient$//;
 547     push @SupportedSimilarityCoefficients, $SimilarityCoefficient;
 548     $SupportedSimilarityCoefficientsNameMap{lc($SimilarityCoefficient)} = $SimilarityCoefficient;
 549     $SupportedSimilarityCoefficientsMethodMap{lc($SimilarityCoefficient)} = $SupportedSimilarityCoefficient;
 550   }
 551 
 552   # Setup a list of similarity coefficients to use for calculating similarity matrices...
 553   my($SpecifiedCoefficient, @SpecifiedSimilarityCoefficients, %SpecifiedSimilarityCoefficientsNameMap, %SpecifiedSimilarityCoefficientsMethodMap);
 554 
 555   @SpecifiedSimilarityCoefficients = ();
 556   %SpecifiedSimilarityCoefficientsNameMap = ();
 557   %SpecifiedSimilarityCoefficientsMethodMap = ();
 558 
 559   if ($Options{mode} =~ /^All$/i) {
 560     push @SpecifiedSimilarityCoefficients, @SupportedSimilarityCoefficients;
 561   }
 562   else {
 563     # Comma delimited list of similarity coefficients...
 564     my($Mode, @SpecifiedCoefficients, @UnsupportedSpecifiedCoefficients);
 565 
 566     $Mode = $Options{mode};
 567     $Mode =~ s/ //g;
 568     @SpecifiedCoefficients = split ",", $Mode;
 569     @UnsupportedSpecifiedCoefficients = ();
 570 
 571     for $SpecifiedCoefficient (@SpecifiedCoefficients) {
 572       if (exists($SupportedSimilarityCoefficientsMethodMap{lc($SpecifiedCoefficient)})) {
 573 	push @SpecifiedSimilarityCoefficients, $SpecifiedCoefficient;
 574       }
 575       else {
 576 	push @UnsupportedSpecifiedCoefficients, $SpecifiedCoefficient;
 577       }
 578     }
 579     if (@UnsupportedSpecifiedCoefficients) {
 580       if (@UnsupportedSpecifiedCoefficients > 1) {
 581 	warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedCoefficients, ", ", 0)," - for option \"-m --mode\" are not valid.\n";
 582       }
 583       else {
 584 	warn "Error: The value specified, @UnsupportedSpecifiedCoefficients, for option \"-m --mode\" is not valid.\n";
 585       }
 586       die "Allowed values:", JoinWords(\@SupportedSimilarityCoefficients, ", ", 0), "\n";
 587     }
 588   }
 589   COEFFICIENT: for $SpecifiedCoefficient (@SpecifiedSimilarityCoefficients) {
 590     if (exists $SpecifiedSimilarityCoefficientsMethodMap{lc($SpecifiedCoefficient)} ) {
 591       next COEFFICIENT;
 592     }
 593     $SpecifiedSimilarityCoefficientsMethodMap{lc($SpecifiedCoefficient)} = $SupportedSimilarityCoefficientsMethodMap{lc($SpecifiedCoefficient)};
 594     $SpecifiedSimilarityCoefficientsNameMap{lc($SpecifiedCoefficient)} = $SupportedSimilarityCoefficientsNameMap{lc($SpecifiedCoefficient)};
 595   }
 596   $OptionsInfo{Mode} = $Options{mode};
 597   $OptionsInfo{SpecifiedSimilarityCoefficientsRef} = \@SpecifiedSimilarityCoefficients;
 598   $OptionsInfo{SpecifiedSimilarityCoefficientsNameMapRef} = \%SpecifiedSimilarityCoefficientsNameMap;
 599   $OptionsInfo{SpecifiedSimilarityCoefficientsMethodMapRef} = \%SpecifiedSimilarityCoefficientsMethodMap;
 600 
 601   # Make sure valid alpha parameter is specified for Tversky calculation...
 602   $OptionsInfo{Alpha} = '';
 603   $SpecifiedCoefficient = 'Tversky';
 604   if ($SpecifiedSimilarityCoefficientsNameMap{lc($SpecifiedCoefficient)}) {
 605     if (IsEmpty($Options{alpha})) {
 606       die "Error: You must specify a value for \"-a, --alpha\" option in \"$SpecifiedCoefficient or All\" \"-m --mode\". \n";
 607     }
 608     my($Alpha);
 609     $Alpha = $Options{alpha};
 610     if (!(IsFloat($Alpha) && $Alpha >=0 && $Alpha <= 1)) {
 611       die "Error: The value specified, $Options{alpha}, for option \"-a, --alpha\" is not valid. Allowed values: >= 0 and <= 1\n";
 612     }
 613     $OptionsInfo{Alpha} = $Alpha;
 614   }
 615 
 616   # Make sure valid beta parameter is specified for WeightedTanimoto and WeightedTversky
 617   # calculations...
 618   my($SpecifiedCoefficient1, $SpecifiedCoefficient2);
 619   $OptionsInfo{Beta} = '';
 620   $SpecifiedCoefficient1 = 'WeightedTversky';
 621   $SpecifiedCoefficient2 = 'WeightedTanimoto';
 622   if ($SpecifiedSimilarityCoefficientsNameMap{lc($SpecifiedCoefficient1)} || $SpecifiedSimilarityCoefficientsNameMap{lc($SpecifiedCoefficient1)}) {
 623     if (IsEmpty($Options{beta})) {
 624       die "Error: You must specify a value for \"-b, --beta\" option in \"$SpecifiedCoefficient1, $SpecifiedCoefficient2, or All\" \"-m --mode\". \n";
 625     }
 626     my($Beta);
 627     $Beta = $Options{beta};
 628     if (!(IsFloat($Beta) && $Beta >=0 && $Beta <= 1)) {
 629       die "Error: The value specified, $Options{beta}, for option \"-b, --beta\" is not valid. Allowed values: >= 0 and <= 1\n";
 630     }
 631     $OptionsInfo{Beta} = $Beta;
 632   }
 633 
 634   $OptionsInfo{ColMode} = $Options{colmode};
 635 
 636   if (IsNotEmpty($Options{compoundidcol})) {
 637     if ($Options{colmode} =~ /^ColNum$/i) {
 638       if (!IsPositiveInteger($Options{compoundidcol})) {
 639 	die "Error: Column value, $Options{compoundidcol}, specified using \"--CompoundIDCol\" is not valid: Allowed integer values: > 0.\n";
 640       }
 641     }
 642     $OptionsInfo{CompoundIDCol} = $Options{compoundidcol};
 643   }
 644   else {
 645     $OptionsInfo{CompoundIDCol} = 'UseDefault';
 646   }
 647 
 648   if (IsNotEmpty($Options{fingerprintscol})) {
 649     if ($Options{colmode} =~ /^ColNum$/i) {
 650       if (!IsPositiveInteger($Options{fingerprintscol})) {
 651 	die "Error: Column value, $Options{fingerprintscol}, specified using \"--FingerprintsCol\" is not valid: Allowed integer values: > 0.\n";
 652       }
 653     }
 654     $OptionsInfo{FingerprintsCol} = $Options{fingerprintscol};
 655   }
 656   else {
 657     $OptionsInfo{FingerprintsCol} = 'UseDefault';
 658   }
 659   if (IsNotEmpty($Options{compoundidcol}) && IsNotEmpty($Options{fingerprintscol})) {
 660     if (IsPositiveInteger($Options{compoundidcol}) && IsPositiveInteger($Options{fingerprintscol})) {
 661       if (($Options{compoundidcol} == $Options{fingerprintscol})) {
 662 	die "Error: Values specified using \"--CompoundIDCol\" and \"--FingerprintsCol\", $Options{compoundidcol}, must be different.\n";
 663       }
 664     }
 665     else {
 666       if (($Options{compoundidcol} eq $Options{fingerprintscol})) {
 667 	die "Error: Values specified using \"--CompoundIDCol\" and \"--FingerprintsCol\", $Options{compoundidcol}, must be different.\n";
 668       }
 669     }
 670   }
 671 
 672 
 673   $OptionsInfo{Detail} = $Options{detail};
 674 
 675   $OptionsInfo{FingerprintsFormatMode} = $Options{fingerprintsformatmode};
 676   $OptionsInfo{FingerprintsString} = '';
 677   if ($Options{fingerprintsformatmode} =~ /^Specify$/i) {
 678     if (IsEmpty($Options{fingerprintsstring})) {
 679       die "Error: You must specify a value for \"--FingerprintsString\" option in \"Specify\" \"--FingerprintsFormatMode\". \n";
 680     }
 681     if ($Options{fingerprintsstring} !~ /^(Hexadecimal|Binary|RawBinary)$/i) {
 682       die "Error: The value specified, $Options{fingerprintsstring}, for option \"--FingerprintsString\" is not valid. Allowed values: Hexadecimal, Binary, or RawBinary\n";
 683     }
 684     $OptionsInfo{FingerprintsString} = $Options{fingerprintsstring};
 685   }
 686 
 687   $OptionsInfo{InDelim} = ($Options{indelim} =~ /semicolon/i) ? "\;" : "\,";
 688   $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
 689   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
 690 
 691   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 692   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 693 
 694   $OptionsInfo{Fast} = $Options{fast} ? 1 : 0;
 695 
 696   $OptionsInfo{Precision} = $Options{precision};
 697 }
 698 
 699 # Setup script usage  and retrieve command line arguments specified using various options...
 700 sub SetupScriptUsage {
 701 
 702   # Retrieve all the options...
 703   %Options = ();
 704 
 705   $Options{alpha} = 0.5;
 706   $Options{beta} = 1;
 707   $Options{colmode} = 'colnum';
 708   $Options{fingerprintsformatmode} = 'Internal';
 709   $Options{detail} = 1;
 710   $Options{mode} = 'Tanimoto';
 711   $Options{indelim} = 'comma';
 712   $Options{outdelim} = 'comma';
 713   $Options{quote} = 'yes';
 714   $Options{precision} = 2;
 715 
 716   if (!GetOptions(\%Options, "alpha|a=f", "beta|b=f", "colmode|c=s", "compoundidcol=s", "detail|d=i", "fast|f", "fingerprintscol=s", "fingerprintsformatmode=s", "fingerprintsstring=s", "help|h", "mode|m=s",  "indelim=s", "outdelim=s", "overwrite|o", "precision|p=s", "quote|q=s", "root|r=s", "workingdir|w=s")) {
 717     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 718   }
 719   if ($Options{workingdir}) {
 720     if (! -d $Options{workingdir}) {
 721       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 722     }
 723     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 724   }
 725   if ($Options{colmode} !~ /^(ColNum|ColLabel)$/i) {
 726     die "Error: The value specified, $Options{colmode}, for option \"-c, --ColMode\" is not valid. Allowed values: ColNum, or ColLabel\n";
 727   }
 728   if (!IsPositiveInteger($Options{detail})) {
 729     die "Error: The value specified, $Options{detail}, for option \"-d, --detail\" is not valid. Allowed values: > 0 \n";
 730   }
 731   if ($Options{fingerprintsformatmode} !~ /^(Internal|Specify)$/i) {
 732     die "Error: The value specified, $Options{fingerprintsformatmode}, for option \"--FingerprintsFormatMode\" is not valid. Allowed values: Internal or Specify\n";
 733   }
 734   if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 735     die "Error: The value specified, $Options{indelim}, for option \"--InDelim\" is not valid. Allowed values: comma, or semicolon\n";
 736   }
 737   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 738     die "Error: The value specified, $Options{outdelim}, for option \"--OutDelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 739   }
 740   if ($Options{quote} !~ /^(Yes|No)$/i) {
 741     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
 742   }
 743   if (!IsPositiveInteger($Options{precision})) {
 744     die "Error: The value specified, $Options{precision}, for option \"--precision\" is not valid. Allowed values: > 0 \n";
 745   }
 746 }
 747