MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: TopologicalPharmacophoreAtomTripletsFingerprints.pl,v $
   4 # $Date: 2011/12/16 00:03:32 $
   5 # $Revision: 1.27 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2012 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 use SDFileUtil;
  38 use MoleculeFileIO;
  39 use FileIO::FingerprintsSDFileIO;
  40 use FileIO::FingerprintsTextFileIO;
  41 use FileIO::FingerprintsFPFileIO;
  42 use AtomTypes::FunctionalClassAtomTypes;
  43 use Fingerprints::TopologicalPharmacophoreAtomTripletsFingerprints;
  44 
  45 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  46 
  47 # Autoflush STDOUT
  48 $| = 1;
  49 
  50 # Starting message...
  51 $ScriptName = basename($0);
  52 print "\n$ScriptName: Starting...\n\n";
  53 $StartTime = new Benchmark;
  54 
  55 # Get the options and setup script...
  56 SetupScriptUsage();
  57 if ($Options{help} || @ARGV < 1) {
  58   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  59 }
  60 
  61 my(@SDFilesList);
  62 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  63 
  64 # Process options...
  65 print "Processing options...\n";
  66 my(%OptionsInfo);
  67 ProcessOptions();
  68 
  69 # Setup information about input files...
  70 print "Checking input SD file(s)...\n";
  71 my(%SDFilesInfo);
  72 RetrieveSDFilesInfo();
  73 
  74 # Process input files..
  75 my($FileIndex);
  76 if (@SDFilesList > 1) {
  77   print "\nProcessing SD files...\n";
  78 }
  79 for $FileIndex (0 .. $#SDFilesList) {
  80   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  81     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  82     GenerateTopologicalPharmacophoreAtomTripletsFingerprints($FileIndex);
  83   }
  84 }
  85 print "\n$ScriptName:Done...\n\n";
  86 
  87 $EndTime = new Benchmark;
  88 $TotalTime = timediff ($EndTime, $StartTime);
  89 print "Total time: ", timestr($TotalTime), "\n";
  90 
  91 ###############################################################################
  92 
  93 # Generate fingerprints for a SD file...
  94 #
  95 sub GenerateTopologicalPharmacophoreAtomTripletsFingerprints {
  96   my($FileIndex) = @_;
  97   my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $TopologicalPharmacophoreAtomTripletsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, $SetupOutputFiles);
  98 
  99   $SDFile = $SDFilesList[$FileIndex];
 100 
 101   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
 102   $SetupOutputFiles = 1;
 103 
 104   $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
 105   $MoleculeFileIO->Open();
 106 
 107   $CmpdCount = 0;
 108   $IgnoredCmpdCount = 0;
 109 
 110   COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
 111     $CmpdCount++;
 112 
 113     # Filter compound data before calculating fingerprints...
 114     if ($OptionsInfo{Filter}) {
 115       if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
 116         $IgnoredCmpdCount++;
 117         next COMPOUND;
 118       }
 119     }
 120 
 121     $TopologicalPharmacophoreAtomTripletsFingerprints = GenerateMoleculeFingerprints($Molecule);
 122     if (!$TopologicalPharmacophoreAtomTripletsFingerprints) {
 123       $IgnoredCmpdCount++;
 124       ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule);
 125       next COMPOUND;
 126     }
 127 
 128     if ($SetupOutputFiles) {
 129       $SetupOutputFiles = 0;
 130       SetupFingerprintsLabelValueIDs($TopologicalPharmacophoreAtomTripletsFingerprints);
 131       ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex);
 132     }
 133 
 134     WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $TopologicalPharmacophoreAtomTripletsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 135   }
 136   $MoleculeFileIO->Close();
 137 
 138   if ($NewFPSDFileIO) {
 139     $NewFPSDFileIO->Close();
 140   }
 141   if ($NewFPTextFileIO) {
 142     $NewFPTextFileIO->Close();
 143   }
 144   if ($NewFPFileIO) {
 145     $NewFPFileIO->Close();
 146   }
 147 
 148   WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount);
 149 }
 150 
 151 # Process compound being ignored due to problems in fingerprints geneation...
 152 #
 153 sub ProcessIgnoredCompound {
 154   my($Mode, $CmpdCount, $Molecule) = @_;
 155   my($CmpdID, $DataFieldLabelAndValuesRef);
 156 
 157   $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 158   $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 159 
 160   MODE: {
 161     if ($Mode =~ /^ContainsNonElementalData$/i) {
 162       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
 163       next MODE;
 164     }
 165 
 166     if ($Mode =~ /^ContainsNoElementalData$/i) {
 167       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
 168       next MODE;
 169     }
 170 
 171     if ($Mode =~ /^FingerprintsGenerationFailed$/i) {
 172       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 173       next MODE;
 174     }
 175     warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 176   }
 177 }
 178 
 179 # Check and filter compounds....
 180 #
 181 sub CheckAndFilterCompound {
 182   my($CmpdCount, $Molecule) = @_;
 183   my($ElementCount, $NonElementCount);
 184 
 185   ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
 186 
 187   if ($NonElementCount) {
 188     ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
 189     return 1;
 190   }
 191 
 192   if (!$ElementCount) {
 193     ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
 194     return 1;
 195   }
 196 
 197   return 0;
 198 }
 199 
 200 # Write out compounds fingerprints generation summary statistics...
 201 #
 202 sub WriteFingerprintsGenerationSummaryStatistics {
 203   my($CmpdCount, $IgnoredCmpdCount) = @_;
 204   my($ProcessedCmpdCount);
 205 
 206   $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
 207 
 208   print "\nNumber of compounds: $CmpdCount\n";
 209   print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n";
 210   print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n";
 211 }
 212 
 213 # Append atom pair value IDs to fingerprint label...
 214 #
 215 sub SetupFingerprintsLabelValueIDs {
 216   my($TopologicalPharmacophoreAtomTripletsFingerprints) = @_;
 217 
 218   if ($OptionsInfo{AtomTripletsSetSizeToUse} =~ /^ArbitrarySize$/i ||
 219       $OptionsInfo{FingerprintsLabelMode} !~ /^FingerprintsLabelWithIDs$/i) {
 220     return;
 221   }
 222   $OptionsInfo{FingerprintsLabel} .= "; Value IDs: " . $TopologicalPharmacophoreAtomTripletsFingerprints->GetFingerprintsVector->GetValueIDsString();
 223 }
 224 
 225 # Open output files...
 226 #
 227 sub SetupAndOpenOutputFiles {
 228   my($FileIndex) = @_;
 229   my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams);
 230 
 231   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
 232 
 233   # Setup common parameters for fingerprints file IO objects...
 234   #
 235   %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat});
 236 
 237   if ($OptionsInfo{SDOutput}) {
 238     $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
 239     print "Generating SD file $NewFPSDFile...\n";
 240     $NewFPSDFileIO = new FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel});
 241     $NewFPSDFileIO->Open();
 242   }
 243 
 244   if ($OptionsInfo{FPOutput}) {
 245     $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex];
 246     print "Generating FP file $NewFPFile...\n";
 247     $NewFPFileIO = new FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams);
 248     $NewFPFileIO->Open();
 249   }
 250 
 251   if ($OptionsInfo{TextOutput}) {
 252     my($ColLabelsRef);
 253 
 254     $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
 255     $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex);
 256 
 257     print "Generating text file $NewFPTextFile...\n";
 258     $NewFPTextFileIO = new FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote});
 259     $NewFPTextFileIO->Open();
 260   }
 261 
 262   return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 263 }
 264 
 265 # Write fingerpritns and other data to appropriate output files...
 266 #
 267 sub WriteDataToOutputFiles {
 268   my($FileIndex, $CmpdCount, $Molecule, $TopologicalPharmacophoreAtomTripletsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_;
 269   my($DataFieldLabelAndValuesRef);
 270 
 271   $DataFieldLabelAndValuesRef = undef;
 272   if ($NewFPTextFileIO || $NewFPFileIO) {
 273     $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 274   }
 275 
 276   if ($NewFPSDFileIO) {
 277     my($CmpdString);
 278 
 279     $CmpdString = $Molecule->GetInputMoleculeString();
 280     $NewFPSDFileIO->WriteFingerprints($TopologicalPharmacophoreAtomTripletsFingerprints, $CmpdString);
 281   }
 282 
 283   if ($NewFPTextFileIO) {
 284     my($ColValuesRef);
 285 
 286     $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 287     $NewFPTextFileIO->WriteFingerprints($TopologicalPharmacophoreAtomTripletsFingerprints, $ColValuesRef);
 288   }
 289 
 290   if ($NewFPFileIO) {
 291     my($CompoundID);
 292 
 293     $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 294     $NewFPFileIO->WriteFingerprints($TopologicalPharmacophoreAtomTripletsFingerprints, $CompoundID);
 295   }
 296 }
 297 
 298 # Generate approriate column labels for FPText output file...
 299 #
 300 sub SetupFPTextFileCoulmnLabels {
 301   my($FileIndex) = @_;
 302   my($Line, @ColLabels);
 303 
 304   @ColLabels = ();
 305   if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 306     push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 307   }
 308   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 309     push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 310   }
 311   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 312     push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}};
 313   }
 314   elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 315     push @ColLabels, $OptionsInfo{CompoundIDLabel};
 316   }
 317   # Add fingerprints label...
 318   push @ColLabels, $OptionsInfo{FingerprintsLabel};
 319 
 320   return \@ColLabels;
 321 }
 322 
 323 # Generate column values FPText output file..
 324 #
 325 sub SetupFPTextFileCoulmnValues {
 326   my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 327   my(@ColValues);
 328 
 329   @ColValues = ();
 330   if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 331     push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 332   }
 333   elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 334     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 335   }
 336   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 337     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 338   }
 339   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 340     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
 341   }
 342 
 343   return \@ColValues;
 344 }
 345 
 346 # Generate compound ID for FP and FPText output files..
 347 #
 348 sub SetupCmpdIDForOutputFiles {
 349   my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 350   my($CmpdID);
 351 
 352   $CmpdID = '';
 353   if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
 354     my($MolName);
 355     $MolName = $Molecule->GetName();
 356     $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
 357   }
 358   elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
 359     $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
 360   }
 361   elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
 362     my($SpecifiedDataField);
 363     $SpecifiedDataField = $OptionsInfo{CompoundID};
 364     $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
 365   }
 366   elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
 367     $CmpdID = $Molecule->GetName();
 368   }
 369   return $CmpdID;
 370 }
 371 
 372 # Generate fingerprints for molecule...
 373 #
 374 sub GenerateMoleculeFingerprints {
 375   my($Molecule) = @_;
 376   my($TopologicalPharmacophoreAtomTripletsFingerprints);
 377 
 378   if ($OptionsInfo{KeepLargestComponent}) {
 379     $Molecule->KeepLargestComponent();
 380   }
 381   if (!$Molecule->DetectRings()) {
 382     return undef;
 383   }
 384   $Molecule->DetectAromaticity();
 385 
 386   $TopologicalPharmacophoreAtomTripletsFingerprints = new TopologicalPharmacophoreAtomTripletsFingerprints('Molecule' => $Molecule, 'AtomTripletsSetSizeToUse' => $OptionsInfo{AtomTripletsSetSizeToUse}, 'MinDistance' => $OptionsInfo{MinDistance},  'MaxDistance' => $OptionsInfo{MaxDistance}, 'DistanceBinSize' => $OptionsInfo{DistanceBinSize}, 'UseTriangleInequality' => $OptionsInfo{UseTriangleInequality}, 'AtomTypesToUse' => \@{$OptionsInfo{AtomTypesToUse}});
 387 
 388   # Generate fingerprints...
 389   $TopologicalPharmacophoreAtomTripletsFingerprints->GenerateFingerprints();
 390 
 391   # Make sure fingerprints generation is successful...
 392   if (!$TopologicalPharmacophoreAtomTripletsFingerprints->IsFingerprintsGenerationSuccessful()) {
 393     return undef;
 394   }
 395 
 396   return $TopologicalPharmacophoreAtomTripletsFingerprints;
 397 }
 398 
 399 # Retrieve information about SD files...
 400 #
 401 sub RetrieveSDFilesInfo {
 402   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
 403 
 404   %SDFilesInfo = ();
 405   @{$SDFilesInfo{FileOkay}} = ();
 406   @{$SDFilesInfo{OutFileRoot}} = ();
 407   @{$SDFilesInfo{SDOutFileNames}} = ();
 408   @{$SDFilesInfo{FPOutFileNames}} = ();
 409   @{$SDFilesInfo{TextOutFileNames}} = ();
 410   @{$SDFilesInfo{AllDataFieldsRef}} = ();
 411   @{$SDFilesInfo{CommonDataFieldsRef}} = ();
 412 
 413   $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
 414   $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
 415 
 416   FILELIST: for $Index (0 .. $#SDFilesList) {
 417     $SDFile = $SDFilesList[$Index];
 418 
 419     $SDFilesInfo{FileOkay}[$Index] = 0;
 420     $SDFilesInfo{OutFileRoot}[$Index] = '';
 421     $SDFilesInfo{SDOutFileNames}[$Index] = '';
 422     $SDFilesInfo{FPOutFileNames}[$Index] = '';
 423     $SDFilesInfo{TextOutFileNames}[$Index] = '';
 424 
 425     $SDFile = $SDFilesList[$Index];
 426     if (!(-e $SDFile)) {
 427       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 428       next FILELIST;
 429     }
 430     if (!CheckFileType($SDFile, "sd sdf")) {
 431       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 432       next FILELIST;
 433     }
 434 
 435     if ($CheckDataField) {
 436       # Make sure data field exists in SD file..
 437       my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
 438 
 439       @CmpdLines = ();
 440       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 441       $CmpdString = ReadCmpdString(\*SDFILE);
 442       close SDFILE;
 443       @CmpdLines = split "\n", $CmpdString;
 444       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 445       $SpecifiedDataField = $OptionsInfo{CompoundID};
 446       if (!exists $DataFieldValues{$SpecifiedDataField}) {
 447         warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using  \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
 448         next FILELIST;
 449       }
 450     }
 451 
 452     $AllDataFieldsRef = '';
 453     $CommonDataFieldsRef = '';
 454     if ($CollectDataFields) {
 455       my($CmpdCount);
 456       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 457       ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
 458       close SDFILE;
 459     }
 460 
 461     # Setup output file names...
 462     $FileDir = ""; $FileName = ""; $FileExt = "";
 463     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 464 
 465     $TextOutFileExt = "csv";
 466     if ($Options{outdelim} =~ /^tab$/i) {
 467       $TextOutFileExt = "tsv";
 468     }
 469     $SDOutFileExt = $FileExt;
 470     $FPOutFileExt = "fpf";
 471 
 472     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
 473       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 474       if ($RootFileName && $RootFileExt) {
 475         $FileName = $RootFileName;
 476       }
 477       else {
 478         $FileName = $OptionsInfo{OutFileRoot};
 479       }
 480       $OutFileRoot = $FileName;
 481     }
 482     else {
 483       $OutFileRoot = "${FileName}TopologicalPharmacophoreAtomTripletsFP";
 484     }
 485 
 486     $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
 487     $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}";
 488     $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
 489 
 490     if ($OptionsInfo{SDOutput}) {
 491       if ($SDFile =~ /$NewSDFileName/i) {
 492         warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
 493         print "Specify a different name using \"-r --root\" option or use default name.\n";
 494         next FILELIST;
 495       }
 496     }
 497 
 498     if (!$OptionsInfo{OverwriteFiles}) {
 499       # Check SD and text outout files...
 500       if ($OptionsInfo{SDOutput}) {
 501         if (-e $NewSDFileName) {
 502           warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
 503           next FILELIST;
 504         }
 505       }
 506       if ($OptionsInfo{FPOutput}) {
 507         if (-e $NewFPFileName) {
 508           warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n";
 509           next FILELIST;
 510         }
 511       }
 512       if ($OptionsInfo{TextOutput}) {
 513         if (-e $NewTextFileName) {
 514           warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
 515           next FILELIST;
 516         }
 517       }
 518     }
 519 
 520     $SDFilesInfo{FileOkay}[$Index] = 1;
 521 
 522     $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 523     $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
 524     $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName;
 525     $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
 526 
 527     $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
 528     $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
 529   }
 530 }
 531 
 532 # Process option values...
 533 sub ProcessOptions {
 534   %OptionsInfo = ();
 535 
 536   ProcessAtomTypesToUseOption();
 537 
 538   $OptionsInfo{AtomTripletsSetSizeToUse} = $Options{atomtripletssetsizetouse};
 539 
 540   $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
 541   $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
 542   $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
 543 
 544   my(@SpecifiedDataFields);
 545   @SpecifiedDataFields = ();
 546 
 547   @{$OptionsInfo{SpecifiedDataFields}} = ();
 548   $OptionsInfo{CompoundID} = '';
 549 
 550   if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
 551     if ($Options{compoundidmode} =~ /^DataField$/i) {
 552       if (!$Options{compoundid}) {
 553         die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
 554       }
 555       $OptionsInfo{CompoundID} = $Options{compoundid};
 556     }
 557     elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
 558       $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
 559     }
 560   }
 561   elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
 562     if (!$Options{datafields}) {
 563       die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
 564     }
 565     @SpecifiedDataFields = split /\,/, $Options{datafields};
 566     push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
 567   }
 568 
 569   $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
 570 
 571   $OptionsInfo{FingerprintsLabelMode} = $Options{fingerprintslabelmode};
 572   $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'TopologicalPharmacophoreAtomTripletsFingerprints';
 573 
 574   $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
 575 
 576   $OptionsInfo{DistanceBinSize} = $Options{distancebinsize};
 577 
 578   $OptionsInfo{MinDistance} = $Options{mindistance};
 579   $OptionsInfo{MaxDistance} = $Options{maxdistance};
 580 
 581   $OptionsInfo{Output} = $Options{output};
 582   $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0;
 583   $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0;
 584   $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0;
 585 
 586   $OptionsInfo{OutDelim} = $Options{outdelim};
 587   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
 588 
 589   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 590   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 591 
 592   $OptionsInfo{UseTriangleInequality} = ($Options{usetriangleinequality} =~ /^Yes$/i) ? 1 : 0;
 593 
 594   # Setup default vector string format...
 595   my($VectorStringFormat);
 596   $VectorStringFormat = '';
 597 
 598   if ($Options{vectorstringformat}) {
 599     $VectorStringFormat = $Options{vectorstringformat};
 600 
 601     if ($Options{atomtripletssetsizetouse} =~ /^ArbitrarySize$/i && $VectorStringFormat =~ /^ValuesString$/i) {
 602       die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid for $Options{atomtripletssetsizetouse} value of \"--AtomTripletsSetSizeToUse\" option. Allowed values: IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
 603     }
 604   }
 605   else {
 606     $VectorStringFormat = ($Options{atomtripletssetsizetouse} =~ /^FixedSize$/) ? "ValuesString" : "IDsAndValuesString";
 607   }
 608   $OptionsInfo{VectorStringFormat} = $VectorStringFormat;
 609 }
 610 
 611 # Process atom type to use option...
 612 #
 613 sub ProcessAtomTypesToUseOption {
 614   my($AtomType, $SpecifiedAtomTypesToUse, @AtomTypesWords);
 615 
 616   @{$OptionsInfo{AtomTypesToUse}} = ();
 617   if (IsEmpty($Options{atomtypestouse})) {
 618     die "Error: Atom types value specified using \"-a, --AtomTypesToUse\" option is empty\n";
 619   }
 620 
 621   $SpecifiedAtomTypesToUse = $Options{atomtypestouse};
 622   $SpecifiedAtomTypesToUse =~ s/ //g;
 623   @AtomTypesWords = split /\,/, $SpecifiedAtomTypesToUse;
 624 
 625   for $AtomType (@AtomTypesWords) {
 626     if (!FunctionalClassAtomTypes::IsFunctionalClassAvailable($AtomType)) {
 627       die "Error: Atom type specified, $AtomType, using \"-a, --AtomTypesToUse\" option is not valid...\n ";
 628     }
 629     push @{$OptionsInfo{AtomTypesToUse}}, $AtomType;
 630   }
 631 }
 632 
 633 # Setup script usage  and retrieve command line arguments specified using various options...
 634 sub SetupScriptUsage {
 635 
 636   # Retrieve all the options...
 637   %Options = ();
 638 
 639   $Options{atomtripletssetsizetouse} = 'ArbitrarySize';
 640 
 641   $Options{atomtypestouse} = 'HBD,HBA,PI,NI,H,Ar';
 642 
 643   $Options{compoundidmode} = 'LabelPrefix';
 644   $Options{compoundidlabel} = 'CompoundID';
 645   $Options{datafieldsmode} = 'CompoundID';
 646 
 647   $Options{filter} = 'Yes';
 648 
 649   $Options{fingerprintslabelmode} = 'FingerprintsLabelOnly';
 650 
 651   $Options{keeplargestcomponent} = 'Yes';
 652 
 653   $Options{mindistance} = 1;
 654   $Options{maxdistance} = 10;
 655 
 656   $Options{distancebinsize} = 2;
 657 
 658   $Options{usetriangleinequality} = 'Yes';
 659 
 660   $Options{output} = 'text';
 661   $Options{outdelim} = 'comma';
 662   $Options{quote} = 'yes';
 663 
 664   $Options{vectorstringformat} = '';
 665 
 666   if (!GetOptions(\%Options, "atomtripletssetsizetouse=s", "atomtypestouse|a=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "distancebinsize=s", "filter|f=s", "fingerprintslabelmode=s", "fingerprintslabel=s", "help|h", "keeplargestcomponent|k=s",  "mindistance=s", "maxdistance=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "usetriangleinequality|u=s", "vectorstringformat|v=s", "workingdir|w=s")) {
 667     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 668   }
 669   if ($Options{workingdir}) {
 670     if (! -d $Options{workingdir}) {
 671       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 672     }
 673     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 674   }
 675   if ($Options{atomtripletssetsizetouse} !~ /^(ArbitrarySize|FixedSize)$/i) {
 676     die "Error: The value specified, $Options{atomtripletssetsizetouse}, for option \"--AtomTripletsSetSizeToUse\" is not valid. Allowed values: ArbitrarySize or FixedSize\n";
 677   }
 678   if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
 679     die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
 680   }
 681   if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
 682     die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
 683   }
 684   if (!IsPositiveInteger($Options{distancebinsize})) {
 685     die "Error: The value specified, $Options{distancebinsize}, for option \"--DistanceBinSize\" is not valid. Allowed values: > 0 \n";
 686   }
 687   if ($Options{filter} !~ /^(Yes|No)$/i) {
 688     die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
 689   }
 690   if ($Options{fingerprintslabelmode} !~ /^(FingerprintsLabelOnly|FingerprintsLabelWithIDs)$/i) {
 691     die "Error: The value specified, $Options{fingerprintslabelmode}, for option \"--FingerprintsLabelMode\" is not valid. Allowed values: FingerprintsLabelOnly or FingerprintsLabelWithIDs\n";
 692   }
 693   if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
 694     die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
 695   }
 696   if (!IsPositiveInteger($Options{mindistance})) {
 697     die "Error: The value specified, $Options{mindistance}, for option \"--MinDistance\" is not valid. Allowed values: > 0 \n";
 698   }
 699   if (!IsPositiveInteger($Options{maxdistance})) {
 700     die "Error: The value specified, $Options{maxdistance}, for option \"--MaxDistance\" is not valid. Allowed values: > 0 \n";
 701   }
 702   if ($Options{mindistance} > $Options{maxdistance}) {
 703     die "Error: The value specified, specified, $Options{mindistance}, for option \"--MinDistance\" must be less than the value specified, $Options{maxdistance}, for option \"--MaxDistance\" \n";
 704   }
 705   if ($Options{output} !~ /^(SD|FP|text|all)$/i) {
 706     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n";
 707   }
 708   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 709     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 710   }
 711   if ($Options{quote} !~ /^(Yes|No)$/i) {
 712     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
 713   }
 714   if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) {
 715     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n";
 716   }
 717   if ($Options{usetriangleinequality} !~ /^(Yes|No)$/i) {
 718     die "Error: The value specified, $Options{usetriangleinequality}, for option \"-u, --UseTriangleInequality\" is not valid. Allowed values: Yes or No\n";
 719   }
 720   if ($Options{vectorstringformat} && $Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) {
 721     die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
 722   }
 723 }
 724