MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: TopologicalAtomPairsFingerprints.pl,v $
   4 # $Date: 2011/12/16 00:03:31 $
   5 # $Revision: 1.27 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2012 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 use SDFileUtil;
  38 use MoleculeFileIO;
  39 use FileIO::FingerprintsSDFileIO;
  40 use FileIO::FingerprintsTextFileIO;
  41 use FileIO::FingerprintsFPFileIO;
  42 use AtomTypes::AtomicInvariantsAtomTypes;
  43 use AtomTypes::FunctionalClassAtomTypes;
  44 use Fingerprints::TopologicalAtomPairsFingerprints;
  45 
  46 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  47 
  48 # Autoflush STDOUT
  49 $| = 1;
  50 
  51 # Starting message...
  52 $ScriptName = basename($0);
  53 print "\n$ScriptName: Starting...\n\n";
  54 $StartTime = new Benchmark;
  55 
  56 # Get the options and setup script...
  57 SetupScriptUsage();
  58 if ($Options{help} || @ARGV < 1) {
  59   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  60 }
  61 
  62 my(@SDFilesList);
  63 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  64 
  65 # Process options...
  66 print "Processing options...\n";
  67 my(%OptionsInfo);
  68 ProcessOptions();
  69 
  70 # Setup information about input files...
  71 print "Checking input SD file(s)...\n";
  72 my(%SDFilesInfo);
  73 RetrieveSDFilesInfo();
  74 
  75 # Process input files..
  76 my($FileIndex);
  77 if (@SDFilesList > 1) {
  78   print "\nProcessing SD files...\n";
  79 }
  80 for $FileIndex (0 .. $#SDFilesList) {
  81   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  82     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  83     GenerateTopologicalAtomPairsFingerprints($FileIndex);
  84   }
  85 }
  86 print "\n$ScriptName:Done...\n\n";
  87 
  88 $EndTime = new Benchmark;
  89 $TotalTime = timediff ($EndTime, $StartTime);
  90 print "Total time: ", timestr($TotalTime), "\n";
  91 
  92 ###############################################################################
  93 
  94 # Generate fingerprints for a SD file...
  95 #
  96 sub GenerateTopologicalAtomPairsFingerprints {
  97   my($FileIndex) = @_;
  98   my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $TopologicalAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
  99 
 100   $SDFile = $SDFilesList[$FileIndex];
 101 
 102   # Setup output files...
 103   #
 104   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex);
 105 
 106   $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
 107   $MoleculeFileIO->Open();
 108 
 109   $CmpdCount = 0;
 110   $IgnoredCmpdCount = 0;
 111 
 112   COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
 113     $CmpdCount++;
 114 
 115     # Filter compound data before calculating fingerprints...
 116     if ($OptionsInfo{Filter}) {
 117       if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
 118         $IgnoredCmpdCount++;
 119         next COMPOUND;
 120       }
 121     }
 122 
 123     $TopologicalAtomPairsFingerprints = GenerateMoleculeFingerprints($Molecule);
 124     if (!$TopologicalAtomPairsFingerprints) {
 125       $IgnoredCmpdCount++;
 126       ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule);
 127       next COMPOUND;
 128     }
 129 
 130     WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $TopologicalAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 131   }
 132   $MoleculeFileIO->Close();
 133 
 134   if ($NewFPSDFileIO) {
 135     $NewFPSDFileIO->Close();
 136   }
 137   if ($NewFPTextFileIO) {
 138     $NewFPTextFileIO->Close();
 139   }
 140   if ($NewFPFileIO) {
 141     $NewFPFileIO->Close();
 142   }
 143 
 144   WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount);
 145 }
 146 
 147 # Process compound being ignored due to problems in fingerprints geneation...
 148 #
 149 sub ProcessIgnoredCompound {
 150   my($Mode, $CmpdCount, $Molecule) = @_;
 151   my($CmpdID, $DataFieldLabelAndValuesRef);
 152 
 153   $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 154   $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 155 
 156   MODE: {
 157     if ($Mode =~ /^ContainsNonElementalData$/i) {
 158       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
 159       next MODE;
 160     }
 161 
 162     if ($Mode =~ /^ContainsNoElementalData$/i) {
 163       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
 164       next MODE;
 165     }
 166 
 167     if ($Mode =~ /^FingerprintsGenerationFailed$/i) {
 168       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 169       next MODE;
 170     }
 171     warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 172   }
 173 }
 174 
 175 # Check and filter compounds....
 176 #
 177 sub CheckAndFilterCompound {
 178   my($CmpdCount, $Molecule) = @_;
 179   my($ElementCount, $NonElementCount);
 180 
 181   ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
 182 
 183   if ($NonElementCount) {
 184     ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
 185     return 1;
 186   }
 187 
 188   if (!$ElementCount) {
 189     ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
 190     return 1;
 191   }
 192 
 193   return 0;
 194 }
 195 
 196 # Write out compounds fingerprints generation summary statistics...
 197 #
 198 sub WriteFingerprintsGenerationSummaryStatistics {
 199   my($CmpdCount, $IgnoredCmpdCount) = @_;
 200   my($ProcessedCmpdCount);
 201 
 202   $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
 203 
 204   print "\nNumber of compounds: $CmpdCount\n";
 205   print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n";
 206   print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n";
 207 }
 208 
 209 # Open output files...
 210 #
 211 sub SetupAndOpenOutputFiles {
 212   my($FileIndex) = @_;
 213   my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams);
 214 
 215   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
 216 
 217   # Setup common parameters for fingerprints file IO objects...
 218   #
 219   %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat});
 220 
 221   if ($OptionsInfo{SDOutput}) {
 222     $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
 223     print "Generating SD file $NewFPSDFile...\n";
 224     $NewFPSDFileIO = new FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel});
 225     $NewFPSDFileIO->Open();
 226   }
 227 
 228   if ($OptionsInfo{FPOutput}) {
 229     $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex];
 230     print "Generating FP file $NewFPFile...\n";
 231     $NewFPFileIO = new FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams);
 232     $NewFPFileIO->Open();
 233   }
 234 
 235   if ($OptionsInfo{TextOutput}) {
 236     my($ColLabelsRef);
 237 
 238     $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
 239     $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex);
 240 
 241     print "Generating text file $NewFPTextFile...\n";
 242     $NewFPTextFileIO = new FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote});
 243     $NewFPTextFileIO->Open();
 244   }
 245 
 246   return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 247 }
 248 
 249 # Write fingerpritns and other data to appropriate output files...
 250 #
 251 sub WriteDataToOutputFiles {
 252   my($FileIndex, $CmpdCount, $Molecule, $TopologicalAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_;
 253   my($DataFieldLabelAndValuesRef);
 254 
 255   $DataFieldLabelAndValuesRef = undef;
 256   if ($NewFPTextFileIO || $NewFPFileIO) {
 257     $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 258   }
 259 
 260   if ($NewFPSDFileIO) {
 261     my($CmpdString);
 262 
 263     $CmpdString = $Molecule->GetInputMoleculeString();
 264     $NewFPSDFileIO->WriteFingerprints($TopologicalAtomPairsFingerprints, $CmpdString);
 265   }
 266 
 267   if ($NewFPTextFileIO) {
 268     my($ColValuesRef);
 269 
 270     $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 271     $NewFPTextFileIO->WriteFingerprints($TopologicalAtomPairsFingerprints, $ColValuesRef);
 272   }
 273 
 274   if ($NewFPFileIO) {
 275     my($CompoundID);
 276 
 277     $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 278     $NewFPFileIO->WriteFingerprints($TopologicalAtomPairsFingerprints, $CompoundID);
 279   }
 280 }
 281 
 282 # Generate approriate column labels for FPText output file...
 283 #
 284 sub SetupFPTextFileCoulmnLabels {
 285   my($FileIndex) = @_;
 286   my($Line, @ColLabels);
 287 
 288   @ColLabels = ();
 289   if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 290     push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 291   }
 292   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 293     push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 294   }
 295   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 296     push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}};
 297   }
 298   elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 299     push @ColLabels, $OptionsInfo{CompoundIDLabel};
 300   }
 301   # Add fingerprints label...
 302   push @ColLabels, $OptionsInfo{FingerprintsLabel};
 303 
 304   return \@ColLabels;
 305 }
 306 
 307 # Generate column values FPText output file..
 308 #
 309 sub SetupFPTextFileCoulmnValues {
 310   my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 311   my(@ColValues);
 312 
 313   @ColValues = ();
 314   if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 315     push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 316   }
 317   elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 318     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 319   }
 320   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 321     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 322   }
 323   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 324     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
 325   }
 326 
 327   return \@ColValues;
 328 }
 329 
 330 # Generate compound ID for FP and FPText output files..
 331 #
 332 sub SetupCmpdIDForOutputFiles {
 333   my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 334   my($CmpdID);
 335 
 336   $CmpdID = '';
 337   if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
 338     my($MolName);
 339     $MolName = $Molecule->GetName();
 340     $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
 341   }
 342   elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
 343     $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
 344   }
 345   elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
 346     my($SpecifiedDataField);
 347     $SpecifiedDataField = $OptionsInfo{CompoundID};
 348     $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
 349   }
 350   elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
 351     $CmpdID = $Molecule->GetName();
 352   }
 353   return $CmpdID;
 354 }
 355 
 356 # Generate fingerprints for molecule...
 357 #
 358 sub GenerateMoleculeFingerprints {
 359   my($Molecule) = @_;
 360   my($TopologicalAtomPairsFingerprints);
 361 
 362   if ($OptionsInfo{KeepLargestComponent}) {
 363     $Molecule->KeepLargestComponent();
 364   }
 365   if (!$Molecule->DetectRings()) {
 366     return undef;
 367   }
 368   $Molecule->DetectAromaticity();
 369 
 370   $TopologicalAtomPairsFingerprints = new TopologicalAtomPairsFingerprints('Molecule' => $Molecule, 'MinDistance' => $OptionsInfo{MinDistance},  'MaxDistance' => $OptionsInfo{MaxDistance}, 'AtomIdentifierType' => $OptionsInfo{AtomIdentifierType});
 371   SetAtomIdentifierTypeValuesToUse($TopologicalAtomPairsFingerprints);
 372 
 373   # Generate fingerprints...
 374   $TopologicalAtomPairsFingerprints->GenerateFingerprints();
 375 
 376   # Make sure fingerprints generation is successful...
 377   if (!$TopologicalAtomPairsFingerprints->IsFingerprintsGenerationSuccessful()) {
 378     return undef;
 379   }
 380 
 381   return $TopologicalAtomPairsFingerprints;
 382 }
 383 
 384 # Set atom identifier type to use for generating fingerprints...
 385 #
 386 sub SetAtomIdentifierTypeValuesToUse {
 387   my($TopologicalAtomPairsFingerprints) = @_;
 388 
 389   if ($OptionsInfo{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) {
 390     $TopologicalAtomPairsFingerprints->SetAtomicInvariantsToUse(\@{$OptionsInfo{AtomicInvariantsToUse}});
 391   }
 392   elsif ($OptionsInfo{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) {
 393     $TopologicalAtomPairsFingerprints->SetFunctionalClassesToUse(\@{$OptionsInfo{FunctionalClassesToUse}});
 394   }
 395   elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
 396     # Nothing to do for now...
 397   }
 398   else {
 399     die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
 400   }
 401 }
 402 
 403 # Retrieve information about SD files...
 404 #
 405 sub RetrieveSDFilesInfo {
 406   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
 407 
 408   %SDFilesInfo = ();
 409   @{$SDFilesInfo{FileOkay}} = ();
 410   @{$SDFilesInfo{OutFileRoot}} = ();
 411   @{$SDFilesInfo{SDOutFileNames}} = ();
 412   @{$SDFilesInfo{FPOutFileNames}} = ();
 413   @{$SDFilesInfo{TextOutFileNames}} = ();
 414   @{$SDFilesInfo{AllDataFieldsRef}} = ();
 415   @{$SDFilesInfo{CommonDataFieldsRef}} = ();
 416 
 417   $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
 418   $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
 419 
 420   FILELIST: for $Index (0 .. $#SDFilesList) {
 421     $SDFile = $SDFilesList[$Index];
 422 
 423     $SDFilesInfo{FileOkay}[$Index] = 0;
 424     $SDFilesInfo{OutFileRoot}[$Index] = '';
 425     $SDFilesInfo{SDOutFileNames}[$Index] = '';
 426     $SDFilesInfo{FPOutFileNames}[$Index] = '';
 427     $SDFilesInfo{TextOutFileNames}[$Index] = '';
 428 
 429     $SDFile = $SDFilesList[$Index];
 430     if (!(-e $SDFile)) {
 431       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 432       next FILELIST;
 433     }
 434     if (!CheckFileType($SDFile, "sd sdf")) {
 435       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 436       next FILELIST;
 437     }
 438 
 439     if ($CheckDataField) {
 440       # Make sure data field exists in SD file..
 441       my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
 442 
 443       @CmpdLines = ();
 444       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 445       $CmpdString = ReadCmpdString(\*SDFILE);
 446       close SDFILE;
 447       @CmpdLines = split "\n", $CmpdString;
 448       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 449       $SpecifiedDataField = $OptionsInfo{CompoundID};
 450       if (!exists $DataFieldValues{$SpecifiedDataField}) {
 451         warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using  \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
 452         next FILELIST;
 453       }
 454     }
 455 
 456     $AllDataFieldsRef = '';
 457     $CommonDataFieldsRef = '';
 458     if ($CollectDataFields) {
 459       my($CmpdCount);
 460       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 461       ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
 462       close SDFILE;
 463     }
 464 
 465     # Setup output file names...
 466     $FileDir = ""; $FileName = ""; $FileExt = "";
 467     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 468 
 469     $TextOutFileExt = "csv";
 470     if ($Options{outdelim} =~ /^tab$/i) {
 471       $TextOutFileExt = "tsv";
 472     }
 473     $SDOutFileExt = $FileExt;
 474     $FPOutFileExt = "fpf";
 475 
 476     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
 477       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 478       if ($RootFileName && $RootFileExt) {
 479         $FileName = $RootFileName;
 480       }
 481       else {
 482         $FileName = $OptionsInfo{OutFileRoot};
 483       }
 484       $OutFileRoot = $FileName;
 485     }
 486     else {
 487       $OutFileRoot = "${FileName}TopologicalAtomPairsFP";
 488     }
 489 
 490     $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
 491     $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}";
 492     $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
 493 
 494     if ($OptionsInfo{SDOutput}) {
 495       if ($SDFile =~ /$NewSDFileName/i) {
 496         warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
 497         print "Specify a different name using \"-r --root\" option or use default name.\n";
 498         next FILELIST;
 499       }
 500     }
 501 
 502     if (!$OptionsInfo{OverwriteFiles}) {
 503       # Check SD and text outout files...
 504       if ($OptionsInfo{SDOutput}) {
 505         if (-e $NewSDFileName) {
 506           warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
 507           next FILELIST;
 508         }
 509       }
 510       if ($OptionsInfo{FPOutput}) {
 511         if (-e $NewFPFileName) {
 512           warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n";
 513           next FILELIST;
 514         }
 515       }
 516       if ($OptionsInfo{TextOutput}) {
 517         if (-e $NewTextFileName) {
 518           warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
 519           next FILELIST;
 520         }
 521       }
 522     }
 523 
 524     $SDFilesInfo{FileOkay}[$Index] = 1;
 525 
 526     $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 527     $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
 528     $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName;
 529     $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
 530 
 531     $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
 532     $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
 533   }
 534 }
 535 
 536 # Process option values...
 537 sub ProcessOptions {
 538   %OptionsInfo = ();
 539 
 540   ProcessAtomIdentifierTypeOptions();
 541 
 542   $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
 543   $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
 544   $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
 545 
 546   my(@SpecifiedDataFields);
 547   @SpecifiedDataFields = ();
 548 
 549   @{$OptionsInfo{SpecifiedDataFields}} = ();
 550   $OptionsInfo{CompoundID} = '';
 551 
 552   if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
 553     if ($Options{compoundidmode} =~ /^DataField$/i) {
 554       if (!$Options{compoundid}) {
 555         die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
 556       }
 557       $OptionsInfo{CompoundID} = $Options{compoundid};
 558     }
 559     elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
 560       $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
 561     }
 562   }
 563   elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
 564     if (!$Options{datafields}) {
 565       die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
 566     }
 567     @SpecifiedDataFields = split /\,/, $Options{datafields};
 568     push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
 569   }
 570 
 571   $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
 572 
 573   $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'TopologicalAtomPairsFingerprints';
 574 
 575   $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
 576 
 577   $OptionsInfo{MinDistance} = $Options{mindistance};
 578   $OptionsInfo{MaxDistance} = $Options{maxdistance};
 579 
 580   $OptionsInfo{Output} = $Options{output};
 581   $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0;
 582   $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0;
 583   $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0;
 584 
 585   $OptionsInfo{OutDelim} = $Options{outdelim};
 586   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
 587 
 588   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 589   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 590 
 591   $OptionsInfo{VectorStringFormat} = $Options{vectorstringformat};
 592 }
 593 
 594 # Process atom identifier type and related options...
 595 #
 596 sub ProcessAtomIdentifierTypeOptions {
 597 
 598   $OptionsInfo{AtomIdentifierType} = $Options{atomidentifiertype};
 599 
 600   if ($Options{atomidentifiertype} =~ /^AtomicInvariantsAtomTypes$/i) {
 601     ProcessAtomicInvariantsToUseOption();
 602   }
 603   elsif ($Options{atomidentifiertype} =~ /^FunctionalClassAtomTypes$/i) {
 604     ProcessFunctionalClassesToUse();
 605   }
 606   elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
 607     # Nothing to do for now...
 608   }
 609   else {
 610     die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
 611   }
 612 }
 613 
 614 # Process specified atomic invariants to use...
 615 #
 616 sub ProcessAtomicInvariantsToUseOption {
 617   my($AtomicInvariant, $AtomSymbolSpecified, @AtomicInvariantsWords);
 618 
 619   @{$OptionsInfo{AtomicInvariantsToUse}} = ();
 620   if (IsEmpty($Options{atomicinvariantstouse})) {
 621     die "Error: Atomic invariants value specified using \"--AtomicInvariantsToUse\" option is empty\n";
 622   }
 623   $AtomSymbolSpecified = 0;
 624   @AtomicInvariantsWords = split /\,/, $Options{atomicinvariantstouse};
 625   for $AtomicInvariant (@AtomicInvariantsWords) {
 626     if (!AtomicInvariantsAtomTypes::IsAtomicInvariantAvailable($AtomicInvariant)) {
 627       die "Error: Atomic invariant specified, $AtomicInvariant, using \"--AtomicInvariantsToUse\" option is not valid...\n ";
 628     }
 629     if ($AtomicInvariant =~ /^(AS|AtomSymbol)$/i) {
 630       $AtomSymbolSpecified = 1;
 631     }
 632     push @{$OptionsInfo{AtomicInvariantsToUse}}, $AtomicInvariant;
 633   }
 634   if (!$AtomSymbolSpecified) {
 635     die "Error: Atomic invariant, AS or AtomSymbol, must be specified as using \"--AtomicInvariantsToUse\" option...\n ";
 636   }
 637 }
 638 
 639 # Process specified functional classes invariants to use...
 640 #
 641 sub ProcessFunctionalClassesToUse {
 642   my($FunctionalClass, @FunctionalClassesToUseWords);
 643 
 644   @{$OptionsInfo{FunctionalClassesToUse}} = ();
 645   if (IsEmpty($Options{functionalclassestouse})) {
 646     die "Error: Functional classes value specified using \"--FunctionalClassesToUse\" option is empty\n";
 647   }
 648   @FunctionalClassesToUseWords = split /\,/, $Options{functionalclassestouse};
 649   for $FunctionalClass (@FunctionalClassesToUseWords) {
 650     if (!FunctionalClassAtomTypes::IsFunctionalClassAvailable($FunctionalClass)) {
 651       die "Error: Functional class specified, $FunctionalClass, using \"--FunctionalClassesToUse\" option is not valid...\n ";
 652     }
 653     push @{$OptionsInfo{FunctionalClassesToUse}}, $FunctionalClass;
 654   }
 655 }
 656 
 657 # Setup script usage  and retrieve command line arguments specified using various options...
 658 sub SetupScriptUsage {
 659 
 660   # Retrieve all the options...
 661   %Options = ();
 662 
 663   $Options{atomidentifiertype} = 'AtomicInvariantsAtomTypes';
 664   $Options{atomicinvariantstouse} = 'AS,X,BO,H,FC';
 665 
 666   $Options{functionalclassestouse} = 'HBD,HBA,PI,NI,Ar,Hal';
 667 
 668   $Options{compoundidmode} = 'LabelPrefix';
 669   $Options{compoundidlabel} = 'CompoundID';
 670   $Options{datafieldsmode} = 'CompoundID';
 671 
 672   $Options{filter} = 'Yes';
 673 
 674   $Options{keeplargestcomponent} = 'Yes';
 675 
 676   $Options{mindistance} = 1;
 677   $Options{maxdistance} = 10;
 678 
 679   $Options{output} = 'text';
 680   $Options{outdelim} = 'comma';
 681   $Options{quote} = 'yes';
 682 
 683   $Options{vectorstringformat} = 'IDsAndValuesString';
 684 
 685   if (!GetOptions(\%Options, "atomidentifiertype|a=s", "atomicinvariantstouse=s", "functionalclassestouse=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabel=s",  "help|h", "keeplargestcomponent|k=s",  "mindistance=s", "maxdistance=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "vectorstringformat|v=s", "workingdir|w=s")) {
 686     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 687   }
 688   if ($Options{workingdir}) {
 689     if (! -d $Options{workingdir}) {
 690       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 691     }
 692     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 693   }
 694   if ($Options{atomidentifiertype} !~ /^(AtomicInvariantsAtomTypes|DREIDINGAtomTypes|EStateAtomTypes|FunctionalClassAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
 695     die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
 696   }
 697   if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
 698     die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
 699   }
 700   if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
 701     die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
 702   }
 703   if ($Options{filter} !~ /^(Yes|No)$/i) {
 704     die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
 705   }
 706   if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
 707     die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
 708   }
 709   if (!IsPositiveInteger($Options{mindistance})) {
 710     die "Error: The value specified, $Options{mindistance}, for option \"--MinDistance\" is not valid. Allowed values: > 0 \n";
 711   }
 712   if (!IsPositiveInteger($Options{maxdistance})) {
 713     die "Error: The value specified, $Options{maxdistance}, for option \"--MaxDistance\" is not valid. Allowed values: > 0 \n";
 714   }
 715   if ($Options{mindistance} > $Options{maxdistance}) {
 716     die "Error: The value specified, specified, $Options{mindistance}, for option \"--MinDistance\" must be less than the value specified, $Options{maxdistance}, for option \"--MaxDistance\" \n";
 717   }
 718   if ($Options{output} !~ /^(SD|FP|text|all)$/i) {
 719     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n";
 720   }
 721   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 722     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 723   }
 724   if ($Options{quote} !~ /^(Yes|No)$/i) {
 725     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
 726   }
 727   if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) {
 728     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n";
 729   }
 730   if ($Options{vectorstringformat} !~ /^(IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) {
 731     die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
 732   }
 733 }
 734