MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: TopologicalAtomPairsFingerprints.pl,v $
   4 # $Date: 2010/07/03 20:32:14 $
   5 # $Revision: 1.18 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2010 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 use SDFileUtil;
  38 use MoleculeFileIO;
  39 use AtomTypes::AtomicInvariantsAtomTypes;
  40 use AtomTypes::FunctionalClassAtomTypes;
  41 use Fingerprints::TopologicalAtomPairsFingerprints;
  42 use Fingerprints::FingerprintsStringUtil;
  43 
  44 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  45 
  46 # Autoflush STDOUT
  47 $| = 1;
  48 
  49 # Starting message...
  50 $ScriptName = basename($0);
  51 print "\n$ScriptName: Starting...\n\n";
  52 $StartTime = new Benchmark;
  53 
  54 # Get the options and setup script...
  55 SetupScriptUsage();
  56 if ($Options{help} || @ARGV < 1) {
  57   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  58 }
  59 
  60 my(@SDFilesList);
  61 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  62 
  63 # Process options...
  64 my(%OptionsInfo);
  65 ProcessOptions();
  66 
  67 # Setup information about input files...
  68 my(%SDFilesInfo);
  69 print "Checking input SD file(s)...\n";
  70 RetrieveSDFilesInfo();
  71 
  72 # Process input files..
  73 my($FileIndex, $SDFile, $FileProcessingMsg);
  74 $FileProcessingMsg = "Processing file";
  75 if (@SDFilesList > 1) {
  76   print "Processing SD files...\n";
  77   $FileProcessingMsg = "\n$FileProcessingMsg";
  78 }
  79 
  80 for $FileIndex (0 .. $#SDFilesList) {
  81   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  82     $SDFile = $SDFilesList[$FileIndex];
  83     print "$FileProcessingMsg $SDFile...\n";
  84     GenerateTopologicalAtomPairsFingerprints($FileIndex);
  85   }
  86 }
  87 print "$ScriptName:Done...\n\n";
  88 
  89 $EndTime = new Benchmark;
  90 $TotalTime = timediff ($EndTime, $StartTime);
  91 print "Total time: ", timestr($TotalTime), "\n";
  92 
  93 ###############################################################################
  94 
  95 # Generate fingerprints for a SD file...
  96 #
  97 sub GenerateTopologicalAtomPairsFingerprints {
  98   my($FileIndex) = @_;
  99   my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $TopologicalAtomPairsFingerprints, $NewSDFileRef, $NewTextFileRef);
 100 
 101   $SDFile = $SDFilesList[$FileIndex];
 102 
 103   # Setup output files...
 104   $NewSDFileRef = '';
 105   $NewTextFileRef = '';
 106   ($NewSDFileRef, $NewTextFileRef) = SetupAndOpenOutputFiles($FileIndex);
 107 
 108   $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
 109   $MoleculeFileIO->Open();
 110 
 111   $CmpdCount = 0;
 112   $IgnoredCmpdCount = 0;
 113 
 114   COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
 115     $CmpdCount++;
 116 
 117     # Filter compound data before calculating fingerprints...
 118     if ($OptionsInfo{Filter}) {
 119       if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
 120 	$IgnoredCmpdCount++;
 121 	next COMPOUND;
 122       }
 123     }
 124 
 125     $TopologicalAtomPairsFingerprints = GenerateMoleculeFingerprints($Molecule);
 126     if (!$TopologicalAtomPairsFingerprints) {
 127       $IgnoredCmpdCount++;
 128       ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule);
 129       next COMPOUND;
 130     }
 131 
 132     WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $TopologicalAtomPairsFingerprints, $NewSDFileRef, $NewTextFileRef);
 133   }
 134   $MoleculeFileIO->Close();
 135 
 136   if ($OptionsInfo{SDOutput}) {
 137     close $NewSDFileRef;
 138   }
 139   if ($OptionsInfo{TextOutput}) {
 140     close $NewTextFileRef;
 141   }
 142 
 143   WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount);
 144 }
 145 
 146 # Process compound being ignored due to problems in fingerprints geneation...
 147 #
 148 sub ProcessIgnoredCompound {
 149   my($Mode, $CmpdCount, $Molecule) = @_;
 150   my($CmpdID, $DataFieldLabelAndValuesRef);
 151 
 152   $DataFieldLabelAndValuesRef = $Molecule->GetMDLDataFieldLabelAndValues();
 153   $CmpdID = SetupCmpdIDForTextFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 154 
 155   MODE: {
 156     if ($Mode =~ /^ContainsNonElementalData$/i) {
 157       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
 158       next MODE;
 159     }
 160 
 161     if ($Mode =~ /^ContainsNoElementalData$/i) {
 162       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
 163       next MODE;
 164     }
 165 
 166     if ($Mode =~ /^FingerprintsGenerationFailed$/i) {
 167       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 168       next MODE;
 169     }
 170     warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 171   }
 172 }
 173 
 174 # Check and filter compounds....
 175 #
 176 sub CheckAndFilterCompound {
 177   my($CmpdCount, $Molecule) = @_;
 178   my($ElementCount, $NonElementCount);
 179 
 180   ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
 181 
 182   if ($NonElementCount) {
 183     ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
 184     return 1;
 185   }
 186 
 187   if (!$ElementCount) {
 188     ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
 189     return 1;
 190   }
 191 
 192   return 0;
 193 }
 194 
 195 # Write out compounds fingerprints generation summary statistics...
 196 #
 197 sub WriteFingerprintsGenerationSummaryStatistics {
 198   my($CmpdCount, $IgnoredCmpdCount) = @_;
 199   my($ProcessedCmpdCount);
 200 
 201   $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
 202 
 203   print "\nNumber of compounds: $CmpdCount\n";
 204   print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n";
 205   print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n";
 206 }
 207 
 208 # Open output files...
 209 #
 210 sub SetupAndOpenOutputFiles {
 211   my($FileIndex) = @_;
 212   my($NewSDFile, $NewTextFile, $NewSDFileRef, $NewTextFileRef);
 213 
 214   $NewSDFileRef = '';
 215   $NewTextFileRef = '';
 216 
 217   if ($OptionsInfo{SDOutput}) {
 218     $NewSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
 219     print "Generating SD file $NewSDFile...\n";
 220     open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
 221     $NewSDFileRef = \*NEWSDFILE;
 222   }
 223   if ($OptionsInfo{TextOutput}) {
 224     $NewTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
 225     print "Generating text file $NewTextFile...\n";
 226     open NEWTEXTFILE, ">$NewTextFile" or die "Error: Couldn't open $NewTextFile: $! \n";
 227     WriteTextFileCoulmnLabels($FileIndex, \*NEWTEXTFILE);
 228     $NewTextFileRef = \*NEWTEXTFILE;
 229   }
 230   return ($NewSDFileRef, $NewTextFileRef);
 231 }
 232 
 233 # Write fingerpritns and other data to appropriate output files...
 234 #
 235 sub WriteDataToOutputFiles {
 236   my($FileIndex, $CmpdCount, $Molecule, $TopologicalAtomPairsFingerprints, $NewSDFileRef, $NewTextFileRef) = @_;
 237   my($FingerprintsString);
 238 
 239   $FingerprintsString = GetFingerprintsString($TopologicalAtomPairsFingerprints);
 240 
 241   if ($OptionsInfo{SDOutput}) {
 242     # Retrieve input compound string used to create molecule and write it out
 243     # without last line containing a delimiter...
 244     my($CmpdString);
 245     $CmpdString = $Molecule->GetMDLCmpdString();
 246     $CmpdString =~ s/\$\$\$\$$//;
 247     print $NewSDFileRef "$CmpdString";
 248 
 249     # Write out fingerprints data...
 250     print $NewSDFileRef  ">  <$OptionsInfo{FingerprintsLabel}>\n$FingerprintsString\n\n";
 251 
 252     # Write out delimiter...
 253     print $NewSDFileRef "\$\$\$\$\n";
 254   }
 255 
 256   if ($OptionsInfo{TextOutput}) {
 257     my($Line, $DataFieldLabelAndValuesRef, $DataFieldLabel, $DataFieldValue, @LineWords,);
 258 
 259     $DataFieldLabelAndValuesRef = $Molecule->GetMDLDataFieldLabelAndValues();
 260     @LineWords = ();
 261     if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 262       push @LineWords, SetupCmpdIDForTextFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 263     }
 264     elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 265       @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 266     }
 267     elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 268       @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 269     }
 270     elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 271       @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
 272     }
 273 
 274     # Add fingerprints string...
 275     push @LineWords, $FingerprintsString;
 276 
 277     $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 278     print $NewTextFileRef "$Line\n";
 279   }
 280 }
 281 
 282 # Write out approriate column labels to text file...
 283 sub WriteTextFileCoulmnLabels {
 284   my($FileIndex, $NewTextFileRef) = @_;
 285   my($Line, @LineWords);
 286 
 287   @LineWords = ();
 288   if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 289     push @LineWords, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 290   }
 291   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 292     push @LineWords, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 293   }
 294   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 295     push @LineWords, @{$OptionsInfo{SpecifiedDataFields}};
 296   }
 297   elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 298     push @LineWords, $OptionsInfo{CompoundIDLabel};
 299   }
 300   # Add fingerprints label...
 301   push @LineWords, $OptionsInfo{FingerprintsLabel};
 302 
 303   $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 304   print $NewTextFileRef "$Line\n";
 305 }
 306 
 307 # Generate compound ID for text files..
 308 #
 309 sub SetupCmpdIDForTextFiles {
 310   my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 311   my($CmpdID);
 312 
 313   $CmpdID = '';
 314   if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
 315     my($MolName);
 316     $MolName = $Molecule->GetName();
 317     $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
 318   }
 319   elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
 320     $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
 321   }
 322   elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
 323     my($SpecifiedDataField);
 324     $SpecifiedDataField = $OptionsInfo{CompoundID};
 325     $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
 326   }
 327   elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
 328     $CmpdID = $Molecule->GetName();
 329   }
 330   return $CmpdID;
 331 }
 332 
 333 # Get fingerprints as a string...
 334 #
 335 sub GetFingerprintsString {
 336   my($TopologicalAtomPairsFingerprints) = @_;
 337 
 338   return FingerprintsStringUtil::GenerateFingerprintsString($TopologicalAtomPairsFingerprints, $OptionsInfo{VectorStringFormat});
 339 }
 340 
 341 # Generate fingerprints for molecule...
 342 #
 343 sub GenerateMoleculeFingerprints {
 344   my($Molecule) = @_;
 345   my($TopologicalAtomPairsFingerprints);
 346 
 347   if ($OptionsInfo{KeepLargestComponent}) {
 348     $Molecule->KeepLargestComponent();
 349   }
 350   if (!$Molecule->DetectRings()) {
 351     return undef;
 352   }
 353   $Molecule->DetectAromaticity();
 354 
 355   $TopologicalAtomPairsFingerprints = new TopologicalAtomPairsFingerprints('Molecule' => $Molecule, 'MinDistance' => $OptionsInfo{MinDistance},  'MaxDistance' => $OptionsInfo{MaxDistance}, 'AtomIdentifierType' => $OptionsInfo{AtomIdentifierType});
 356   SetAtomIdentifierTypeValuesToUse($TopologicalAtomPairsFingerprints);
 357 
 358   # Generate fingerprints...
 359   $TopologicalAtomPairsFingerprints->GenerateFingerprints();
 360 
 361   # Make sure fingerprints generation is successful...
 362   if (!$TopologicalAtomPairsFingerprints->IsFingerprintsGenerationSuccessful()) {
 363     return undef;
 364   }
 365 
 366   return $TopologicalAtomPairsFingerprints;
 367 }
 368 
 369 # Set atom identifier type to use for generating fingerprints...
 370 #
 371 sub SetAtomIdentifierTypeValuesToUse {
 372   my($TopologicalAtomPairsFingerprints) = @_;
 373 
 374   if ($OptionsInfo{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) {
 375     $TopologicalAtomPairsFingerprints->SetAtomicInvariantsToUse(\@{$OptionsInfo{AtomicInvariantsToUse}});
 376   }
 377   elsif ($OptionsInfo{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) {
 378     $TopologicalAtomPairsFingerprints->SetFunctionalClassesToUse(\@{$OptionsInfo{FunctionalClassesToUse}});
 379   }
 380   elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
 381     # Nothing to do for now...
 382   }
 383   else {
 384     die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
 385   }
 386 }
 387 
 388 # Retrieve information about SD files...
 389 #
 390 sub RetrieveSDFilesInfo {
 391   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $NewSDFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
 392 
 393   %SDFilesInfo = ();
 394   @{$SDFilesInfo{FileOkay}} = ();
 395   @{$SDFilesInfo{OutFileRoot}} = ();
 396   @{$SDFilesInfo{SDOutFileNames}} = ();
 397   @{$SDFilesInfo{TextOutFileNames}} = ();
 398   @{$SDFilesInfo{AllDataFieldsRef}} = ();
 399   @{$SDFilesInfo{CommonDataFieldsRef}} = ();
 400 
 401   $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
 402   $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
 403 
 404   FILELIST: for $Index (0 .. $#SDFilesList) {
 405     $SDFile = $SDFilesList[$Index];
 406 
 407     $SDFilesInfo{FileOkay}[$Index] = 0;
 408     $SDFilesInfo{OutFileRoot}[$Index] = '';
 409     $SDFilesInfo{SDOutFileNames}[$Index] = '';
 410     $SDFilesInfo{TextOutFileNames}[$Index] = '';
 411 
 412     $SDFile = $SDFilesList[$Index];
 413     if (!(-e $SDFile)) {
 414       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 415       next FILELIST;
 416     }
 417     if (!CheckFileType($SDFile, "sd sdf")) {
 418       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 419       next FILELIST;
 420     }
 421 
 422     if ($CheckDataField) {
 423       # Make sure data field exists in SD file..
 424       my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
 425 
 426       @CmpdLines = ();
 427       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 428       $CmpdString = ReadCmpdString(\*SDFILE);
 429       close SDFILE;
 430       @CmpdLines = split "\n", $CmpdString;
 431       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 432       $SpecifiedDataField = $OptionsInfo{CompoundID};
 433       if (!exists $DataFieldValues{$SpecifiedDataField}) {
 434 	warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using  \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
 435 	next FILELIST;
 436       }
 437     }
 438 
 439     $AllDataFieldsRef = '';
 440     $CommonDataFieldsRef = '';
 441     if ($CollectDataFields) {
 442       my($CmpdCount);
 443       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 444       ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
 445       close SDFILE;
 446     }
 447 
 448     # Setup output file names...
 449     $FileDir = ""; $FileName = ""; $FileExt = "";
 450     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 451 
 452     $TextOutFileExt = "csv";
 453     if ($Options{outdelim} =~ /^tab$/i) {
 454       $TextOutFileExt = "tsv";
 455     }
 456     $SDOutFileExt = $FileExt;
 457 
 458     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
 459       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 460       if ($RootFileName && $RootFileExt) {
 461 	$FileName = $RootFileName;
 462       }
 463       else {
 464 	$FileName = $OptionsInfo{OutFileRoot};
 465       }
 466       $OutFileRoot = $FileName;
 467     }
 468     else {
 469       $OutFileRoot = "${FileName}TopologicalAtomPairsFP";
 470     }
 471 
 472     $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
 473     $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
 474 
 475     if ($OptionsInfo{SDOutput}) {
 476       if ($SDFile =~ /$NewSDFileName/i) {
 477 	warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
 478 	print "Specify a different name using \"-r --root\" option or use default name.\n";
 479 	next FILELIST;
 480       }
 481     }
 482 
 483     if (!$OptionsInfo{OverwriteFiles}) {
 484       # Check SD and text outout files...
 485       if ($OptionsInfo{SDOutput}) {
 486 	if (-e $NewSDFileName) {
 487 	  warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
 488 	  next FILELIST;
 489 	}
 490       }
 491       if ($OptionsInfo{TextOutput}) {
 492 	if (-e $NewTextFileName) {
 493 	  warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
 494 	  next FILELIST;
 495 	}
 496       }
 497     }
 498 
 499     $SDFilesInfo{FileOkay}[$Index] = 1;
 500 
 501     $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 502     $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
 503     $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
 504 
 505     $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
 506     $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
 507   }
 508 }
 509 
 510 # Process option values...
 511 sub ProcessOptions {
 512   %OptionsInfo = ();
 513 
 514   ProcessAtomIdentifierTypeOptions();
 515 
 516   $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
 517   $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
 518   $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
 519 
 520   my(@SpecifiedDataFields);
 521   @SpecifiedDataFields = ();
 522 
 523   @{$OptionsInfo{SpecifiedDataFields}} = ();
 524   $OptionsInfo{CompoundID} = '';
 525 
 526   if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
 527     if ($Options{compoundidmode} =~ /^DataField$/i) {
 528       if (!$Options{compoundid}) {
 529 	die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
 530       }
 531       $OptionsInfo{CompoundID} = $Options{compoundid};
 532     }
 533     elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
 534       $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
 535     }
 536   }
 537   elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
 538     if (!$Options{datafields}) {
 539       die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
 540     }
 541     @SpecifiedDataFields = split /\,/, $Options{datafields};
 542     push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
 543   }
 544 
 545   $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
 546 
 547   $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'TopologicalAtomPairsFingerprints';
 548 
 549   $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
 550 
 551   $OptionsInfo{MinDistance} = $Options{mindistance};
 552   $OptionsInfo{MaxDistance} = $Options{maxdistance};
 553 
 554   $OptionsInfo{Output} = $Options{output};
 555   $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|Both)$/i) ? 1 : 0;
 556   $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|Both)$/i) ? 1 : 0;
 557 
 558   $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
 559   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
 560 
 561   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 562   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 563 
 564   $OptionsInfo{VectorStringFormat} = $Options{vectorstringformat};
 565 }
 566 
 567 # Process atom identifier type and related options...
 568 #
 569 sub ProcessAtomIdentifierTypeOptions {
 570 
 571   $OptionsInfo{AtomIdentifierType} = $Options{atomidentifiertype};
 572 
 573   if ($Options{atomidentifiertype} =~ /^AtomicInvariantsAtomTypes$/i) {
 574     ProcessAtomicInvariantsToUseOption();
 575   }
 576   elsif ($Options{atomidentifiertype} =~ /^FunctionalClassAtomTypes$/i) {
 577     ProcessFunctionalClassesToUse();
 578   }
 579   elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
 580     # Nothing to do for now...
 581   }
 582   else {
 583     die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
 584   }
 585 }
 586 
 587 # Process specified atomic invariants to use...
 588 #
 589 sub ProcessAtomicInvariantsToUseOption {
 590   my($AtomicInvariant, $AtomSymbolSpecified, @AtomicInvariantsWords);
 591 
 592   @{$OptionsInfo{AtomicInvariantsToUse}} = ();
 593   if (IsEmpty($Options{atomicinvariantstouse})) {
 594     die "Error: Atomic invariants value specified using \"--AtomicInvariantsToUse\" option is empty\n";
 595   }
 596   $AtomSymbolSpecified = 0;
 597   @AtomicInvariantsWords = split /\,/, $Options{atomicinvariantstouse};
 598   for $AtomicInvariant (@AtomicInvariantsWords) {
 599     if (!AtomicInvariantsAtomTypes::IsAtomicInvariantAvailable($AtomicInvariant)) {
 600       die "Error: Atomic invariant specified, $AtomicInvariant, using \"--AtomicInvariantsToUse\" option is not valid...\n ";
 601     }
 602     if ($AtomicInvariant =~ /^(AS|AtomSymbol)$/i) {
 603       $AtomSymbolSpecified = 1;
 604     }
 605     push @{$OptionsInfo{AtomicInvariantsToUse}}, $AtomicInvariant;
 606   }
 607   if (!$AtomSymbolSpecified) {
 608     die "Error: Atomic invariant, AS or AtomSymbol, must be specified as using \"--AtomicInvariantsToUse\" option...\n ";
 609   }
 610 }
 611 
 612 # Process specified functional classes invariants to use...
 613 #
 614 sub ProcessFunctionalClassesToUse {
 615   my($FunctionalClass, @FunctionalClassesToUseWords);
 616 
 617   @{$OptionsInfo{FunctionalClassesToUse}} = ();
 618   if (IsEmpty($Options{functionalclassestouse})) {
 619     die "Error: Functional classes value specified using \"--FunctionalClassesToUse\" option is empty\n";
 620   }
 621   @FunctionalClassesToUseWords = split /\,/, $Options{functionalclassestouse};
 622   for $FunctionalClass (@FunctionalClassesToUseWords) {
 623     if (!FunctionalClassAtomTypes::IsFunctionalClassAvailable($FunctionalClass)) {
 624       die "Error: Functional class specified, $FunctionalClass, using \"--FunctionalClassesToUse\" option is not valid...\n ";
 625     }
 626     push @{$OptionsInfo{FunctionalClassesToUse}}, $FunctionalClass;
 627   }
 628 }
 629 
 630 # Setup script usage  and retrieve command line arguments specified using various options...
 631 sub SetupScriptUsage {
 632 
 633   # Retrieve all the options...
 634   %Options = ();
 635 
 636   $Options{atomidentifiertype} = 'AtomicInvariantsAtomTypes';
 637   $Options{atomicinvariantstouse} = 'AS,X,BO,H,FC';
 638 
 639   $Options{functionalclassestouse} = 'HBD,HBA,PI,NI,Ar,Hal';
 640 
 641   $Options{compoundidmode} = 'LabelPrefix';
 642   $Options{compoundidlabel} = 'CompoundID';
 643   $Options{datafieldsmode} = 'CompoundID';
 644 
 645   $Options{filter} = 'Yes';
 646 
 647   $Options{keeplargestcomponent} = 'Yes';
 648 
 649   $Options{mindistance} = 1;
 650   $Options{maxdistance} = 10;
 651 
 652   $Options{output} = 'text';
 653   $Options{outdelim} = 'comma';
 654   $Options{quote} = 'yes';
 655 
 656   $Options{vectorstringformat} = 'IDsAndValuesString';
 657 
 658   if (!GetOptions(\%Options, "atomidentifiertype|a=s", "atomicinvariantstouse=s", "functionalclassestouse=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabel=s",  "help|h", "keeplargestcomponent|k=s",  "mindistance=s", "maxdistance=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "vectorstringformat|v=s", "workingdir|w=s")) {
 659     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 660   }
 661   if ($Options{workingdir}) {
 662     if (! -d $Options{workingdir}) {
 663       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 664     }
 665     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 666   }
 667   if ($Options{atomidentifiertype} !~ /^(AtomicInvariantsAtomTypes|DREIDINGAtomTypes|EStateAtomTypes|FunctionalClassAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
 668     die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
 669   }
 670   if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
 671     die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
 672   }
 673   if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
 674     die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
 675   }
 676   if ($Options{filter} !~ /^(Yes|No)$/i) {
 677     die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
 678   }
 679   if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
 680     die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
 681   }
 682   if (!IsPositiveInteger($Options{mindistance})) {
 683     die "Error: The value specified, $Options{mindistance}, for option \"--MinDistance\" is not valid. Allowed values: > 0 \n";
 684   }
 685   if (!IsPositiveInteger($Options{maxdistance})) {
 686     die "Error: The value specified, $Options{maxdistance}, for option \"--MaxDistance\" is not valid. Allowed values: > 0 \n";
 687   }
 688   if ($Options{mindistance} > $Options{maxdistance}) {
 689     die "Error: The value specified, specified, $Options{mindistance}, for option \"--MinDistance\" must be less than the value specified, $Options{maxdistance}, for option \"--MaxDistance\" \n";
 690   }
 691   if ($Options{output} !~ /^(SD|text|both)$/i) {
 692     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n";
 693   }
 694   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 695     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 696   }
 697   if ($Options{quote} !~ /^(Yes|No)$/i) {
 698     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
 699   }
 700   if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) {
 701     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n";
 702   }
 703   if ($Options{vectorstringformat} !~ /^(IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) {
 704     die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
 705   }
 706 }
 707