MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: MACCSKeysFingerprints.pl,v $
   4 # $Date: 2011/12/16 00:03:31 $
   5 # $Revision: 1.24 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2012 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 use SDFileUtil;
  38 use MoleculeFileIO;
  39 use FileIO::FingerprintsSDFileIO;
  40 use FileIO::FingerprintsTextFileIO;
  41 use FileIO::FingerprintsFPFileIO;
  42 use Fingerprints::MACCSKeys;
  43 
  44 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  45 
  46 # Autoflush STDOUT
  47 $| = 1;
  48 
  49 # Starting message...
  50 $ScriptName = basename($0);
  51 print "\n$ScriptName: Starting...\n\n";
  52 $StartTime = new Benchmark;
  53 
  54 # Get the options and setup script...
  55 SetupScriptUsage();
  56 if ($Options{help} || @ARGV < 1) {
  57   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  58 }
  59 
  60 my(@SDFilesList);
  61 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  62 
  63 # Process options...
  64 print "Processing options...\n";
  65 my(%OptionsInfo);
  66 ProcessOptions();
  67 
  68 # Setup information about input files...
  69 print "Checking input SD file(s)...\n";
  70 my(%SDFilesInfo);
  71 RetrieveSDFilesInfo();
  72 
  73 # Process input files..
  74 my($FileIndex);
  75 if (@SDFilesList > 1) {
  76   print "\nProcessing SD files...\n";
  77 }
  78 for $FileIndex (0 .. $#SDFilesList) {
  79   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  80     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  81     GenerateMACCSKeysFingerprints($FileIndex);
  82   }
  83 }
  84 print "\n$ScriptName:Done...\n\n";
  85 
  86 $EndTime = new Benchmark;
  87 $TotalTime = timediff ($EndTime, $StartTime);
  88 print "Total time: ", timestr($TotalTime), "\n";
  89 
  90 ###############################################################################
  91 
  92 # Generate fingerprints for a SD file...
  93 #
  94 sub GenerateMACCSKeysFingerprints {
  95   my($FileIndex) = @_;
  96   my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
  97 
  98   $SDFile = $SDFilesList[$FileIndex];
  99 
 100   # Setup output files...
 101   #
 102   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex);
 103 
 104   $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
 105   $MoleculeFileIO->Open();
 106 
 107   $CmpdCount = 0;
 108   $IgnoredCmpdCount = 0;
 109 
 110   COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
 111     $CmpdCount++;
 112 
 113     # Filter compound data before calculating fingerprints...
 114     if ($OptionsInfo{Filter}) {
 115       if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
 116         $IgnoredCmpdCount++;
 117         next COMPOUND;
 118       }
 119     }
 120 
 121     $MACCSKeysFingerprints = GenerateMoleculeFingerprints($Molecule);
 122     if (!$MACCSKeysFingerprints) {
 123       $IgnoredCmpdCount++;
 124       ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule);
 125       next COMPOUND;
 126     }
 127 
 128     WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 129   }
 130   $MoleculeFileIO->Close();
 131 
 132   if ($NewFPSDFileIO) {
 133     $NewFPSDFileIO->Close();
 134   }
 135   if ($NewFPTextFileIO) {
 136     $NewFPTextFileIO->Close();
 137   }
 138   if ($NewFPFileIO) {
 139     $NewFPFileIO->Close();
 140   }
 141 
 142   WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount);
 143 }
 144 
 145 # Process compound being ignored due to problems in fingerprints geneation...
 146 #
 147 sub ProcessIgnoredCompound {
 148   my($Mode, $CmpdCount, $Molecule) = @_;
 149   my($CmpdID, $DataFieldLabelAndValuesRef);
 150 
 151   $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 152   $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 153 
 154   MODE: {
 155     if ($Mode =~ /^ContainsNonElementalData$/i) {
 156       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
 157       next MODE;
 158     }
 159 
 160     if ($Mode =~ /^ContainsNoElementalData$/i) {
 161       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
 162       next MODE;
 163     }
 164 
 165     if ($Mode =~ /^FingerprintsGenerationFailed$/i) {
 166       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 167       next MODE;
 168     }
 169     warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 170   }
 171 }
 172 
 173 # Check and filter compounds....
 174 #
 175 sub CheckAndFilterCompound {
 176   my($CmpdCount, $Molecule) = @_;
 177   my($ElementCount, $NonElementCount);
 178 
 179   ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
 180 
 181   if ($NonElementCount) {
 182     ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
 183     return 1;
 184   }
 185 
 186   if (!$ElementCount) {
 187     ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
 188     return 1;
 189   }
 190 
 191   return 0;
 192 }
 193 
 194 # Write out compounds fingerprints generation summary statistics...
 195 #
 196 sub WriteFingerprintsGenerationSummaryStatistics {
 197   my($CmpdCount, $IgnoredCmpdCount) = @_;
 198   my($ProcessedCmpdCount);
 199 
 200   $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
 201 
 202   print "\nNumber of compounds: $CmpdCount\n";
 203   print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n";
 204   print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n";
 205 }
 206 
 207 # Open output files...
 208 #
 209 sub SetupAndOpenOutputFiles {
 210   my($FileIndex) = @_;
 211   my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams);
 212 
 213   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
 214 
 215   # Setup common parameters for fingerprints file IO objects...
 216   #
 217   %FingerprintsFileIOParams = ();
 218   if ($OptionsInfo{Mode} =~ /^MACCSKeyBits$/i) {
 219     %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsBitVectorString', 'BitStringFormat' => $OptionsInfo{BitStringFormat}, 'BitsOrder' => $OptionsInfo{BitsOrder});
 220   }
 221   elsif ($OptionsInfo{Mode} =~ /^MACCSKeyCount$/i) {
 222     %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat});
 223   }
 224 
 225   if ($OptionsInfo{SDOutput}) {
 226     $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
 227     print "Generating SD file $NewFPSDFile...\n";
 228     $NewFPSDFileIO = new FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel});
 229     $NewFPSDFileIO->Open();
 230   }
 231 
 232   if ($OptionsInfo{FPOutput}) {
 233     $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex];
 234     print "Generating FP file $NewFPFile...\n";
 235     $NewFPFileIO = new FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams);
 236     $NewFPFileIO->Open();
 237   }
 238 
 239   if ($OptionsInfo{TextOutput}) {
 240     my($ColLabelsRef);
 241 
 242     $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
 243     $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex);
 244 
 245     print "Generating text file $NewFPTextFile...\n";
 246     $NewFPTextFileIO = new FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote});
 247     $NewFPTextFileIO->Open();
 248   }
 249 
 250   return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 251 }
 252 
 253 # Write fingerpritns and other data to appropriate output files...
 254 #
 255 sub WriteDataToOutputFiles {
 256   my($FileIndex, $CmpdCount, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_;
 257   my($DataFieldLabelAndValuesRef);
 258 
 259   $DataFieldLabelAndValuesRef = undef;
 260   if ($NewFPTextFileIO || $NewFPFileIO) {
 261     $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 262   }
 263 
 264   if ($NewFPSDFileIO) {
 265     my($CmpdString);
 266 
 267     $CmpdString = $Molecule->GetInputMoleculeString();
 268     $NewFPSDFileIO->WriteFingerprints($MACCSKeysFingerprints, $CmpdString);
 269   }
 270 
 271   if ($NewFPTextFileIO) {
 272     my($ColValuesRef);
 273 
 274     $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 275     $NewFPTextFileIO->WriteFingerprints($MACCSKeysFingerprints, $ColValuesRef);
 276   }
 277 
 278   if ($NewFPFileIO) {
 279     my($CompoundID);
 280 
 281     $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 282     $NewFPFileIO->WriteFingerprints($MACCSKeysFingerprints, $CompoundID);
 283   }
 284 }
 285 
 286 # Generate approriate column labels for FPText output file...
 287 #
 288 sub SetupFPTextFileCoulmnLabels {
 289   my($FileIndex) = @_;
 290   my($Line, @ColLabels);
 291 
 292   @ColLabels = ();
 293   if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 294     push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 295   }
 296   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 297     push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 298   }
 299   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 300     push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}};
 301   }
 302   elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 303     push @ColLabels, $OptionsInfo{CompoundIDLabel};
 304   }
 305   # Add fingerprints label...
 306   push @ColLabels, $OptionsInfo{FingerprintsLabel};
 307 
 308   return \@ColLabels;
 309 }
 310 
 311 # Generate column values FPText output file..
 312 #
 313 sub SetupFPTextFileCoulmnValues {
 314   my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 315   my(@ColValues);
 316 
 317   @ColValues = ();
 318   if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 319     push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 320   }
 321   elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 322     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 323   }
 324   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 325     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 326   }
 327   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 328     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
 329   }
 330 
 331   return \@ColValues;
 332 }
 333 
 334 # Generate compound ID for FP and FPText output files..
 335 #
 336 sub SetupCmpdIDForOutputFiles {
 337   my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 338   my($CmpdID);
 339 
 340   $CmpdID = '';
 341   if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
 342     my($MolName);
 343     $MolName = $Molecule->GetName();
 344     $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
 345   }
 346   elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
 347     $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
 348   }
 349   elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
 350     my($SpecifiedDataField);
 351     $SpecifiedDataField = $OptionsInfo{CompoundID};
 352     $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
 353   }
 354   elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
 355     $CmpdID = $Molecule->GetName();
 356   }
 357   return $CmpdID;
 358 }
 359 
 360 # Generate fingerprints for molecule...
 361 #
 362 sub GenerateMoleculeFingerprints {
 363   my($Molecule) = @_;
 364   my($MACCSKeysFingerprints);
 365 
 366   if ($OptionsInfo{KeepLargestComponent}) {
 367     $Molecule->KeepLargestComponent();
 368   }
 369   if (!$Molecule->DetectRings()) {
 370     return undef;
 371   }
 372   $Molecule->DetectAromaticity();
 373 
 374   $MACCSKeysFingerprints = undef;
 375   if ($OptionsInfo{Mode} =~ /^MACCSKeyBits$/i) {
 376     $MACCSKeysFingerprints = new MACCSKeys('Molecule' => $Molecule, 'Type' => 'MACCSKeyBits', 'Size' => $OptionsInfo{Size});
 377   }
 378   elsif ($OptionsInfo{Mode} =~ /^MACCSKeyCount$/i) {
 379     $MACCSKeysFingerprints = new MACCSKeys('Molecule' => $Molecule, 'Type' => 'MACCSKeyCount', 'Size' => $OptionsInfo{Size});
 380   }
 381   else {
 382     die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: MACCSKeyBits or MACCSKeyCount\n";
 383   }
 384   $MACCSKeysFingerprints->GenerateMACCSKeys();
 385 
 386   return $MACCSKeysFingerprints;
 387 }
 388 
 389 # Retrieve information about SD files...
 390 #
 391 sub RetrieveSDFilesInfo {
 392   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
 393 
 394   %SDFilesInfo = ();
 395   @{$SDFilesInfo{FileOkay}} = ();
 396   @{$SDFilesInfo{OutFileRoot}} = ();
 397   @{$SDFilesInfo{SDOutFileNames}} = ();
 398   @{$SDFilesInfo{FPOutFileNames}} = ();
 399   @{$SDFilesInfo{TextOutFileNames}} = ();
 400   @{$SDFilesInfo{AllDataFieldsRef}} = ();
 401   @{$SDFilesInfo{CommonDataFieldsRef}} = ();
 402 
 403   $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
 404   $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
 405 
 406   FILELIST: for $Index (0 .. $#SDFilesList) {
 407     $SDFile = $SDFilesList[$Index];
 408 
 409     $SDFilesInfo{FileOkay}[$Index] = 0;
 410     $SDFilesInfo{OutFileRoot}[$Index] = '';
 411     $SDFilesInfo{SDOutFileNames}[$Index] = '';
 412     $SDFilesInfo{FPOutFileNames}[$Index] = '';
 413     $SDFilesInfo{TextOutFileNames}[$Index] = '';
 414 
 415     $SDFile = $SDFilesList[$Index];
 416     if (!(-e $SDFile)) {
 417       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 418       next FILELIST;
 419     }
 420     if (!CheckFileType($SDFile, "sd sdf")) {
 421       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 422       next FILELIST;
 423     }
 424 
 425     if ($CheckDataField) {
 426       # Make sure data field exists in SD file..
 427       my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
 428 
 429       @CmpdLines = ();
 430       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 431       $CmpdString = ReadCmpdString(\*SDFILE);
 432       close SDFILE;
 433       @CmpdLines = split "\n", $CmpdString;
 434       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 435       $SpecifiedDataField = $OptionsInfo{CompoundID};
 436       if (!exists $DataFieldValues{$SpecifiedDataField}) {
 437         warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using  \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
 438         next FILELIST;
 439       }
 440     }
 441 
 442     $AllDataFieldsRef = '';
 443     $CommonDataFieldsRef = '';
 444     if ($CollectDataFields) {
 445       my($CmpdCount);
 446       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 447       ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
 448       close SDFILE;
 449     }
 450 
 451     # Setup output file names...
 452     $FileDir = ""; $FileName = ""; $FileExt = "";
 453     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 454 
 455     $TextOutFileExt = "csv";
 456     if ($Options{outdelim} =~ /^tab$/i) {
 457       $TextOutFileExt = "tsv";
 458     }
 459     $SDOutFileExt = $FileExt;
 460     $FPOutFileExt = "fpf";
 461 
 462     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
 463       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 464       if ($RootFileName && $RootFileExt) {
 465         $FileName = $RootFileName;
 466       }
 467       else {
 468         $FileName = $OptionsInfo{OutFileRoot};
 469       }
 470       $OutFileRoot = $FileName;
 471     }
 472     else {
 473       $OutFileRoot = "${FileName}MACCSKeysFP";
 474     }
 475 
 476     $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
 477     $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}";
 478     $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
 479 
 480     if ($OptionsInfo{SDOutput}) {
 481       if ($SDFile =~ /$NewSDFileName/i) {
 482         warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
 483         print "Specify a different name using \"-r --root\" option or use default name.\n";
 484         next FILELIST;
 485       }
 486     }
 487 
 488     if (!$OptionsInfo{OverwriteFiles}) {
 489       # Check SD and text outout files...
 490       if ($OptionsInfo{SDOutput}) {
 491         if (-e $NewSDFileName) {
 492           warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
 493           next FILELIST;
 494         }
 495       }
 496       if ($OptionsInfo{FPOutput}) {
 497         if (-e $NewFPFileName) {
 498           warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n";
 499           next FILELIST;
 500         }
 501       }
 502       if ($OptionsInfo{TextOutput}) {
 503         if (-e $NewTextFileName) {
 504           warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
 505           next FILELIST;
 506         }
 507       }
 508     }
 509 
 510     $SDFilesInfo{FileOkay}[$Index] = 1;
 511 
 512     $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 513     $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
 514     $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName;
 515     $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
 516 
 517     $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
 518     $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
 519   }
 520 }
 521 
 522 # Process option values...
 523 sub ProcessOptions {
 524   %OptionsInfo = ();
 525 
 526   $OptionsInfo{Mode} = $Options{mode};
 527 
 528   $OptionsInfo{BitsOrder} = $Options{bitsorder};
 529   $OptionsInfo{BitStringFormat} = $Options{bitstringformat};
 530 
 531   $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
 532   $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
 533   $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
 534 
 535   $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
 536 
 537   my(@SpecifiedDataFields);
 538   @SpecifiedDataFields = ();
 539 
 540   @{$OptionsInfo{SpecifiedDataFields}} = ();
 541   $OptionsInfo{CompoundID} = '';
 542 
 543   if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
 544     if ($Options{compoundidmode} =~ /^DataField$/i) {
 545       if (!$Options{compoundid}) {
 546         die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
 547       }
 548       $OptionsInfo{CompoundID} = $Options{compoundid};
 549     }
 550     elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
 551       $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
 552     }
 553   }
 554   elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
 555     if (!$Options{datafields}) {
 556       die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
 557     }
 558     @SpecifiedDataFields = split /\,/, $Options{datafields};
 559     push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
 560   }
 561 
 562   $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'MACCSKeysFingerprints';
 563 
 564   $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
 565 
 566   $OptionsInfo{Output} = $Options{output};
 567   $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0;
 568   $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0;
 569   $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0;
 570 
 571   $OptionsInfo{OutDelim} = $Options{outdelim};
 572   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
 573 
 574   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 575   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 576 
 577   $OptionsInfo{Size} = $Options{size};
 578 
 579   $OptionsInfo{VectorStringFormat} = $Options{vectorstringformat};
 580 }
 581 
 582 # Setup script usage  and retrieve command line arguments specified using various options...
 583 sub SetupScriptUsage {
 584 
 585   # Retrieve all the options...
 586   %Options = ();
 587 
 588   $Options{bitsorder} = 'Ascending';
 589   $Options{bitstringformat} = 'BinaryString';
 590 
 591   $Options{compoundidmode} = 'LabelPrefix';
 592   $Options{compoundidlabel} = 'CompoundID';
 593   $Options{datafieldsmode} = 'CompoundID';
 594 
 595   $Options{filter} = 'Yes';
 596 
 597   $Options{detectaromaticity} = 'Yes';
 598   $Options{keeplargestcomponent} = 'Yes';
 599 
 600   $Options{mode} = 'MACCSKeyBits';
 601 
 602   $Options{output} = 'text';
 603   $Options{outdelim} = 'comma';
 604   $Options{quote} = 'yes';
 605 
 606   $Options{size} = 166;
 607 
 608   $Options{vectorstringformat} = 'ValuesString';
 609 
 610   if (!GetOptions(\%Options, "bitsorder=s", "bitstringformat|b=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabel=s",  "help|h", "keeplargestcomponent|k=s", "mode|m=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "size|s=i", "vectorstringformat|v=s", "workingdir|w=s")) {
 611     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 612   }
 613   if ($Options{workingdir}) {
 614     if (! -d $Options{workingdir}) {
 615       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 616     }
 617     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 618   }
 619   if ($Options{bitsorder} !~ /^(Ascending|Descending)$/i) {
 620     die "Error: The value specified, $Options{bitsorder}, for option \"--BitsOrder\" is not valid. Allowed values: Ascending or Descending\n";
 621   }
 622   if ($Options{bitstringformat} !~ /^(BinaryString|HexadecimalString)$/i) {
 623     die "Error: The value specified, $Options{bitstringformat}, for option \"-b, --bitstringformat\" is not valid. Allowed values: BinaryString or HexadecimalString\n";
 624   }
 625   if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
 626     die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
 627   }
 628   if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
 629     die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
 630   }
 631   if ($Options{filter} !~ /^(Yes|No)$/i) {
 632     die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
 633   }
 634   if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
 635     die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
 636   }
 637   if ($Options{mode} !~ /^(MACCSKeyBits|MACCSKeyCount)$/i) {
 638     die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: MACCSKeyBits or MACCSKeyCount\n";
 639   }
 640   if ($Options{output} !~ /^(SD|FP|text|all)$/i) {
 641     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n";
 642   }
 643   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 644     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 645   }
 646   if ($Options{quote} !~ /^(Yes|No)$/i) {
 647     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
 648   }
 649   if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) {
 650     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n";
 651   }
 652   if (!(IsPositiveInteger($Options{size}) && ($Options{size} == 166 || $Options{size} == 322))) {
 653     die "Error: The value specified, $Options{size}, for option \"-s, --size\" is not valid. Allowed values: 166 or 322 \n";
 654   }
 655   if ($Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) {
 656     die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
 657   }
 658 }
 659