MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: PathLengthFingerprints.pl,v $
   4 # $Date: 2008/04/19 16:12:21 $
   5 # $Revision: 1.16 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use 5.006;
  30 use strict;
  31 use FindBin; use lib "$FindBin::Bin/../lib";
  32 use Getopt::Long;
  33 use File::Basename;
  34 use Text::ParseWords;
  35 use Benchmark;
  36 use FileUtil;
  37 use TextUtil;
  38 use SDFileUtil;
  39 use MoleculeFileIO;
  40 use Fingerprints::PathLengthFingerprints;
  41 
  42 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  43 
  44 # Autoflush STDOUT
  45 $| = 1;
  46 
  47 # Starting message...
  48 $ScriptName = basename($0);
  49 print "\n$ScriptName: Starting...\n\n";
  50 $StartTime = new Benchmark;
  51 
  52 # Get the options and setup script...
  53 SetupScriptUsage();
  54 if ($Options{help} || @ARGV < 1) {
  55   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  56 }
  57 
  58 my(@SDFilesList);
  59 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  60 
  61 # Process options...
  62 my(%OptionsInfo);
  63 ProcessOptions();
  64 
  65 # Setup information about input files...
  66 my(%SDFilesInfo);
  67 print "Checking input SD file(s)...\n";
  68 RetrieveSDFilesInfo();
  69 
  70 # Process input files..
  71 my($FileIndex, $SDFile, $FileProcessingMsg);
  72 $FileProcessingMsg = "Processing file";
  73 if (@SDFilesList > 1) {
  74   print "Processing SD files...\n";
  75   $FileProcessingMsg = "\n$FileProcessingMsg";
  76 }
  77 
  78 for $FileIndex (0 .. $#SDFilesList) {
  79   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  80     $SDFile = $SDFilesList[$FileIndex];
  81     print "$FileProcessingMsg $SDFile...\n";
  82     GeneratePathLengthFingerprints($FileIndex);
  83   }
  84 }
  85 print "$ScriptName:Done...\n\n";
  86 
  87 $EndTime = new Benchmark;
  88 $TotalTime = timediff ($EndTime, $StartTime);
  89 print "Total time: ", timestr($TotalTime), "\n";
  90 
  91 ###############################################################################
  92 
  93 # Generate fingerprints for a SD file...
  94 #
  95 sub GeneratePathLengthFingerprints {
  96   my($FileIndex) = @_;
  97   my($CmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $PathLengthFingerprints, $NewSDFileRef, $NewTextFileRef);
  98 
  99   $SDFile = $SDFilesList[$FileIndex];
 100 
 101   # Setup output files...
 102   $NewSDFileRef = '';
 103   $NewTextFileRef = '';
 104   ($NewSDFileRef, $NewTextFileRef) = SetupAndOpenOutputFiles($FileIndex);
 105 
 106   $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
 107   $MoleculeFileIO->Open();
 108 
 109   $CmpdCount = 0;
 110   while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
 111     $CmpdCount++;
 112     $PathLengthFingerprints = GenerateMoleculeFingerprints($Molecule);
 113     WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $PathLengthFingerprints, $NewSDFileRef, $NewTextFileRef);
 114   }
 115   $MoleculeFileIO->Close();
 116 
 117   if ($OptionsInfo{SDOutput}) {
 118     close $NewSDFileRef;
 119   }
 120   if ($OptionsInfo{TextOutput}) {
 121     close $NewTextFileRef;
 122   }
 123 }
 124 
 125 # Open output files...
 126 #
 127 sub SetupAndOpenOutputFiles {
 128   my($FileIndex) = @_;
 129   my($NewSDFile, $NewTextFile, $NewSDFileRef, $NewTextFileRef);
 130 
 131   $NewSDFileRef = '';
 132   $NewTextFileRef = '';
 133 
 134   if ($OptionsInfo{SDOutput}) {
 135     $NewSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
 136     print "Generating SD file $NewSDFile...\n";
 137     open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
 138     $NewSDFileRef = \*NEWSDFILE;
 139   }
 140   if ($OptionsInfo{TextOutput}) {
 141     $NewTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
 142     print "Generating text file $NewTextFile...\n";
 143     open NEWTEXTFILE, ">$NewTextFile" or die "Error: Couldn't open $NewTextFile: $! \n";
 144     WriteTextFileCoulmnLabels($FileIndex, \*NEWTEXTFILE);
 145     $NewTextFileRef = \*NEWTEXTFILE;
 146   }
 147   return ($NewSDFileRef, $NewTextFileRef);
 148 }
 149 
 150 # Write fingerpritns and other data to appropriate output files...
 151 #
 152 sub WriteDataToOutputFiles {
 153   my($FileIndex, $CmpdCount, $Molecule, $PathLengthFingerprints, $NewSDFileRef, $NewTextFileRef) = @_;
 154   my($FingerprintsString);
 155 
 156   $FingerprintsString = GetFingerprintsString($PathLengthFingerprints);
 157 
 158   if ($OptionsInfo{SDOutput}) {
 159     # Retrieve input compound string used to create molecule and write it out
 160     # without last line containing a delimiter...
 161     my($CmpdString);
 162     $CmpdString = $Molecule->GetMDLCmpdString();
 163     $CmpdString =~ s/\$\$\$\$$//;
 164     print $NewSDFileRef "$CmpdString";
 165 
 166     # Write out fingerprints data...
 167     print $NewSDFileRef  ">  <$OptionsInfo{FingerprintsLabel}>\n$FingerprintsString\n\n";
 168 
 169     # Write out delimiter...
 170     print $NewSDFileRef "\$\$\$\$\n";
 171   }
 172 
 173   if ($OptionsInfo{TextOutput}) {
 174     my($Line, $DataFieldLabelAndValuesRef, $DataFieldLabel, $DataFieldValue, @LineWords,);
 175 
 176     $DataFieldLabelAndValuesRef = $Molecule->GetMDLDataFieldLabelAndValues();
 177     @LineWords = ();
 178     if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 179       push @LineWords, SetupCmpdIDForTextFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 180     }
 181     elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 182       @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 183     }
 184     elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 185       @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 186     }
 187     elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 188       @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
 189     }
 190 
 191     # Add fingerprints string...
 192     push @LineWords, $FingerprintsString;
 193 
 194     $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 195     print $NewTextFileRef "$Line\n";
 196   }
 197 }
 198 
 199 # Write out approriate column labels to text file...
 200 sub WriteTextFileCoulmnLabels {
 201   my($FileIndex, $NewTextFileRef) = @_;
 202   my($Line, @LineWords);
 203 
 204   @LineWords = ();
 205   if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 206     push @LineWords, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 207   }
 208   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 209     push @LineWords, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 210   }
 211   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 212     push @LineWords, @{$OptionsInfo{SpecifiedDataFields}};
 213   }
 214   elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 215     push @LineWords, $OptionsInfo{CompoundIDLabel};
 216   }
 217   # Add fingerprints label...
 218   push @LineWords, $OptionsInfo{FingerprintsLabel};
 219 
 220   $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 221   print $NewTextFileRef "$Line\n";
 222 }
 223 
 224 # Generate compound ID for text files..
 225 #
 226 sub SetupCmpdIDForTextFiles {
 227   my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 228   my($CmpdID);
 229 
 230   $CmpdID = '';
 231   if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
 232     my($MolName);
 233     $MolName = $Molecule->GetName();
 234     $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
 235   }
 236   elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
 237     $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
 238   }
 239   elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
 240     my($SpecifiedDataField);
 241     $SpecifiedDataField = $OptionsInfo{CompoundID};
 242     $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
 243   }
 244   elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
 245     $CmpdID = $Molecule->GetName();
 246   }
 247   return $CmpdID;
 248 }
 249 
 250 # Get fingerprints bits as a string...
 251 #
 252 sub GetFingerprintsString {
 253   my($PathLengthFingerprints) = @_;
 254   my($FingerprintsString, $Size);
 255 
 256   $FingerprintsString = 'PathLength';
 257   $Size = $PathLengthFingerprints->GetSize();
 258   if ($OptionsInfo{FingerprintsOutput} =~ /^HexadecimalString$/i) {
 259     $FingerprintsString .= ":Hexadecimal:${Size}:" . $PathLengthFingerprints->GetFingerprintBitsAsHexadecimalString();
 260   }
 261   elsif ($OptionsInfo{FingerprintsOutput} =~ /^BinaryString$/i) {
 262     $FingerprintsString .= ":Binary:${Size}:" . $PathLengthFingerprints->GetFingerprintBitsAsBinaryString();
 263   }
 264   elsif ($OptionsInfo{FingerprintsOutput} =~ /^RawBinaryString$/i) {
 265     $FingerprintsString .= ":RawBinary:${Size}:" . $PathLengthFingerprints->GetFingerprintBitsAsRawBinaryString();
 266   }
 267   return $FingerprintsString;
 268 }
 269 
 270 # Generate fingerprints for molecule...
 271 #
 272 sub GenerateMoleculeFingerprints {
 273   my($Molecule) = @_;
 274   my($PathLengthFingerprints);
 275 
 276   if ($OptionsInfo{KeepLargestComponent}) {
 277     $Molecule->KeepLargestComponent();
 278   }
 279   if ($OptionsInfo{IgnoreHydrogens}) {
 280     $Molecule->DeleteHydrogens();
 281   }
 282   if ($OptionsInfo{DetectAromaticity} && $OptionsInfo{UseBondSymbols}) {
 283     $Molecule->DetectRings();
 284     $Molecule->DetectAromaticity();
 285   }
 286   $PathLengthFingerprints = new PathLengthFingerprints('Molecule' => $Molecule, 'Size' => $OptionsInfo{Size}, 'MinLength' => $OptionsInfo{MinPathLength}, 'MaxLength' => $OptionsInfo{MaxPathLength}, 'AllowRings' => $OptionsInfo{AllowRings}, 'AllowSharedBonds' => $OptionsInfo{AllowSharedBonds}, 'UseBondSymbols' => $OptionsInfo{UseBondSymbols});
 287 
 288   $PathLengthFingerprints->GenerateFingerprints();
 289 
 290   if ($OptionsInfo{Fold}) {
 291     my($CheckSizeValue) = 0;
 292     $PathLengthFingerprints->FoldFingerprintsBySize($OptionsInfo{FoldedSize}, $CheckSizeValue);
 293   }
 294   return $PathLengthFingerprints;
 295 }
 296 
 297 # Retrieve information about SD files...
 298 #
 299 sub RetrieveSDFilesInfo {
 300   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $NewSDFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
 301 
 302   %SDFilesInfo = ();
 303   @{$SDFilesInfo{FileOkay}} = ();
 304   @{$SDFilesInfo{OutFileRoot}} = ();
 305   @{$SDFilesInfo{SDOutFileNames}} = ();
 306   @{$SDFilesInfo{TextOutFileNames}} = ();
 307   @{$SDFilesInfo{AllDataFieldsRef}} = ();
 308   @{$SDFilesInfo{CommonDataFieldsRef}} = ();
 309 
 310   $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
 311   $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
 312 
 313   FILELIST: for $Index (0 .. $#SDFilesList) {
 314     $SDFile = $SDFilesList[$Index];
 315 
 316     $SDFilesInfo{FileOkay}[$Index] = 0;
 317     $SDFilesInfo{OutFileRoot}[$Index] = '';
 318     $SDFilesInfo{SDOutFileNames}[$Index] = '';
 319     $SDFilesInfo{TextOutFileNames}[$Index] = '';
 320 
 321     $SDFile = $SDFilesList[$Index];
 322     if (!(-e $SDFile)) {
 323       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 324       next FILELIST;
 325     }
 326     if (!CheckFileType($SDFile, "sd sdf")) {
 327       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 328       next FILELIST;
 329     }
 330 
 331     if ($CheckDataField) {
 332       # Make sure data field exists in SD file..
 333       my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
 334 
 335       @CmpdLines = ();
 336       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 337       $CmpdString = ReadCmpdString(\*SDFILE);
 338       close SDFILE;
 339       @CmpdLines = split "\n", $CmpdString;
 340       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 341       $SpecifiedDataField = $OptionsInfo{CompoundID};
 342       if (!exists $DataFieldValues{$SpecifiedDataField}) {
 343 	warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using  \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
 344 	next FILELIST;
 345       }
 346     }
 347 
 348     $AllDataFieldsRef = '';
 349     $CommonDataFieldsRef = '';
 350     if ($CollectDataFields) {
 351       my($CmpdCount);
 352       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 353       ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
 354       close SDFILE;
 355     }
 356 
 357     # Setup output file names...
 358     $FileDir = ""; $FileName = ""; $FileExt = "";
 359     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 360 
 361     $TextOutFileExt = "csv";
 362     if ($Options{outdelim} =~ /^tab$/i) {
 363       $TextOutFileExt = "tsv";
 364     }
 365     $SDOutFileExt = $FileExt;
 366 
 367     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
 368       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 369       if ($RootFileName && $RootFileExt) {
 370 	$FileName = $RootFileName;
 371       }
 372       else {
 373 	$FileName = $OptionsInfo{OutFileRoot};
 374       }
 375       $OutFileRoot = $FileName;
 376     }
 377     else {
 378       $OutFileRoot = "${FileName}PathLengthFP";
 379     }
 380 
 381     $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
 382     $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
 383 
 384     if ($OptionsInfo{SDOutput}) {
 385       if ($SDFile =~ /$NewSDFileName/i) {
 386 	warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
 387 	print "Specify a different name using \"-r --root\" option or use default name.\n";
 388 	next FILELIST;
 389       }
 390     }
 391 
 392     if (!$OptionsInfo{OverwriteFiles}) {
 393       # Check SD and text outout files...
 394       if ($OptionsInfo{SDOutput}) {
 395 	if (-e $NewSDFileName) {
 396 	  warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
 397 	  next FILELIST;
 398 	}
 399       }
 400       if ($OptionsInfo{TextOutput}) {
 401 	if (-e $NewTextFileName) {
 402 	  warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
 403 	  next FILELIST;
 404 	}
 405       }
 406     }
 407 
 408     $SDFilesInfo{FileOkay}[$Index] = 1;
 409 
 410     $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 411     $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
 412     $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
 413 
 414     $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
 415     $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
 416   }
 417 }
 418 
 419 # Process option values...
 420 sub ProcessOptions {
 421   %OptionsInfo = ();
 422 
 423   $OptionsInfo{Mode} = $Options{mode};
 424   $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
 425   $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
 426   $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
 427 
 428   my(@SpecifiedDataFields);
 429   @SpecifiedDataFields = ();
 430 
 431   $OptionsInfo{DataFields} = '';
 432   @{$OptionsInfo{SpecifiedDataFields}} = ();
 433   $OptionsInfo{CompoundID} = '';
 434 
 435   if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
 436     if ($Options{compoundidmode} =~ /^DataField$/i) {
 437       if (!$Options{compoundid}) {
 438 	die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
 439       }
 440       $OptionsInfo{CompoundID} = $Options{compoundid};
 441     }
 442     elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
 443       $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
 444     }
 445   }
 446   elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
 447     if (!$Options{datafields}) {
 448       die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
 449     }
 450     @SpecifiedDataFields = split /\,/, $Options{datafields};
 451     push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
 452   }
 453   $OptionsInfo{DetectAromaticity} = ($Options{detectaromaticity} =~ /^Yes$/i) ? 1 : 0;
 454 
 455   $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'PathLengthFingerprints';
 456   $OptionsInfo{FingerprintsOutput} = $Options{fingerprintsoutput};
 457 
 458   my($Size, $MinSize, $MaxSize);
 459   $MinSize = 32;
 460   $MaxSize = 2**32;
 461   $Size = $Options{size};
 462   if (!(IsPositiveInteger($Size) && $Size >= $MinSize && $Size <= $MaxSize && IsNumberPowerOfNumber($Size, 2))) {
 463     die "Error: Invalid size value, $Size, for \"-s, --size\" option. Allowed values: power of 2, >= minimum size of $MinSize, and <= maximum size of $MaxSize.\n";
 464   }
 465   $OptionsInfo{Size} = $Size;
 466 
 467   $OptionsInfo{Fold} = ($Options{fold} =~ /^Yes$/i) ? 1 : 0;
 468   my($FoldedSize);
 469   $FoldedSize = $Options{foldedsize};
 470   if ($Options{fold} =~ /^Yes$/i) {
 471     if (!(IsPositiveInteger($FoldedSize) && $FoldedSize < $Size && IsNumberPowerOfNumber($FoldedSize, 2))) {
 472       die "Error: Invalid folded size value, $FoldedSize, for \"--FoldedSize\" option. Allowed values: power of 2, >= minimum size of $MinSize, and < size value of $Size.\n";
 473     }
 474   }
 475   $OptionsInfo{FoldedSize} = $FoldedSize;
 476 
 477   $OptionsInfo{IgnoreHydrogens} = ($Options{ignorehydrogens} =~ /^Yes$/i) ? 1 : 0;
 478   $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
 479 
 480   my($MinPathLength, $MaxPathLength);
 481   $MinPathLength = $Options{minpathlength};
 482   $MaxPathLength = $Options{maxpathlength};
 483   if (!IsPositiveInteger($MinPathLength)) {
 484     die "Error: Invalid path length value, $MinPathLength, for \"--MinPathLength\" option. Allowed values: > 0\n";
 485   }
 486   if (!IsPositiveInteger($MaxPathLength)) {
 487     die "Error: Invalid path length value, $MaxPathLength, for \"--MinPathLength\" option. Allowed values: > 0\n";
 488   }
 489   if ($MinPathLength >= $MaxPathLength) {
 490     die "Error: Invalid minimum and maximum path length values, $MinPathLength and $MaxPathLength, for \"--MinPathLength\"  and \"--MaxPathLength\"options. Allowed values: minimum path length value must be smaller than maximum path length value.\n";
 491   }
 492   $OptionsInfo{MinPathLength} = $MinPathLength;
 493   $OptionsInfo{MaxPathLength} = $MaxPathLength;
 494 
 495   $OptionsInfo{Output} = $Options{output};
 496   $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|Both)$/i) ? 1 : 0;
 497   $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|Both)$/i) ? 1 : 0;
 498 
 499   $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
 500   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
 501 
 502   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 503   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 504 
 505   $OptionsInfo{UseBondSymbols} = ($Options{usebondsymbols} =~ /^Yes$/i) ? 1 : 0;
 506 
 507   # Setup parameters used during generation of fingerprints by PathLengthFingerprints class...
 508   my($AllowRings, $AllowSharedBonds);
 509   $AllowRings = 1;
 510   $AllowSharedBonds = 1;
 511   MODE: {
 512     if ($Options{mode} =~ /^AtomPathsWithoutRings$/i) { $AllowSharedBonds = 0; $AllowRings = 0; last MODE;}
 513     if ($Options{mode} =~ /^AtomPathsWithRings$/i) { $AllowSharedBonds = 0; $AllowRings = 1; last MODE;}
 514     if ($Options{mode} =~ /^AllAtomPathsWithoutRings$/i) { $AllowSharedBonds = 1; $AllowRings = 0; last MODE;}
 515     if ($Options{mode} =~ /^AllAtomPathsWithRings$/i) { $AllowSharedBonds = 1; $AllowRings = 1; last MODE;}
 516     die "Error: ProcessOptions: mode value, $Options{mode}, is not supported.\n";
 517   }
 518   $OptionsInfo{AllowRings} = $AllowRings;
 519   $OptionsInfo{AllowSharedBonds} = $AllowSharedBonds;
 520 }
 521 
 522 # Setup script usage  and retrieve command line arguments specified using various options...
 523 sub SetupScriptUsage {
 524 
 525   # Retrieve all the options...
 526   %Options = ();
 527 
 528   $Options{compoundidmode} = 'LabelPrefix';
 529   $Options{compoundidlabel} = 'CompoundID';
 530   $Options{datafieldsmode} = 'CompoundID';
 531   $Options{detectaromaticity} = 'Yes';
 532 
 533   $Options{fingerprintsoutput} = 'HexadecimalString';
 534 
 535   $Options{fold} = 'No';
 536   $Options{foldedsize} = 256;
 537 
 538   $Options{ignorehydrogens} = 'Yes';
 539   $Options{keeplargestcomponent} = 'Yes';
 540 
 541   $Options{mode} = 'AllAtomPathsWithRings';
 542 
 543   $Options{minpathlength} = 1;
 544   $Options{maxpathlength} = 8;
 545 
 546   $Options{output} = 'text';
 547   $Options{outdelim} = 'comma';
 548   $Options{quote} = 'yes';
 549 
 550   $Options{size} = 1024;
 551 
 552   $Options{usebondsymbols} = 'yes';
 553 
 554   if (!GetOptions(\%Options, "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "detectaromaticity=s", "fingerprintslabel=s", "fingerprintsoutput|f=s", "fold=s", "foldedsize=i", "help|h", "ignorehydrogens|i=s", "keeplargestcomponent|k=s", "mode|m=s", "minpathlength=i", "maxpathlength=i", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "size|s=i", "usebondsymbols|u=s", "workingdir|w=s")) {
 555     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 556   }
 557   if ($Options{workingdir}) {
 558     if (! -d $Options{workingdir}) {
 559       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 560     }
 561     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 562   }
 563   if ($Options{compoundidmode} !~ /(^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$)/i) {
 564     die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
 565   }
 566   if ($Options{datafieldsmode} !~ /(^(All|Common|Specify|CompoundID)$)/i) {
 567     die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
 568   }
 569   if ($Options{detectaromaticity} !~ /(^(Yes|No)$)/i) {
 570     die "Error: The value specified, $Options{detectaromaticity}, for option \"--DetectAromaticity\" is not valid. Allowed values: Yes or No\n";
 571   }
 572   if ($Options{fingerprintsoutput} !~ /(^(BinaryString|HexadecimalString|RawBinaryString)$)/i) {
 573     die "Error: The value specified, $Options{fingerprintsoutput}, for option \"-f, --FingerprintsOutput\" is not valid. Allowed values: BinaryString, HexadecimalString or RawBinaryString\n";
 574   }
 575   if ($Options{fold} !~ /(^(Yes|No)$)/i) {
 576     die "Error: The value specified, $Options{fold}, for option \"--fold\" is not valid. Allowed values: Yes or No\n";
 577   }
 578   if (!IsPositiveInteger($Options{foldedsize})) {
 579     die "Error: The value specified, $Options{foldedsize}, for option \"--FoldedSize\" is not valid. Allowed values: > 0 \n";
 580   }
 581   if ($Options{ignorehydrogens} !~ /(^(Yes|No)$)/i) {
 582     die "Error: The value specified, $Options{ignorehydrogens}, for option \"-i, --IgnoreHydrogens\" is not valid. Allowed values: Yes or No\n";
 583   }
 584   if ($Options{keeplargestcomponent} !~ /(^(Yes|No)$)/i) {
 585     die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
 586   }
 587   if ($Options{mode} !~ /(^(AtomPathsWithoutRings|AtomPathsWithRings|AllAtomPathsWithoutRings|AllAtomPathsWithRings)$)/i) {
 588     die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: AtomPathsWithoutRings, AtomPathsWithRings, AllAtomPathsWithoutRings or AllAtomPathsWithRings\n";
 589   }
 590   if (!IsPositiveInteger($Options{minpathlength})) {
 591     die "Error: The value specified, $Options{minpathlength}, for option \"--MinPathLength\" is not valid. Allowed values: > 0 \n";
 592   }
 593   if (!IsPositiveInteger($Options{maxpathlength})) {
 594     die "Error: The value specified, $Options{maxpathlength}, for option \"--MaxPathLength\" is not valid. Allowed values: > 0 \n";
 595   }
 596   if ($Options{output} !~ /(^(SD|text|both)$)/i) {
 597     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n";
 598   }
 599   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 600     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 601   }
 602   if ($Options{quote} !~ /^(Yes|No)$/i) {
 603     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
 604   }
 605   if (!IsPositiveInteger($Options{size})) {
 606     die "Error: The value specified, $Options{size}, for option \"-s, --size\" is not valid. Allowed values: > 0 \n";
 607   }
 608   if ($Options{usebondsymbols} !~ /^(Yes|No)$/i) {
 609     die "Error: The value specified, $Options{usebondsymbols}, for option \"-u, --UseBondSymbols\" is not valid. Allowed values: Yes or No\n";
 610   }
 611 }
 612