MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: ExtractFromSDFiles.pl,v $
   4 # $Date: 2011/12/16 00:03:30 $
   5 # $Revision: 1.41 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2012 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use SDFileUtil;
  36 use FileUtil;
  37 use TextUtil;
  38 
  39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  40 
  41 # Autoflush STDOUT
  42 $| = 1;
  43 
  44 # Starting message...
  45 $ScriptName = basename($0);
  46 print "\n$ScriptName:Starting...\n\n";
  47 $StartTime = new Benchmark;
  48 
  49 # Get the options and setup script...
  50 SetupScriptUsage();
  51 if ($Options{help} || @ARGV < 1) {
  52   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  53 }
  54 
  55 my(@SDFilesList);
  56 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  57 
  58 # Process options...
  59 print "Processing options...\n";
  60 my(%OptionsInfo);
  61 ProcessOptions();
  62 
  63 # Collect information about SD files...
  64 print "Checking input SD file(s)...\n";
  65 my(%SDFilesInfo);
  66 RetrieveSDFilesInfo();
  67 
  68 # Generate output files...
  69 my($FileIndex);
  70 if (@SDFilesList > 1) {
  71   print "\nProcessing SD files...\n";
  72 }
  73 for $FileIndex (0 .. $#SDFilesList) {
  74   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  75     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  76     ExtractFromSDFile($FileIndex);
  77   }
  78 }
  79 print "\n$ScriptName:Done...\n\n";
  80 
  81 $EndTime = new Benchmark;
  82 $TotalTime = timediff ($EndTime, $StartTime);
  83 print "Total time: ", timestr($TotalTime), "\n";
  84 
  85 ###############################################################################
  86 
  87 # Extract data from a SD file...
  88 sub ExtractFromSDFile {
  89   my($FileIndex) = @_;
  90 
  91   OpenInputAndOutputFiles($FileIndex);
  92 
  93   MODE: {
  94     if ($OptionsInfo{Mode} =~ /^AllDataFields$/i) {
  95       ExtractAllDataFields($FileIndex);
  96       last MODE;
  97     }
  98     if ($OptionsInfo{Mode} =~ /^CommonDataFields$/i) {
  99       ExtractCommonDataFields($FileIndex);
 100       last MODE;
 101     }
 102     if ($OptionsInfo{Mode} =~ /^DataFields$/i) {
 103       ExtractDataFields($FileIndex);
 104       last MODE;
 105     }
 106     if ($OptionsInfo{Mode} =~ /^(DataFieldByList|DatafieldUniqueByList)$/i) {
 107       ExtractDataFieldByList($FileIndex);
 108       last MODE;
 109     }
 110     if ($OptionsInfo{Mode} =~ /^DataFieldsByValue$/i) {
 111       ExtractDataFieldsByValue($FileIndex);
 112       last MODE;
 113     }
 114     if ($OptionsInfo{Mode} =~ /^DataFieldsByRegex$/i) {
 115       ExtractDataFieldsByRegex($FileIndex);
 116       last MODE;
 117     }
 118     if ($OptionsInfo{Mode} =~ /^RandomCmpds$/i) {
 119       ExtractRandomCompounds($FileIndex);
 120       last MODE;
 121     }
 122     if ($OptionsInfo{Mode} =~ /^MolNames$/i) {
 123       ExtractMolNames($FileIndex);
 124       last MODE;
 125     }
 126     if ($OptionsInfo{Mode} =~ /^RecordNum$/i) {
 127       ExtractRecordNum($FileIndex);
 128       last MODE;
 129     }
 130     if ($OptionsInfo{Mode} =~ /^RecordRange$/i) {
 131       ExtractRecordRange($FileIndex);
 132       last MODE;
 133     }
 134     if ($OptionsInfo{Mode} =~ /^2DCmpdRecords$/i) {
 135       Extract2DCmpdRecords($FileIndex);
 136       last MODE;
 137     }
 138     if ($OptionsInfo{Mode} =~ /^3DCmpdRecords$/i) {
 139       Extract3DCmpdRecords($FileIndex);
 140       last MODE;
 141     }
 142     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, molnames, randomcmpds, recordnum, recordrange, 2dcmpdrecords, 3dcmpdrecords\n";
 143   }
 144 
 145   CloseInputAndOutputFiles();
 146 }
 147 
 148 # Extract all data fields...
 149 sub ExtractAllDataFields {
 150   my($FileIndex) = @_;
 151   my(@CmpdLines);
 152 
 153   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 154   WriteTextFileColLabels();
 155 
 156   while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 157     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 158     %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 159 
 160     SetupDataValues();
 161     WriteTextFileCmpdData();
 162     WriteSDFileCmpdData();
 163   }
 164 }
 165 
 166 # Extract common data fields...
 167 sub ExtractCommonDataFields {
 168   my($FileIndex) = @_;
 169   my(@CmpdLines);
 170 
 171   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{CommonDataFieldLabels}[$FileIndex]};
 172   WriteTextFileColLabels();
 173 
 174   while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 175     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 176     %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 177 
 178     SetupDataValues();
 179     WriteTextFileCmpdData();
 180     WriteSDFileCmpdData();
 181   }
 182 }
 183 
 184 # Extract specified data fields...
 185 sub ExtractDataFields {
 186   my($FileIndex) = @_;
 187   my(@CmpdLines);
 188 
 189   @{$SDFilesInfo{DataLabels}} = @{$OptionsInfo{SpecifiedDataFieldLabels}};
 190   WriteTextFileColLabels();
 191 
 192   while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 193     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 194     %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 195 
 196     SetupDataValues();
 197     WriteTextFileCmpdData();
 198     WriteSDFileCmpdData();
 199   }
 200 }
 201 
 202 # Extract data fields using a list...
 203 sub ExtractDataFieldByList {
 204   my($FileIndex) = @_;
 205   my($CmpdNum, $Value, $SpecifiedDataFieldValuesFoundCount, $CurrentValue, $SpecifiedDataFieldLabel, @CmpdLines);
 206 
 207   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 208   WriteTextFileColLabels();
 209 
 210   for $Value (keys %{$OptionsInfo{SpecifiedDataFieldValues}}) {
 211     $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound";
 212   }
 213   $SpecifiedDataFieldValuesFoundCount = 0;
 214   $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel};
 215 
 216   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 217     $CmpdNum++;
 218 
 219     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 220     %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 221 
 222     if (!exists $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}) {
 223       next CMPDSTRING;
 224     }
 225 
 226     SetupDataValues();
 227 
 228     $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel};
 229     $CurrentValue = $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel};
 230 
 231     if (exists $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue}) {
 232       if ($SpecifiedDataFieldValuesFoundCount < $OptionsInfo{SpecifiedDataFieldValuesCount}) {
 233         if ($OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} eq "NotFound") {
 234           $SpecifiedDataFieldValuesFoundCount++;
 235           $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} = "Found";
 236           if ($OptionsInfo{Mode} =~ /^DataFieldUniqueByList$/i) {
 237             WriteSDFileCmpdString();
 238             WriteTextFileCmpdData();
 239           }
 240         }
 241         if ($OptionsInfo{Mode} =~ /^DataFieldByList$/i) {
 242           WriteSDFileCmpdString();
 243           WriteTextFileCmpdData();
 244         }
 245       }
 246       if ($SpecifiedDataFieldValuesFoundCount >= $OptionsInfo{SpecifiedDataFieldValuesCount}) {
 247         last CMPDSTRING;
 248       }
 249     }
 250   }
 251 }
 252 
 253 # Extract data fields by value...
 254 sub ExtractDataFieldsByValue {
 255   my($FileIndex) = @_;
 256   my($Label, $CurrentValue, $SpecifiedCriterion, $SpecifiedValue, $ViolationCount, $Nothing, @CmpdLines);
 257 
 258   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 259   WriteTextFileColLabels();
 260 
 261   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 262     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 263     %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 264 
 265     SetupDataValues();
 266     $ViolationCount = 0;
 267 
 268     for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) {
 269       if (exists $SDFilesInfo{DataFieldValues}{$Label}) {
 270         $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label};
 271         $SpecifiedCriterion = $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label};
 272         $SpecifiedValue = $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label};
 273 
 274         if ($OptionsInfo{NumericalComparison}) {
 275           CRITERION: {
 276               if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue != $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
 277               if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue > $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
 278               if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue < $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
 279               $Nothing = 1;
 280             }
 281         }
 282         else {
 283           CRITERION: {
 284               if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue ne $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
 285               if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue gt $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
 286               if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue lt $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
 287               $Nothing = 1;
 288             }
 289         }
 290       }
 291     }
 292     if ($ViolationCount <= $OptionsInfo{Violations}) {
 293       WriteSDFileCmpdString();
 294       WriteTextFileCmpdData();
 295     }
 296   }
 297 }
 298 
 299 # Extract data fields by value using regular expression match...
 300 sub ExtractDataFieldsByRegex {
 301   my($FileIndex) = @_;
 302   my($Label, $CurrentValue, $SpecifiedRegexCriterion, $SpecifiedRegex, $ViolationCount, $Nothing, @CmpdLines);
 303 
 304   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 305   WriteTextFileColLabels();
 306 
 307   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 308     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 309     %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 310 
 311     SetupDataValues();
 312     $ViolationCount = 0;
 313 
 314     for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) {
 315       if (exists $SDFilesInfo{DataFieldValues}{$Label}) {
 316         $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label};
 317            $SpecifiedRegexCriterion = $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label};
 318            $SpecifiedRegex = $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label};
 319 
 320         if ($OptionsInfo{RegexIgnoreCase}) {
 321           CRITERION: {
 322                  if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/i) { $ViolationCount++; last CRITERION; } }
 323                  if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/i) {  $ViolationCount++; last CRITERION; } }
 324               $Nothing = 1;
 325             }
 326         }
 327         else {
 328           CRITERION: {
 329                  if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/) { $ViolationCount++; last CRITERION; } }
 330                  if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/) {  $ViolationCount++; last CRITERION; } }
 331               $Nothing = 1;
 332             }
 333         }
 334       }
 335     }
 336     if ($ViolationCount <= $OptionsInfo{Violations}) {
 337       WriteSDFileCmpdString();
 338       WriteTextFileCmpdData();
 339     }
 340   }
 341 }
 342 
 343 # Extract random compounds...
 344 sub ExtractRandomCompounds {
 345   my($FileIndex) = @_;
 346   my($CmpdNum, $CmpdCount, $RandomCycleCount, $RandomIndex, @CmpdLines, %RandomCmpdIndexMap);
 347 
 348   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 349   WriteTextFileColLabels();
 350 
 351   $CmpdCount = $SDFilesInfo{CmpdCount}[$FileIndex];
 352   srand($OptionsInfo{Seed});
 353   $RandomCycleCount = 0;
 354 
 355   %RandomCmpdIndexMap = ();
 356   while ($RandomCycleCount <= $CmpdCount && $RandomCycleCount <= $OptionsInfo{NumOfCmpds}) {
 357     $RandomCycleCount++;
 358     $RandomIndex = int (rand $CmpdCount) + 1;
 359     $RandomCmpdIndexMap{$RandomIndex} = $RandomIndex;
 360   }
 361 
 362   $CmpdNum = 0;
 363   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 364     $CmpdNum++;
 365     if (!exists $RandomCmpdIndexMap{$CmpdNum}) {
 366       next CMPDSTRING;
 367     }
 368 
 369     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 370 
 371     WriteSDFileCmpdString();
 372 
 373     if ($OptionsInfo{OutputTextFile}) {
 374       %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 375       SetupDataValues();
 376       WriteTextFileCmpdData();
 377     }
 378   }
 379 }
 380 
 381 # Extract mol names...
 382 sub ExtractMolNames {
 383   my($FileIndex) = @_;
 384   my($MolName, $NewTextFileRef, @CmpdLines);
 385 
 386   push @{$SDFilesInfo{DataLabels}}, "MolName";
 387   WriteTextFileColLabels();
 388 
 389   $NewTextFileRef = $SDFilesInfo{NewTextFileRef};
 390   while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 391     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 392     $MolName = QuoteAWord(ParseCmpdMolNameLine($CmpdLines[0]), $OptionsInfo{OutQuote});
 393     print $NewTextFileRef "$MolName\n";
 394   }
 395 }
 396 
 397 # Extract a specific compound records...
 398 sub ExtractRecordNum {
 399   my($FileIndex) = @_;
 400   my($CmpdNum, @CmpdLines);
 401 
 402   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 403   WriteTextFileColLabels();
 404 
 405   $CmpdNum = 0;
 406 
 407   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 408     $CmpdNum++;
 409     if ($CmpdNum != $OptionsInfo{RecordNum}) {
 410       next CMPDSTRING;
 411     }
 412 
 413     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 414     WriteSDFileCmpdString();
 415 
 416     if ($OptionsInfo{OutputTextFile}) {
 417       %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 418       SetupDataValues();
 419       WriteTextFileCmpdData();
 420     }
 421     last CMPDSTRING;
 422   }
 423 }
 424 
 425 # Extract compounds in a specific record range...
 426 sub ExtractRecordRange {
 427   my($FileIndex) = @_;
 428   my($CmpdNum, @CmpdLines);
 429 
 430   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 431   WriteTextFileColLabels();
 432 
 433   $CmpdNum = 0;
 434   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 435     $CmpdNum++;
 436 
 437     if ($CmpdNum >= $OptionsInfo{StartRecordNum} && $CmpdNum <= $OptionsInfo{EndRecordNum}) {
 438       @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 439 
 440       WriteSDFileCmpdString();
 441 
 442       if ($OptionsInfo{OutputTextFile}) {
 443         %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 444         SetupDataValues();
 445         WriteTextFileCmpdData(\*NEWTEXTFILE);
 446       }
 447     }
 448     elsif ($CmpdNum > $OptionsInfo{EndRecordNum}) {
 449       last CMPDSTRING;
 450     }
 451   }
 452 }
 453 
 454 # Extract 2D compound records...
 455 sub Extract2DCmpdRecords {
 456   my($FileIndex) = @_;
 457   my(@CmpdLines);
 458 
 459   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 460   WriteTextFileColLabels();
 461 
 462 
 463   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 464     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 465     if (!IsCmpd2D(\@CmpdLines)) {
 466       next CMPDSTRING;
 467     }
 468 
 469     WriteSDFileCmpdString();
 470 
 471     if ($OptionsInfo{OutputTextFile}) {
 472       %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 473       SetupDataValues();
 474       WriteTextFileCmpdData();
 475     }
 476   }
 477 }
 478 
 479 # Extract 3D compound records...
 480 sub Extract3DCmpdRecords {
 481   my($FileIndex) = @_;
 482   my(@CmpdLines);
 483 
 484   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 485   WriteTextFileColLabels();
 486 
 487 
 488   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 489     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 490     if (!IsCmpd3D(\@CmpdLines)) {
 491       next CMPDSTRING;
 492     }
 493 
 494     WriteSDFileCmpdString();
 495 
 496     if ($OptionsInfo{OutputTextFile}) {
 497       %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 498       SetupDataValues();
 499       WriteTextFileCmpdData();
 500     }
 501   }
 502 }
 503 
 504 
 505 # Open input and output files...
 506 sub OpenInputAndOutputFiles {
 507   my($FileIndex) = @_;
 508 
 509   $SDFilesInfo{NewTextFileRef} = undef;
 510   $SDFilesInfo{NewSDFileRef} = undef;
 511 
 512   if ($OptionsInfo{OutputTextFile} && $OptionsInfo{OutputSDFile}) {
 513     print "Generating files $SDFilesInfo{NewSDFileName}[$FileIndex] and $SDFilesInfo{NewTextFileName}[$FileIndex]...\n";
 514   }
 515   elsif ($OptionsInfo{OutputSDFile}) {
 516     print "Generating file $SDFilesInfo{NewSDFileName}[$FileIndex]...\n";
 517   }
 518   else {
 519     print "Generating file $SDFilesInfo{NewTextFileName}[$FileIndex]...\n";
 520   }
 521 
 522   if ($OptionsInfo{OutputSDFile}) {
 523     open NEWSDFILE, ">$SDFilesInfo{NewSDFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewSDFileName}[$FileIndex]: $! \n";
 524     $SDFilesInfo{NewSDFileRef} = \*NEWSDFILE;
 525   }
 526   if ($OptionsInfo{OutputTextFile}) {
 527     open NEWTEXTFILE, ">$SDFilesInfo{NewTextFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewTextFileName}[$FileIndex]: $! \n";
 528     $SDFilesInfo{NewTextFileRef} = \*NEWTEXTFILE;
 529   }
 530 
 531   open SDFILE, "$SDFilesList[$FileIndex]" or die "Error: Couldn't open $SDFilesList[$FileIndex]: $! \n";
 532   $SDFilesInfo{InputSDFileRef} = \*SDFILE;
 533 
 534 }
 535 
 536 # Close open input and output files...
 537 sub CloseInputAndOutputFiles {
 538   if ($SDFilesInfo{NewSDFileRef}) {
 539     close $SDFilesInfo{NewSDFileRef};
 540   }
 541   if ($SDFilesInfo{NewTextFileRef}) {
 542     close $SDFilesInfo{NewTextFileRef};
 543   }
 544 
 545   if ($SDFilesInfo{InputSDFileRef}) {
 546     close $SDFilesInfo{InputSDFileRef};
 547   }
 548 
 549   $SDFilesInfo{NewTextFileRef} = undef;
 550   $SDFilesInfo{NewSDFileRef} = undef;
 551   $SDFilesInfo{InputSDFileRef} = undef;
 552 }
 553 
 554 # Write out column labels for text file...
 555 sub WriteTextFileColLabels {
 556   my($ColLabelsLine, $NewTextFileRef);
 557 
 558   if (!$OptionsInfo{OutputTextFile}) {
 559     return;
 560   }
 561 
 562   $NewTextFileRef = $SDFilesInfo{NewTextFileRef};
 563 
 564   if ($OptionsInfo{OutoutStrDataString}) {
 565     # Append structure data string label...
 566     my(@DataLabels);
 567 
 568     @DataLabels = ();
 569     push @DataLabels, @{$SDFilesInfo{DataLabels}};
 570     push @DataLabels, "StructureDataString";
 571 
 572     $ColLabelsLine = JoinWords(\@DataLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 573   }
 574   else {
 575     $ColLabelsLine = JoinWords(\@{$SDFilesInfo{DataLabels}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 576   }
 577   print $NewTextFileRef "$ColLabelsLine\n";
 578 }
 579 
 580 # Setup values for data fields...
 581 sub SetupDataValues {
 582   @{$SDFilesInfo{DataValues}} = map { exists $SDFilesInfo{DataFieldValues}{$_} ? $SDFilesInfo{DataFieldValues}{$_} : "" } @{$SDFilesInfo{DataLabels}};
 583 }
 584 
 585 # Write out structure data and specific data fields to SD file...
 586 sub WriteSDFileCmpdData {
 587   my($MolString, $Count, $NewSDFileRef);
 588 
 589   if (!$OptionsInfo{OutputSDFile}) {
 590     return;
 591   }
 592 
 593   $NewSDFileRef = $SDFilesInfo{NewSDFileRef};
 594 
 595   ($MolString) = split "M  END", $SDFilesInfo{CmpdString};
 596   $MolString .= "M  END";
 597   print $NewSDFileRef "$MolString\n";
 598 
 599   for $Count (0 .. $#{$SDFilesInfo{DataLabels}}) {
 600     print $NewSDFileRef ">  <$SDFilesInfo{DataLabels}[$Count]>\n$SDFilesInfo{DataValues}[$Count]\n\n";
 601   }
 602   print $NewSDFileRef "\$\$\$\$\n";
 603 }
 604 
 605 # Write out compound string...
 606 sub WriteSDFileCmpdString {
 607   my($NewSDFileRef);
 608 
 609   if (!$OptionsInfo{OutputSDFile}) {
 610     return;
 611   }
 612 
 613   $NewSDFileRef = $SDFilesInfo{NewSDFileRef};
 614   print $NewSDFileRef "$SDFilesInfo{CmpdString}\n";
 615 }
 616 
 617 # Write out data for text file...
 618 sub WriteTextFileCmpdData {
 619   my($DataValuesLine, $NewTextFileRef);
 620 
 621   if (!$OptionsInfo{OutputTextFile}) {
 622     return;
 623   }
 624 
 625   $NewTextFileRef = $SDFilesInfo{NewTextFileRef};
 626   $DataValuesLine = JoinWords(\@{$SDFilesInfo{DataValues}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 627 
 628   # Handle multiple lines data values for data fields by joining 'em using semicolons...
 629   if ($DataValuesLine =~ /\n/) {
 630     $DataValuesLine =~ s/\n/;/g;
 631   }
 632 
 633   if ($OptionsInfo{OutoutStrDataString}) {
 634     # Append structure data string...
 635     my($StrDataString, $OutQuoteValue, $OutDelim, $StrDataStringDelimiter);
 636 
 637     if ($OptionsInfo{StrDataStringWithFields}) {
 638       $StrDataString = $SDFilesInfo{CmpdString};
 639     }
 640     else {
 641       ($StrDataString) = split "M  END", $SDFilesInfo{CmpdString};
 642       $StrDataString .= "M  END";
 643     }
 644     $StrDataStringDelimiter = $OptionsInfo{StrDataStringDelimiter};
 645     $StrDataString =~ s/\n/$StrDataStringDelimiter/g;
 646 
 647     $OutDelim = $OptionsInfo{OutDelim};
 648     $OutQuoteValue = $OptionsInfo{OutQuote} ? "\"" : "";
 649 
 650     print $NewTextFileRef "$DataValuesLine${OutDelim}${OutQuoteValue}${StrDataString}${OutQuoteValue}\n";
 651   }
 652   else {
 653     print $NewTextFileRef "$DataValuesLine\n";
 654   }
 655 }
 656 
 657 # Retrieve information about input SD files...
 658 sub RetrieveSDFilesInfo {
 659   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $NewFileName, $NewSDFileName, $NewTextFileName, $CmpdCount);
 660 
 661   %SDFilesInfo = ();
 662 
 663   @{$SDFilesInfo{FileOkay}} = ();
 664   @{$SDFilesInfo{CmpdCount}} = ();
 665   @{$SDFilesInfo{NewTextFileName}} = ();
 666   @{$SDFilesInfo{NewSDFileName}} = ();
 667 
 668   @{$SDFilesInfo{AllDataFieldLabels}} = ();
 669   @{$SDFilesInfo{CommonDataFieldLabels}} = ();
 670 
 671   FILELIST: for $Index (0 .. $#SDFilesList) {
 672     $SDFile = $SDFilesList[$Index];
 673 
 674     $SDFilesInfo{FileOkay}[$Index] = 0;
 675 
 676     $SDFilesInfo{CmpdCount}[$Index] = 0;
 677     $SDFilesInfo{NewTextFileName}[$Index] = "";
 678     $SDFilesInfo{NewSDFileName}[$Index] = "";
 679 
 680     @{$SDFilesInfo{AllDataFieldLabels}[$Index]} = ();
 681     @{$SDFilesInfo{CommonDataFieldLabels}[$Index]} = ();
 682 
 683     if (!(-e $SDFile)) {
 684       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 685       next FILELIST;
 686     }
 687 
 688     if (!CheckFileType($SDFile, "sd sdf")) {
 689       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 690       next FILELIST;
 691     }
 692 
 693     # Generate appropriate name for the new output file.
 694     $FileDir = ""; $FileName = ""; $FileExt = "";
 695     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 696     $NewFileName = $FileName;
 697     $NewFileName = $FileName  . $OptionsInfo{FileNameMode};
 698     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
 699       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 700       if ($RootFileName && $RootFileExt) {
 701         $NewFileName = $RootFileName;
 702       }
 703       else {
 704         $NewFileName = $OptionsInfo{OutFileRoot};
 705       }
 706     }
 707     $NewSDFileName = $NewFileName . ".$OptionsInfo{SDFileExt}";
 708     $NewTextFileName = $NewFileName . ".$OptionsInfo{TextFileExt}";
 709 
 710     if ($OptionsInfo{OutputSDFile}) {
 711       if (lc($NewSDFileName) eq lc($SDFile)) {
 712         warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
 713         print "Specify a different name using \"-r --root\" option or use default name.\n";
 714         next FILELIST;
 715       }
 716     }
 717 
 718     if (!$OptionsInfo{Overwrite}) {
 719       if ($OptionsInfo{OutputSDFile}) {
 720         if (-e $NewSDFileName) {
 721           warn "Warning: Ignoring file $SDFile: New file, $NewSDFileName, already exists\n";
 722           next FILELIST;
 723         }
 724       }
 725       if ($OptionsInfo{OutputTextFile}) {
 726         if (-e $NewTextFileName) {
 727           warn "Warning: Ignoring file $SDFile: New file, $NewTextFileName, already exists\n";
 728           next FILELIST;
 729         }
 730       }
 731     }
 732 
 733     if (!open SDFILE, "$SDFile") {
 734       warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
 735       next FILELIST;
 736     }
 737 
 738     my($CountCmpds, $CollectDataFields);
 739     my($CmpdString, @CmpdLines, @DataFieldLabels, %DataFieldLabelsMap,@CommonDataFieldLabels);
 740 
 741     $CountCmpds = ($OptionsInfo{Mode} =~ /^randomcmpds$/i) ? 1 : 0;
 742 
 743     $CollectDataFields = (($OptionsInfo{Mode} =~ /^(alldatafields|commondatafields|randomcmpds)$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldsbyvalue$/i  && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldbylist$/i  && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafielduniquebylist$/i  && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordrange$/i && $OptionsInfo{OutputTextFile})) ? 1 : 0;
 744 
 745     $CmpdCount = 0;
 746     if ($CountCmpds || $CollectDataFields) {
 747       @DataFieldLabels = ();
 748       @CommonDataFieldLabels = ();
 749       %DataFieldLabelsMap = ();
 750       CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 751         $CmpdCount++;
 752         if ($OptionsInfo{Mode} =~ /^recordnum$/i) {
 753           if ($CmpdCount == $OptionsInfo{RecordNum}) {
 754             @CmpdLines = split "\n", $CmpdString;
 755             @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
 756             last CMPDSTRING;
 757           }
 758         }
 759         if ($CollectDataFields) {
 760           my($Label);
 761           @CmpdLines = split "\n", $CmpdString;
 762           # Process compound data header labels and figure out which ones are present for
 763           # all the compounds...
 764           if (@DataFieldLabels) {
 765             my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines);
 766             my(%CmpdDataFieldLabelsMap) = ();
 767             # Setup a map for the current labels...
 768             for $Label (@CmpdDataFieldLabels) {
 769               $CmpdDataFieldLabelsMap{$Label} = "PresentInSome";
 770             }
 771             # Check the presence old labels for this compound; otherwise, mark 'em new...
 772             for $Label (@DataFieldLabels) {
 773               if (!$CmpdDataFieldLabelsMap{$Label}) {
 774                 $DataFieldLabelsMap{$Label} = "PresentInSome";
 775               }
 776             }
 777             # Check the presence this compound in the old labels; otherwise, add 'em...
 778             for $Label (@CmpdDataFieldLabels ) {
 779               if (!$DataFieldLabelsMap{$Label}) {
 780                 # It's a new label...
 781                 push @DataFieldLabels, $Label;
 782                 $DataFieldLabelsMap{$Label} = "PresentInSome";
 783               }
 784             }
 785           }
 786           else {
 787             # Get the initial label set and set up a map...
 788             @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
 789             for $Label (@DataFieldLabels) {
 790               $DataFieldLabelsMap{$Label} = "PresentInAll";
 791             }
 792           }
 793           # Identify the common data field labels...
 794           if ($Options{mode} =~ /^commondatafields$/i) {
 795             @CommonDataFieldLabels = ();
 796             for $Label (@DataFieldLabels) {
 797               if ($DataFieldLabelsMap{$Label} eq "PresentInAll") {
 798                 push @CommonDataFieldLabels, $Label;
 799               }
 800             }
 801           }
 802         }
 803       }
 804     }
 805 
 806     $SDFilesInfo{FileOkay}[$Index] = 1;
 807 
 808     $SDFilesInfo{NewTextFileName}[$Index] = $NewTextFileName;
 809     $SDFilesInfo{NewSDFileName}[$Index] = $NewSDFileName;
 810 
 811     $SDFilesInfo{CmpdCount}[$Index] = $CmpdCount;
 812 
 813     push @{$SDFilesInfo{AllDataFieldLabels}[$Index]}, @DataFieldLabels;
 814     push @{$SDFilesInfo{CommonDataFieldLabels}[$Index]}, @CommonDataFieldLabels;
 815 
 816     close SDFILE;
 817   }
 818 }
 819 
 820 # Process options...
 821 sub ProcessOptions {
 822   %OptionsInfo = ();
 823 
 824   $OptionsInfo{Mode} = $Options{mode};
 825 
 826   $OptionsInfo{InDelim} = "\,";
 827   if ($Options{indelim} =~ /^semicolon$/i) {
 828     $OptionsInfo{InDelim} = "\;";
 829   }
 830   elsif ($Options{indelim} =~ /^tab$/i) {
 831     $OptionsInfo{InDelim} = "\t";
 832   }
 833 
 834   $OptionsInfo{OutDelim} = "\,";
 835   if ($Options{outdelim} =~ /^semicolon$/i) {
 836     $OptionsInfo{OutDelim} = "\;";
 837   }
 838   elsif ($Options{outdelim} =~ /^tab$/i) {
 839     $OptionsInfo{OutDelim} = "\t";
 840   }
 841 
 842   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0;
 843 
 844   $OptionsInfo{RegexIgnoreCase} = ($Options{regexignorecase} =~ /^yes$/i) ? 1 : 0;
 845 
 846   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : undef;
 847   $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef;
 848 
 849   $OptionsInfo{NumOfCmpds} = $Options{numofcmpds};
 850 
 851   $OptionsInfo{ValueComparisonMode} = $Options{valuecomparisonmode};
 852   $OptionsInfo{NumericalComparison} = ($Options{valuecomparisonmode} =~ /^Numeric$/i) ? 1 : 0;
 853 
 854   $OptionsInfo{Violations} = $Options{violations};
 855   $OptionsInfo{Seed} = $Options{seed};
 856 
 857 
 858   if ($Options{mode} =~ /^(datafields|datafieldsbyregex|datafieldsbyvalue|datafieldbylist|datafielduniquebylist)$/i) {
 859     if ($Options{datafields} || $Options{datafieldsfile}) {
 860       if ($Options{datafields} && $Options{datafieldsfile}) {
 861         die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, or datafielduniquebylist, specify only one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n";
 862       }
 863     }
 864     else {
 865       die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, or datafielduniquebylist, specify one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n";
 866     }
 867   }
 868   $OptionsInfo{DataFields} = $Options{datafields} ? $Options{datafields} : undef;
 869   $OptionsInfo{DataFieldsFile} = $Options{datafieldsfile} ? $Options{datafieldsfile} : undef;
 870 
 871   $OptionsInfo{RecordNum} = 0; $OptionsInfo{StartRecordNum} = 0; $OptionsInfo{EndRecordNum} = 0;
 872   $OptionsInfo{Record} = $Options{record} ? $Options{record} : undef;
 873 
 874   if ($Options{mode} =~ /^(recordnum|recordrange)$/i) {
 875     if ($Options{record}) {
 876       my(@RecordSplit) = split ",", $Options{record};
 877       if ($Options{mode} =~ /^recordnum$/i ) {
 878         if (@RecordSplit == 1) {
 879           $OptionsInfo{RecordNum} = $RecordSplit[0];
 880           if ($OptionsInfo{RecordNum} <= 0) {
 881             die "Error: The value specified, $OptionsInfo{RecordNum},  for option \"--records\" is not valid. Allowed values: > 0 \n";
 882           }
 883         }
 884         else {
 885           die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 1 value is allowed.\n";
 886         }
 887       }
 888       else {
 889         if (@RecordSplit == 2) {
 890           $OptionsInfo{StartRecordNum} = $RecordSplit[0];
 891           $OptionsInfo{EndRecordNum} = $RecordSplit[1];
 892           if ($OptionsInfo{StartRecordNum} <= 0 || $OptionsInfo{EndRecordNum} <= 0) {
 893             die "Error: The value pair specified, $Options{record},  for option \"--records\" is not valid. Allowed values: > 0 \n";
 894           }
 895         }
 896         else {
 897           die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 2 values is allowed.\n";
 898         }
 899         if ($OptionsInfo{StartRecordNum} > $OptionsInfo{EndRecordNum}) {
 900           die "Error: Start record number, $OptionsInfo{StartRecordNum}, must be smaller than end record number, $OptionsInfo{EndRecordNum}.\nSpecify different values using \"--record\" option.\n";
 901         }
 902       }
 903     }
 904     else {
 905       die "Error: For \"-m --mode\" option values recordnum, or recordrange, specify \"--record\" option value.\n";
 906     }
 907   }
 908 
 909   @{$OptionsInfo{SpecifiedDataFieldLabels}} = ();
 910 
 911   my(@Words, $Line, $Value);
 912   if ($Options{mode} =~ /^datafields$/i) {
 913     @{$OptionsInfo{SpecifiedDataFieldLabels}} = ();
 914     if ($Options{datafields}) {
 915       @{$OptionsInfo{SpecifiedDataFieldLabels}} = split $OptionsInfo{InDelim}, $Options{datafields};
 916     }
 917     elsif ($Options{datafieldsfile}) {
 918       open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
 919       while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
 920         @Words = quotewords($OptionsInfo{InDelim}, 0, $Line);
 921         if (@Words) {
 922           push @{$OptionsInfo{SpecifiedDataFieldLabels}}, @Words;
 923         }
 924       }
 925       close DATAFIELDSFILE;
 926     }
 927   }
 928   elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) {
 929     my(@DataFieldsByValueTriplets);
 930     @DataFieldsByValueTriplets = ();
 931     if ($Options{datafields}) {
 932       @DataFieldsByValueTriplets = split $OptionsInfo{InDelim}, $Options{datafields};
 933     }
 934     elsif ($Options{datafieldsfile}) {
 935       open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
 936       while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
 937         @Words = quotewords($OptionsInfo{InDelim}, 0, $Line);
 938         if (@Words) {
 939           push @DataFieldsByValueTriplets, @Words;
 940         }
 941       }
 942       close DATAFIELDSFILE;
 943     }
 944     if ((@DataFieldsByValueTriplets % 3)) {
 945       if ($Options{datafields}) {
 946         die "Error: Triplets not found in values specified by \"-d --datafields\" option\n";
 947       }
 948       elsif ($Options{datafieldsfile}) {
 949         die "Error: Triplets not found in values specified by \"--datafieldsfile\" option\n";
 950       }
 951     }
 952     my($Index, $Label, $Value, $Criterion);
 953 
 954     @{$OptionsInfo{SpecifiedDataFieldLabels}} = ();
 955     %{$OptionsInfo{SpecifiedDataFieldValuesMap}} = ();
 956     %{$OptionsInfo{SpecifiedDataFieldCriteriaMap}} = ();
 957 
 958     for ($Index = 0; $Index < @DataFieldsByValueTriplets; $Index = $Index + 3) {
 959       $Label = $DataFieldsByValueTriplets[$Index];
 960       $Value = $DataFieldsByValueTriplets[$Index + 1];
 961       $Criterion = $DataFieldsByValueTriplets[$Index + 2];
 962 
 963       if ($Criterion =~ /^(eq|le|ge)$/i) {
 964         push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label;
 965         $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label} = $Value;
 966         $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label} = $Criterion;
 967       }
 968       else {
 969         warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion\n";
 970       }
 971     }
 972   }
 973   elsif ($Options{mode} =~ /^datafieldsbyregex$/i) {
 974     my(@DataFieldsByRegexTriplets);
 975 
 976     @DataFieldsByRegexTriplets = ();
 977     if ($Options{datafields}) {
 978       @DataFieldsByRegexTriplets = quotewords($OptionsInfo{InDelim}, 0, $Options{datafields});
 979     }
 980     elsif ($Options{datafieldsfile}) {
 981       open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
 982       while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
 983           @Words = quotewords($OptionsInfo{InDelim}, 0, $Line);
 984           if (@Words) {
 985             push @DataFieldsByRegexTriplets, @Words;
 986           }
 987       }
 988       close DATAFIELDSFILE;
 989     }
 990     if ((@DataFieldsByRegexTriplets % 3)) {
 991       if ($Options{datafields}) {
 992           die "Error: Triplet not found in values specified by \"-d --datafields\" option\n";
 993       }
 994       elsif ($Options{datafieldsfile}) {
 995           die "Error: Triplet not found in values specified by \"--datafieldsfile\" option\n";
 996       }
 997     }
 998 
 999     my($Index, $Label, $Value, $Criterion);
1000 
1001     @{$OptionsInfo{SpecifiedDataFieldLabels}} = ();
1002     %{$OptionsInfo{SpecifiedDataFieldRegexMap}} = ();
1003     %{$OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}} = ();
1004 
1005     for ($Index = 0; $Index < @DataFieldsByRegexTriplets; $Index = $Index + 3) {
1006       $Label = $DataFieldsByRegexTriplets[$Index];
1007       $Value = $DataFieldsByRegexTriplets[$Index + 1];
1008       $Criterion = $DataFieldsByRegexTriplets[$Index + 2];
1009 
1010       if ($Criterion =~ /^(eq|ne)$/i) {
1011           push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label;
1012           $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label} = $Value;
1013           $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label} = $Criterion;
1014       }
1015       else {
1016           warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion; Supported values: eq or ne\n";
1017       }
1018     }
1019   }
1020   elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist)$/i) {
1021     my($Index, @DataFieldAndValuesList);
1022     if ($Options{datafields}) {
1023       @DataFieldAndValuesList = split $OptionsInfo{InDelim}, $Options{datafields};
1024     }
1025     elsif ($Options{datafieldsfile}) {
1026       open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
1027       while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
1028         @Words = quotewords($OptionsInfo{InDelim}, 0, $Line);
1029         if (@Words) {
1030           push @DataFieldAndValuesList, @Words;
1031         }
1032       }
1033       close DATAFIELDSFILE;
1034     }
1035     if (@DataFieldAndValuesList < 2) {
1036       if ($Options{datafields}) {
1037         die "Error: Invalid number of values specified by \"-d --datafields\" option\n";
1038       }
1039       elsif ($Options{datafieldsfile}) {
1040         die "Error: Invalid number values specified by \"--datafieldsfile\" option\n";
1041       }
1042     }
1043 
1044     $OptionsInfo{SpecifiedDataFieldLabel} = $DataFieldAndValuesList[0];
1045     $OptionsInfo{SpecifiedDataFieldValuesCount} = @DataFieldAndValuesList - 1;
1046     %{$OptionsInfo{SpecifiedDataFieldValues}} = ();
1047 
1048     for ($Index = 1; $Index < @DataFieldAndValuesList; $Index++) {
1049       $Value = $DataFieldAndValuesList[$Index];
1050       $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound";
1051     }
1052   }
1053 
1054   $OptionsInfo{SDFileExt} = "sdf";
1055   $OptionsInfo{TextFileExt} = "csv";
1056 
1057   if ($Options{outdelim} =~ /^tab$/i) {
1058     $OptionsInfo{TextFileExt} = "tsv";
1059   }
1060 
1061   if ($Options{mode} =~ /^(alldatafields|molnames)$/i) {
1062     $OptionsInfo{OutputSDFile} = 0;
1063     $OptionsInfo{OutputTextFile} = 1;
1064   }
1065   else {
1066     $OptionsInfo{OutputSDFile} = ($Options{output} =~ /^(SD|both)$/i) ? 1 : 0;
1067     $OptionsInfo{OutputTextFile} = ($Options{output} =~ /^(text|both)$/i) ? 1 : 0;
1068   }
1069 
1070   $OptionsInfo{StrDataString} = $Options{strdatastring};
1071   $OptionsInfo{OutoutStrDataString} = ($Options{strdatastring} =~ /^Yes$/i) ? 1 : 0;
1072 
1073   $OptionsInfo{StrDataStringDelimiter} = $Options{strdatastringdelimiter};
1074 
1075   if (IsEmpty($Options{strdatastringdelimiter})) {
1076     die "Error: No value specified for \"--StrDataStringDelimiter\" option.\n";
1077   }
1078   $OptionsInfo{StrDataStringMode} = $Options{strdatastringmode};
1079   $OptionsInfo{StrDataStringWithFields} = $Options{strdatastringmode} =~ /^StrAndDataFields$/i ? 1 : 0;
1080 
1081   MODE: {
1082     if ($Options{mode} =~ /^alldatafields$/i) { $OptionsInfo{FileNameMode} = "AllDataDields"; last MODE; }
1083     if ($Options{mode} =~ /^commondatafields$/i) { $OptionsInfo{FileNameMode} = "CommonDataDields"; last MODE; }
1084     if ($Options{mode} =~ /^datafields$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFields"; last MODE; }
1085     if ($Options{mode} =~ /^datafieldsbyvalue$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByValue"; last MODE; }
1086     if ($Options{mode} =~ /^datafieldsbyregex$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByRegex"; last MODE; }
1087     if ($Options{mode} =~ /^datafieldbylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataField"; last MODE; }
1088     if ($Options{mode} =~ /^datafielduniquebylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedUniqueDataField"; last MODE; }
1089     if ($Options{mode} =~ /^molnames$/i) { $OptionsInfo{FileNameMode} = "MolName"; last MODE; }
1090     if ($Options{mode} =~ /^randomcmpds$/i) { $OptionsInfo{FileNameMode} = "RandomCmpds"; last MODE; }
1091     if ($Options{mode} =~ /^recordnum$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{RecordNum}"; last MODE; }
1092     if ($Options{mode} =~ /^recordrange$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{StartRecordNum}" . "To" . "$OptionsInfo{EndRecordNum}"; last MODE; }
1093     if ($Options{mode} =~ /^2dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "2DCmpdRecords"; last MODE; }
1094     if ($Options{mode} =~ /^3dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "3DCmpdRecords"; last MODE; }
1095     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, molnames, randomcmpds, recordnum, recordrange, 2dcmpdrecords, 3dcmpdrecords\n";
1096   }
1097 
1098 }
1099 
1100 # Setup script usage  and retrieve command line arguments specified using various options...
1101 sub SetupScriptUsage {
1102 
1103   # Retrieve all the options...
1104   %Options = ();
1105   $Options{numofcmpds} = 1;
1106   $Options{mode} = "alldatafields";
1107   $Options{indelim} = "comma";
1108   $Options{outdelim} = "comma";
1109   $Options{output} = "SD";
1110   $Options{quote} = "yes";
1111   $Options{regexignorecase} = "yes";
1112   $Options{valuecomparisonmode} = "numeric";
1113   $Options{violations} = 0;
1114   $Options{seed} = 123456789;
1115 
1116   $Options{strdatastring} = "no";
1117   $Options{strdatastringdelimiter} = "|";
1118   $Options{strdatastringmode} = "StrOnly";
1119 
1120   if (!GetOptions(\%Options, "help|h", "datafields|d=s", "datafieldsfile=s", "indelim=s", "mode|m=s", "numofcmpds|n=i", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "regexignorecase=s", "record=s", "root|r=s", "seed|s=i", "strdatastring=s", "strdatastringdelimiter=s", "strdatastringmode=s", "valuecomparisonmode=s", "violations|v=i", "workingdir|w=s")) {
1121     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
1122   }
1123   if ($Options{workingdir}) {
1124     if (! -d $Options{workingdir}) {
1125       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
1126     }
1127     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
1128   }
1129   if ($Options{numofcmpds} < 1) {
1130     die "Error: The value specified, $Options{numofcmpds},  for option \"-n --numofcmpds\" is not valid. Allowed values: >= 1 \n";
1131   }
1132   if ($Options{valuecomparisonmode} !~ /^(Numeric|Alphanumeric)$/i) {
1133     die "Error: The value specified, $Options{valuecomparisonmode}, for option \"--ValueComparisonMode\" is not valid. Allowed values: Numeric or Alphanumeric\n";
1134   }
1135   if ($Options{violations} < 0) {
1136     die "Error: The value specified, $Options{violations},  for option \"-v --violations\" is not valid. Allowed values: >= 0 \n";
1137   }
1138   if ($Options{mode} !~ /^(alldatafields|commondatafields|datafields|datafieldsbyvalue|datafieldsbyregex|datafieldbylist|datafielduniquebylist|molnames|randomcmpds|recordnum|recordrange|2dcmpdrecords|3dcmpdrecords)$/i) {
1139     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, molnames, randomcmpds, recordnum, recordrange, 2dcmpdrecords, 3dcmpdrecords\n";
1140   }
1141   if ($Options{output} !~ /^(SD|text|both)$/i) {
1142     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n";
1143   }
1144   if ($Options{indelim} !~ /^(comma|semicolon|tab)$/i) {
1145     die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
1146   }
1147   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
1148     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
1149   }
1150   if ($Options{quote} !~ /^(yes|no)$/i) {
1151     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
1152   }
1153   if ($Options{regexignorecase} !~ /^(yes|no)$/i) {
1154     die "Error: The value specified, $Options{regexignorecase}, for option \"--regexignorecase\" is not valid. Allowed values: yes or no\n";
1155   }
1156   if ($Options{strdatastring} !~ /^(yes|no)$/i) {
1157     die "Error: The value specified, $Options{strdatastring}, for option \"--StrDataString\" is not valid. Allowed values: yes or no\n";
1158   }
1159   if ($Options{strdatastringmode} !~ /^(StrOnly|StrAndDataFields)$/i) {
1160     die "Error: The value specified, $Options{strdatastringmode}, for option \"--StrDataStringMode\" is not valid. Allowed values: StrOnly or StrAndDataFields\n";
1161   }
1162 }
1163