MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: ExtractFromSDFiles.pl,v $
   4 # $Date: 2010/07/18 17:14:05 $
   5 # $Revision: 1.35 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2010 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use SDFileUtil;
  36 use FileUtil;
  37 use TextUtil;
  38 
  39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  40 
  41 # Autoflush STDOUT
  42 $| = 1;
  43 
  44 # Starting message...
  45 $ScriptName = basename($0);
  46 print "\n$ScriptName:Starting...\n\n";
  47 $StartTime = new Benchmark;
  48 
  49 # Get the options and setup script...
  50 SetupScriptUsage();
  51 if ($Options{help} || @ARGV < 1) {
  52   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  53 }
  54 
  55 my(@SDFilesList);
  56 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  57 
  58 print "Processing options...\n";
  59 my($InDelim, $OutDelim, $OutQuote, $RecordNum, $StartRecordNum, $EndRecordNum, $FileNameMode, $SDFileExt, $TextFileExt, $OutputSDFileFlag, $OutputTextFileFlag, $OutoutStrDataStringFlag, $StrDataStringDelimiter, $StrDataStringWithFieldsFlag, @SpecifiedDataFieldLabels, %SpecifiedDataFieldValuesMap, %SpecifiedDataFieldCriteriaMap, $SpecifiedDataFieldLabel, $SpecifiedDataFieldValuesCount, %SpecifiedDataFieldValues);
  60 ProcessOptions();
  61 
  62 # Collect information about SD files...
  63 print "Checking input SD file(s)...\n";
  64 my(@SDFilesOkay, @SDFilesCmpdCount, @SDFilesNewTextFileName, @SDFilesNewSDFileName, @SDFilesAllDataFieldLabels, @SDFilesCommonDataFieldLabels);
  65 RetrieveSDFilesInfo();
  66 
  67 my($SDFile, $Index, $CmpdString, @CmpdLines, @DataLabels, @DataValues, %DataFieldValues, $DataValuesLine, $ColLabelsLine, $Label, $MolName, $CmpdNum, $CmpdCount, %RandomCmpdIndexMap, $SpecifiedDataFieldValuesFoundCount, @Words, $Line, $Value);
  68 
  69 if (@SDFilesList > 1) {
  70   print "Processing SD files...\n";
  71 }
  72 SDFILE: for $Index (0 .. $#SDFilesList) {
  73   if (!$SDFilesOkay[$Index]) {
  74     next SDFILE;
  75   }
  76   $SDFile = $SDFilesList[$Index];
  77   if (@SDFilesList > 1) {
  78     print "\nProcessing file $SDFile...\n";
  79   }
  80   else {
  81     print "Processing file $SDFile...\n"
  82   }
  83   # Open output files...
  84   if ($OutputTextFileFlag && $OutputSDFileFlag) {
  85     print "Generating $SDFilesNewSDFileName[$Index] and $SDFilesNewTextFileName[$Index]...\n";
  86   }
  87   elsif ($OutputSDFileFlag) {
  88     print "Generating $SDFilesNewSDFileName[$Index]...\n";
  89   }
  90   else {
  91     print "Generating $SDFilesNewTextFileName[$Index]...\n";
  92   }
  93   if ($OutputSDFileFlag) {
  94     open NEWSDFILE, ">$SDFilesNewSDFileName[$Index]" or die "Error: Couldn't open $SDFilesNewSDFileName[$Index]: $! \n";
  95   }
  96   if ($OutputTextFileFlag) {
  97     open NEWTEXTFILE, ">$SDFilesNewTextFileName[$Index]" or die "Error: Couldn't open $SDFilesNewTextFileName[$Index]: $! \n";
  98   }
  99   # Prepare for mode specific processing....
 100   @DataLabels = ();
 101   if ($Options{mode} =~ /^alldatafields$/i) {
 102     @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]};
 103   }
 104   elsif ($Options{mode} =~ /^commondatafields$/i) {
 105     @DataLabels = @{$SDFilesCommonDataFieldLabels[$Index]};
 106   }
 107   elsif ($Options{mode} =~ /^datafields$/i) {
 108     @DataLabels = @SpecifiedDataFieldLabels;
 109   }
 110   elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist)$/i) {
 111     for $Value (keys %SpecifiedDataFieldValues) {
 112       $SpecifiedDataFieldValues{$Value} = "NotFound";
 113     }
 114     $SpecifiedDataFieldValuesFoundCount = 0;
 115     @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]};
 116   }
 117   elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) {
 118     @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]};
 119   }
 120   elsif ($Options{mode} =~ /^randomcmpds$/i) {
 121     my($RandomCycleCount, $RandomIndex);
 122     $CmpdCount = $SDFilesCmpdCount[$Index];
 123     %RandomCmpdIndexMap = ();
 124     srand($Options{seed});
 125     $RandomCycleCount = 0;
 126     while ($RandomCycleCount <= $CmpdCount && $RandomCycleCount <= $Options{numofcmpds}) {
 127       $RandomCycleCount++;
 128       $RandomIndex = int (rand $CmpdCount) + 1;
 129       $RandomCmpdIndexMap{$RandomIndex} = $RandomIndex;
 130     }
 131     @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]};
 132   }
 133   elsif ($Options{mode} =~ /^molnames$/i) {
 134     push @DataLabels, "MolName";
 135   }
 136   elsif ($Options{mode} =~ /^recordnum$/i) {
 137     @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]};
 138   }
 139   elsif ($Options{mode} =~ /^recordrange$/i) {
 140     @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]};
 141   }
 142 
 143   if ($OutputTextFileFlag) {
 144     if ($OutoutStrDataStringFlag) {
 145       # Append structure data string label...
 146       my(@NewDataLabels);
 147 
 148       @NewDataLabels = ();
 149       push @NewDataLabels, @DataLabels;
 150       push @NewDataLabels, "StructureDataString";
 151 
 152       $ColLabelsLine = JoinWords(\@NewDataLabels, $OutDelim, $OutQuote);
 153     }
 154     else {
 155       $ColLabelsLine = JoinWords(\@DataLabels, $OutDelim, $OutQuote);
 156     }
 157     print NEWTEXTFILE "$ColLabelsLine\n";
 158   }
 159 
 160   open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 161   $CmpdNum = 0;
 162   CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 163     $CmpdNum++;
 164     @DataValues = ();
 165 
 166     if ($Options{mode} =~ /^(alldatafields|commondatafields|datafields)$/i) {
 167       @CmpdLines = split "\n", $CmpdString;
 168       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 169 
 170       SetupDataValues();
 171       WriteTextFileCmpdData();
 172       WriteSDFileCmpdData();
 173     }
 174     elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist)$/i) {
 175       my($CurrentValue);
 176 
 177       @CmpdLines = split "\n", $CmpdString;
 178       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 179 
 180       SetupDataValues();
 181       if (exists $DataFieldValues{$SpecifiedDataFieldLabel}) {
 182 	$CurrentValue = $DataFieldValues{$SpecifiedDataFieldLabel};
 183 	if (exists $SpecifiedDataFieldValues{$CurrentValue}) {
 184 	  if ($SpecifiedDataFieldValuesFoundCount < $SpecifiedDataFieldValuesCount) {
 185 	    if ($SpecifiedDataFieldValues{$CurrentValue} eq "NotFound") {
 186 	      $SpecifiedDataFieldValuesFoundCount++;
 187 	      $SpecifiedDataFieldValues{$CurrentValue} = "Found";
 188 	      if ($Options{mode} =~ /^datafielduniquebylist$/i) {
 189 		WriteSDFileCmpdString();
 190 		WriteTextFileCmpdData();
 191 	      }
 192 	    }
 193 	    if ($Options{mode} =~ /^datafieldbylist$/i) {
 194 	      WriteSDFileCmpdString();
 195 	      WriteTextFileCmpdData();
 196 	    }
 197 	  }
 198 	  if ($SpecifiedDataFieldValuesFoundCount >= $SpecifiedDataFieldValuesCount) {
 199 	    last CMPDSTRING;
 200 	  }
 201 	}
 202       }
 203     }
 204     elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) {
 205       my($CurrentValue, $SpecifiedCriterion, $SpecifiedValue, $ViolationCount, $Nothing);
 206 
 207       @CmpdLines = split "\n", $CmpdString;
 208       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 209 
 210       SetupDataValues();
 211       $ViolationCount = 0;
 212       for $Label (@SpecifiedDataFieldLabels) {
 213 	if (exists $DataFieldValues{$Label}) {
 214 	  $CurrentValue = $DataFieldValues{$Label};
 215 	  $SpecifiedCriterion = $SpecifiedDataFieldCriteriaMap{$Label};
 216 	  $SpecifiedValue = $SpecifiedDataFieldValuesMap{$Label};
 217 	SWITCH: {
 218 	    if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue ne $SpecifiedValue) { $ViolationCount++; last SWITCH; } }
 219 	    if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue gt $SpecifiedValue) { $ViolationCount++; } }
 220 	    if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue lt $SpecifiedValue) { $ViolationCount++; } }
 221 	    $Nothing = 1;
 222 	  }
 223 	}
 224       }
 225       if ($ViolationCount <= $Options{violations}) {
 226 	WriteSDFileCmpdString();
 227 	WriteTextFileCmpdData();
 228       }
 229     }
 230     elsif ($Options{mode} =~ /^randomcmpds$/i) {
 231       if (exists $RandomCmpdIndexMap{$CmpdNum}) {
 232 	@CmpdLines = split "\n", $CmpdString;
 233 
 234 	WriteSDFileCmpdString();
 235 	if ($OutputTextFileFlag) {
 236 	  %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 237 	  SetupDataValues();
 238 	  WriteTextFileCmpdData();
 239 	}
 240       }
 241     }
 242     elsif ($Options{mode} =~ /^molnames$/i) {
 243       @CmpdLines = split "\n", $CmpdString;
 244       $MolName = QuoteAWord(ParseCmpdMolNameLine($CmpdLines[0]), $OutQuote);
 245       print NEWTEXTFILE "$MolName\n";
 246     }
 247     elsif ($Options{mode} =~ /^recordnum$/i) {
 248       if ($CmpdNum == $RecordNum) {
 249 	@CmpdLines = split "\n", $CmpdString;
 250 
 251 	WriteSDFileCmpdString();
 252 	if ($OutputTextFileFlag) {
 253 	  %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 254 	  SetupDataValues();
 255 	  WriteTextFileCmpdData();
 256 	}
 257 	last CMPDSTRING;
 258       }
 259     }
 260     elsif ($Options{mode} =~ /^recordrange$/i) {
 261       if ($CmpdNum >= $StartRecordNum && $CmpdNum <= $EndRecordNum) {
 262 	@CmpdLines = split "\n", $CmpdString;
 263 
 264 	WriteSDFileCmpdString();
 265 	if ($OutputTextFileFlag) {
 266 	  %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 267 	  SetupDataValues();
 268 	  WriteTextFileCmpdData();
 269 	}
 270       }
 271       elsif ($CmpdNum > $EndRecordNum) {
 272 	last CMPDSTRING;
 273       }
 274     }
 275   }
 276   close SDFILE;
 277 
 278   if ($OutputSDFileFlag) {
 279     close NEWSDFILE;
 280   }
 281   if ($OutputTextFileFlag) {
 282     close NEWTEXTFILE;
 283   }
 284 }
 285 print "$ScriptName:Done...\n\n";
 286 
 287 $EndTime = new Benchmark;
 288 $TotalTime = timediff ($EndTime, $StartTime);
 289 print "Total time: ", timestr($TotalTime), "\n";
 290 
 291 ###############################################################################
 292 
 293 # Process options...
 294 sub ProcessOptions {
 295   $InDelim = "\,";
 296   if ($Options{indelim} =~ /^semicolon$/i) {
 297     $InDelim = "\;";
 298   }
 299   elsif ($Options{indelim} =~ /^tab$/i) {
 300     $InDelim = "\t";
 301   }
 302   $OutDelim = "\,";
 303   if ($Options{outdelim} =~ /^semicolon$/i) {
 304     $OutDelim = "\;";
 305   }
 306   elsif ($Options{outdelim} =~ /^tab$/i) {
 307     $OutDelim = "\t";
 308   }
 309   $OutQuote = 1;
 310   if ($Options{quote} =~ /^no$/i) {
 311     $OutQuote = 0;
 312   }
 313   if ($Options{mode} =~ /^(datafields|datafieldsbyvalue|datafieldbylist|datafielduniquebylist)$/i) {
 314     if ($Options{datafields} || $Options{datafieldsfile}) {
 315       if ($Options{datafields} && $Options{datafieldsfile}) {
 316 	die "Error: For \"-m --mode\" option values datafields, datafieldsbyvalue, datafieldbylist, or datafielduniquebylist, specify only one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n";
 317       }
 318     }
 319     else {
 320       die "Error: For \"-m --mode\" option values datafields, datafieldsbyvalue, datafieldbylist, or datafielduniquebylist, specify one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n";
 321     }
 322   }
 323   $RecordNum = 0; $StartRecordNum = 0; $EndRecordNum = 0;
 324   if ($Options{mode} =~ /^(recordnum|recordrange)$/i) {
 325     if ($Options{record}) {
 326       my(@RecordSplit) = split ",", $Options{record};
 327       if ($Options{mode} =~ /^recordnum$/i ) {
 328 	if (@RecordSplit == 1) {
 329 	  $RecordNum = $RecordSplit[0];
 330 	  if ($RecordNum <= 0) {
 331 	    die "Error: The value specified, $RecordNum,  for option \"--records\" is not valid. Allowed values: > 0 \n";
 332 	  }
 333 	}
 334 	else {
 335 	  die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 1 value is allowed.\n";
 336 	}
 337       }
 338       else {
 339 	if (@RecordSplit == 2) {
 340 	  $StartRecordNum = $RecordSplit[0];
 341 	  $EndRecordNum = $RecordSplit[1];
 342 	  if ($StartRecordNum <= 0 || $EndRecordNum <= 0) {
 343 	    die "Error: The value pair specified, $Options{record},  for option \"--records\" is not valid. Allowed values: > 0 \n";
 344 	  }
 345 	}
 346 	else {
 347 	  die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 2 values is allowed.\n";
 348 	}
 349 	if ($StartRecordNum > $EndRecordNum) {
 350 	  die "Error: Start record number, $StartRecordNum, must be smaller than end record number, $EndRecordNum.\nSpecify different values using \"--record\" option.\n";
 351 	}
 352       }
 353     }
 354     else {
 355       die "Error: For \"-m --mode\" option values recordnum, or recordrange, specify \"--record\" option value.\n";
 356     }
 357   }
 358   my(@Words, $Line, $Value);
 359   if ($Options{mode} =~ /^datafields$/i) {
 360     @SpecifiedDataFieldLabels = ();
 361     if ($Options{datafields}) {
 362       @SpecifiedDataFieldLabels = split "$InDelim", $Options{datafields};
 363     }
 364     elsif ($Options{datafieldsfile}) {
 365       open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
 366       while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
 367 	@Words = quotewords($InDelim, 0, $Line);
 368 	if (@Words) {
 369 	  push @SpecifiedDataFieldLabels, @Words;
 370 	}
 371       }
 372       close DATAFIELDSFILE;
 373     }
 374   }
 375   elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) {
 376     my(@DataFieldsByValueTriplets);
 377     @DataFieldsByValueTriplets = ();
 378     if ($Options{datafields}) {
 379       @DataFieldsByValueTriplets = split "$InDelim", $Options{datafields};
 380     }
 381     elsif ($Options{datafieldsfile}) {
 382       open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
 383       while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
 384 	@Words = quotewords($InDelim, 0, $Line);
 385 	if (@Words) {
 386 	  push @DataFieldsByValueTriplets, @Words;
 387 	}
 388       }
 389       close DATAFIELDSFILE;
 390     }
 391     if ((@DataFieldsByValueTriplets % 3)) {
 392       if ($Options{datafields}) {
 393 	die "Error: Triplets not found in values specified by \"-d --datafields\" option\n";
 394       }
 395       elsif ($Options{datafieldsfile}) {
 396 	die "Error: Triplets not found in values specified by \"--datafieldsfile\" option\n";
 397       }
 398     }
 399     @SpecifiedDataFieldLabels = ();
 400     %SpecifiedDataFieldValuesMap = ();
 401     %SpecifiedDataFieldCriteriaMap = ();
 402     for ($Index = 0; $Index < @DataFieldsByValueTriplets; $Index = $Index + 3) {
 403       $Label = $DataFieldsByValueTriplets[$Index];
 404       my($Value) = $DataFieldsByValueTriplets[$Index + 1];
 405       my($Criterion) = $DataFieldsByValueTriplets[$Index + 2];
 406       if ($Criterion =~ /^(eq|le|ge)$/i) {
 407 	push @SpecifiedDataFieldLabels, $Label;
 408 	$SpecifiedDataFieldValuesMap{$Label} = $Value;
 409 	$SpecifiedDataFieldCriteriaMap{$Label} = $Criterion;
 410       }
 411       else {
 412 	warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion\n";
 413       }
 414     }
 415   }
 416   elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist)$/i) {
 417     my(@DataFieldAndValuesList);
 418     if ($Options{datafields}) {
 419       @DataFieldAndValuesList = split "$InDelim", $Options{datafields};
 420     }
 421     elsif ($Options{datafieldsfile}) {
 422       open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
 423       while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
 424 	@Words = quotewords($InDelim, 0, $Line);
 425 	if (@Words) {
 426 	  push @DataFieldAndValuesList, @Words;
 427 	}
 428       }
 429       close DATAFIELDSFILE;
 430     }
 431     if (@DataFieldAndValuesList < 2) {
 432       if ($Options{datafields}) {
 433 	die "Error: Invalid number of values specified by \"-d --datafields\" option\n";
 434       }
 435       elsif ($Options{datafieldsfile}) {
 436 	die "Error: Invalid number values specified by \"--datafieldsfile\" option\n";
 437       }
 438     }
 439     $SpecifiedDataFieldLabel = $DataFieldAndValuesList[0];
 440     %SpecifiedDataFieldValues = ();
 441     $SpecifiedDataFieldValuesCount = @DataFieldAndValuesList - 1;
 442     for ($Index = 1; $Index < @DataFieldAndValuesList; $Index++) {
 443       $Value = $DataFieldAndValuesList[$Index];
 444       $SpecifiedDataFieldValues{$Value} = "NotFound";
 445     }
 446   }
 447 
 448   $SDFileExt = "sdf";
 449   $TextFileExt = "csv";
 450   if ($Options{outdelim} =~ /^tab$/i) {
 451     $TextFileExt = "tsv";
 452   }
 453   if ($Options{mode} =~ /^(alldatafields|molnames)$/i) {
 454     $OutputSDFileFlag = 0;
 455     $OutputTextFileFlag = 1;
 456   }
 457   else {
 458     $OutputSDFileFlag = ($Options{output} =~ /^(SD|both)$/i) ? 1 : 0;
 459     $OutputTextFileFlag = ($Options{output} =~ /^(text|both)$/i) ? 1 : 0;
 460   }
 461 
 462   $OutoutStrDataStringFlag = ($Options{strdatastring} =~ /^Yes$/i) ? 1 : 0;
 463   $StrDataStringDelimiter = $Options{strdatastringdelimiter};
 464   if (IsEmpty($StrDataStringDelimiter)) {
 465     die "Error: No value specified for \"--StrDataStringDelimiter\" option.\n";
 466   }
 467   $StrDataStringWithFieldsFlag = $Options{strdatastringmode} =~ /^StrAndDataFields$/i ? 1 : 0;
 468 
 469   my($Nothing);
 470  SWITCH: {
 471     if ($Options{mode} =~ /^alldatafields$/i) { $FileNameMode = "AllDataDields"; last SWITCH; }
 472     if ($Options{mode} =~ /^commondatafields$/i) { $FileNameMode = "CommonDataDields"; last SWITCH; }
 473     if ($Options{mode} =~ /^datafields$/i) { $FileNameMode = "SpecifiedDataFields"; last SWITCH; }
 474     if ($Options{mode} =~ /^datafieldsbyvalue$/i) { $FileNameMode = "SpecifiedDataFieldsByValue"; last SWITCH; }
 475     if ($Options{mode} =~ /^datafieldbylist$/i) { $FileNameMode = "SpecifiedDataField"; last SWITCH; }
 476     if ($Options{mode} =~ /^datafielduniquebylist$/i) { $FileNameMode = "SpecifiedUniqueDataField"; last SWITCH; }
 477     if ($Options{mode} =~ /^molnames$/i) { $FileNameMode = "MolName"; last SWITCH; }
 478     if ($Options{mode} =~ /^randomcmpds$/i) { $FileNameMode = "RandomCmpds"; last SWITCH; }
 479     if ($Options{mode} =~ /^recordnum$/i) { $FileNameMode = "RecordNum$RecordNum"; last SWITCH; }
 480     if ($Options{mode} =~ /^recordrange$/i) { $FileNameMode = "RecordNum$StartRecordNum" . "To" . "$EndRecordNum"; last SWITCH; }
 481     $Nothing = 1;
 482   }
 483 }
 484 
 485 # Retrieve information about input SD files...
 486 sub RetrieveSDFilesInfo {
 487   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $NewFileName, $NewSDFileName, $NewTextFileName, $CmpdCount);
 488 
 489   @SDFilesOkay = ();
 490   @SDFilesCmpdCount = ();
 491   @SDFilesNewTextFileName = ();
 492   @SDFilesNewSDFileName = ();
 493 
 494   @SDFilesAllDataFieldLabels = ();
 495   @SDFilesCommonDataFieldLabels = ();
 496 
 497  FILELIST: for $Index (0 .. $#SDFilesList) {
 498     $SDFile = $SDFilesList[$Index];
 499     $SDFilesOkay[$Index] = 0;
 500     $SDFilesCmpdCount[$Index] = 0;
 501     $SDFilesNewTextFileName[$Index] = "";
 502     $SDFilesNewSDFileName[$Index] = "";
 503 
 504     @{$SDFilesAllDataFieldLabels[$Index]} = ();
 505     @{$SDFilesCommonDataFieldLabels[$Index]} = ();
 506 
 507     if (!(-e $SDFile)) {
 508       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 509       next FILELIST;
 510     }
 511     if (!CheckFileType($SDFile, "sd sdf")) {
 512       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 513       next FILELIST;
 514     }
 515     # Generate appropriate name for the new output file.
 516     $FileDir = ""; $FileName = ""; $FileExt = "";
 517     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 518     $NewFileName = $FileName;
 519     $NewFileName = $FileName  . "$FileNameMode";
 520     if ($Options{root} && (@SDFilesList == 1)) {
 521       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 522       if ($RootFileName && $RootFileExt) {
 523 	$NewFileName = $RootFileName;
 524       }
 525       else {
 526 	$NewFileName = $Options{root};
 527       }
 528     }
 529     $NewSDFileName = $NewFileName . ".$SDFileExt";
 530     $NewTextFileName = $NewFileName . ".$TextFileExt";
 531     if ($OutputSDFileFlag) {
 532       if (lc($NewSDFileName) eq lc($SDFile)) {
 533 	warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
 534 	print "Specify a different name using \"-r --root\" option or use default name.\n";
 535 	next FILELIST;
 536       }
 537     }
 538     if (!$Options{overwrite}) {
 539       if ($OutputSDFileFlag) {
 540 	if (-e $NewSDFileName) {
 541 	  warn "Warning: Ignoring file $SDFile: New file, $NewSDFileName, already exists\n";
 542 	  next FILELIST;
 543 	}
 544       }
 545       if ($OutputTextFileFlag) {
 546 	if (-e $NewTextFileName) {
 547 	  warn "Warning: Ignoring file $SDFile: New file, $NewTextFileName, already exists\n";
 548 	  next FILELIST;
 549 	}
 550       }
 551     }
 552     if (!open SDFILE, "$SDFile") {
 553       warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
 554       next FILELIST;
 555     }
 556 
 557     my($CountCmpdsFlag, $CollectDataFieldsFlag);
 558     my($CmpdString, @CmpdLines, @DataFieldLabels, %DataFieldLabelsMap,@CommonDataFieldLabels);
 559 
 560     $CountCmpdsFlag = ($Options{mode} =~ /^randomcmpds$/i) ? 1 : 0;
 561 
 562     $CollectDataFieldsFlag = (($Options{mode} =~ /^(alldatafields|commondatafields|randomcmpds)$/i && $OutputTextFileFlag) || ($Options{mode} =~ /^datafieldsbyvalue$/i  && $OutputTextFileFlag) || ($Options{mode} =~ /^datafieldbylist$/i  && $OutputTextFileFlag) || ($Options{mode} =~ /^datafielduniquebylist$/i  && $OutputTextFileFlag) || ($Options{mode} =~ /^recordrange$/i && $OutputTextFileFlag)) ? 1 : 0;
 563 
 564     $CmpdCount = 0;
 565     if ($CountCmpdsFlag || $CollectDataFieldsFlag) {
 566       @DataFieldLabels = ();
 567       @CommonDataFieldLabels = ();
 568       %DataFieldLabelsMap = ();
 569       CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 570 	$CmpdCount++;
 571 	if ($Options{mode} =~ /^recordnum$/i) {
 572 	  if ($CmpdCount == $RecordNum) {
 573 	    @CmpdLines = split "\n", $CmpdString;
 574 	    @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
 575 	    last CMPDSTRING;
 576 	  }
 577 	}
 578 	if ($CollectDataFieldsFlag) {
 579 	  my($Label);
 580 	  @CmpdLines = split "\n", $CmpdString;
 581 	  # Process compound data header labels and figure out which ones are present for
 582 	  # all the compounds...
 583 	  if (@DataFieldLabels) {
 584 	    my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines);
 585 	    my(%CmpdDataFieldLabelsMap) = ();
 586 	    # Setup a map for the current labels...
 587 	    for $Label (@CmpdDataFieldLabels) {
 588 	      $CmpdDataFieldLabelsMap{$Label} = "PresentInSome";
 589 	    }
 590 	    # Check the presence old labels for this compound; otherwise, mark 'em new...
 591 	    for $Label (@DataFieldLabels) {
 592 	      if (!$CmpdDataFieldLabelsMap{$Label}) {
 593 		$DataFieldLabelsMap{$Label} = "PresentInSome";
 594 	      }
 595 	    }
 596 	    # Check the presence this compound in the old labels; otherwise, add 'em...
 597 	    for $Label (@CmpdDataFieldLabels ) {
 598 	      if (!$DataFieldLabelsMap{$Label}) {
 599 		# It's a new label...
 600 		push @DataFieldLabels, $Label;
 601 		$DataFieldLabelsMap{$Label} = "PresentInSome";
 602 	      }
 603 	    }
 604 	  }
 605 	  else {
 606 	    # Get the initial label set and set up a map...
 607 	    @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
 608 	    for $Label (@DataFieldLabels) {
 609 	      $DataFieldLabelsMap{$Label} = "PresentInAll";
 610 	    }
 611 	  }
 612 	  # Identify the common data field labels...
 613 	  if ($Options{mode} =~ /^commondatafields$/i) {
 614 	    @CommonDataFieldLabels = ();
 615 	    for $Label (@DataFieldLabels) {
 616 	      if ($DataFieldLabelsMap{$Label} eq "PresentInAll") {
 617 		push @CommonDataFieldLabels, $Label;
 618 	      }
 619 	    }
 620 	  }
 621 	}
 622       }
 623     }
 624 
 625     $SDFilesOkay[$Index] = 1;
 626     $SDFilesNewTextFileName[$Index] = "$NewTextFileName";
 627     $SDFilesNewSDFileName[$Index] = "$NewSDFileName";
 628 
 629     $SDFilesCmpdCount[$Index] = $CmpdCount;
 630     push @{$SDFilesAllDataFieldLabels[$Index]}, @DataFieldLabels;
 631     push @{$SDFilesCommonDataFieldLabels[$Index]}, @CommonDataFieldLabels;
 632 
 633     close SDFILE;
 634   }
 635 }
 636 
 637 # Setup values for data fields...
 638 sub SetupDataValues {
 639   @DataValues = ();
 640   for $Label (@DataLabels) {
 641     if (exists $DataFieldValues{$Label}) {
 642       push @DataValues, $DataFieldValues{$Label};
 643     }
 644     else {
 645       push @DataValues, "";
 646     }
 647   }
 648 }
 649 
 650 # Write out structure data and specific data fields to SD file...
 651 sub WriteSDFileCmpdData {
 652   my($Count);
 653   if ($OutputSDFileFlag) {
 654     my($MolString) = split "M  END", $CmpdString;
 655     $MolString .= "M  END";
 656     print NEWSDFILE "$MolString\n";
 657     for $Count (0 .. $#DataLabels) {
 658       print NEWSDFILE ">  <$DataLabels[$Count]>\n$DataValues[$Count]\n\n";
 659     }
 660     print NEWSDFILE "\$\$\$\$\n";
 661   }
 662 }
 663 
 664 # Write out compound string...
 665 sub WriteSDFileCmpdString {
 666   if ($OutputSDFileFlag) {
 667     print NEWSDFILE "$CmpdString\n";
 668   }
 669 }
 670 
 671 # Write out data for text file...
 672 sub WriteTextFileCmpdData {
 673   if ($OutputTextFileFlag) {
 674     $DataValuesLine = JoinWords(\@DataValues, $OutDelim, $OutQuote);
 675 
 676     # Handle multiple lines data values for data fields by joining 'em using semicolons...
 677     if ($DataValuesLine =~ /\n/) {
 678       $DataValuesLine =~ s/\n/;/g;
 679     }
 680 
 681     if ($OutoutStrDataStringFlag) {
 682       # Append structure data string...
 683       my($StrDataString, $OutQuoteValue);
 684 
 685       if ($StrDataStringWithFieldsFlag) {
 686 	$StrDataString = $CmpdString;
 687       }
 688       else {
 689 	($StrDataString) = split "M  END", $CmpdString;
 690 	$StrDataString .= "M  END";
 691       }
 692       $StrDataString =~ s/\n/$StrDataStringDelimiter/g;
 693       $OutQuoteValue = $OutDelim ? "\"" : "";
 694 
 695       print NEWTEXTFILE "$DataValuesLine${OutDelim}${OutQuoteValue}${StrDataString}${OutQuoteValue}\n";
 696     }
 697     else {
 698       print NEWTEXTFILE "$DataValuesLine\n";
 699     }
 700   }
 701 }
 702 
 703 # Setup script usage  and retrieve command line arguments specified using various options...
 704 sub SetupScriptUsage {
 705 
 706   # Retrieve all the options...
 707   %Options = ();
 708   $Options{numofcmpds} = 1;
 709   $Options{mode} = "alldatafields";
 710   $Options{indelim} = "comma";
 711   $Options{outdelim} = "comma";
 712   $Options{output} = "SD";
 713   $Options{quote} = "yes";
 714   $Options{violations} = 0;
 715   $Options{seed} = 999999999;
 716 
 717   $Options{strdatastring} = "no";
 718   $Options{strdatastringdelimiter} = "|";
 719   $Options{strdatastringmode} = "StrOnly";
 720 
 721   if (!GetOptions(\%Options, "help|h", "datafields|d=s", "datafieldsfile=s", "indelim=s", "mode|m=s", "numofcmpds|n=i", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "record=s", "root|r=s", "seed|s=i", "strdatastring=s", "strdatastringdelimiter=s", "strdatastringmode=s", "violations|v=i", "workingdir|w=s")) {
 722     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 723   }
 724   if ($Options{workingdir}) {
 725     if (! -d $Options{workingdir}) {
 726       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 727     }
 728     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 729   }
 730   if ($Options{numofcmpds} < 1) {
 731     die "Error: The value specified, $Options{numofcmpds},  for option \"-n --numofcmpds\" is not valid. Allowed values: >= 1 \n";
 732   }
 733   if ($Options{violations} < 0) {
 734     die "Error: The value specified, $Options{violations},  for option \"-v --violations\" is not valid. Allowed values: >= 0 \n";
 735   }
 736   if ($Options{mode} !~ /^(alldatafields|commondatafields|datafields|datafieldsbyvalue|datafieldbylist|datafielduniquebylist|molnames|randomcmpds|recordnum|recordrange)$/i) {
 737     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, molnames, randomcmpds, recordnum, recordrange\n";
 738   }
 739   if ($Options{output} !~ /^(SD|text|both)$/i) {
 740     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n";
 741   }
 742   if ($Options{indelim} !~ /^(comma|semicolon|tab)$/i) {
 743     die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 744   }
 745   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 746     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 747   }
 748   if ($Options{quote} !~ /^(yes|no)$/i) {
 749     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 750   }
 751   if ($Options{strdatastring} !~ /^(yes|no)$/i) {
 752     die "Error: The value specified, $Options{strdatastring}, for option \"--StrDataString\" is not valid. Allowed values: yes or no\n";
 753   }
 754   if ($Options{strdatastringmode} !~ /^(StrOnly|StrAndDataFields)$/i) {
 755     die "Error: The value specified, $Options{strdatastringmode}, for option \"--StrDataStringMode\" is not valid. Allowed values: StrOnly or StrAndDataFields\n";
 756   }
 757 }
 758