MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: ExtractFromSDFiles.pl,v $
   4 # $Date: 2008/02/28 00:29:34 $
   5 # $Revision: 1.28 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use 5.006;
  30 use strict;
  31 use FindBin; use lib "$FindBin::Bin/../lib";
  32 use Getopt::Long;
  33 use File::Basename;
  34 use Text::ParseWords;
  35 use Benchmark;
  36 use SDFileUtil;
  37 use FileUtil;
  38 use TextUtil;
  39 
  40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  41 
  42 # Autoflush STDOUT
  43 $| = 1;
  44 
  45 # Starting message...
  46 $ScriptName = basename($0);
  47 print "\n$ScriptName:Starting...\n\n";
  48 $StartTime = new Benchmark;
  49 
  50 # Get the options and setup script...
  51 SetupScriptUsage();
  52 if ($Options{help} || @ARGV < 1) {
  53   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  54 }
  55 
  56 my(@SDFilesList);
  57 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  58 
  59 print "Processing options...\n";
  60 my($InDelim, $OutDelim, $OutQuote, $RecordNum, $StartRecordNum, $EndRecordNum, $FileNameMode, $SDFileExt, $TextFileExt, $OutputSDFileFlag, $OutputTextFileFlag, @SpecifiedDataFieldLabels, %SpecifiedDataFieldValuesMap, %SpecifiedDataFieldCriteriaMap, $SpecifiedDataFieldLabel, $SpecifiedDataFieldValuesCount, %SpecifiedDataFieldValues);
  61 ProcessOptions();
  62 
  63 # Collect information about SD files...
  64 print "Checking input SD file(s)...\n";
  65 my(@SDFilesOkay, @SDFilesCmpdCount, @SDFilesNewTextFileName, @SDFilesNewSDFileName, @SDFilesAllDataFieldLabels, @SDFilesCommonDataFieldLabels);
  66 RetrieveSDFilesInfo();
  67 
  68 my($SDFile, $Index, $CmpdString, @CmpdLines, @DataLabels, @DataValues, %DataFieldValues, $DataValuesLine, $ColLabelsLine, $Label, $MolName, $CmpdNum, $CmpdCount, %RandomCmpdIndexMap, $SpecifiedDataFieldValuesFoundCount, @Words, $Line, $Value);
  69 
  70 if (@SDFilesList > 1) {
  71   print "Processing SD files...\n";
  72 }
  73 SDFILE: for $Index (0 .. $#SDFilesList) {
  74   if (!$SDFilesOkay[$Index]) {
  75     next SDFILE;
  76   }
  77   $SDFile = $SDFilesList[$Index];
  78   if (@SDFilesList > 1) {
  79     print "\nProcessing file $SDFile...\n";
  80   }
  81   else {
  82     print "Processing file $SDFile...\n"
  83   }
  84   # Open output files...
  85   if ($OutputTextFileFlag && $OutputSDFileFlag) {
  86     print "Generating $SDFilesNewSDFileName[$Index] and $SDFilesNewTextFileName[$Index]...\n";
  87   }
  88   elsif ($OutputSDFileFlag) {
  89     print "Generating $SDFilesNewSDFileName[$Index] ...\n";
  90   }
  91   else {
  92     print "Generating $SDFilesNewTextFileName[$Index]...\n";
  93   }
  94   if ($OutputSDFileFlag) {
  95     open NEWSDFILE, ">$SDFilesNewSDFileName[$Index]" or die "Error: Couldn't open $SDFilesNewSDFileName[$Index]: $! \n";
  96   }
  97   if ($OutputTextFileFlag) {
  98     open NEWTEXTFILE, ">$SDFilesNewTextFileName[$Index]" or die "Error: Couldn't open $SDFilesNewTextFileName[$Index]: $! \n";
  99   }
 100   # Prepare for mode specific processing....
 101   @DataLabels = ();
 102   if ($Options{mode} =~ /^alldatafields$/i) {
 103     @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]};
 104   }
 105   elsif ($Options{mode} =~ /^commondatafields$/i) {
 106     @DataLabels = @{$SDFilesCommonDataFieldLabels[$Index]};
 107   }
 108   elsif ($Options{mode} =~ /^datafields$/i) {
 109     @DataLabels = @SpecifiedDataFieldLabels;
 110   }
 111   elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist)$/i) {
 112     for $Value (keys %SpecifiedDataFieldValues) {
 113       $SpecifiedDataFieldValues{$Value} = "NotFound";
 114     }
 115     $SpecifiedDataFieldValuesFoundCount = 0;
 116     @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]};
 117   }
 118   elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) {
 119     @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]};
 120   }
 121   elsif ($Options{mode} =~ /^randomcmpds$/i) {
 122     my($RandomCycleCount, $RandomIndex);
 123     $CmpdCount = $SDFilesCmpdCount[$Index];
 124     %RandomCmpdIndexMap = ();
 125     srand($Options{seed});
 126     $RandomCycleCount = 0;
 127     while ($RandomCycleCount <= $CmpdCount && $RandomCycleCount <= $Options{numofcmpds}) {
 128       $RandomCycleCount++;
 129       $RandomIndex = int (rand $CmpdCount) + 1;
 130       $RandomCmpdIndexMap{$RandomIndex} = $RandomIndex;
 131     }
 132     @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]};
 133   }
 134   elsif ($Options{mode} =~ /^molnames$/i) {
 135     push @DataLabels, "MolName";
 136   }
 137   elsif ($Options{mode} =~ /^recordnum$/i) {
 138     @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]};
 139   }
 140   elsif ($Options{mode} =~ /^recordrange$/i) {
 141     @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]};
 142   }
 143 
 144   if ($OutputTextFileFlag) {
 145     $ColLabelsLine = JoinWords(\@DataLabels, $OutDelim, $OutQuote);
 146     print NEWTEXTFILE "$ColLabelsLine\n";
 147   }
 148 
 149   open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 150   $CmpdNum = 0;
 151   CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 152     @CmpdLines = split "\n", $CmpdString;
 153     $CmpdNum++;
 154     @DataValues = ();
 155     if ($Options{mode} =~ /^(alldatafields|commondatafields|datafields)$/i) {
 156       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 157       SetupDataValues();
 158       WriteTextFileCmpdData();
 159       WriteSDFileCmpdData();
 160     }
 161     elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist)$/i) {
 162       my($CurrentValue);
 163       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 164       SetupDataValues();
 165       if (exists $DataFieldValues{$SpecifiedDataFieldLabel}) {
 166 	$CurrentValue = $DataFieldValues{$SpecifiedDataFieldLabel};
 167 	if (exists $SpecifiedDataFieldValues{$CurrentValue}) {
 168 	  if ($SpecifiedDataFieldValuesFoundCount < $SpecifiedDataFieldValuesCount) {
 169 	    if ($SpecifiedDataFieldValues{$CurrentValue} eq "NotFound") {
 170 	      $SpecifiedDataFieldValuesFoundCount++;
 171 	      $SpecifiedDataFieldValues{$CurrentValue} = "Found";
 172 	      if ($Options{mode} =~ /^datafielduniquebylist$/i) {
 173 		WriteSDFileCmpdString();
 174 		WriteTextFileCmpdData();
 175 	      }
 176 	    }
 177 	    if ($Options{mode} =~ /^datafieldbylist$/i) {
 178 	      WriteSDFileCmpdString();
 179 	      WriteTextFileCmpdData();
 180 	    }
 181 	  }
 182 	  if ($SpecifiedDataFieldValuesFoundCount >= $SpecifiedDataFieldValuesCount) {
 183 	    last CMPDSTRING;
 184 	  }
 185 	}
 186       }
 187     }
 188     elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) {
 189       my($CurrentValue, $SpecifiedCriterion, $SpecifiedValue, $ViolationCount, $Nothing);
 190       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 191       SetupDataValues();
 192       $ViolationCount = 0;
 193       for $Label (@SpecifiedDataFieldLabels) {
 194 	if (exists $DataFieldValues{$Label}) {
 195 	  $CurrentValue = $DataFieldValues{$Label};
 196 	  $SpecifiedCriterion = $SpecifiedDataFieldCriteriaMap{$Label};
 197 	  $SpecifiedValue = $SpecifiedDataFieldValuesMap{$Label};
 198 	SWITCH: {
 199 	    if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue ne $SpecifiedValue) { $ViolationCount++; last SWITCH; } }
 200 	    if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue gt $SpecifiedValue) { $ViolationCount++; } }
 201 	    if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue lt $SpecifiedValue) { $ViolationCount++; } }
 202 	    $Nothing = 1;
 203 	  }
 204 	}
 205       }
 206       if ($ViolationCount <= $Options{violations}) {
 207 	WriteSDFileCmpdString();
 208 	WriteTextFileCmpdData();
 209       }
 210     }
 211     elsif ($Options{mode} =~ /^randomcmpds$/i) {
 212       if (exists $RandomCmpdIndexMap{$CmpdNum}) {
 213 	WriteSDFileCmpdString();
 214 	if ($OutputTextFileFlag) {
 215 	  %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 216 	  SetupDataValues();
 217 	  WriteTextFileCmpdData();
 218 	}
 219       }
 220     }
 221     elsif ($Options{mode} =~ /^molnames$/i) {
 222       $MolName = QuoteAWord(ParseCmpdMolNameLine($CmpdLines[0]), $OutQuote);
 223       print NEWTEXTFILE "$MolName\n";
 224     }
 225     elsif ($Options{mode} =~ /^recordnum$/i) {
 226       if ($CmpdNum == $RecordNum) {
 227 	WriteSDFileCmpdString();
 228 	if ($OutputTextFileFlag) {
 229 	  %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 230 	  SetupDataValues();
 231 	  WriteTextFileCmpdData();
 232 	}
 233 	last CMPDSTRING;
 234       }
 235     }
 236     elsif ($Options{mode} =~ /^recordrange$/i) {
 237       if ($CmpdNum >= $StartRecordNum && $CmpdNum <= $EndRecordNum) {
 238 	WriteSDFileCmpdString();
 239 	if ($OutputTextFileFlag) {
 240 	  %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 241 	  SetupDataValues();
 242 	  WriteTextFileCmpdData();
 243 	}
 244       }
 245       elsif ($CmpdNum > $EndRecordNum) {
 246 	last CMPDSTRING;
 247       }
 248     }
 249   }
 250   close SDFILE;
 251 
 252   if ($OutputSDFileFlag) {
 253     close NEWSDFILE;
 254   }
 255   if ($OutputTextFileFlag) {
 256     close NEWTEXTFILE;
 257   }
 258 }
 259 print "$ScriptName:Done...\n\n";
 260 
 261 $EndTime = new Benchmark;
 262 $TotalTime = timediff ($EndTime, $StartTime);
 263 print "Total time: ", timestr($TotalTime), "\n";
 264 
 265 ###############################################################################
 266 
 267 # Process options...
 268 sub ProcessOptions {
 269   $InDelim = "\,";
 270   if ($Options{indelim} =~ /^semicolon$/i) {
 271     $InDelim = "\;";
 272   }
 273   elsif ($Options{indelim} =~ /^tab$/i) {
 274     $InDelim = "\t";
 275   }
 276   $OutDelim = "\,";
 277   if ($Options{outdelim} =~ /^semicolon$/i) {
 278     $OutDelim = "\;";
 279   }
 280   elsif ($Options{outdelim} =~ /^tab$/i) {
 281     $OutDelim = "\t";
 282   }
 283   $OutQuote = 1;
 284   if ($Options{quote} =~ /^no$/i) {
 285     $OutQuote = 0;
 286   }
 287   if ($Options{mode} =~ /^(datafields|datafieldsbyvalue|datafieldbylist|datafielduniquebylist)$/i) {
 288     if ($Options{datafields} || $Options{datafieldsfile}) {
 289       if ($Options{datafields} && $Options{datafieldsfile}) {
 290 	die "Error: For \"-m --mode\" option values datafields, datafieldsbyvalue, datafieldbylist, or datafielduniquebylist, specify only one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n";
 291       }
 292     }
 293     else {
 294       die "Error: For \"-m --mode\" option values datafields, datafieldsbyvalue, datafieldbylist, or datafielduniquebylist, specify one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n";
 295     }
 296   }
 297   $RecordNum = 0; $StartRecordNum = 0; $EndRecordNum = 0;
 298   if ($Options{mode} =~ /^(recordnum|recordrange)$/i) {
 299     if ($Options{record}) {
 300       my(@RecordSplit) = split ",", $Options{record};
 301       if ($Options{mode} =~ /^recordnum$/i ) {
 302 	if (@RecordSplit == 1) {
 303 	  $RecordNum = $RecordSplit[0];
 304 	  if ($RecordNum <= 0) {
 305 	    die "Error: The value specified, $RecordNum,  for option \"--records\" is not valid. Allowed values: > 0 \n";
 306 	  }
 307 	}
 308 	else {
 309 	  die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 1 value is allowed.\n";
 310 	}
 311       }
 312       else {
 313 	if (@RecordSplit == 2) {
 314 	  $StartRecordNum = $RecordSplit[0];
 315 	  $EndRecordNum = $RecordSplit[1];
 316 	  if ($StartRecordNum <= 0 || $EndRecordNum <= 0) {
 317 	    die "Error: The value pair specified, $Options{record},  for option \"--records\" is not valid. Allowed values: > 0 \n";
 318 	  }
 319 	}
 320 	else {
 321 	  die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 2 values is allowed.\n";
 322 	}
 323 	if ($StartRecordNum > $EndRecordNum) {
 324 	  die "Error: Start record number, $StartRecordNum, must be smaller than end record number, $EndRecordNum.\nSpecify different values using \"--record\" option.\n";
 325 	}
 326       }
 327     }
 328     else {
 329       die "Error: For \"-m --mode\" option values recordnum, or recordrange, specify \"--record\" option value.\n";
 330     }
 331   }
 332   my(@Words, $Line, $Value);
 333   if ($Options{mode} =~ /^datafields$/i) {
 334     @SpecifiedDataFieldLabels = ();
 335     if ($Options{datafields}) {
 336       @SpecifiedDataFieldLabels = split "$InDelim", $Options{datafields};
 337     }
 338     elsif ($Options{datafieldsfile}) {
 339       open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
 340       while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
 341 	@Words = quotewords($InDelim, 0, $Line);
 342 	if (@Words) {
 343 	  push @SpecifiedDataFieldLabels, @Words;
 344 	}
 345       }
 346       close DATAFIELDSFILE;
 347     }
 348   }
 349   elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) {
 350     my(@DataFieldsByValueTriplets);
 351     @DataFieldsByValueTriplets = ();
 352     if ($Options{datafields}) {
 353       @DataFieldsByValueTriplets = split "$InDelim", $Options{datafields};
 354     }
 355     elsif ($Options{datafieldsfile}) {
 356       open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
 357       while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
 358 	@Words = quotewords($InDelim, 0, $Line);
 359 	if (@Words) {
 360 	  push @DataFieldsByValueTriplets, @Words;
 361 	}
 362       }
 363       close DATAFIELDSFILE;
 364     }
 365     if ((@DataFieldsByValueTriplets % 3)) {
 366       if ($Options{datafields}) {
 367 	die "Error: Triplets not found in values specified by \"-d --datafields\" option\n";
 368       }
 369       elsif ($Options{datafieldsfile}) {
 370 	die "Error: Triplets not found in values specified by \"--datafieldsfile\" option\n";
 371       }
 372     }
 373     @SpecifiedDataFieldLabels = ();
 374     %SpecifiedDataFieldValuesMap = ();
 375     %SpecifiedDataFieldCriteriaMap = ();
 376     for ($Index = 0; $Index < @DataFieldsByValueTriplets; $Index = $Index + 3) {
 377       $Label = $DataFieldsByValueTriplets[$Index];
 378       my($Value) = $DataFieldsByValueTriplets[$Index + 1];
 379       my($Criterion) = $DataFieldsByValueTriplets[$Index + 2];
 380       if ($Criterion =~ /^(eq|le|ge)$/i) {
 381 	push @SpecifiedDataFieldLabels, $Label;
 382 	$SpecifiedDataFieldValuesMap{$Label} = $Value;
 383 	$SpecifiedDataFieldCriteriaMap{$Label} = $Criterion;
 384       }
 385       else {
 386 	warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion\n";
 387       }
 388     }
 389   }
 390   elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist)$/i) {
 391     my(@DataFieldAndValuesList);
 392     if ($Options{datafields}) {
 393       @DataFieldAndValuesList = split "$InDelim", $Options{datafields};
 394     }
 395     elsif ($Options{datafieldsfile}) {
 396       open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
 397       while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
 398 	@Words = quotewords($InDelim, 0, $Line);
 399 	if (@Words) {
 400 	  push @DataFieldAndValuesList, @Words;
 401 	}
 402       }
 403       close DATAFIELDSFILE;
 404     }
 405     if (@DataFieldAndValuesList < 2) {
 406       if ($Options{datafields}) {
 407 	die "Error: Invalid number of values specified by \"-d --datafields\" option\n";
 408       }
 409       elsif ($Options{datafieldsfile}) {
 410 	die "Error: Invalid number values specified by \"--datafieldsfile\" option\n";
 411       }
 412     }
 413     $SpecifiedDataFieldLabel = $DataFieldAndValuesList[0];
 414     %SpecifiedDataFieldValues = ();
 415     $SpecifiedDataFieldValuesCount = @DataFieldAndValuesList - 1;
 416     for ($Index = 1; $Index < @DataFieldAndValuesList; $Index++) {
 417       $Value = $DataFieldAndValuesList[$Index];
 418       $SpecifiedDataFieldValues{$Value} = "NotFound";
 419     }
 420   }
 421 
 422   $SDFileExt = "sdf";
 423   $TextFileExt = "csv";
 424   if ($Options{outdelim} =~ /^tab$/i) {
 425     $TextFileExt = "tsv";
 426   }
 427   if ($Options{mode} =~ /^(alldatafields|molnames)$/i) {
 428     $OutputSDFileFlag = 0;
 429     $OutputTextFileFlag = 1;
 430   }
 431   else {
 432     $OutputSDFileFlag = ($Options{output} =~ /^(SD|both)$/i) ? 1 : 0;
 433     $OutputTextFileFlag = ($Options{output} =~ /^(text|both)$/i) ? 1 : 0;
 434   }
 435 
 436   my($Nothing);
 437  SWITCH: {
 438     if ($Options{mode} =~ /^alldatafields$/i) { $FileNameMode = "AllDataDields"; last SWITCH; }
 439     if ($Options{mode} =~ /^commondatafields$/i) { $FileNameMode = "CommonDataDields"; last SWITCH; }
 440     if ($Options{mode} =~ /^datafields$/i) { $FileNameMode = "SpecifiedDataFields"; last SWITCH; }
 441     if ($Options{mode} =~ /^datafieldsbyvalue$/i) { $FileNameMode = "SpecifiedDataFieldsByValue"; last SWITCH; }
 442     if ($Options{mode} =~ /^datafieldbylist$/i) { $FileNameMode = "SpecifiedDataField"; last SWITCH; }
 443     if ($Options{mode} =~ /^datafielduniquebylist$/i) { $FileNameMode = "SpecifiedUniqueDataField"; last SWITCH; }
 444     if ($Options{mode} =~ /^molnames$/i) { $FileNameMode = "MolName"; last SWITCH; }
 445     if ($Options{mode} =~ /^randomcmpds$/i) { $FileNameMode = "RandomCmpds"; last SWITCH; }
 446     if ($Options{mode} =~ /^recordnum$/i) { $FileNameMode = "RecordNum$RecordNum"; last SWITCH; }
 447     if ($Options{mode} =~ /^recordrange$/i) { $FileNameMode = "RecordNum$StartRecordNum" . "To" . "$EndRecordNum"; last SWITCH; }
 448     $Nothing = 1;
 449   }
 450 }
 451 
 452 # Retrieve information about input SD files...
 453 sub RetrieveSDFilesInfo {
 454   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $NewFileName, $NewSDFileName, $NewTextFileName, $CmpdCount);
 455 
 456   @SDFilesOkay = ();
 457   @SDFilesCmpdCount = ();
 458   @SDFilesNewTextFileName = ();
 459   @SDFilesNewSDFileName = ();
 460 
 461   @SDFilesAllDataFieldLabels = ();
 462   @SDFilesCommonDataFieldLabels = ();
 463 
 464  FILELIST: for $Index (0 .. $#SDFilesList) {
 465     $SDFile = $SDFilesList[$Index];
 466     $SDFilesOkay[$Index] = 0;
 467     $SDFilesCmpdCount[$Index] = 0;
 468     $SDFilesNewTextFileName[$Index] = "";
 469     $SDFilesNewSDFileName[$Index] = "";
 470 
 471     @{$SDFilesAllDataFieldLabels[$Index]} = ();
 472     @{$SDFilesCommonDataFieldLabels[$Index]} = ();
 473 
 474     if (!(-e $SDFile)) {
 475       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 476       next FILELIST;
 477     }
 478     if (!CheckFileType($SDFile, "sd sdf")) {
 479       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 480       next FILELIST;
 481     }
 482     # Generate appropriate name for the new output file.
 483     $FileDir = ""; $FileName = ""; $FileExt = "";
 484     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 485     $NewFileName = $FileName;
 486     $NewFileName = $FileName  . "$FileNameMode";
 487     if ($Options{root} && (@SDFilesList == 1)) {
 488       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 489       if ($RootFileName && $RootFileExt) {
 490 	$NewFileName = $RootFileName;
 491       }
 492       else {
 493 	$NewFileName = $Options{root};
 494       }
 495     }
 496     $NewSDFileName = $NewFileName . ".$SDFileExt";
 497     $NewTextFileName = $NewFileName . ".$TextFileExt";
 498     if ($OutputSDFileFlag) {
 499       if (lc($NewSDFileName) eq lc($SDFile)) {
 500 	warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
 501 	print "Specify a different name using \"-r --root\" option or use default name.\n";
 502 	next FILELIST;
 503       }
 504     }
 505     if (!$Options{overwrite}) {
 506       if ($OutputSDFileFlag) {
 507 	if (-e $NewSDFileName) {
 508 	  warn "Warning: Ignoring file $SDFile: New file, $NewSDFileName, already exists\n";
 509 	  next FILELIST;
 510 	}
 511       }
 512       if ($OutputTextFileFlag) {
 513 	if (-e $NewTextFileName) {
 514 	  warn "Warning: Ignoring file $SDFile: New file, $NewTextFileName, already exists\n";
 515 	  next FILELIST;
 516 	}
 517       }
 518     }
 519     if (!open SDFILE, "$SDFile") {
 520       warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
 521       next FILELIST;
 522     }
 523 
 524     my($CountCmpdsFlag, $CollectDataFieldsFlag);
 525     my($CmpdString, @CmpdLines, @DataFieldLabels, %DataFieldLabelsMap,@CommonDataFieldLabels);
 526 
 527     $CountCmpdsFlag = ($Options{mode} =~ /^(randomcmpds|recordnum|recordrange)$/i) ? 1 : 0;
 528     $CollectDataFieldsFlag = (($Options{mode} =~ /^(alldatafields|commondatafields|randomcmpds)$/i && $OutputTextFileFlag) || ($Options{mode} =~ /^datafieldsbyvalue$/i  && $OutputTextFileFlag) || ($Options{mode} =~ /^datafieldbylist$/i  && $OutputTextFileFlag) || ($Options{mode} =~ /^datafielduniquebylist$/i  && $OutputTextFileFlag) || ($Options{mode} =~ /^recordrange$/i && $OutputTextFileFlag)) ? 1 : 0;
 529 
 530     $CmpdCount = 0;
 531     if ($CountCmpdsFlag || $CollectDataFieldsFlag) {
 532       @DataFieldLabels = ();
 533       @CommonDataFieldLabels = ();
 534       %DataFieldLabelsMap = ();
 535       CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 536 	$CmpdCount++;
 537 	if ($Options{mode} =~ /^recordnum$/i) {
 538 	  if ($CmpdCount == $RecordNum) {
 539 	    @CmpdLines = split "\n", $CmpdString;
 540 	    @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
 541 	    last CMPDSTRING;
 542 	  }
 543 	}
 544 	if ($CollectDataFieldsFlag) {
 545 	  my($Label);
 546 	  @CmpdLines = split "\n", $CmpdString;
 547 	  # Process compound data header labels and figure out which ones are present for
 548 	  # all the compounds...
 549 	  if (@DataFieldLabels) {
 550 	    my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines);
 551 	    my(%CmpdDataFieldLabelsMap) = ();
 552 	    # Setup a map for the current labels...
 553 	    for $Label (@CmpdDataFieldLabels) {
 554 	      $CmpdDataFieldLabelsMap{$Label} = "PresentInSome";
 555 	    }
 556 	    # Check the presence old labels for this compound; otherwise, mark 'em new...
 557 	    for $Label (@DataFieldLabels) {
 558 	      if (!$CmpdDataFieldLabelsMap{$Label}) {
 559 		$DataFieldLabelsMap{$Label} = "PresentInSome";
 560 	      }
 561 	    }
 562 	    # Check the presence this compound in the old labels; otherwise, add 'em...
 563 	    for $Label (@CmpdDataFieldLabels ) {
 564 	      if (!$DataFieldLabelsMap{$Label}) {
 565 		# It's a new label...
 566 		push @DataFieldLabels, $Label;
 567 		$DataFieldLabelsMap{$Label} = "PresentInSome";
 568 	      }
 569 	    }
 570 	  }
 571 	  else {
 572 	    # Get the initial label set and set up a map...
 573 	    @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
 574 	    for $Label (@DataFieldLabels) {
 575 	      $DataFieldLabelsMap{$Label} = "PresentInAll";
 576 	    }
 577 	  }
 578 	  # Identify the common data field labels...
 579 	  if ($Options{mode} =~ /^commondatafields$/i) {
 580 	    @CommonDataFieldLabels = ();
 581 	    for $Label (@DataFieldLabels) {
 582 	      if ($DataFieldLabelsMap{$Label} eq "PresentInAll") {
 583 		push @CommonDataFieldLabels, $Label;
 584 	      }
 585 	    }
 586 	  }
 587 	}
 588       }
 589     }
 590     if ($Options{mode} =~ /^recordnum$/i) {
 591       if ($RecordNum > $CmpdCount) {
 592 	warn "Warning: Ignoring file $SDFile: The record specified, $RecordNum, using option \"--records\" doesn't exist \n";
 593 	next FILELIST;
 594       }
 595     }
 596     elsif ($Options{mode} =~ /^recordrange$/i) {
 597       if ($StartRecordNum > $CmpdCount && $EndRecordNum > $CmpdCount) {
 598 	warn "Warning: Ignoring file $SDFile: The record range specified, $StartRecordNum to $EndRecordNum, using option \"--records\" doesn't exist \n";
 599 	next FILELIST;
 600       }
 601     }
 602 
 603     $SDFilesOkay[$Index] = 1;
 604     $SDFilesNewTextFileName[$Index] = "$NewTextFileName";
 605     $SDFilesNewSDFileName[$Index] = "$NewSDFileName";
 606 
 607     $SDFilesCmpdCount[$Index] = $CmpdCount;
 608     push @{$SDFilesAllDataFieldLabels[$Index]}, @DataFieldLabels;
 609     push @{$SDFilesCommonDataFieldLabels[$Index]}, @CommonDataFieldLabels;
 610 
 611     close SDFILE;
 612   }
 613 }
 614 
 615 # Setup values for data fields...
 616 sub SetupDataValues {
 617   @DataValues = ();
 618   for $Label (@DataLabels) {
 619     if (exists $DataFieldValues{$Label}) {
 620       push @DataValues, $DataFieldValues{$Label};
 621     }
 622     else {
 623       push @DataValues, "";
 624     }
 625   }
 626 }
 627 
 628 # Write out structure data and specific data fields to SD file...
 629 sub WriteSDFileCmpdData {
 630   my($Count);
 631   if ($OutputSDFileFlag) {
 632     my($MolString) = split "M  END", $CmpdString;
 633     $MolString .= "M  END";
 634     print NEWSDFILE "$MolString\n";
 635     for $Count (0 .. $#DataLabels) {
 636       print NEWSDFILE ">  <$DataLabels[$Count]>\n$DataValues[$Count]\n\n";
 637     }
 638     print NEWSDFILE "\$\$\$\$\n";
 639   }
 640 }
 641 
 642 # Write out compound string...
 643 sub WriteSDFileCmpdString {
 644   if ($OutputSDFileFlag) {
 645     print NEWSDFILE "$CmpdString\n";
 646   }
 647 }
 648 
 649 # Write out data for text file...
 650 sub WriteTextFileCmpdData {
 651   if ($OutputTextFileFlag) {
 652     $DataValuesLine = JoinWords(\@DataValues, $OutDelim, $OutQuote);
 653     # Handle multiple lines data values for data fields by joining 'em using hyphen...
 654     if ($DataValuesLine =~ /\n/) {
 655       $DataValuesLine =~ s/\n/ /g;
 656     }
 657     print NEWTEXTFILE "$DataValuesLine\n";
 658   }
 659 }
 660 
 661 # Setup script usage  and retrieve command line arguments specified using various options...
 662 sub SetupScriptUsage {
 663 
 664   # Retrieve all the options...
 665   %Options = ();
 666   $Options{numofcmpds} = 1;
 667   $Options{mode} = "alldatafields";
 668   $Options{indelim} = "comma";
 669   $Options{outdelim} = "comma";
 670   $Options{output} = "SD";
 671   $Options{quote} = "yes";
 672   $Options{violations} = 0;
 673   $Options{seed} = 999999999;
 674   if (!GetOptions(\%Options, "help|h", "datafields|d=s", "datafieldsfile=s", "indelim=s", "mode|m=s", "numofcmpds|n=i", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "record=s", "root|r=s", "seed|s=i", "violations|v=i", "workingdir|w=s")) {
 675     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 676   }
 677   if ($Options{workingdir}) {
 678     if (! -d $Options{workingdir}) {
 679       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 680     }
 681     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 682   }
 683   if ($Options{numofcmpds} < 1) {
 684     die "Error: The value specified, $Options{numofcmpds},  for option \"-n --numofcmpds\" is not valid. Allowed values: >= 1 \n";
 685   }
 686   if ($Options{violations} < 0) {
 687     die "Error: The value specified, $Options{violations},  for option \"-v --violations\" is not valid. Allowed values: >= 0 \n";
 688   }
 689   if ($Options{mode} !~ /(^(alldatafields|commondatafields|datafields|datafieldsbyvalue|datafieldbylist|datafielduniquebylist|molnames|randomcmpds|recordnum|recordrange)$)/i) {
 690     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, molnames, randomcmpds, recordnum, recordrange\n";
 691   }
 692   if ($Options{output} !~ /(^(SD|text|both)$)/i) {
 693     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n";
 694   }
 695   if ($Options{indelim} !~ /^(comma|semicolon|tab)$/i) {
 696     die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 697   }
 698   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 699     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 700   }
 701   if ($Options{quote} !~ /^(yes|no)$/i) {
 702     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 703   }
 704 }
 705