MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: SplitSDFiles.pl,v $
   4 # $Date: 2008/01/30 21:45:03 $
   5 # $Revision: 1.19 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use 5.006;
  30 use strict;
  31 use FindBin; use lib "$FindBin::Bin/../lib";
  32 use Getopt::Long;
  33 use File::Basename;
  34 use Benchmark;
  35 use SDFileUtil;
  36 use FileUtil;
  37 
  38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  39 my($SDFile, @SDFilesList, $CmpdCount, $Index, $NewFileIndex, @NewSDFilesList, $IgnoreFile, $FileDir, $FileName, $FileExt, $NewFullFileName, $MaxCmpdsCount, $MaxCmpdsPerFile);
  40 
  41 # Autoflush STDOUT
  42 $| = 1;
  43 
  44 # Starting message...
  45 $ScriptName = basename $0;
  46 print "\n$ScriptName:Starting...\n\n";
  47 $StartTime = new Benchmark;
  48 
  49 # Get the options and setup script...
  50 SetupScriptUsage();
  51 if ($Options{help} || @ARGV < 1) {
  52   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  53 }
  54 
  55 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  56 
  57 # Go over each file quickly and make sure: it's a SD file; number of compounds
  58 # in each file is larger than the number of new files being generated; the new
  59 # file names don't already exist. And then split 'em up...
  60 if (@SDFilesList > 1) {
  61   print "Processing SD files...\n";
  62 }
  63 FILELIST: for $Index (0 .. $#SDFilesList) {
  64   $SDFile = $SDFilesList[$Index];
  65   if (@SDFilesList > 1) {
  66     print "\nProcessing file $SDFile...\n";
  67   }
  68   else {
  69     print "Processing file $SDFile...\n"
  70   }
  71   if (!(-e $SDFile)) {
  72     warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
  73     next FILELIST;
  74   }
  75   if (!CheckFileType($SDFile, "sd sdf")) {
  76     warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
  77     next FILELIST;
  78   }
  79   if (!open SDFILE, "$SDFile") {
  80     warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
  81     next FILELIST;
  82   }
  83   $CmpdCount = 0;
  84   while (<SDFILE>) {
  85     if (/\$\$\$\$/) {
  86       $CmpdCount++;
  87     }
  88   }
  89   close SDFILE;
  90   if ($CmpdCount < $Options{numfiles}) {
  91     warn "Warning: Ignoring file $SDFile: Total number of compounds, $CmpdCount, is smaller than\nnumber of new files, $Options{numfiles}\n";
  92     next FILELIST;
  93   }
  94   $FileDir = ""; $FileName = ""; $FileExt = "";
  95   ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
  96   $IgnoreFile = 0;
  97   @NewSDFilesList = ();
  98  NEWFILELIST: for $NewFileIndex (1 .. $Options{numfiles}) {
  99     $NewFullFileName = $FileName;
 100     if ($Options{root} && (@SDFilesList == 1)) {
 101       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 102       if ($RootFileName && $RootFileExt) {
 103 	$NewFullFileName = $RootFileName;
 104       }
 105       else {
 106 	$NewFullFileName = $Options{root};
 107       }
 108     }
 109     $NewFullFileName .= "Part" . "$NewFileIndex" . ".$FileExt";
 110     push @NewSDFilesList, $NewFullFileName;
 111     if (!$Options{overwrite}) {
 112       if (-e $NewFullFileName) {
 113 	$IgnoreFile = 1;
 114 	warn "Warning: Ignoring file $SDFile: New SD file, $NewFullFileName, already exists\n";
 115 	last NEWFILELIST;
 116       }
 117     }
 118   }
 119   if ($IgnoreFile) {
 120     next FILELIST;
 121   }
 122   $MaxCmpdsPerFile = int $CmpdCount / $Options{numfiles};
 123   $MaxCmpdsCount = $MaxCmpdsPerFile;
 124   $CmpdCount = 0;
 125   $NewFileIndex = 1;
 126   open NEWSDFILE, ">$NewSDFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewSDFilesList[$NewFileIndex -1]: $! \n";
 127   print "Generating $NewSDFilesList[$NewFileIndex - 1] file\n";
 128   open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
 129   while (<SDFILE>) {
 130     s/(\r\n)|(\r)/\n/g;
 131     print NEWSDFILE;
 132     if ( /\$\$\$\$/ ) {
 133       $CmpdCount++;
 134       if ($NewFileIndex <= $Options{numfiles}) {
 135 	if ($CmpdCount >= $MaxCmpdsCount) {
 136 	  if ($NewFileIndex < $Options{numfiles}) {
 137 	    close NEWSDFILE;
 138 	  }
 139 	  $NewFileIndex++;
 140 	  $MaxCmpdsCount = $MaxCmpdsPerFile * $NewFileIndex;
 141 	  if ($NewFileIndex <= $Options{numfiles}) {
 142 	    open NEWSDFILE, ">$NewSDFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewSDFilesList[$NewFileIndex - 1]: $! \n";
 143 	    print "Generating $NewSDFilesList[$NewFileIndex - 1] file\n";
 144 	  }
 145 	}
 146       }
 147     }
 148   }
 149   close NEWSDFILE;
 150   close SDFILE;
 151 }
 152 
 153 print "$ScriptName:Done...\n\n";
 154 
 155 $EndTime = new Benchmark;
 156 $TotalTime = timediff ($EndTime, $StartTime);
 157 print "Total time: ", timestr($TotalTime), "\n";
 158 
 159 ###############################################################################
 160 
 161 # Setup script usage  and retrieve command line arguments specified using various options...
 162 sub SetupScriptUsage {
 163 
 164   # Retrieve all the options...
 165   %Options = ();
 166   $Options{numfiles} = 2;
 167   if (!GetOptions(\%Options, "help|h", "numfiles|n=i",  "overwrite|o", "root|r=s", "workingdir|w=s")) {
 168     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 169   }
 170   if ($Options{workingdir}) {
 171     if (! -d $Options{workingdir}) {
 172       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 173     }
 174     chdir $Options{workingdir} or die "Error: Error: Couldn't chdir $Options{workingdir}: $! \n";
 175   }
 176   if ($Options{numfiles} < 2) {
 177     die "Error: The value specified, $Options{numfiles}, for option \"-n --numfiles\" is not valid. Allowed values: >= 2 \n";
 178   }
 179 }
 180