MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: FilterSDFiles.pl,v $
   4 # $Date: 2008/01/30 21:44:46 $
   5 # $Revision: 1.19 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use 5.006;
  30 use strict;
  31 use FindBin; use lib "$FindBin::Bin/../lib";
  32 use Getopt::Long;
  33 use File::Basename;
  34 use Benchmark;
  35 use SDFileUtil;
  36 use FileUtil;
  37 
  38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  39 my($SDFile, @SDFilesList, $NewSDFile, $NewKeepSDFile, $NewFileName, $Index, $FileDir, $FileName, $FileExt, $CmpdCount, $FilteredCmpdCount, $KeepCmpdCount, $CtabLinesCount, @CmpdLines, $CmpdString, $FilterCmpd, $PrintCmpdCounterHeader);
  40 
  41 # Autoflush STDOUT
  42 $| = 1;
  43 
  44 # Starting message...
  45 $ScriptName = basename $0;
  46 print "\n$ScriptName:Starting...\n\n";
  47 $StartTime = new Benchmark;
  48 
  49 # Get the options and setup script...
  50 SetupScriptUsage();
  51 if ($Options{help} || @ARGV < 1) {
  52   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  53 }
  54 
  55 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  56 
  57 # Process all the SD files...
  58 if (@SDFilesList > 1) {
  59   print "Processing SD files...\n";
  60 }
  61 
  62 FILELIST: for $Index (0 .. $#SDFilesList) {
  63   $SDFile = $SDFilesList[$Index];
  64   if (@SDFilesList > 1) {
  65     print "\nProcessing file $SDFile...\n";
  66   }
  67   else {
  68     print "Processing file $SDFile...\n"
  69   }
  70   if (!(-e $SDFile)) {
  71     warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
  72     next FILELIST;
  73   }
  74   if (!CheckFileType($SDFile, "sd sdf")) {
  75     warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
  76     next FILELIST;
  77   }
  78   if (!open SDFILE, "$SDFile") {
  79     warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
  80     next FILELIST;
  81   }
  82 
  83   # Setup new file name...
  84   $FileDir = ""; $FileName = ""; $FileExt = "";
  85   ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
  86   if ($Options{root} && (@SDFilesList == 1)) {
  87     my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
  88     if ($RootFileName && $RootFileExt) {
  89       $NewFileName = $RootFileName;
  90     }
  91     else {
  92       $NewSDFile = $Options{root};
  93     }
  94     $NewKeepSDFile = $NewSDFile;
  95   }
  96   else {
  97     $NewSDFile = $FileName . "Filtered";
  98     $NewKeepSDFile = $FileName;
  99   }
 100   $NewSDFile .= ".$FileExt";
 101   $NewKeepSDFile .= "Ignored" . ".$FileExt";
 102   if (!$Options{overwrite}) {
 103     if (-e $NewSDFile) {
 104       warn "Warning: Ignoring file $SDFile: New SD file, $NewSDFile, already exists\n";
 105       next FILELIST;
 106     }
 107     if ($Options{keep}) {
 108       if (-e $NewKeepSDFile) {
 109 	warn "Warning: Ignoring file $SDFile: New SD file, $NewKeepSDFile, already exists\n";
 110 	next FILELIST;
 111       }
 112     }
 113   }
 114   if (lc($NewSDFile) eq lc($SDFile)) {
 115       warn "Warning: Ignoring file $SDFile: Same output, $NewSDFile, and input file name\n";
 116       print "Specify a different name using \"-r --root\" option or use default name.\n";
 117       next FILELIST;
 118   }
 119   if (!open NEWSDFILE, ">$NewSDFile") {
 120     warn "Warning: Ignoring file $SDFile: Couldn't open $NewSDFile: $! \n";
 121     next FILELIST;
 122   }
 123   if ($Options{keep}) {
 124     if (!open NEWKEEPSDFILE, ">$NewKeepSDFile") {
 125       warn "Warning: Ignoring file $SDFile: Couldn't open $NewKeepSDFile: $! \n";
 126       next FILELIST;
 127     }
 128   }
 129   print "\nGenerating file $NewSDFile...\n";
 130   if ($Options{keep}) {
 131     print "Generating file $NewKeepSDFile...\n";
 132   }
 133 
 134   $CmpdCount = 0; $FilteredCmpdCount = 0; $KeepCmpdCount = 0;
 135   $PrintCmpdCounterHeader = 1;
 136  CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 137     $CmpdCount++;
 138     $FilterCmpd = 0;
 139     if (($CmpdCount % 5000) == 0) {
 140       if ($PrintCmpdCounterHeader) {
 141 	$PrintCmpdCounterHeader = 0;
 142 	print "\nProcessing compounds:";
 143       }
 144       print "$CmpdCount...";
 145     }
 146     @CmpdLines = split "\n", $CmpdString;
 147     $CtabLinesCount = GetCtabLinesCount(\@CmpdLines);
 148     if ($CtabLinesCount <= 0) {
 149       $FilterCmpd = 1;
 150       WriteOutCmpdString();
 151       next CMPDSTRING;
 152     }
 153     my ($AtomCount, $BondCount) = ParseCmpdCountsLine($CmpdLines[3]);
 154     if ($Options{all} || $Options{mismatch}) {
 155       if ($CtabLinesCount != ($AtomCount + $BondCount)) {
 156 	$FilterCmpd = 1;
 157 	WriteOutCmpdString();
 158 	next CMPDSTRING;
 159       }
 160     }
 161     if ($CtabLinesCount == ($AtomCount + $BondCount)) {
 162       if ($Options{all} || $Options{unknownatoms}) {
 163 	my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines) = GetUnknownAtoms(\@CmpdLines);
 164 	if ($UnknownAtomCount) {
 165 	  $FilterCmpd = 1;
 166 	  WriteOutCmpdString();
 167 	  next CMPDSTRING;
 168 	}
 169       }
 170       if ($Options{all} || $Options{cleansalts} || $Options{salts}) {
 171 	my ($FragmentsCount, $Fragments, $WashedCmpdString) = WashCmpd(\@CmpdLines);
 172 	if ($FragmentsCount > 1) {
 173 	  if ($Options{all} || $Options{cleansalts}) {
 174 	    $CmpdString = $WashedCmpdString;
 175 	  }
 176 	  else {
 177 	    $FilterCmpd = 1;
 178 	  }
 179 	  WriteOutCmpdString();
 180 	  next CMPDSTRING;
 181 	}
 182       }
 183     }
 184     WriteOutCmpdString();
 185   }
 186   if (!$PrintCmpdCounterHeader) {
 187     print "\n";
 188   }
 189   close NEWSDFILE;
 190   if ($Options{keep}) {
 191     close NEWKEEPSDFILE;
 192   }
 193   close SDFILE;
 194   print "\nTotal Number of compounds: $CmpdCount\n";
 195   print "Number of compounds left after filtering: $FilteredCmpdCount\n";
 196   print "Number of compounds ignored: $KeepCmpdCount\n";
 197 }
 198 
 199 print "\n$ScriptName:Done...\n\n";
 200 
 201 $EndTime = new Benchmark;
 202 $TotalTime = timediff ($EndTime, $StartTime);
 203 print "Total time: ", timestr($TotalTime), "\n";
 204 
 205 ###############################################################################
 206 
 207 # Write out the compound data...
 208 sub WriteOutCmpdString {
 209     if ($FilterCmpd) {
 210       $KeepCmpdCount++;
 211       if ($Options{keep}) {
 212 	print NEWKEEPSDFILE "$CmpdString\n";
 213       }
 214     }
 215     else {
 216       $FilteredCmpdCount++;
 217       print NEWSDFILE "$CmpdString\n";
 218     }
 219 }
 220 
 221 # Setup script usage  and retrieve command line arguments specified using various options...
 222 sub SetupScriptUsage {
 223 
 224   # Retrieve all the options...
 225   %Options = ();
 226   if (!GetOptions(\%Options, "all|a", "cleansalts|c", "empty|e", "help|h", "keep|k", "mismatch|m", "overwrite|o", "root|r=s", "salts|s", "unknownatoms|u", "workingdir|w=s")) {
 227     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 228   }
 229   if ($Options{workingdir}) {
 230     if (! -d $Options{workingdir}) {
 231       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 232     }
 233     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 234   }
 235 }
 236