MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: FilterSDFiles.pl,v $
   4 # $Date: 2010/01/03 00:59:51 $
   5 # $Revision: 1.23 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2010 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Benchmark;
  34 use SDFileUtil;
  35 use FileUtil;
  36 
  37 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  38 my($SDFile, @SDFilesList, $NewSDFile, $NewKeepSDFile, $NewFileName, $Index, $FileDir, $FileName, $FileExt, $CmpdCount, $FilteredCmpdCount, $KeepCmpdCount, $CtabLinesCount, @CmpdLines, $CmpdString, $FilterCmpd, $PrintCmpdCounterHeader);
  39 
  40 # Autoflush STDOUT
  41 $| = 1;
  42 
  43 # Starting message...
  44 $ScriptName = basename $0;
  45 print "\n$ScriptName:Starting...\n\n";
  46 $StartTime = new Benchmark;
  47 
  48 # Get the options and setup script...
  49 SetupScriptUsage();
  50 if ($Options{help} || @ARGV < 1) {
  51   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  52 }
  53 
  54 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  55 
  56 # Process all the SD files...
  57 if (@SDFilesList > 1) {
  58   print "Processing SD files...\n";
  59 }
  60 
  61 FILELIST: for $Index (0 .. $#SDFilesList) {
  62   $SDFile = $SDFilesList[$Index];
  63   if (@SDFilesList > 1) {
  64     print "\nProcessing file $SDFile...\n";
  65   }
  66   else {
  67     print "Processing file $SDFile...\n"
  68   }
  69   if (!(-e $SDFile)) {
  70     warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
  71     next FILELIST;
  72   }
  73   if (!CheckFileType($SDFile, "sd sdf")) {
  74     warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
  75     next FILELIST;
  76   }
  77   if (!open SDFILE, "$SDFile") {
  78     warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
  79     next FILELIST;
  80   }
  81 
  82   # Setup new file name...
  83   $FileDir = ""; $FileName = ""; $FileExt = "";
  84   ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
  85   if ($Options{root} && (@SDFilesList == 1)) {
  86     my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
  87     if ($RootFileName && $RootFileExt) {
  88       $NewFileName = $RootFileName;
  89     }
  90     else {
  91       $NewSDFile = $Options{root};
  92     }
  93     $NewKeepSDFile = $NewSDFile;
  94   }
  95   else {
  96     $NewSDFile = $FileName . "Filtered";
  97     $NewKeepSDFile = $FileName;
  98   }
  99   $NewSDFile .= ".$FileExt";
 100   $NewKeepSDFile .= "Ignored" . ".$FileExt";
 101   if (!$Options{overwrite}) {
 102     if (-e $NewSDFile) {
 103       warn "Warning: Ignoring file $SDFile: New SD file, $NewSDFile, already exists\n";
 104       next FILELIST;
 105     }
 106     if ($Options{keep}) {
 107       if (-e $NewKeepSDFile) {
 108 	warn "Warning: Ignoring file $SDFile: New SD file, $NewKeepSDFile, already exists\n";
 109 	next FILELIST;
 110       }
 111     }
 112   }
 113   if (lc($NewSDFile) eq lc($SDFile)) {
 114       warn "Warning: Ignoring file $SDFile: Same output, $NewSDFile, and input file name\n";
 115       print "Specify a different name using \"-r --root\" option or use default name.\n";
 116       next FILELIST;
 117   }
 118   if (!open NEWSDFILE, ">$NewSDFile") {
 119     warn "Warning: Ignoring file $SDFile: Couldn't open $NewSDFile: $! \n";
 120     next FILELIST;
 121   }
 122   if ($Options{keep}) {
 123     if (!open NEWKEEPSDFILE, ">$NewKeepSDFile") {
 124       warn "Warning: Ignoring file $SDFile: Couldn't open $NewKeepSDFile: $! \n";
 125       next FILELIST;
 126     }
 127   }
 128   print "\nGenerating file $NewSDFile...\n";
 129   if ($Options{keep}) {
 130     print "Generating file $NewKeepSDFile...\n";
 131   }
 132 
 133   $CmpdCount = 0; $FilteredCmpdCount = 0; $KeepCmpdCount = 0;
 134   $PrintCmpdCounterHeader = 1;
 135  CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 136     $CmpdCount++;
 137     $FilterCmpd = 0;
 138     if (($CmpdCount % 5000) == 0) {
 139       if ($PrintCmpdCounterHeader) {
 140 	$PrintCmpdCounterHeader = 0;
 141 	print "\nProcessing compounds:";
 142       }
 143       print "$CmpdCount...";
 144     }
 145     @CmpdLines = split "\n", $CmpdString;
 146     $CtabLinesCount = GetCtabLinesCount(\@CmpdLines);
 147     if ($CtabLinesCount <= 0) {
 148       $FilterCmpd = 1;
 149       WriteOutCmpdString();
 150       next CMPDSTRING;
 151     }
 152     my ($AtomCount, $BondCount) = ParseCmpdCountsLine($CmpdLines[3]);
 153     if ($Options{all} || $Options{mismatch}) {
 154       if ($CtabLinesCount != ($AtomCount + $BondCount)) {
 155 	$FilterCmpd = 1;
 156 	WriteOutCmpdString();
 157 	next CMPDSTRING;
 158       }
 159     }
 160     if ($CtabLinesCount == ($AtomCount + $BondCount)) {
 161       if ($Options{all} || $Options{unknownatoms}) {
 162 	my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines) = GetUnknownAtoms(\@CmpdLines);
 163 	if ($UnknownAtomCount) {
 164 	  $FilterCmpd = 1;
 165 	  WriteOutCmpdString();
 166 	  next CMPDSTRING;
 167 	}
 168       }
 169       if ($Options{all} || $Options{cleansalts} || $Options{salts}) {
 170 	my ($FragmentsCount, $Fragments, $WashedCmpdString) = WashCmpd(\@CmpdLines);
 171 	if ($FragmentsCount > 1) {
 172 	  if ($Options{all} || $Options{cleansalts}) {
 173 	    $CmpdString = $WashedCmpdString;
 174 	  }
 175 	  else {
 176 	    $FilterCmpd = 1;
 177 	  }
 178 	  WriteOutCmpdString();
 179 	  next CMPDSTRING;
 180 	}
 181       }
 182     }
 183     WriteOutCmpdString();
 184   }
 185   if (!$PrintCmpdCounterHeader) {
 186     print "\n";
 187   }
 188   close NEWSDFILE;
 189   if ($Options{keep}) {
 190     close NEWKEEPSDFILE;
 191   }
 192   close SDFILE;
 193   print "\nTotal Number of compounds: $CmpdCount\n";
 194   print "Number of compounds left after filtering: $FilteredCmpdCount\n";
 195   print "Number of compounds ignored: $KeepCmpdCount\n";
 196 }
 197 
 198 print "\n$ScriptName:Done...\n\n";
 199 
 200 $EndTime = new Benchmark;
 201 $TotalTime = timediff ($EndTime, $StartTime);
 202 print "Total time: ", timestr($TotalTime), "\n";
 203 
 204 ###############################################################################
 205 
 206 # Write out the compound data...
 207 sub WriteOutCmpdString {
 208     if ($FilterCmpd) {
 209       $KeepCmpdCount++;
 210       if ($Options{keep}) {
 211 	print NEWKEEPSDFILE "$CmpdString\n";
 212       }
 213     }
 214     else {
 215       $FilteredCmpdCount++;
 216       print NEWSDFILE "$CmpdString\n";
 217     }
 218 }
 219 
 220 # Setup script usage  and retrieve command line arguments specified using various options...
 221 sub SetupScriptUsage {
 222 
 223   # Retrieve all the options...
 224   %Options = ();
 225   if (!GetOptions(\%Options, "all|a", "cleansalts|c", "empty|e", "help|h", "keep|k", "mismatch|m", "overwrite|o", "root|r=s", "salts|s", "unknownatoms|u", "workingdir|w=s")) {
 226     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 227   }
 228   if ($Options{workingdir}) {
 229     if (! -d $Options{workingdir}) {
 230       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 231     }
 232     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 233   }
 234 }
 235