MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitEnumerateTautomers.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 import multiprocessing as mp
  37 
  38 # RDKit imports...
  39 try:
  40     from rdkit import rdBase
  41     from rdkit import Chem
  42     from rdkit.Chem.MolStandardize import rdMolStandardize
  43     from rdkit.Chem import AllChem
  44 except ImportError as ErrMsg:
  45     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  46     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  47     sys.exit(1)
  48 
  49 # MayaChemTools imports...
  50 try:
  51     from docopt import docopt
  52     import MiscUtil
  53     import RDKitUtil
  54 except ImportError as ErrMsg:
  55     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  56     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  57     sys.exit(1)
  58 
  59 ScriptName = os.path.basename(sys.argv[0])
  60 Options = {}
  61 OptionsInfo = {}
  62 
  63 def main():
  64     """Start execution of the script."""
  65     
  66     MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()))
  67     
  68     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  69     
  70     # Retrieve command line arguments and options...
  71     RetrieveOptions()
  72     
  73     # Process and validate command line arguments and options...
  74     ProcessOptions()
  75     
  76     # Perform actions required by the script...
  77     EnumerateTautomers()
  78     
  79     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  80     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  81 
  82 def EnumerateTautomers():
  83     """Enunmerate tautomers."""
  84     
  85     # Setup a molecule reader...
  86     MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"])
  87     Mols  = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"])
  88     
  89     # Set up a molecule writer...
  90     Writer = SetupMoleculeWriter()
  91 
  92     MolCount, ValidMolCount, TautomerizationFailedCount, TautomersCount, MinTautomersCount, MaxTautomersCount = ProcessMolecules(Mols, Writer)
  93 
  94     if Writer is not None:
  95         Writer.close()
  96     
  97     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
  98     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
  99     MiscUtil.PrintInfo("Number of molecules failed during tautomerization: %d" % TautomerizationFailedCount)
 100     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount + TautomerizationFailedCount))
 101     
 102     MiscUtil.PrintInfo("\nNumber of tautomerized molecules: %d" % (ValidMolCount - TautomerizationFailedCount))
 103     
 104     MiscUtil.PrintInfo("\nTotal number of tautomers for molecules: %d" % TautomersCount)
 105     MiscUtil.PrintInfo("Minumum number of tautomers for a molecule: %d" % MinTautomersCount)
 106     MiscUtil.PrintInfo("Maxiumum number of tautomers for a molecule: %d" % MaxTautomersCount)
 107     MiscUtil.PrintInfo("Average number of tautomers for a molecule: %.1f" % (TautomersCount/(ValidMolCount - TautomerizationFailedCount)))
 108 
 109 def ProcessMolecules(Mols, Writer):
 110     """Process molecules."""
 111 
 112     if OptionsInfo["MPMode"]:
 113         return ProcessMoleculesUsingMultipleProcesses(Mols, Writer)
 114     else:
 115         return ProcessMoleculesUsingSingleProcess(Mols, Writer)
 116 
 117 def ProcessMoleculesUsingSingleProcess(Mols,  Writer):
 118     """Process and generate tautomers for molecules using a single process."""
 119 
 120     MiscUtil.PrintInfo("\nEnumerating tatutomers...")
 121     
 122     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 123     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 124 
 125     # Set up tautomer enumerator...
 126     TautomerEnumerator = SetupTautomerEnumerator()
 127 
 128     (MolCount, ValidMolCount, TautomerizationFailedCount, TautomersCount) = [0] * 4
 129     (MinTautomersCount, MaxTautomersCount) = [sys.maxsize, 0]
 130     FirstTautomerMol = True
 131     for Mol in Mols:
 132         MolCount += 1
 133         
 134         if Mol is None:
 135             continue
 136 
 137         if RDKitUtil.IsMolEmpty(Mol):
 138             if not OptionsInfo["QuietMode"]:
 139                 MolName = RDKitUtil.GetMolName(Mol, MolCount)
 140                 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 141             continue
 142         
 143         ValidMolCount += 1
 144         
 145         TautomerMols,  TautomerizationStatus = EnumerateMolTautomers(Mol, TautomerEnumerator, MolCount)
 146         if not TautomerizationStatus:
 147             if not OptionsInfo["QuietMode"]:
 148                 MolName = RDKitUtil.GetMolName(Mol, MolCount)
 149                 MiscUtil.PrintWarning("Failed to tautomerize molecule %s" % MolName)
 150             
 151             TautomerizationFailedCount += 1
 152             continue
 153         
 154         if FirstTautomerMol:
 155             FirstTautomerMol = False
 156             if SetSMILESMolProps:
 157                 RDKitUtil.SetWriterMolProps(Writer, TautomerMols[0])
 158 
 159         # Track tautomer count...
 160         TautomerMolsCount = len(TautomerMols)
 161         TautomersCount += TautomerMolsCount
 162         if TautomerMolsCount < MinTautomersCount:
 163             MinTautomersCount = TautomerMolsCount
 164         if TautomerMolsCount > MaxTautomersCount:
 165             MaxTautomersCount = TautomerMolsCount
 166         
 167         WriteMolTautomers(Writer, Mol, MolCount, Compute2DCoords, TautomerMols)
 168     
 169     return (MolCount, ValidMolCount, TautomerizationFailedCount, TautomersCount, MinTautomersCount, MaxTautomersCount)
 170     
 171 def ProcessMoleculesUsingMultipleProcesses(Mols, Writer):
 172     """Process and enumerate tautomer of molecules using  multiprocessing."""
 173     
 174     MiscUtil.PrintInfo("\nEnumerating tatutomers using multiprocessing...")
 175     
 176     MPParams = OptionsInfo["MPParams"]
 177     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 178     
 179     # Setup data for initializing a worker process...
 180     InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo))
 181 
 182     # Setup a encoded mols data iterable for a worker process by pickling only public
 183     # and private molecule properties...
 184     WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols)
 185 
 186     # Setup process pool along with data initialization for each process...
 187     MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()"))
 188     MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"])))
 189     
 190     ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
 191     
 192     # Start processing...
 193     if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
 194         Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 195     elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
 196         Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 197     else:
 198         MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"]))
 199     
 200     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 201     
 202     (MolCount, ValidMolCount, TautomerizationFailedCount, TautomersCount) = [0] * 4
 203     (MinTautomersCount, MaxTautomersCount) = [sys.maxsize, 0]
 204     FirstTautomerMol = True
 205     for Result in Results:
 206         MolCount += 1
 207         MolIndex, EncodedMol, TautomerizationStatus, EncodedTautomerMols = Result
 208         
 209         if EncodedMol is None:
 210             continue
 211         ValidMolCount += 1
 212         
 213         Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 214         
 215         TautomerMols = []
 216         if EncodedTautomerMols is not None:
 217             TautomerMols = [RDKitUtil.MolFromBase64EncodedMolString(EncodedTautomerMol) for EncodedTautomerMol in EncodedTautomerMols]
 218             
 219         if not TautomerizationStatus:
 220             if not OptionsInfo["QuietMode"]:
 221                 MolName = RDKitUtil.GetMolName(Mol, MolCount)
 222                 MiscUtil.PrintWarning("Failed to tautomerize molecule %s" % MolName)
 223             
 224             TautomerizationFailedCount += 1
 225             continue
 226 
 227         if FirstTautomerMol:
 228             FirstTautomerMol = False
 229             if SetSMILESMolProps:
 230                 RDKitUtil.SetWriterMolProps(Writer, TautomerMols[0])
 231 
 232         # Track tautomer count...
 233         TautomerMolsCount = len(TautomerMols)
 234         TautomersCount += TautomerMolsCount
 235         if TautomerMolsCount < MinTautomersCount:
 236             MinTautomersCount = TautomerMolsCount
 237         if TautomerMolsCount > MaxTautomersCount:
 238             MaxTautomersCount = TautomerMolsCount
 239         
 240         WriteMolTautomers(Writer, Mol, MolCount, Compute2DCoords, TautomerMols)
 241     
 242     return (MolCount, ValidMolCount, TautomerizationFailedCount, TautomersCount, MinTautomersCount, MaxTautomersCount)
 243 
 244 def InitializeWorkerProcess(*EncodedArgs):
 245     """Initialize data for a worker process."""
 246 
 247     global Options, OptionsInfo
 248     
 249     MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
 250 
 251     # Decode Options and OptionInfo...
 252     Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
 253     OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
 254 
 255     # Set up tautomer enumerator...
 256     OptionsInfo["TautomerEnumerator"] = SetupTautomerEnumerator()
 257 
 258 def WorkerProcess(EncodedMolInfo):
 259     """Process data for a worker process."""
 260     
 261     MolIndex, EncodedMol = EncodedMolInfo
 262     
 263     if EncodedMol is None:
 264         return [MolIndex, None, False, None]
 265         
 266     Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 267     if RDKitUtil.IsMolEmpty(Mol):
 268         if not OptionsInfo["QuietMode"]:
 269             MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
 270             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 271         return [MolIndex, None, False, None]
 272     
 273     TautomerMols,  TautomerizationStatus = EnumerateMolTautomers(Mol, OptionsInfo["TautomerEnumerator"], (MolIndex + 1))
 274     
 275     EncodedTautomerMols = None
 276     if TautomerMols is not None:
 277         EncodedTautomerMols = [RDKitUtil.MolToBase64EncodedMolString(TautomerMol, PropertyPickleFlags = Chem.PropertyPickleOptions.MolProps | Chem.PropertyPickleOptions.AtomProps | Chem.PropertyPickleOptions.BondProps | Chem.PropertyPickleOptions.PrivateProps) for TautomerMol in TautomerMols]
 278     
 279     return [MolIndex, EncodedMol, TautomerizationStatus, EncodedTautomerMols]
 280     
 281 def EnumerateMolTautomers(Mol, TautomerEnumerator, MolNum):
 282     """Enumerate tautomers of a molecule and return a list of tatutomers
 283     along with the status of tautomerization."""
 284 
 285     TautomerMols, Status, TautomerScores  = [None, False, None]
 286     try:
 287         TautomerMols = [TautomerMol for TautomerMol in TautomerEnumerator.Enumerate(Mol)]
 288         
 289         if OptionsInfo["ScoreTautomers"]:
 290             TautomerScores = [TautomerEnumerator.ScoreTautomer(TautomerMol) for TautomerMol in TautomerMols]
 291 
 292         if OptionsInfo["SortTautomers"]:
 293             TautomerMols, TautomerScores = SortMolTautomers(Mol, TautomerEnumerator, TautomerMols, TautomerScores)
 294 
 295         # Set tautomer score...
 296         if TautomerScores is not None:
 297             for Index, TautomerMol in enumerate(TautomerMols):
 298                 TautomerMol.SetProp("Tautomer_Score", "%.1f" % TautomerScores[Index])
 299                 
 300         Status = True
 301     except Exception as ErrMsg:
 302         if not OptionsInfo["QuietMode"]:
 303             MiscUtil.PrintWarning("Failed to tautomerize molecule %s: %s" % (RDKitUtil.GetMolName(Mol, MolNum), ErrMsg))
 304         TautomerMols, Status = [None, False]
 305 
 306     return (TautomerMols,  Status)
 307 
 308 def SortMolTautomers(Mol, TautomerEnumerator, TautomerMols, TautomerScores = None):
 309     """Sort tatutomers by SMILES string and place canonical tautomer at the top
 310     of the list."""
 311 
 312     CanonicalTautomer = TautomerEnumerator.Canonicalize(Mol)
 313     CanonicalTautomerSmiles = Chem.MolToSmiles(CanonicalTautomer)
 314     if TautomerScores is None:
 315         CanonicalTautomerScore = None
 316     else:
 317         CanonicalTautomerScore = TautomerEnumerator.ScoreTautomer(CanonicalTautomer)
 318 
 319     TautomerSmiles = [Chem.MolToSmiles(TautomerMol) for TautomerMol in TautomerMols]
 320     if TautomerScores is None:
 321         SortedResults = sorted((Smiles,  TautomerMol) for Smiles,  TautomerMol in zip(TautomerSmiles, TautomerMols) if Smiles != CanonicalTautomerSmiles)
 322     else:
 323         SortedResults = sorted((Smiles,  TautomerMol, TautomerScore) for Smiles,  TautomerMol, TautomerScore in zip(TautomerSmiles, TautomerMols, TautomerScores) if Smiles != CanonicalTautomerSmiles)
 324     
 325     SortedTautomerMols = [CanonicalTautomer]
 326     if TautomerScores is None:
 327         SortedTautomerMols += [TautomerMol for Smiles,  TautomerMol in SortedResults]
 328     else:
 329         SortedTautomerMols += [TautomerMol for Smiles,  TautomerMol, TautomerScore in SortedResults]
 330     
 331     if TautomerScores is None:
 332         SortedTautomerScores = None
 333     else:
 334         SortedTautomerScores = [CanonicalTautomerScore]
 335         SortedTautomerScores += [TautomerScore for Smiles,  TautomerMol, TautomerScore in SortedResults]
 336 
 337     return (SortedTautomerMols, SortedTautomerScores)
 338 
 339 def WriteMolTautomers(Writer, Mol, MolNum, Compute2DCoords, TautomerMols):
 340     """Write out tautomers of a  molecule."""
 341 
 342     if TautomerMols is None:
 343         return
 344     
 345     MolName = RDKitUtil.GetMolName(Mol, MolNum)
 346     
 347     for Index, TautomerMol in enumerate(TautomerMols):
 348         SetupTautomerMolName(TautomerMol, MolName, (Index + 1))
 349 
 350         if Compute2DCoords:
 351             AllChem.Compute2DCoords(Mol)
 352     
 353         Writer.write(TautomerMol)
 354 
 355 def SetupTautomerMolName(Mol, MolName, TautomerCount):
 356     """Set tautomer mol name."""
 357 
 358     TautomerName = "%s_Taut%d" % (MolName, TautomerCount)
 359     Mol.SetProp("_Name", TautomerName)
 360 
 361 def SetupTautomerEnumerator():
 362     """Setup tautomer enumerator. """
 363     
 364     TautomerParams  = SetupTautomerizationParameters()
 365     
 366     return rdMolStandardize.TautomerEnumerator(TautomerParams)
 367     
 368 def SetupTautomerizationParameters():
 369     """Setup tautomerization parameters for RDKit using cleanup parameters."""
 370 
 371     Params = rdMolStandardize.CleanupParameters()
 372     TautomerizationParams = OptionsInfo["TautomerizationParams"]
 373     
 374     if TautomerizationParams["TautomerTransformsFile"] is not None:
 375         Params.tautomerTransformsFile = TautomerizationParams["TautomerTransformsFile"]
 376     
 377     Params.maxTautomers = TautomerizationParams["MaxTautomers"]
 378     Params.maxTransforms = TautomerizationParams["MaxTransforms"]
 379     Params.tautomerRemoveBondStereo = TautomerizationParams["TautomerRemoveBondStereo"]
 380     Params.tautomerRemoveIsotopicHs = TautomerizationParams["TautomerRemoveIsotopicHs"]
 381     Params.tautomerRemoveSp3Stereo = TautomerizationParams["TautomerRemoveSp3Stereo"]
 382     Params.tautomerReassignStereo = TautomerizationParams["TautomerReassignStereo"]
 383     
 384     return Params
 385 
 386 def SetupMoleculeWriter():
 387     """Setup a molecule writer."""
 388     
 389     Writer = None
 390 
 391     Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"])
 392     if Writer is None:
 393         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"])
 394     MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"])
 395     
 396     return Writer
 397 
 398 def ProcessTautomerizationParameters():
 399     """Process tautomerizationparameters. """
 400 
 401     ParamsDefaultInfo = {"TautomerTransformsFile": ["file", None], "MaxTautomers": ["int", 1000], "MaxTransforms": ["int", 1000], "TautomerRemoveBondStereo": ["bool", True], "TautomerRemoveIsotopicHs": ["bool", True], "TautomerRemoveSp3Stereo": ["bool", True], "TautomerReassignStereo": ["bool", True]}
 402 
 403     OptionsInfo["TautomerizationParams"] = MiscUtil.ProcessOptionNameValuePairParameters("--tautomerizationParams", Options["--tautomerizationParams"], ParamsDefaultInfo)
 404     
 405     #  Validate numerical values...
 406     for ParamName in ["MaxTautomers", "MaxTransforms"]:
 407         ParamValue = OptionsInfo["TautomerizationParams"][ParamName]
 408         if  ParamValue <= 0:
 409             MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"-t, --tautomerizationParams\" option is not a valid value. Supported values: > 0" % (ParamValue, ParamName))
 410 
 411 def ProcessOptions():
 412     """Process and validate command line arguments and options."""
 413     
 414     MiscUtil.PrintInfo("Processing options...")
 415 
 416     # Validate options...
 417     ValidateOptions()
 418     
 419     OptionsInfo["Infile"] = Options["--infile"]
 420     ParamsDefaultInfoOverride = {'RemoveHydrogens': False}
 421     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"], ParamsDefaultInfo = ParamsDefaultInfoOverride)
 422     
 423     OptionsInfo["Outfile"] = Options["--outfile"]
 424     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 425 
 426     OptionsInfo["Overwrite"] = Options["--overwrite"]
 427 
 428     OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
 429     OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
 430 
 431     OptionsInfo["QuietMode"] = True if re.match("^yes$", Options["--quiet"], re.I) else False
 432     
 433     OptionsInfo["ScoreTautomers"] = True if re.match("^yes$", Options["--scoreTautomers"], re.I) else False
 434     OptionsInfo["SortTautomers"] = True if re.match("^yes$", Options["--sortTautomers"], re.I) else False
 435 
 436     ProcessTautomerizationParameters()
 437     
 438 def RetrieveOptions():
 439     """Retrieve command line arguments and options."""
 440     
 441     # Get options...
 442     global Options
 443     Options = docopt(_docoptUsage_)
 444     
 445     # Set current working directory to the specified directory...
 446     WorkingDir = Options["--workingdir"]
 447     if WorkingDir:
 448         os.chdir(WorkingDir)
 449     
 450     # Handle examples option...
 451     if "--examples" in Options and Options["--examples"]:
 452         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 453         sys.exit(0)
 454 
 455 def ValidateOptions():
 456     """Validate option values."""
 457 
 458     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 459     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol smi txt csv tsv")
 460     
 461     if Options["--outfile"]:
 462         MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 463         MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 464         MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 465 
 466     MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
 467     MiscUtil.ValidateOptionTextValue("-q, --quiet", Options["--quiet"], "yes no")
 468     
 469     MiscUtil.ValidateOptionTextValue("--scoreTautomers", Options["--scoreTautomers"], "yes no")
 470     MiscUtil.ValidateOptionTextValue("--sortTautomers", Options["--sortTautomers"], "yes no")
 471     
 472 # Setup a usage string for docopt...
 473 _docoptUsage_ = """
 474 RDKitEnumerateTautomers.py - Enumerate tautomers of molecules
 475 
 476 Usage:
 477     RDKitEnumerateTautomers.py [--infileParams <Name,Value,...>] [--mp <yes or no>] [--mpParams <Name,Value,...>]
 478                                [--outfileParams <Name,Value,...> ] [--overwrite] [--quiet <yes or no>] [--scoreTautomers <yes or no>]
 479                                [--sortTautomers <yes or no>] [--tautomerizationParams <Name,Value,...>] [-w <dir>] -i <infile> -o <outfile>
 480     RDKitEnumerateTautomers.py -h | --help | -e | --examples
 481 
 482 Description:
 483     Enumerate tautomers for molecules and write them out to an output file.
 484     The tautomer enumerator generates both protomers and valence tautomers. You
 485     may optionally calculate tautomer scores and sort tautomers by SMILES string. The
 486     canonical tautomer is placed at the top during sorting.
 487 
 488     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi., csv, .tsv, .txt)
 489 
 490     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 491 
 492 Options:
 493     -e, --examples
 494         Print examples.
 495     -h, --help
 496         Print this help message.
 497     -i, --infile <infile>
 498         Input file name.
 499     --infileParams <Name,Value,...>  [default: auto]
 500         A comma delimited list of parameter name and value pairs for reading
 501         molecules from files. The supported parameter names for different file
 502         formats, along with their default values, are shown below:
 503             
 504             SD, MOL: removeHydrogens,no,sanitize,yes,strictParsing,yes
 505             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 506                 smilesTitleLine,auto,sanitize,yes
 507             
 508         Possible values for smilesDelimiter: space, comma or tab.
 509     --mp <yes or no>  [default: no]
 510         Use multiprocessing.
 511          
 512         By default, input data is retrieved in a lazy manner via mp.Pool.imap()
 513         function employing lazy RDKit data iterable. This allows processing of
 514         arbitrary large data sets without any additional requirements memory.
 515         
 516         All input data may be optionally loaded into memory by mp.Pool.map()
 517         before starting worker processes in a process pool by setting the value
 518         of 'inputDataMode' to 'InMemory' in '--mpParams' option.
 519         
 520         A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
 521         data mode may adversely impact the performance. The '--mpParams' section
 522         provides additional information to tune the value of 'chunkSize'.
 523     --mpParams <Name,Value,...>  [default: auto]
 524         A comma delimited list of parameter name and value pairs to configure
 525         multiprocessing.
 526         
 527         The supported parameter names along with their default and possible
 528         values are shown below:
 529         
 530             chunkSize, auto
 531             inputDataMode, Lazy   [ Possible values: InMemory or Lazy ]
 532             numProcesses, auto   [ Default: mp.cpu_count() ]
 533         
 534         These parameters are used by the following functions to configure and
 535         control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
 536         mp.Pool.imap().
 537         
 538         The chunkSize determines chunks of input data passed to each worker
 539         process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
 540         The default value of chunkSize is dependent on the value of 'inputDataMode'.
 541         
 542         The mp.Pool.map() function, invoked during 'InMemory' input data mode,
 543         automatically converts RDKit data iterable into a list, loads all data into
 544         memory, and calculates the default chunkSize using the following method
 545         as shown in its code:
 546         
 547             chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
 548             if extra: chunkSize += 1
 549         
 550         For example, the default chunkSize will be 7 for a pool of 4 worker processes
 551         and 100 data items.
 552         
 553         The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
 554         'lazy' RDKit data iterable to retrieve data as needed, without loading all the
 555         data into memory. Consequently, the size of input data is not known a priori.
 556         It's not possible to estimate an optimal value for the chunkSize. The default 
 557         chunkSize is set to 1.
 558         
 559         The default value for the chunkSize during 'Lazy' data mode may adversely
 560         impact the performance due to the overhead associated with exchanging
 561         small chunks of data. It is generally a good idea to explicitly set chunkSize to
 562         a larger value during 'Lazy' input data mode, based on the size of your input
 563         data and number of processes in the process pool.
 564         
 565         The mp.Pool.map() function waits for all worker processes to process all
 566         the data and return the results. The mp.Pool.imap() function, however,
 567         returns the the results obtained from worker processes as soon as the
 568         results become available for specified chunks of data.
 569         
 570         The order of data in the results returned by both mp.Pool.map() and 
 571         mp.Pool.imap() functions always corresponds to the input data.
 572     -o, --outfile <outfile>
 573         Output file name.
 574     --outfileParams <Name,Value,...>  [default: auto]
 575         A comma delimited list of parameter name and value pairs for writing
 576         molecules to files. The supported parameter names for different file
 577         formats, along with their default values, are shown below:
 578             
 579             SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
 580             SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 581                 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no
 582             
 583         Default value for compute2DCoords: yes for SMILES input file; no for all other
 584         file types.
 585     --overwrite
 586         Overwrite existing files.
 587     -q, --quiet <yes or no>  [default: no]
 588         Use quiet mode. The warning and information messages will not be printed.
 589     --scoreTautomers <yes or no>  [default: no]
 590         Calculate and write out tautomer scores [ Ref 159 ].
 591     --sortTautomers <yes or no>  [default: no]
 592         Sort tatutomers of a molecule by SMILES string and place canonical tautomer
 593         at the top of the list.
 594     -t, --tautomerizationParams <Name,Value,...>  [default: auto]
 595         A comma delimited list of parameter name and value pairs for enumerating
 596         tautomers of molecules. The supported parameter names along with their
 597         default values are shown below:
 598             
 599             tautomerTransformsFile,none,
 600             maxTautomers,1000,maxTransforms,1000,
 601             tautomerRemoveBondStereo,yes,tautomerRemoveIsotopicHs,yes
 602             tautomerRemoveSp3Stereo,yes,tautomerReassignStereo,yes
 603             
 604         A brief description of the tatutomerization parameters, taken from RDKit
 605         documentation, is as follows:
 606             
 607             tautomerTransformsFile - File containing tautomer transformations
 608             
 609             maxTautomers - Maximum number of tautomers to generate
 610             maxTransforms - Maximum number of transforms to apply during
 611                 tautomer enumeration
 612             tautomerRemoveBondStereo - Remove stereochemistry from double bonds
 613                 involved in tautomerism
 614             tautomerRemoveIsotopicHs: Remove isotopic Hs from centers involved in tautomerism
 615             tautomerRemoveSp3Stereo - Remove stereochemistry from sp3 centers
 616                 involved in tautomerism
 617             tautomerReassignStereo - AssignStereochemistry on all generated tautomers
 618             
 619         The default value is set to none for the 'tautomerTransformsFile' parameter. The
 620         script relies on RDKit to automatically load appropriate tautomer transformations
 621         from a set of internal catalog.
 622         
 623         The contents  of transformation file are described below:
 624             
 625             tautomerTransformsFile - File containing tautomer transformations
 626             
 627                 // Name                SMARTS   Bonds  Charges
 628                 1,3 (thio)keto/enol f  [CX4!H0]-[C]=[O,S,Se,Te;X1]
 629                 1,3 (thio)keto/enol r  [O,S,Se,Te;X2!H0]-[C]=[C]
 630                 1,5 (thio)keto/enol f  [CX4,NX3;!H0]-[C]=[C][CH0]=[O,S,Se,Te;X1]
 631                 ... ... ...
 632             
 633     -w, --workingdir <dir>
 634         Location of working directory which defaults to the current directory.
 635 
 636 Examples:
 637     To enumerate tautomers of molecules in a SMILES file and write out a SMILES
 638     file, type: 
 639 
 640         % RDKitEnumerateTautomers.py -i Sample.smi -o SampleOut.smi
 641 
 642     To enumerate tautomers of molecules in a SD file, calculate tautomer scores,
 643     sort tautomers, and write out a SD file, type:
 644 
 645         % RDKitEnumerateTautomers.py --scoreTautomers yes --sortTautomers yes
 646           -i Sample.sdf -o SampleOut.sdf
 647 
 648     To enumerate tautomers of molecules in a SD fie , calculate tautomer
 649     scores, sort tautomers, and write out a SMILES file, type:
 650 
 651         % RDKitEnumerateTautomers.py --scoreTautomers yes  --sortTautomers yes
 652           --outfileParams "smilesMolProps,yes" -i Sample.smi -o SampleOut.smi
 653 
 654     To enumerate tautomers of  molecules in a SD file, performing enumeration in
 655     multiprocessing mode on all available CPUs without loading all data into
 656     memory, and write out a SD file, type:
 657 
 658         % RDKitEnumerateTautomers.py --mp yes -i Sample.sdf -o SampleOut.sdf
 659 
 660     To enumerate tautomers of  molecules in a SD file, performing enumeration in
 661     multiprocessing mode on specific number of CPUs and chunk size without loading
 662     all data into memory, and write out a SD file, type:
 663 
 664         % RDKitEnumerateTautomers.py --mp yes --mpParams "inputDataMode,Lazy,
 665           numProcesses,4,chunkSize,8" -i Sample.sdf -o SampleOut.sdf
 666 
 667     To enumerate tautomers of  molecules in a SD file using specific values of
 668     parameters to contol the enumeration behavior, and write out a SD file, type:
 669 
 670         % RDKitEnumerateTautomers.py  -t "maxTautomers,1000,maxTransforms,1000,
 671           tautomerRemoveBondStereo,yes,tautomerRemoveIsotopicHs,yes,
 672           tautomerRemoveSp3Stereo,yes,tautomerReassignStereo,yes"
 673           --scoreTautomers yes --sortTautomers yes -i Sample.sdf -o SampleOut.sdf
 674 
 675     To enumerate tautomers for molecules in a CSV SMILES file, SMILES strings in column 1,
 676     name in column 2, and generate output SD file, type:
 677 
 678         % RDKitEnumerateTautomers.py --infileParams 
 679           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 680           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 681           -i SampleSMILES.csv -o SampleOut.sdf
 682 
 683 Author:
 684     Manish Sud(msud@san.rr.com)
 685 
 686 See also:
 687     RDKitConvertFileFormat.py, RDKitRemoveDuplicateMolecules.py,
 688     RDKitRemoveInvalidMolecules.py, RDKitRemoveSalts.py,
 689     RDKitSearchFunctionalGroups.py, RDKitSearchSMARTS.py,
 690     RDKitStandardizeMolecules.py
 691 
 692 Copyright:
 693     Copyright (C) 2024 Manish Sud. All rights reserved.
 694 
 695     The functionality available in this script is implemented using RDKit, an
 696     open source toolkit for cheminformatics developed by Greg Landrum.
 697 
 698     This file is part of MayaChemTools.
 699 
 700     MayaChemTools is free software; you can redistribute it and/or modify it under
 701     the terms of the GNU Lesser General Public License as published by the Free
 702     Software Foundation; either version 3 of the License, or (at your option) any
 703     later version.
 704 
 705 """
 706 
 707 if __name__ == "__main__":
 708     main()