MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitConvertFileFormat.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2026 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 import os
  32 import sys
  33 import time
  34 
  35 # RDKit imports...
  36 try:
  37     from rdkit import rdBase
  38 except ImportError as ErrMsg:
  39     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  40     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  41     sys.exit(1)
  42 
  43 # MayaChemTools imports...
  44 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  45 try:
  46     from docopt import docopt
  47     import MiscUtil
  48     import RDKitUtil
  49 except ImportError as ErrMsg:
  50     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  51     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  52     sys.exit(1)
  53 
  54 ScriptName = os.path.basename(sys.argv[0])
  55 Options = {}
  56 OptionsInfo = {}
  57 
  58 
  59 def main():
  60     """Start execution of the script."""
  61 
  62     MiscUtil.PrintInfo(
  63         "\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n"
  64         % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())
  65     )
  66 
  67     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  68 
  69     # Retrieve command line arguments and options...
  70     RetrieveOptions()
  71 
  72     # Process and validate command line arguments and options...
  73     ProcessOptions()
  74 
  75     # Perform actions required by the script...
  76     ConvertFileFormat()
  77 
  78     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  79     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  80 
  81 
  82 def ConvertFileFormat():
  83     """Convert between  file formats."""
  84 
  85     Infile = OptionsInfo["Infile"]
  86     Outfile = OptionsInfo["Outfile"]
  87 
  88     # Read molecules...
  89     MiscUtil.PrintInfo("\nReading file %s..." % Infile)
  90     Mols = RDKitUtil.ReadMolecules(Infile, **OptionsInfo["InfileParams"])
  91 
  92     # Write molecules...
  93     MiscUtil.PrintInfo("\nGenerating file %s..." % Outfile)
  94     MolCount, ProcessedMolCount = RDKitUtil.WriteMolecules(Outfile, Mols, **OptionsInfo["OutfileParams"])
  95 
  96     MiscUtil.PrintInfo("Total number of molecules: %d" % MolCount)
  97     MiscUtil.PrintInfo("Number of molecules processed: %d" % ProcessedMolCount)
  98     MiscUtil.PrintInfo("Number of molecules ignored: %d" % (MolCount - ProcessedMolCount))
  99 
 100 
 101 def ProcessOptions():
 102     """Process and validate command line arguments and options."""
 103 
 104     MiscUtil.PrintInfo("Processing options...")
 105 
 106     # Validate options...
 107     ValidateOptions()
 108 
 109     # Process and setup options for RDKit functions...
 110     OptionsInfo["Infile"] = Options["--infile"]
 111     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters(
 112         "--infileParams", Options["--infileParams"], Options["--infile"]
 113     )
 114 
 115     OptionsInfo["Outfile"] = Options["--outfile"]
 116     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters(
 117         "--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]
 118     )
 119 
 120     OptionsInfo["Overwrite"] = Options["--overwrite"]
 121 
 122 
 123 def RetrieveOptions():
 124     """Retrieve command line arguments and options."""
 125 
 126     # Get options...
 127     global Options
 128     Options = docopt(_docoptUsage_)
 129 
 130     # Set current working directory to the specified directory...
 131     WorkingDir = Options["--workingdir"]
 132     if WorkingDir:
 133         os.chdir(WorkingDir)
 134 
 135     # Handle examples option...
 136     if "--examples" in Options and Options["--examples"]:
 137         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 138         sys.exit(0)
 139 
 140 
 141 def ValidateOptions():
 142     """Validate option values."""
 143 
 144     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 145     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol smi txt csv tsv mol2 pdb")
 146 
 147     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd mol smi pdb")
 148     MiscUtil.ValidateOptionsOutputFileOverwrite(
 149         "-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]
 150     )
 151     MiscUtil.ValidateOptionsDistinctFileNames(
 152         "-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]
 153     )
 154 
 155 
 156 # Setup a usage string for docopt...
 157 _docoptUsage_ = """
 158 RDKitConvertFileFormat.py - Convert between molecular file formats
 159 
 160 Usage:
 161     RDKitConvertFileFormat.py [--infileParams <Name,Value,...>]
 162                               [ --outfileParams <Name,Value,...> ] [--overwrite]
 163                               [-w <dir>] -i <infile> -o <outfile>
 164     RDKitConvertFileFormat.py -h | --help | -e | --examples
 165 
 166 Description:
 167     Convert between molecular file formats.
 168 
 169     The supported input file formats are: Mol (.mol), SD (.sdf, .sd), SMILES (.smi,
 170     .txt, .csv, .tsv), MOL2 (.mol2), PDB (.pdb)
 171 
 172     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi), PDB (.pdb)
 173 
 174 Options:
 175     -e, --examples
 176         Print examples.
 177     -h, --help
 178         Print this help message.
 179     -i, --infile <infile>
 180         Input file name.
 181     --infileParams <Name,Value,...>  [default: auto]
 182         A comma delimited list of parameter name and value pairs for reading
 183         molecules from files. The supported parameter names for different file
 184         formats, along with their default values, are shown below:
 185             
 186             SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 187             MOL2: removeHydrogens,yes,sanitize,yes
 188             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 189                 smilesTitleLine,auto,sanitize,yes
 190             PDB: removeHydrogens,yes,sanitize,yes
 191             
 192         Possible values for smilesDelimiter: space, comma or tab.
 193     -o, --outfile <outfile>
 194         Output file name.
 195     --outfileParams <Name,Value,...>  [default: auto]
 196         A comma delimited list of parameter name and value pairs for writing
 197         molecules to files. The supported parameter names for different file
 198         formats, along with their default values, are shown below:
 199             
 200             SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
 201             SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 202                 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no
 203             
 204         Default value for compute2DCoords: yes for SMILES input file; no for all other
 205         file types.
 206     --overwrite
 207         Overwrite existing files.
 208     -w, --workingdir <dir>
 209         Location of working directory which defaults to the current directory.
 210 
 211 Examples:
 212     To convert a SD file  into a isomeric SMILES file, type:
 213 
 214         % RDKitConvertFileFormat.py -i Sample.sdf -o SampleOut.smi
 215 
 216     To convert a SD file into a non isomeric SMILES file, type
 217 
 218         % RDKitConvertFileFormat.py --outfileParams "smilesIsomeric,no"
 219           -i Sample.sdf -o SampleOut.smi
 220 
 221     To convert a SMILES file into a SD file along with calculation of 2D
 222     coordinates, type:
 223 
 224         % RDKitConvertFileFormat.py -i Sample.smi -o SampleOut.sdf
 225 
 226     To convert a MDL MOL file into a PDB file, type:
 227 
 228         % RDKitConvertFileFormat.py -i Sample.mol -o SampleOut.pdb
 229 
 230     To convert a CSV SMILES file  with column headers, SMILES strings
 231     in column 1, and name in column 2 into a SD file containing 2D coordinates, type:
 232 
 233         % RDKitConvertFileFormat.py --infileParams "smilesDelimiter,comma,
 234           smilesTitleLine,yes,smilesColumn,1,smilesNameColumn,2" -i Sample.csv
 235           -o SampleOut.sdf
 236 
 237 Author:
 238     Manish Sud(msud@san.rr.com)
 239 
 240 See also:
 241     RDKitDrawMolecules.py, RDKitRemoveDuplicateMolecules.py, RDKitSearchFunctionalGroups.py,
 242     RDKitSearchSMARTS.py
 243 
 244 Copyright:
 245     Copyright (C) 2026 Manish Sud. All rights reserved.
 246 
 247     The functionality available in this script is implemented using RDKit, an
 248     open source toolkit for cheminformatics developed by Greg Landrum.
 249 
 250     This file is part of MayaChemTools.
 251 
 252     MayaChemTools is free software; you can redistribute it and/or modify it under
 253     the terms of the GNU Lesser General Public License as published by the Free
 254     Software Foundation; either version 3 of the License, or (at your option) any
 255     later version.
 256 
 257 """
 258 
 259 if __name__ == "__main__":
 260     main()