MayaChemTools

    1 #!/bin/env python
    2 # File: RDKitUtil.py
    3 # Author: Manish Sud <msud@san.rr.com>
    4 #
    5 # Copyright (C) 2024 Manish Sud. All rights reserved.
    6 #
    7 # The functionality available in this file is implemented using RDKit, an
    8 # open source toolkit for cheminformatics developed by Greg Landrum.
    9 #
   10 # This file is part of MayaChemTools.
   11 #
   12 # MayaChemTools is free software; you can redistribute it and/or modify it under
   13 # the terms of the GNU Lesser General Public License as published by the Free
   14 # Software Foundation; either version 3 of the License, or (at your option) any
   15 # later version.
   16 #
   17 # MayaChemTools is distributed in the hope that it will be useful, but without
   18 # any warranty; without even the implied warranty of merchantability of fitness
   19 # for a particular purpose.  See the GNU Lesser General Public License for more
   20 # details.
   21 #
   22 # You should have received a copy of the GNU Lesser General Public License
   23 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
   24 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
   25 # Boston, MA, 02111-1307, USA.
   26 #
   27 
   28 from __future__ import print_function
   29 
   30 import os
   31 import sys
   32 import re
   33 import base64
   34 import pickle
   35 
   36 from rdkit import Chem
   37 from rdkit.Chem import AllChem
   38 from rdkit.Chem import Draw
   39 
   40 import MiscUtil
   41 
   42 __all__ = ["AreAtomIndicesSequentiallyConnected", "AreAtomMapNumbersPresentInMol", "AreHydrogensMissingInMolecule", "CalculateFormalCharge", "CalculateSpinMultiplicity", "ClearAtomMapNumbers", "ConstrainAndEmbed", "FilterSubstructureMatchByAtomMapNumbers", "FilterSubstructureMatchesByAtomMapNumbers", "GetAtomIndices", "GetAtomMapIndices", "GetAtomMapIndicesAndMapNumbers", "GetAtomSymbols", "GetAtomPositions", "GetFormalCharge", "GetHeavyAtomNeighbors", "GetInlineSVGForMolecule", "GetInlineSVGForMolecules", "GetMolName", "GetNumFragments", "GetNumHeavyAtomNeighbors", "GetSpinMultiplicity", "GetSVGForMolecule", "GetSVGForMolecules", "GetPsi4XYZFormatString", "GenerateBase64EncodedMolStrings", "GenerateBase64EncodedMolStringWithConfIDs", "IsAtomSymbolPresentInMol", "IsMolEmpty", "IsValidElementSymbol", "IsValidAtomIndex", "MolFromBase64EncodedMolString", "GenerateBase64EncodedMolStringsWithIDs", "MolToBase64EncodedMolString", "MolFromSubstructureMatch", "MolsFromSubstructureMatches", "ReadMolecules", "ReadAndValidateMolecules", "ReadMoleculesFromSDFile", "ReadMoleculesFromMolFile", "ReadMoleculesFromMol2File", "ReadMoleculesFromPDBFile", "ReadMoleculesFromSMILESFile", "ReorderAtomIndicesInSequentiallyConnectedManner", "SetAtomPositions", "SetWriterMolProps", "ValidateElementSymbols", "WriteMolecules"]
   43 
   44 def GetMolName(Mol, MolNum = None):
   45     """Get molecule name.
   46     
   47     Arguments:
   48         Mol (object): RDKit molecule object.
   49         MolNum (int or None): Molecule number in input file.
   50 
   51     Returns:
   52         str : Molname corresponding to _Name property of a molecule, generated
   53             from specieid MolNum using the format "Mol%d" % MolNum, or an
   54             empty string.
   55 
   56     """
   57     
   58     MolName = ''
   59     if Mol.HasProp("_Name"):
   60         MolName = Mol.GetProp("_Name")
   61 
   62     if not len(MolName):
   63         if MolNum is not None:
   64             MolName = "Mol%d" % MolNum
   65     
   66     return MolName
   67 
   68 def GetInlineSVGForMolecule(Mol, Width, Height, Legend = None, AtomListToHighlight = None, BondListToHighlight = None, BoldText = True, Base64Encoded = True):
   69     """Get SVG image text for a molecule suitable for inline embedding into a HTML page.
   70     
   71     Arguments:
   72         Mol (object): RDKit molecule object.
   73         Width (int): Width of a molecule image in pixels.
   74         Height (int): Height of a molecule image in pixels.
   75         Legend (str): Text to display under the image.
   76         AtomListToHighlight (list): List of atoms to highlight.
   77         BondListToHighlight (list): List of bonds to highlight.
   78         BoldText (bool): Flag to make text bold in the image of molecule. 
   79         Base64Encoded (bool): Flag to return base64 encoded string. 
   80 
   81     Returns:
   82         str : SVG image text for inline embedding into a HTML page using "img"
   83             tag: <img src="data:image/svg+xml;charset=UTF-8,SVGImageText> or
   84             tag: <img src="data:image/svg+xml;base64,SVGImageText>
   85 
   86     """
   87 
   88     SVGText = GetSVGForMolecule(Mol, Width, Height, Legend, AtomListToHighlight, BondListToHighlight, BoldText)
   89     return _ModifySVGForInlineEmbedding(SVGText, Base64Encoded)
   90     
   91 def GetInlineSVGForMolecules(Mols, MolsPerRow, MolWidth, MolHeight, Legends = None, AtomListsToHighlight = None, BondListsToHighLight = None, BoldText = True, Base64Encoded = True):
   92     """Get SVG image text for  molecules suitable for inline embedding into a HTML page.
   93     
   94     Arguments:
   95         Mols (list): List of RDKit molecule objects.
   96         MolsPerRow (int): Number of molecules per row.
   97         Width (int): Width of a molecule image in pixels.
   98         Height (int): Height of a molecule image in pixels.
   99         Legends (list): List containing strings to display under images.
  100         AtomListsToHighlight (list): List of lists containing atoms to highlight
  101             for molecules.
  102         BondListsToHighlight (list): List of lists containing bonds to highlight
  103             for molecules
  104         BoldText (bool): Flag to make text bold in the image of molecules. 
  105         Base64Encoded (bool): Flag to return base64 encoded string. 
  106 
  107     Returns:
  108         str : SVG image text for inline embedding into a HTML page using "img"
  109             tag: <img src="data:image/svg+xml;charset=UTF-8,SVGImageText> or
  110             tag: <img src="data:image/svg+xml;base64,SVGImageText>
  111 
  112     """
  113     
  114     SVGText = GetSVGForMolecules(Mols, MolsPerRow, MolWidth, MolHeight, Legends, AtomListsToHighlight, BondListsToHighLight, BoldText)
  115     return _ModifySVGForInlineEmbedding(SVGText, Base64Encoded)
  116 
  117 def _ModifySVGForInlineEmbedding(SVGText, Base64Encoded):
  118     """Modify SVG for inline embedding into a HTML page using "img" tag
  119     along with performing base64 encoding.
  120     """
  121     
  122     # Take out all tags till the start of '<svg' tag...
  123     Pattern = re.compile("^.*<svg", re.I | re.S)
  124     SVGText = Pattern.sub("<svg", SVGText)
  125     
  126     # Add an extra space before the "width=..." tag. Otherwise, inline embedding may
  127     # cause the following XML error on some browsers due to start of the "width=..."
  128     # at the begining of the line in <svg ...> tag:
  129     #
  130     #  XML5607: Whitespace expected.
  131     #
  132     SVGText = re.sub("width='", " width='", SVGText, flags = re.I)
  133     
  134     # Take out trailing new line...
  135     SVGText = SVGText.strip()
  136 
  137     # Perform base64 encoding by turning text into byte stream using string
  138     # encode and transform byte stream returned by b64encode into a string
  139     # by string decode...
  140     #
  141     if Base64Encoded:
  142         SVGText = base64.b64encode(SVGText.encode()).decode()
  143 
  144     return SVGText
  145 
  146 def GetSVGForMolecule(Mol, Width, Height, Legend = None, AtomListToHighlight = None, BondListToHighlight = None, BoldText = True):
  147     """Get SVG image text for a molecule suitable for viewing in a browser.
  148     
  149     Arguments:
  150         Mol (object): RDKit molecule object.
  151         Width (int): Width of a molecule image in pixels.
  152         Height (int): Height of a molecule image in pixels.
  153         Legend (str): Text to display under the image.
  154         AtomListToHighlight (list): List of atoms to highlight.
  155         BondListToHighlight (list): List of bonds to highlight.
  156         BoldText (bool): Flag to make text bold in the image of molecule. 
  157 
  158     Returns:
  159         str : SVG image text for writing to a SVG file for viewing in a browser.
  160 
  161     """
  162     
  163     Mols = [Mol]
  164     
  165     MolsPerRow = 1
  166     MolWidth = Width
  167     MolHeight = Height
  168     
  169     Legends = [Legend] if Legend is not None else None
  170     AtomListsToHighlight = [AtomListToHighlight] if AtomListToHighlight is not None else None
  171     BondListsToHighLight = [BondListsToHighLight] if BondListToHighlight is not None else None
  172     
  173     return GetSVGForMolecules(Mols, MolsPerRow, MolWidth, MolHeight, Legends, AtomListsToHighlight, BondListsToHighLight, BoldText)
  174 
  175 def GetSVGForMolecules(Mols, MolsPerRow, MolWidth, MolHeight, Legends = None, AtomListsToHighlight = None, BondListsToHighlight = None, BoldText = True):
  176     """Get SVG image text for molecules suitable for viewing in a browser.
  177     
  178     Arguments:
  179         Mols (list): List of RDKit molecule objects.
  180         MolsPerRow (int): Number of molecules per row.
  181         Width (int): Width of a molecule image in pixels.
  182         Height (int): Height of a molecule image in pixels.
  183         Legends (list): List containing strings to display under images.
  184         AtomListsToHighlight (list): List of lists containing atoms to highlight
  185             for molecules.
  186         BondListsToHighlight (list): List of lists containing bonds to highlight
  187             for molecules
  188         BoldText (bool): Flag to make text bold in the image of molecules. 
  189 
  190     Returns:
  191         str : SVG image text for writing to a SVG file for viewing in a browser.
  192 
  193     """
  194     
  195     SVGText = Draw.MolsToGridImage(Mols, molsPerRow = MolsPerRow, subImgSize = (MolWidth,MolHeight), legends = Legends, highlightAtomLists = AtomListsToHighlight, highlightBondLists = BondListsToHighlight, useSVG = True)
  196     
  197     return _ModifySVGForBrowserViewing(SVGText, BoldText)
  198 
  199 def _ModifySVGForBrowserViewing(SVGText, BoldText = True):
  200     """Modify SVG for loading into a browser."""
  201     
  202     # It appears that the string 'xmlns:svg' needs to be replaced with 'xmlns' in the
  203     # SVG image string generated by older versions of RDKit. Otherwise, the image
  204     # doesn't load in web browsers.
  205     #
  206     if re.search("xmlns:svg", SVGText, re.I):
  207         SVGText = re.sub("xmlns:svg", "xmlns", SVGText, flags = re.I)
  208     
  209     # Make text bold...
  210     if BoldText:
  211         SVGText = re.sub("font-weight:normal;", "font-weight:bold;", SVGText, flags = re.I)
  212     
  213     return SVGText
  214 
  215 def IsMolEmpty(Mol):
  216     """Check for the presence of atoms in a molecule.
  217     
  218     Arguments:
  219         Mol (object): RDKit molecule object.
  220 
  221     Returns:
  222         bool : True - No atoms in molecule; Otherwise, false. 
  223 
  224     """
  225 
  226     Status = False if Mol.GetNumAtoms() else True
  227     
  228     return Status
  229 
  230 def IsAtomSymbolPresentInMol(Mol, AtomSymbol, IgnoreCase = True):
  231     """ Check for the presence of an atom symbol in a molecule.
  232     
  233     Arguments:
  234         Mol (object): RDKit molecule object.
  235         AtomSymbol (str): Atom symbol.
  236 
  237     Returns:
  238         bool : True - Atom symbol in molecule; Otherwise, false. 
  239 
  240     """
  241     
  242     for Atom in Mol.GetAtoms():
  243         Symbol = Atom.GetSymbol()
  244         if IgnoreCase:
  245             if re.match("^%s$" % AtomSymbol, Symbol, re.I):
  246                 return True
  247         else:
  248             if re.match("^%s$" % AtomSymbol, Symbol):
  249                 return True
  250     
  251     return False
  252 
  253 def ValidateElementSymbols(ElementSymbols):
  254     """Validate element symbols.
  255     
  256     Arguments:
  257         ElementSymbols (list): List of element symbols to validate.
  258 
  259     Returns:
  260         bool : True - All element symbols are valid; Otherwise, false. 
  261 
  262     """
  263     for ElementSymbol in ElementSymbols:
  264         if not IsValidElementSymbol(ElementSymbol):
  265             return False
  266     
  267     return True
  268 
  269 def GetAtomPositions(Mol, ConfID = -1):
  270     """Retrieve a list of lists containing coordinates of all atoms in a
  271     molecule.
  272     
  273     Arguments:
  274         Mol (object): RDKit molecule object.
  275         ConfID (int): Conformer number.
  276 
  277     Returns:
  278         list : List of lists containing atom positions.
  279 
  280     Examples:
  281 
  282         for AtomPosition in RDKitUtil.GetAtomPositions(Mol):
  283             print("X: %s; Y: %s; Z: %s" % (AtomPosition[0], AtomPosition[1], AtomPosition[2]))
  284 
  285     """
  286 
  287     return Mol.GetConformer(id = ConfID).GetPositions().tolist()
  288 
  289 def SetAtomPositions(Mol, AtomPositions, ConfID = -1):
  290     """Set atom positions of all atoms in a molecule.
  291     
  292     Arguments:
  293         Mol (object): RDKit molecule object.
  294         AtomPositions (object): List of lists containing atom positions.
  295         ConfID (int): Conformer number.
  296 
  297     Returns:
  298         object : RDKit molecule object.
  299 
  300     """
  301     
  302     MolConf = Mol.GetConformer(ConfID)
  303 
  304     for Index in range(len(AtomPositions)):
  305             MolConf.SetAtomPosition(Index, tuple(AtomPositions[Index]))
  306     
  307     return Mol
  308 
  309 def GetAtomSymbols(Mol):
  310     """Retrieve a list containing atom symbols of all atoms a molecule.
  311     
  312     Arguments:
  313         Mol (object): RDKit molecule object.
  314 
  315     Returns:
  316         list : List of atom symbols.
  317 
  318     """
  319 
  320     return [Atom.GetSymbol() for Atom in Mol.GetAtoms()]
  321 
  322 def GetAtomIndices(Mol):
  323     """Retrieve a list containing atom indices of all atoms a molecule.
  324     
  325     Arguments:
  326         Mol (object): RDKit molecule object.
  327 
  328     Returns:
  329         list : List of atom indices.
  330 
  331     """
  332 
  333     return [Atom.GetIdx() for Atom in Mol.GetAtoms()]
  334 
  335 def GetFormalCharge(Mol, CheckMolProp = True):
  336     """Get formal charge of a molecule. The formal charge is either retrieved
  337     from 'FormalCharge' molecule property or calculated using RDKit function
  338     Chem.GetFormalCharge(Mol).
  339     
  340     The 'FormalCharge' molecule property may contain multiple space delimited
  341     values. The total formal charge corresponds to the sum of the specified formal
  342     charge values.
  343 
  344     Arguments:
  345         Mol (object): RDKit molecule object.
  346         CheckMolProp (bool): Check 'FormalCharge' molecule property to
  347             retrieve formal charge.
  348 
  349     Returns:
  350         int : Formal charge.
  351 
  352     """
  353     
  354     Name = 'FormalCharge'
  355     if (CheckMolProp and Mol.HasProp(Name)):
  356         FormalCharge = Mol.GetProp(Name)
  357         Values = FormalCharge.split()
  358         if len(Values) > 1:
  359             MiscUtil.PrintWarning("RDKitUtil.GetFormalCharge: Molecule property, %s, contains multiple values, %s. Formal charge corresponds to sum of the specified values..." % (Name, FormalCharge))
  360             FormalCharge = 0.0
  361             for Value in Values:
  362                 FormalCharge += float(Value)
  363             FormalCharge = int(FormalCharge)
  364         else:
  365             FormalCharge = int(float(FormalCharge))
  366     else:
  367         FormalCharge =  CalculateFormalCharge(Mol)
  368 
  369     return int(FormalCharge)
  370 
  371 def CalculateFormalCharge(Mol):
  372     """Calculate formal charge of a molecule. The formal charge is calculated
  373     using RDKit function Chem.GetFormalCharge(Mol).
  374 
  375     Arguments:
  376         Mol (object): RDKit molecule object.
  377             retrieve formal charge.
  378 
  379     Returns:
  380         int : Formal charge.
  381 
  382     """
  383     
  384     return int(Chem.GetFormalCharge(Mol))
  385     
  386 def GetSpinMultiplicity(Mol, CheckMolProp = True):
  387     """Get spin multiplicity of a molecule. The spin multiplicity is either
  388     retrieved from 'SpinMultiplicity' molecule property or calculated
  389     from the number of free radical electrons using Hund's rule of maximum
  390     multiplicity defined as 2S + 1 where S is the total electron spin. The
  391     total spin is 1/2 the number of free radical electrons in a molecule.
  392     
  393     The 'SpinMultiplicity' molecule property may contain multiple space delimited
  394     values. The total spin multiplicity corresponds to the total number of free radical
  395     electrons which are calculated for each specified value.
  396 
  397     Arguments:
  398         Mol (object): RDKit molecule object.
  399         CheckMolProp (bool): Check 'SpinMultiplicity' molecule property to
  400             retrieve spin multiplicity.
  401 
  402     Returns:
  403         int : Spin multiplicity.
  404 
  405     """
  406     
  407     Name = 'SpinMultiplicity'
  408     if (CheckMolProp and Mol.HasProp(Name)):
  409         SpinMultiplicity = Mol.GetProp(Name)
  410         Values = SpinMultiplicity.split()
  411         if len(Values) > 1:
  412             MiscUtil.PrintWarning("RDKitUtil.GetSpinMultiplicity: Molecule property, %s, contains multiple values, %s. Calculating spin multiplicity corresponding to total number of free radical electrons for each specified value..." % (Name, SpinMultiplicity))
  413             NumRadicalElectrons = 0
  414             for Value in Values:
  415                 NumRadicalElectrons += int(float(Value)) - 1
  416             
  417             TotalElectronicSpin = NumRadicalElectrons/2
  418             SpinMultiplicity = 2 * TotalElectronicSpin + 1
  419         else:
  420             SpinMultiplicity = int(float(SpinMultiplicity))
  421     else:
  422         SpinMultiplicity = CalculateSpinMultiplicity(Mol)
  423 
  424     return int(SpinMultiplicity)
  425 
  426 def CalculateSpinMultiplicity(Mol):
  427     """Calculate spin multiplicity of a molecule. The spin multiplicity is calculated
  428     from the number of free radical electrons using Hund's rule of maximum
  429     multiplicity defined as 2S + 1 where S is the total electron spin. The
  430     total spin is 1/2 the number of free radical electrons in a molecule.
  431 
  432     Arguments:
  433         Mol (object): RDKit molecule object.
  434 
  435     Returns:
  436         int : Spin multiplicity.
  437 
  438     """
  439     
  440     # Calculate spin multiplicity using Hund's rule of maximum multiplicity...
  441     NumRadicalElectrons = 0
  442     for Atom in Mol.GetAtoms():
  443         NumRadicalElectrons += Atom.GetNumRadicalElectrons()
  444 
  445     TotalElectronicSpin = NumRadicalElectrons/2
  446     SpinMultiplicity = 2 * TotalElectronicSpin + 1
  447 
  448     return int(SpinMultiplicity)
  449     
  450 def GetPsi4XYZFormatString(Mol, ConfID = -1, FormalCharge = "auto", SpinMultiplicity = "auto", Symmetry = "auto", NoCom = False, NoReorient = False, CheckFragments = False):
  451     """Retrieve geometry string of a molecule in Psi4ish XYZ format to perform
  452     Psi4 quantum chemistry calculations.
  453     
  454     You may explicit specify multiple space delimited values for formal charge
  455     and spin multiplicity. Otherwise, these values are either automatically
  456     retrieved from 'FormalCharge' and 'SpinMultiplicity' molecule properties or
  457     calculated using RDKit. The number of specified values for these properties
  458     must match the number of fragments in the molecule during the processing
  459     of the fragments.
  460 
  461     Arguments:
  462         Mol (object): RDKit molecule object.
  463         ConfID (int): Conformer number.
  464         FormalCharge (str): Specified formal charge or 'auto' to calculate
  465            its value by RDKit.
  466         SpinMultiplicity (str): Specified spin multiplicity or 'auto' to calculate
  467            its value by RDKit.
  468         Symmetry (str): Specified symmetry or 'auto' to calculate its value by
  469            Psi4.
  470         NoCom (bool): Flag to disable recentering of a molecule by Psi4.
  471         NoReorient (bool): Flag to disable reorientation of a molecule by Psi4.
  472         CheckFragments (bool): Check for fragments and setup geometry string
  473            using  -- separator between fragments.
  474 
  475     Returns:
  476         str : Geometry string of a molecule in Psi4ish XYZ format.
  477 
  478     """
  479 
  480     # Check for fragments...
  481     Mols = [Mol]
  482     if CheckFragments:
  483         Fragments = list(Chem.rdmolops.GetMolFrags(Mol, asMols = True))
  484         if len(Fragments) > 1:
  485             Mols = Fragments
  486     
  487     FragMolFormalCharges = _SetFormalChargesForPsi4XYZFormatString(Mol, Mols, FormalCharge, CheckFragments)
  488     FragMolSpinMultiplicities = _SetSpinMultiplicitiesForPsi4XYZFormatString(Mol, Mols, SpinMultiplicity, CheckFragments)
  489     
  490     # Setup geometry string for Ps4...
  491     GeometryList = []
  492     FragMolCount = 0
  493     
  494     for FragMolIndex, FragMol in enumerate(Mols):
  495         FragMolCount += 1
  496         if FragMolCount > 1:
  497             GeometryList.append("--")
  498         
  499         FragMolFormalCharge = FragMolFormalCharges[FragMolIndex]
  500         FragMolSpinMultiplicity = FragMolSpinMultiplicities[FragMolIndex]
  501         if FragMolFormalCharge is None or FragMolSpinMultiplicity is None:
  502             MiscUtil.PrintInfo("")
  503             MiscUtil.PrintWarning("RDKitUtil.GetPsi4XYZFormatString: Failed to set formal charge and spin multiplicity values. Both formal charge, %s, and spin multiplicity, %s, must be valid values. These values are either specified explicitly or automatically calculated..." % (FragMolFormalCharge, FragMolSpinMultiplicity))
  504         else:
  505             GeometryList.append("%s %s" % (FragMolFormalCharge, FragMolSpinMultiplicity))
  506         
  507         AtomSymbols = GetAtomSymbols(FragMol)
  508         AtomPositions = GetAtomPositions(FragMol, ConfID)
  509         
  510         for AtomSymbol, AtomPosition in zip(AtomSymbols, AtomPositions):
  511             GeometryList.append("%s %s %s %s" % (AtomSymbol, AtomPosition[0], AtomPosition[1], AtomPosition[2]))
  512 
  513     GeometryList.append("units angstrom")
  514     
  515     if not re.match("^auto$", Symmetry, re.I):
  516         Name = 'Symmetry'
  517         if (Mol.HasProp(Name)):
  518             Symmetry =  Mol.GetProp(Name)
  519         GeometryList.append("symmetry %s" % Symmetry)
  520     
  521     if NoCom:
  522         GeometryList.append("no_com")
  523         
  524     if NoReorient:
  525         GeometryList.append("no_reorient")
  526     
  527     Geometry = "\n".join(GeometryList)
  528     
  529     return Geometry
  530 
  531 def _SetFormalChargesForPsi4XYZFormatString(Mol, FragMols, FormalCharge, CheckFragments):
  532     """Setup formal charges for Psi4 XYZ format string. """
  533 
  534     if not CheckFragments:
  535         if re.match("^auto$", FormalCharge, re.I):
  536             MolFormalCharge = GetFormalCharge(Mol)
  537         else:
  538             MolFormalCharge = int(FormalCharge)
  539         return [MolFormalCharge]
  540     
  541     FragMolsCount = len(FragMols)
  542     FormalCharges = [None] * FragMolsCount
  543     
  544     if re.match("^auto$", FormalCharge, re.I):
  545         PropName = "FormalCharge"
  546         if Mol.HasProp(PropName):
  547             FormalCharge = Mol.GetProp(PropName)
  548             FormalChargeWords = FormalCharge.split()
  549             if len(FormalChargeWords) == FragMolsCount:
  550                 FormalCharges = [int(float(FormalCharge)) for FormalCharge in FormalChargeWords]
  551             else:
  552                 MiscUtil.PrintWarning("RDKitUtil.GetPsi4XYZFormatString: Ignoring specified value, %s, for FormalCharge molecule property. The number of space delimted specified values, %s,  must match number of fragments, %s, in the molecule..." % (FormalCharge, len(FormalChargeWords), FragMolsCount))
  553         else:
  554             FormalCharges = [CalculateFormalCharge(FragMol) for FragMol in FragMols]
  555     else:
  556         FormalChargeWords = FormalCharge.split()
  557         if len(FormalChargeWords) != FragMolsCount:
  558             MiscUtil.PrintWarning("RDKitUtil.GetPsi4XYZFormatString: Ignoring specified value, %s, for FormalCharge paramater. The number of space delimted specified values, %s,  must match number of fragments, %s, in the molecule..." % (FormalCharge, len(FormalChargeWords), FragMolsCount))
  559         else:
  560             FormalCharges = [int(FormalCharge) for FormalCharge in FormalChargeWords]
  561 
  562     return FormalCharges
  563 
  564 def _SetSpinMultiplicitiesForPsi4XYZFormatString(Mol, FragMols, SpinMultiplicity, CheckFragments):
  565     """Setup spin multiplicites for Psi4 XYZ format string. """
  566     
  567     if not CheckFragments:
  568         if re.match("^auto$", SpinMultiplicity, re.I):
  569             MolSpinMultiplicity = GetSpinMultiplicity(Mol)
  570         else:
  571             MolSpinMultiplicity = int(SpinMultiplicity)
  572         return [MolSpinMultiplicity]
  573         
  574     FragMolsCount = len(FragMols)
  575     SpinMultiplicities = [None] * FragMolsCount
  576     
  577     if re.match("^auto$", SpinMultiplicity, re.I):
  578         PropName = "SpinMultiplicity"
  579         if Mol.HasProp(PropName):
  580             SpinMultiplicity = Mol.GetProp(PropName)
  581             SpinMultiplicityWords = SpinMultiplicity.split()
  582             if len(SpinMultiplicityWords) == FragMolsCount:
  583                 SpinMultiplicities = [int(float(SpinMultiplicity)) for SpinMultiplicity in SpinMultiplicityWords]
  584             else:
  585                 MiscUtil.PrintWarning("RDKitUtil.GetPsi4XYZFormatString: Ignoring specified value, %s, for SpinMultiplicity molecule property. The number of space delimted specified values, %s,  must match number of fragments, %s, in the molecule..." % (SpinMultiplicity, len(SpinMultiplicityWords), FragMolsCount))
  586         else:
  587             SpinMultiplicities = [CalculateSpinMultiplicity(FragMol) for FragMol in FragMols]
  588     else:
  589         SpinMultiplicityWords = SpinMultiplicity.split()
  590         if len(SpinMultiplicityWords) != FragMolsCount:
  591             MiscUtil.PrintWarning("RDKitUtil.GetPsi4XYZFormatString: Ignoring specified value, %s, for SpinMultiplicity paramater. The number of space delimted specified values, %s,  must match number of fragments, %s, in the molecule..." % (SpinMultiplicity, len(SpinMultiplicityWords), FragMolsCount))
  592         else:
  593             SpinMultiplicities = [int(SpinMultiplicity) for SpinMultiplicity in SpinMultiplicityWords]
  594 
  595     return SpinMultiplicities
  596 
  597 def GetNumFragments(Mol):
  598     """Get number of fragment in a molecule.
  599 
  600     Arguments:
  601         Atom (object): RDKit molecule object.
  602 
  603     Returns:
  604         int : Number of fragments.
  605 
  606     """
  607     
  608     Fragments = Chem.rdmolops.GetMolFrags(Mol, asMols = False)
  609     
  610     return len(Fragments) if Fragments is not None else 0
  611 
  612 def GetNumHeavyAtomNeighbors(Atom):
  613     """Get number of heavy atom neighbors.
  614 
  615     Arguments:
  616         Atom (object): RDKit atom object.
  617 
  618     Returns:
  619         int : Number of neighbors.
  620 
  621     """
  622     
  623     NbrCount = 0
  624     for AtomNbr in Atom.GetNeighbors():
  625         if AtomNbr.GetAtomicNum() > 1:
  626             NbrCount += 1
  627     
  628     return NbrCount
  629 
  630 def GetHeavyAtomNeighbors(Atom):
  631     """Get a list of heavy atom neighbors.
  632 
  633     Arguments:
  634         Atom (object): RDKit atom object.
  635 
  636     Returns:
  637         list : List of heavy atom neighbors.
  638 
  639     """
  640     
  641     AtomNeighbors = []
  642     for AtomNbr in Atom.GetNeighbors():
  643         if AtomNbr.GetAtomicNum() > 1:
  644             AtomNeighbors.append(AtomNbr)
  645     
  646     return AtomNeighbors
  647 
  648 def IsValidElementSymbol(ElementSymbol):
  649     """Validate element symbol.
  650     
  651     Arguments:
  652         ElementSymbol (str): Element symbol
  653 
  654     Returns:
  655         bool : True - Valid element symbol; Otherwise, false. 
  656 
  657     """
  658 
  659     try:
  660         AtomicNumber = Chem.GetPeriodicTable().GetAtomicNumber(ElementSymbol)
  661         Status = True if AtomicNumber > 0  else False
  662     except Exception as ErrMsg:
  663         Status = False
  664     
  665     return Status
  666 
  667 def IsValidAtomIndex(Mol, AtomIndex):
  668     """Validate presence  atom index in a molecule.
  669     
  670     Arguments:
  671         Mol (object): RDKit molecule object.
  672         AtomIndex (int): Atom index.
  673 
  674     Returns:
  675         bool : True - Valid atom index; Otherwise, false. 
  676 
  677     """
  678     for Atom in Mol.GetAtoms():
  679         if AtomIndex == Atom.GetIdx():
  680             return True
  681     
  682     return False
  683 
  684 def AreHydrogensMissingInMolecule(Mol):
  685     """Check for any missing hydrogens in  in a molecue.
  686 
  687     Arguments:
  688         Mol (object): RDKit molecule object.
  689 
  690     Returns:
  691         bool : True - Missing hydrogens; Otherwise, false. 
  692 
  693     """
  694 
  695     for Atom in Mol.GetAtoms():
  696         NumExplicitAndImplicitHs = Atom.GetNumExplicitHs() + Atom.GetNumImplicitHs()
  697         if NumExplicitAndImplicitHs > 0:
  698             return True
  699 
  700     return False
  701 
  702 def AreAtomIndicesSequentiallyConnected(Mol, AtomIndices):
  703     """Check for the presence bonds between sequential pairs of atoms in a
  704     molecule.
  705     
  706     Arguments:
  707         Mol (object): RDKit molecule object.
  708         AtomIndices (list): List of atom indices.
  709 
  710     Returns:
  711         bool : True - Sequentially connected; Otherwise, false. 
  712 
  713     """
  714 
  715     for Index in range(0, (len(AtomIndices) -1)):
  716         Bond = Mol.GetBondBetweenAtoms(AtomIndices[Index], AtomIndices[Index + 1])
  717         if Bond is None:
  718             return False
  719         
  720         if Bond.GetIdx() is None:
  721             return False
  722     
  723     return True
  724     
  725 def ReorderAtomIndicesInSequentiallyConnectedManner(Mol, AtomIndices):
  726     """Check for the presence of sequentially connected list of atoms in an
  727     arbitray list of atoms in molecule.
  728    
  729     Arguments:
  730         Mol (object): RDKit molecule object.
  731         AtomIndices (list): List of atom indices.
  732 
  733     Returns:
  734         bool : True - Sequentially connected list found; Otherwise, false. 
  735         list : List of seqeuntially connected atoms or None.
  736 
  737     """
  738     
  739     # Count the number of neighbors for specified atom indices ensuring
  740     # that the neighbors are also part of atom indices...
  741     AtomNbrsCount = {}
  742     for AtomIndex in AtomIndices:
  743         Atom = Mol.GetAtomWithIdx(AtomIndex)
  744         
  745         AtomNbrsCount[AtomIndex] = 0
  746         for AtomNbr in Atom.GetNeighbors():
  747             AtomNbrIndex = AtomNbr.GetIdx()
  748             if AtomNbrIndex not in AtomIndices:
  749                 continue
  750             AtomNbrsCount[AtomIndex] += 1
  751     
  752     # Number of neighbors for each specified atom indices must be 1 or 2
  753     # for sequentially connected list of atom indices...
  754     AtomsWithOneNbr = []
  755     for AtomIndex, NbrsCount  in AtomNbrsCount.items():
  756         if not (NbrsCount == 1 or NbrsCount ==2):
  757             return (False, None)
  758         
  759         if NbrsCount == 1:
  760             AtomsWithOneNbr.append(AtomIndex)
  761 
  762     # A sequentially connected list of indices must have two atom indices with
  763     # exactly # one neighbor...
  764     if len(AtomsWithOneNbr) != 2:
  765             return (False, None)
  766 
  767     # Setup a reordered list of sequentially connected atoms...
  768     ReorderedAtomIndices = []
  769     
  770     AtomIndex1, AtomIndex2 = AtomsWithOneNbr
  771     AtomIndex = AtomIndex1 if AtomIndex1 < AtomIndex2 else AtomIndex2
  772     ReorderedAtomIndices.append(AtomIndex)
  773 
  774     while (len(ReorderedAtomIndices) < len(AtomIndices)):
  775         Atom = Mol.GetAtomWithIdx(AtomIndex)
  776         
  777         for AtomNbr in Atom.GetNeighbors():
  778             AtomNbrIndex = AtomNbr.GetIdx()
  779             if AtomNbrIndex not in AtomIndices:
  780                 continue
  781             
  782             if AtomNbrIndex in ReorderedAtomIndices:
  783                 continue
  784             
  785             # Treat neighbor as next connected atom...
  786             AtomIndex = AtomNbrIndex
  787             ReorderedAtomIndices.append(AtomIndex)
  788             break
  789 
  790     # Check reorderd list size...
  791     if (len(ReorderedAtomIndices) != len(AtomIndices)):
  792         return (False, None)
  793 
  794     # A final check to validate reorderd list...
  795     if not AreAtomIndicesSequentiallyConnected(Mol, ReorderedAtomIndices):
  796         return (False, None)
  797     
  798     return (True, ReorderedAtomIndices)
  799 
  800 def MolToBase64EncodedMolString(Mol, PropertyPickleFlags = Chem.PropertyPickleOptions.AllProps):
  801     """Encode RDkit molecule object into a base64 encoded string. The properties
  802     can be optionally excluded.
  803     
  804     The molecule is pickled using RDKit Mol.ToBinary() function before
  805     their encoding.
  806    
  807     Arguments:
  808         Mol (object): RDKit molecule object.
  809         PropertyPickleFlags: RDKit property pickle options.
  810 
  811     Returns:
  812         str : Base64 encode molecule string or None.
  813 
  814     Notes:
  815         The following property pickle flags are currently available in RDKit:
  816             
  817             Chem.PropertyPickleOptions.NoProps
  818             Chem.PropertyPickleOptions.MolProps
  819             Chem.PropertyPickleOptions.AtomProps
  820             Chem.PropertyPickleOptions.BondProps
  821             Chem.PropertyPickleOptions.PrivateProps
  822             Chem.PropertyPickleOptions.AllProps
  823 
  824     """
  825 
  826     return None if Mol is None else base64.b64encode(Mol.ToBinary(PropertyPickleFlags)).decode()
  827 
  828 def MolFromBase64EncodedMolString(EncodedMol):
  829     """Generate a RDKit molecule object from a base64 encoded string.
  830     
  831     Arguments:
  832         str: Base64 encoded molecule string.
  833 
  834     Returns:
  835         object : RDKit molecule object or None.
  836 
  837     """
  838 
  839     return None if EncodedMol is None else Chem.Mol(base64.b64decode(EncodedMol))
  840 
  841 def GenerateBase64EncodedMolStrings(Mols, PropertyPickleFlags = Chem.PropertyPickleOptions.AllProps):
  842     """Setup an iterator for generating base64 encoded molecule string
  843     from a RDKit molecule iterator. The iterator returns a list containing
  844     a molecule index and encoded molecule string or None.
  845     
  846     The molecules are pickled using RDKit Mol.ToBinary() function
  847     before their encoding.
  848     
  849     Arguments:
  850         iterator: RDKit molecules iterator.
  851         PropertyFlags: RDKit property pickle options.
  852 
  853     Returns:
  854         object : Base64 endcoded molecules iterator. The iterator returns a
  855             list containing a molecule index and an encoded molecule string
  856             or None.
  857 
  858     Notes:
  859         The following property pickle flags are currently available in RDKit:
  860             
  861             Chem.PropertyPickleOptions.NoProps
  862             Chem.PropertyPickleOptions.MolProps
  863             Chem.PropertyPickleOptions.AtomProps
  864             Chem.PropertyPickleOptions.BondProps
  865             Chem.PropertyPickleOptions.PrivateProps
  866             Chem.PropertyPickleOptions.AllProps
  867 
  868     Examples:
  869 
  870         EncodedMolsInfo = GenerateBase64EncodedMolStrings(Mols)
  871         for MolIndex, EncodedMol in EncodedMolsInfo:
  872             if EncodeMol is not None:
  873                 Mol = MolFromBase64EncodedMolString(EncodedMol)
  874 
  875     """
  876     for MolIndex, Mol in enumerate(Mols):
  877         yield [MolIndex, None] if Mol is None else [MolIndex, MolToBase64EncodedMolString(Mol, PropertyPickleFlags)]
  878 
  879 def GenerateBase64EncodedMolStringsWithIDs(Mols, MolIDs, PropertyPickleFlags = Chem.PropertyPickleOptions.AllProps):
  880     """Setup an iterator for generating base64 encoded molecule string
  881     from a RDKit molecule iterator. The iterator returns a list containing
  882     a molecule ID and encoded molecule string or None.
  883     
  884     The molecules are pickled using RDKit Mol.ToBinary() function
  885     before their encoding.
  886     
  887     Arguments:
  888         iterator: RDKit molecules iterator.
  889         MolIDs (list): Molecule IDs.
  890         PropertyFlags: RDKit property pickle options.
  891 
  892     Returns:
  893         object : Base64 endcoded molecules iterator. The iterator returns a
  894             list containing a molecule ID and an encoded molecule string
  895             or None.
  896 
  897     Notes:
  898         The following property pickle flags are currently available in RDKit:
  899             
  900             Chem.PropertyPickleOptions.NoProps
  901             Chem.PropertyPickleOptions.MolProps
  902             Chem.PropertyPickleOptions.AtomProps
  903             Chem.PropertyPickleOptions.BondProps
  904             Chem.PropertyPickleOptions.PrivateProps
  905             Chem.PropertyPickleOptions.AllProps
  906 
  907     Examples:
  908 
  909         EncodedMolsInfo = GenerateBase64EncodedMolStringsWithIDs(Mols)
  910         for MolID, EncodedMol in EncodedMolsInfo:
  911             if EncodeMol is not None:
  912                 Mol = MolFromBase64EncodedMolString(EncodedMol)
  913 
  914     """
  915     for MolIndex, Mol in enumerate(Mols):
  916         yield [MolIDs[MolIndex], None] if Mol is None else [MolIDs[MolIndex], MolToBase64EncodedMolString(Mol, PropertyPickleFlags)]
  917 
  918 def GenerateBase64EncodedMolStringWithConfIDs(Mol, MolIndex, ConfIDs, PropertyPickleFlags = Chem.PropertyPickleOptions.AllProps):
  919     """Setup an iterator generating base64 encoded molecule string for a 
  920     molecule. The iterator returns a list containing a molecule index, an encoded
  921     molecule string, and conf ID.
  922     
  923     The molecules are pickled using RDKit Mol.ToBinary() function
  924     before their encoding.
  925     
  926     Arguments:
  927         Mol (object): RDKit molecule object.
  928         MolIndex (int): Molecule index.
  929         ConfIDs (list): Conformer IDs.
  930         PropertyFlags: RDKit property pickle options.
  931 
  932     Returns:
  933         object : Base64 endcoded molecules iterator. The iterator returns a
  934             list containing a molecule index, an encoded molecule string, and
  935             conf ID.
  936 
  937     Notes:
  938         The following property pickle flags are currently available in RDKit:
  939             
  940             Chem.PropertyPickleOptions.NoProps
  941             Chem.PropertyPickleOptions.MolProps
  942             Chem.PropertyPickleOptions.AtomProps
  943             Chem.PropertyPickleOptions.BondProps
  944             Chem.PropertyPickleOptions.PrivateProps
  945             Chem.PropertyPickleOptions.AllProps
  946 
  947     Examples:
  948 
  949         EncodedMolsInfo = GenerateBase64EncodedMolStringWithConfIDs(Mol, MolIndex, ConfIDs)
  950         for MolIndex, EncodedMol, ConfID in EncodedMolsInfo:
  951             if EncodeMol is not None:
  952                 Mol = MolFromBase64EncodedMolString(EncodedMol)
  953 
  954     """
  955     for ConfID in ConfIDs:
  956         yield [MolIndex, None, ConfID] if Mol is None else [MolIndex, MolToBase64EncodedMolString(Mol, PropertyPickleFlags), ConfID]
  957 
  958 def AreAtomMapNumbersPresentInMol(Mol):
  959     """Check for the presence of atom map numbers in a molecue.
  960     
  961     Arguments:
  962         Mol (object): RDKit molecule object.
  963 
  964     Returns:
  965         bool : True - Atom map numbers present; Otherwise, false. 
  966 
  967     """
  968 
  969     return False if _GetAtomMapIndices(Mol) is None else True
  970 
  971 def ClearAtomMapNumbers(Mol, AllowImplicitValence = True, ClearRadicalElectrons = True):
  972     """Check and clear atom map numbers in a molecule. In addition, allow implicit
  973     valence and clear radical electrons for atoms with associated map numbers.
  974     
  975     For example, the following atomic properties are assigned by RDKit to atom
  976     map number 1 in a molecule corresponding to SMILES C[C:1](C)C:
  977     
  978     NoImplicit: True; ImplicitValence: 0; ExplicitValence: 3; NumExplicitHs: 0;
  979     NumImplicitHs: 0; NumRadicalElectrons: 1
  980     
  981     This function clears atoms map numbers in the molecule leading to SMILES 
  982     CC(C)C, along with optionally updating atomic properties as shown below:
  983     
  984     NoImplicit: False; ImplicitValence: 1; ExplicitValence: 3; NumExplicitHs: 0;
  985     NumImplicitHs: 1; NumRadicalElectrons: 0
  986     
  987     Arguments:
  988         Mol (object): RDKit molecule object.
  989 
  990     Returns:
  991         Mol (object): RDKit molecule object.
  992 
  993     """
  994     
  995     AtomMapIndices = GetAtomMapIndices(Mol)
  996     
  997     if AtomMapIndices is None:
  998         return Mol
  999     
 1000     for AtomMapIndex in AtomMapIndices:
 1001         Atom = Mol.GetAtomWithIdx(AtomMapIndex)
 1002         
 1003         # Clear map number property 'molAtomMapNumber'...
 1004         Atom.SetAtomMapNum(0)
 1005         
 1006         # Allow implit valence...
 1007         if AllowImplicitValence:
 1008             Atom.SetNoImplicit(False)
 1009         
 1010         # Set number of electrons to 0...
 1011         if ClearRadicalElectrons:
 1012             Atom.SetNumRadicalElectrons(0)
 1013         
 1014         Atom.UpdatePropertyCache()
 1015     
 1016     Mol.UpdatePropertyCache()
 1017 
 1018     return Mol
 1019 
 1020 def GetAtomMapIndices(Mol):
 1021     """Get a list of available atom indices corresponding to atom map numbers
 1022     present in a SMILES/SMARTS pattern used for creating a molecule. The list of
 1023     atom indices is sorted in ascending order by atom map numbers.
 1024     
 1025     Arguments:
 1026         Mol (object): RDKit molecule object.
 1027 
 1028     Returns:
 1029         list : List of atom indices sorted in the ascending order of atom map
 1030             numbers or None.
 1031 
 1032     """
 1033     
 1034     return _GetAtomMapIndices(Mol)
 1035 
 1036 def GetAtomMapIndicesAndMapNumbers(Mol):
 1037     """Get lists of available atom indices and atom map numbers present in a
 1038     SMILES/SMARTS pattern used for creating a molecule. Both lists are sorted
 1039     in ascending order by atom map numbers.
 1040     
 1041     Arguments:
 1042         Mol (object): RDKit molecule object.
 1043 
 1044     Returns:
 1045         list : List of atom indices sorted in the ascending order of atom map
 1046             numbers or None.
 1047         list : List of atom map numbers sorted in the ascending order or None.
 1048 
 1049     """
 1050     
 1051     return (_GetAtomMapIndicesAndMapNumbers(Mol))
 1052 
 1053 def MolFromSubstructureMatch(Mol, PatternMol, AtomIndices, FilterByAtomMapNums = False):
 1054     """Generate a RDKit molecule object for a list of matched atom indices
 1055     present in a pattern molecule. The list of atom indices correspond to a
 1056     list retrieved by RDKit function GetSubstructureMatches using SMILES/SMARTS
 1057     pattern. The atom indices are optionally filtered by mapping atom numbers
 1058     to appropriate atom indices during the generation of the molecule.
 1059     For example: [O:1]=[S:2](=[O])[C:3][C:4].
 1060 
 1061     Arguments:
 1062         Mol (object): RDKit molecule object.
 1063         PatternMol (object): RDKit molecule object for a SMILES/SMARTS pattern.
 1064         AtomIndices (list): Atom indices.
 1065         FilterByAtomMapNums (bool): Filter matches by atom map numbers.
 1066 
 1067     Returns:
 1068         object : RDKit molecule object or None.
 1069 
 1070     """
 1071 
 1072     AtomMapIndices = _GetAtomMapIndices(PatternMol) if FilterByAtomMapNums else None
 1073 
 1074     return (_MolFromSubstructureMatch(Mol, PatternMol, AtomIndices, AtomMapIndices))
 1075 
 1076 def MolsFromSubstructureMatches(Mol, PatternMol, AtomIndicesList, FilterByAtomMapNums = False):
 1077     """Generate  a list of RDKit molecule objects for a list containing lists of
 1078     matched atom indices present in a pattern molecule. The list of atom indices
 1079     correspond to a list retrieved by RDKit function GetSubstructureMatches using
 1080     SMILES/SMARTS pattern. The atom indices are optionally filtered by mapping
 1081     atom numbers to appropriate atom indices during the generation of the
 1082     molecule. For example: [O:1]=[S:2](=[O])[C:3][C:4].
 1083 
 1084     Arguments:
 1085         Mol (object): RDKit molecule object.
 1086         PatternMol (object): RDKit molecule object for a SMILES/SMARTS pattern.
 1087         AtomIndicesList (list): A list of lists containing atom indices.
 1088         FilterByAtomMapNums (bool): Filter matches by atom map numbers.
 1089 
 1090     Returns:
 1091         list : A list of lists containg RDKit molecule objects or None.
 1092 
 1093     """
 1094 
 1095     AtomMapIndices = _GetAtomMapIndices(PatternMol) if FilterByAtomMapNums else None
 1096 
 1097     Mols = []
 1098     for AtomIndices in AtomIndicesList:
 1099         Mols.append(_MolFromSubstructureMatch(Mol, PatternMol, AtomIndices, AtomMapIndices))
 1100     
 1101     return Mols if len(Mols) else None
 1102 
 1103 def FilterSubstructureMatchByAtomMapNumbers(Mol, PatternMol, AtomIndices):
 1104     """Filter a list of matched atom indices by map atom numbers present in a
 1105     pattern molecule. The list of atom indices correspond to a list retrieved by
 1106     RDKit function GetSubstructureMatches using SMILES/SMARTS pattern. The
 1107     atom map numbers are mapped to appropriate atom indices during the generation
 1108     of molecules. For example: [O:1]=[S:2](=[O])[C:3][C:4].
 1109     
 1110     Arguments:
 1111         Mol (object): RDKit molecule object.
 1112         PatternMol (object): RDKit molecule object for a SMILES/SMARTS pattern.
 1113         AtomIndices (list): Atom indices.
 1114 
 1115     Returns:
 1116         list : A list of filtered atom indices.
 1117 
 1118     """
 1119     AtomMapIndices = _GetAtomMapIndices(PatternMol)
 1120 
 1121     return _FilterSubstructureMatchByAtomMapNumbers(Mol, PatternMol, AtomIndices, AtomMapIndices)
 1122 
 1123 def FilterSubstructureMatchesByAtomMapNumbers(Mol, PatternMol, AtomIndicesList):
 1124     """Filter a list of lists containing matched atom indices by map atom numbers
 1125     present in a pattern molecule. The list of atom indices correspond to a list retrieved by
 1126     RDKit function GetSubstructureMatches using SMILES/SMARTS pattern. The
 1127     atom map numbers are mapped to appropriate atom indices during the generation
 1128     of molecules. For example: [O:1]=[S:2](=[O])[C:3][C:4].
 1129      
 1130     Arguments:
 1131         Mol (object): RDKit molecule object.
 1132         PatternMol (object): RDKit molecule object for a SMILES/SMARTS pattern.
 1133         AtomIndicesList (list): A list of lists containing atom indices.
 1134 
 1135     Returns:
 1136         list : A list of lists containing filtered atom indices.
 1137 
 1138     """
 1139     AtomMapIndices = _GetAtomMapIndices(PatternMol)
 1140 
 1141     MatchedAtomIndicesList = []
 1142     for AtomIndices in AtomIndicesList:
 1143         MatchedAtomIndicesList.append(_FilterSubstructureMatchByAtomMapNumbers(Mol, PatternMol, AtomIndices, AtomMapIndices))
 1144     
 1145     return MatchedAtomIndicesList
 1146 
 1147 def _MolFromSubstructureMatch(Mol, PatternMol, AtomIndices, AtomMapIndices):
 1148     """Generate a RDKit molecule object for a list of matched atom indices and available
 1149    atom map indices.
 1150     """
 1151 
 1152     if AtomMapIndices is not None:
 1153         MatchedAtomIndices = [AtomIndices[Index] for Index in AtomMapIndices]
 1154     else:
 1155         MatchedAtomIndices = list(AtomIndices)
 1156 
 1157     return _GetMolFromAtomIndices(Mol, MatchedAtomIndices)
 1158 
 1159 def _GetAtomMapIndices(Mol):
 1160     """Get a list of available atom indices corresponding to sorted atom map
 1161     numbers present in a SMILES/SMARTS pattern used for creating a molecule.
 1162     """
 1163     
 1164     AtomMapIndices, AtomMapNumbers = _GetAtomMapIndicesAndMapNumbers(Mol)
 1165     
 1166     return AtomMapIndices
 1167     
 1168 def _GetAtomMapIndicesAndMapNumbers(Mol):
 1169     """Get a list of available atom indices and atom map numbers present
 1170     in  a SMILES/SMARTS pattern used for creating a molecule. Both lists
 1171     are sorted in ascending order by atom map numbers.
 1172     """
 1173 
 1174     # Setup a atom map number to atom indices map..
 1175     AtomMapNumToIndices = {}
 1176     for Atom in Mol.GetAtoms():
 1177         AtomMapNum = Atom.GetAtomMapNum()
 1178         
 1179         if AtomMapNum:
 1180             AtomMapNumToIndices[AtomMapNum] = Atom.GetIdx()
 1181     
 1182     # Setup atom indices corresponding to sorted atom map numbers...
 1183     AtomMapIndices = None
 1184     AtomMapNumbers = None
 1185     if len(AtomMapNumToIndices):
 1186         AtomMapNumbers = sorted(AtomMapNumToIndices)
 1187         AtomMapIndices = [AtomMapNumToIndices[AtomMapNum] for AtomMapNum in AtomMapNumbers]
 1188 
 1189     return (AtomMapIndices, AtomMapNumbers)
 1190 
 1191 def _FilterSubstructureMatchByAtomMapNumbers(Mol, PatternMol, AtomIndices, AtomMapIndices):
 1192     """Filter substructure match atom indices by atom map indices corresponding to
 1193     atom map numbers.
 1194     """
 1195     
 1196     if AtomMapIndices is None:
 1197         return list(AtomIndices)
 1198                                                
 1199     return [AtomIndices[Index] for Index in AtomMapIndices]
 1200 
 1201 def _GetMolFromAtomIndices(Mol, AtomIndices):
 1202     """Generate a RDKit molecule object from atom indices returned by
 1203    substructure search.
 1204     """
 1205 
 1206     BondIndices = []
 1207     for AtomIndex in AtomIndices:
 1208         Atom = Mol.GetAtomWithIdx(AtomIndex)
 1209         
 1210         for AtomNbr in Atom.GetNeighbors():
 1211             AtomNbrIndex = AtomNbr.GetIdx()
 1212             if AtomNbrIndex not in AtomIndices:
 1213                 continue
 1214             
 1215             BondIndex = Mol.GetBondBetweenAtoms(AtomIndex, AtomNbrIndex).GetIdx()
 1216             if BondIndex in BondIndices:
 1217                 continue
 1218                 
 1219             BondIndices.append(BondIndex)
 1220             
 1221     MatchedMol = Chem.PathToSubmol(Mol, BondIndices) if len(BondIndices) else None
 1222     
 1223     return MatchedMol
 1224 
 1225 def ConstrainAndEmbed(mol, core, coreMatchesMol=None, useTethers=True, coreConfId=-1, randomseed=2342, getForceField=AllChem.UFFGetMoleculeForceField, **kwargs):
 1226     """
 1227     The function is a local copy of RDKit fucntion AllChem.ConstrainedEmbed().
 1228     It has been enhanced to support an explicit list of core matches corresponding
 1229     to the matched atom indices in the molecule. The number of matched atom indices
 1230     must be equal to the number of atoms in core molecule.
 1231 
 1232     Arguments:
 1233         mol (object): RDKit molecule object to embed.
 1234         core (object): RDKit molecule to use as a source of constraints.
 1235         coreMatchesMol (list): A list matches atom indices in mol.
 1236         useTethers: (bool) if True, the final conformation will be optimized
 1237             subject to a series of extra forces that pull the matching atoms to
 1238             the positions of the core atoms. Otherwise simple distance
 1239             constraints based on the core atoms will be used in the
 1240             optimization.
 1241         coreConfId (int): ID of the core conformation to use.
 1242         randomSeed (int): Seed for the random number generator
 1243 
 1244     Returns:
 1245         mol (object): RDKit molecule object.
 1246 
 1247     """
 1248     if coreMatchesMol is None:
 1249         match = mol.GetSubstructMatch(core)
 1250         if not match:
 1251             raise ValueError("Molecule doesn't match the core.")
 1252     else:
 1253         if core.GetNumAtoms() != len(coreMatchesMol):
 1254             raise ValueError("Number of atoms, %s, in core molecule must match number of atom indices, %s, specified in the list coreMatchesMol." % (core.GetNumAtoms(), len(coreMatchesMol)))
 1255         # Check specified matched atom indices in  coreMatchesMol and use the match atom
 1256         # indices returned by GetSubstructMatches() for embedding...
 1257         coreMatch = None
 1258         matches = mol.GetSubstructMatches(core)
 1259         for match in matches:
 1260             if len(match) != len(coreMatchesMol):
 1261                 continue
 1262             matchFound = True
 1263             for atomIndex in match:
 1264                 if atomIndex not in coreMatchesMol:
 1265                     matchFound = False
 1266                     break
 1267             if matchFound:
 1268                 coreMatch = match
 1269                 break
 1270         if coreMatch is None:
 1271             raise ValueError("Molecule doesn't match the atom indices specified in the list coreMatchesMol.")
 1272         match = coreMatch
 1273     
 1274     coordMap = {}
 1275     coreConf = core.GetConformer(coreConfId)
 1276     for i, idxI in enumerate(match):
 1277         corePtI = coreConf.GetAtomPosition(i)
 1278         coordMap[idxI] = corePtI
 1279     
 1280     ci = AllChem.EmbedMolecule(mol, coordMap=coordMap, randomSeed=randomseed, **kwargs)
 1281     if ci < 0:
 1282         raise ValueError('Could not embed molecule.')
 1283     
 1284     algMap = [(j, i) for i, j in enumerate(match)]
 1285     
 1286     if not useTethers:
 1287         # clean up the conformation
 1288         ff = getForceField(mol, confId=0)
 1289         for i, idxI in enumerate(match):
 1290             for j in range(i + 1, len(match)):
 1291                 idxJ = match[j]
 1292                 d = coordMap[idxI].Distance(coordMap[idxJ])
 1293                 ff.AddDistanceConstraint(idxI, idxJ, d, d, 100.)
 1294         ff.Initialize()
 1295         n = 4
 1296         more = ff.Minimize()
 1297         while more and n:
 1298             more = ff.Minimize()
 1299             n -= 1
 1300         # rotate the embedded conformation onto the core:
 1301         rms = AllChem.AlignMol(mol, core, atomMap=algMap)
 1302     else:
 1303         # rotate the embedded conformation onto the core:
 1304         rms = AllChem.AlignMol(mol, core, atomMap=algMap)
 1305         ff = getForceField(mol, confId=0)
 1306         conf = core.GetConformer()
 1307         for i in range(core.GetNumAtoms()):
 1308             p = conf.GetAtomPosition(i)
 1309             pIdx = ff.AddExtraPoint(p.x, p.y, p.z, fixed=True) - 1
 1310             ff.AddDistanceConstraint(pIdx, match[i], 0, 0, 100.)
 1311         ff.Initialize()
 1312         n = 4
 1313         more = ff.Minimize(energyTol=1e-4, forceTol=1e-3)
 1314         while more and n:
 1315             more = ff.Minimize(energyTol=1e-4, forceTol=1e-3)
 1316             n -= 1
 1317         # realign
 1318         rms = AllChem.AlignMol(mol, core, atomMap=algMap)
 1319 
 1320     mol.SetProp('EmbedRMS', str(rms))
 1321     return mol
 1322 
 1323 def ReadAndValidateMolecules(FileName, **KeyWordArgs):
 1324     """Read molecules from an input file, validate all molecule objects, and return
 1325     a list of valid and non-valid molecule objects along with their counts.
 1326     
 1327     Arguments:
 1328         FileName (str): Name of a file with complete path.
 1329         **KeyWordArgs (dictionary) : Parameter name and value pairs for reading and
 1330             processing molecules.
 1331 
 1332     Returns:
 1333         list : List of valid RDKit molecule objects.
 1334         int : Number of total molecules in input file. 
 1335         int : Number of valid molecules in input file. 
 1336 
 1337     Notes:
 1338         The file extension is used to determine type of the file and set up an appropriate
 1339         file reader.
 1340 
 1341     """
 1342 
 1343     AllowEmptyMols = True
 1344     if "AllowEmptyMols" in KeyWordArgs:
 1345         AllowEmptyMols = KeyWordArgs["AllowEmptyMols"]
 1346     
 1347     Mols = ReadMolecules(FileName, **KeyWordArgs)
 1348 
 1349     if AllowEmptyMols:
 1350         ValidMols = [Mol for Mol in Mols if Mol is not None]
 1351     else:
 1352         ValidMols = []
 1353         MolCount = 0
 1354         for Mol in Mols:
 1355             MolCount += 1
 1356             if Mol is None:
 1357                 continue
 1358             
 1359             if IsMolEmpty(Mol):
 1360                 MolName = GetMolName(Mol, MolCount)
 1361                 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 1362                 continue
 1363             
 1364             ValidMols.append(Mol)
 1365             
 1366     MolCount = len(Mols)
 1367     ValidMolCount = len(ValidMols)
 1368 
 1369     return (ValidMols, MolCount, ValidMolCount)
 1370 
 1371 def ReadMolecules(FileName, **KeyWordArgs):
 1372     """Read molecules from an input file without performing any validation
 1373     and creation of molecule objects.
 1374     
 1375     Arguments:
 1376         FileName (str): Name of a file with complete path.
 1377         **KeyWordArgs (dictionary) : Parameter name and value pairs for reading and
 1378             processing molecules.
 1379 
 1380     Returns:
 1381         list : List of RDKit molecule objects.
 1382 
 1383     Notes:
 1384         The file extension is used to determine type of the file and set up an appropriate
 1385         file reader.
 1386 
 1387     """
 1388 
 1389     # Set default values for possible arguments...
 1390     ReaderArgs = {"Sanitize": True, "RemoveHydrogens": True, "StrictParsing": True,  "SMILESDelimiter" : ' ', "SMILESColumn": 1, "SMILESNameColumn": 2, "SMILESTitleLine": True }
 1391 
 1392     # Set specified values for possible arguments...
 1393     for Arg in ReaderArgs:
 1394         if Arg in KeyWordArgs:
 1395             ReaderArgs[Arg] = KeyWordArgs[Arg]
 1396 
 1397     # Modify specific valeus for SMILES...
 1398     if MiscUtil.CheckFileExt(FileName, "smi csv tsv txt"):
 1399         Args = ["Sanitize", "SMILESTitleLine"]
 1400         for Arg in Args:
 1401             if ReaderArgs[Arg] is True:
 1402                 ReaderArgs[Arg] = 1
 1403             else:
 1404                 ReaderArgs[Arg] = 0
 1405     
 1406     Mols = []
 1407     if MiscUtil.CheckFileExt(FileName, "sdf sd"):
 1408         return ReadMoleculesFromSDFile(FileName, ReaderArgs["Sanitize"], ReaderArgs["RemoveHydrogens"], ReaderArgs['StrictParsing'])
 1409     elif MiscUtil.CheckFileExt(FileName, "mol"):
 1410         return ReadMoleculesFromMolFile(FileName, ReaderArgs["Sanitize"], ReaderArgs["RemoveHydrogens"], ReaderArgs['StrictParsing'])
 1411     elif MiscUtil.CheckFileExt(FileName, "mol2"):
 1412         return ReadMoleculesFromMol2File(FileName, ReaderArgs["Sanitize"], ReaderArgs["RemoveHydrogens"])
 1413     elif MiscUtil.CheckFileExt(FileName, "pdb"):
 1414         return ReadMoleculesFromPDBFile(FileName, ReaderArgs["Sanitize"], ReaderArgs["RemoveHydrogens"])
 1415     elif MiscUtil.CheckFileExt(FileName, "smi txt csv tsv"):
 1416         SMILESColumnIndex = ReaderArgs["SMILESColumn"] - 1
 1417         SMILESNameColumnIndex = ReaderArgs["SMILESNameColumn"] - 1
 1418         return ReadMoleculesFromSMILESFile(FileName, ReaderArgs["SMILESDelimiter"], SMILESColumnIndex, SMILESNameColumnIndex, ReaderArgs["SMILESTitleLine"], ReaderArgs["Sanitize"])
 1419     else:
 1420         MiscUtil.PrintWarning("RDKitUtil.ReadMolecules: Non supported file type: %s" % FileName)
 1421     
 1422     return Mols
 1423 
 1424 def ReadMoleculesFromSDFile(FileName, Sanitize = True, RemoveHydrogens = True, StrictParsing = True):
 1425     """Read molecules from a SD file.
 1426     
 1427     Arguments:
 1428         FileName (str): Name of a file with complete path.
 1429         Sanitize (bool): Sanitize molecules.
 1430         RemoveHydrogens (bool): Remove hydrogens from molecules.
 1431         StrictParsing (bool): Perform strict parsing.
 1432 
 1433     Returns:
 1434         list : List of RDKit molecule objects.
 1435 
 1436     """
 1437     return  Chem.SDMolSupplier(FileName, sanitize = Sanitize, removeHs = RemoveHydrogens, strictParsing = StrictParsing)
 1438 
 1439 def ReadMoleculesFromMolFile(FileName, Sanitize = True, RemoveHydrogens = True, StrictParsing = True):
 1440     """Read molecule from a MDL Mol file.
 1441     
 1442     Arguments:
 1443         FileName (str): Name of a file with complete path.
 1444         Sanitize (bool): Sanitize molecules.
 1445         RemoveHydrogens (bool): Remove hydrogens from molecules.
 1446         StrictParsing (bool): Perform strict parsing.
 1447 
 1448     Returns:
 1449         list : List of RDKit molecule objects.
 1450 
 1451     """
 1452     
 1453     Mols = []
 1454     Mols.append(Chem.MolFromMolFile(FileName, sanitize = Sanitize, removeHs = RemoveHydrogens, strictParsing = StrictParsing))
 1455     return Mols
 1456 
 1457 
 1458 def ReadMoleculesFromMol2File(FileName, Sanitize = True, RemoveHydrogens = True):
 1459     """Read molecules from a Tripos Mol2  file. The first call to the function
 1460     creates and returns  a generator object using Python yield statement. The
 1461     molecules are created during the subsequent iteration by the generator object.
 1462 
 1463     Arguments:
 1464         FileName (str): Name of a file with complete path.
 1465         Sanitize (bool): Sanitize molecules.
 1466         RemoveHydrogens (bool): Remove hydrogens from molecules.
 1467 
 1468     Returns:
 1469         list : A Python generator object for iterating over the molecules.
 1470 
 1471     """
 1472 
 1473     return _Mol2MolSupplier(FileName, Sanitize, RemoveHydrogens)
 1474 
 1475 def _Mol2MolSupplier(FileName, Sanitize = True, RemoveHydrogens = True):
 1476     """Read molecules from a Tripos Mol2  file."""
 1477 
 1478     fh = open(FileName, 'r')
 1479     
 1480     FirstMol = True
 1481     ProcessingMol = False
 1482     
 1483     for Line in fh:
 1484         if re.match("^#", Line, re.I):
 1485             continue
 1486         
 1487         if re.match("^@<TRIPOS>MOLECULE", Line, re.I):
 1488             ProcessingMol = True
 1489             
 1490             if FirstMol:
 1491                 FirstMol = False
 1492                 
 1493                 MolLines = []
 1494                 MolLines.append(Line)
 1495                 continue
 1496             
 1497             # Process lines for existing molecule...
 1498             MolBlock = "".join(MolLines)
 1499             
 1500             Mol = Chem.MolFromMol2Block(MolBlock, sanitize = Sanitize, removeHs = RemoveHydrogens)
 1501             yield Mol
 1502             
 1503             # Track lines for next molecule...
 1504             MolLines = []
 1505             MolLines.append(Line)
 1506             continue
 1507         
 1508         if not ProcessingMol:
 1509             continue
 1510         
 1511         MolLines.append(Line)
 1512         
 1513     fh.close
 1514 
 1515     # Process last molecule...
 1516     if len(MolLines):
 1517         MolBlock = "".join(MolLines)
 1518         Mol = Chem.MolFromMol2Block(MolBlock, sanitize = Sanitize, removeHs = RemoveHydrogens)
 1519         yield Mol
 1520     
 1521 def ReadMoleculesFromPDBFile(FileName, Sanitize = True, RemoveHydrogens = True):
 1522     """Read molecule from a PDB  file.
 1523     
 1524     Arguments:
 1525         FileName (str): Name of a file with complete path.
 1526         Sanitize (bool): Sanitize molecules.
 1527         RemoveHydrogens (bool): Remove hydrogens from molecules.
 1528 
 1529     Returns:
 1530         list : List of RDKit molecule objects.
 1531 
 1532     """
 1533     
 1534     Mols = []
 1535     Mols.append(Chem.MolFromPDBFile(FileName,  sanitize = Sanitize, removeHs = RemoveHydrogens))
 1536     return Mols
 1537 
 1538 def ReadMoleculesFromSMILESFile(FileName, SMILESDelimiter = ' ', SMILESColIndex = 0, SMILESNameColIndex = 1, SMILESTitleLine = 1, Sanitize = 1):
 1539     """Read molecules from a SMILES file.
 1540     
 1541     Arguments:
 1542         SMILESDelimiter (str): Delimiter for parsing SMILES line
 1543         SMILESColIndex (int): Column index containing SMILES string.
 1544         SMILESNameColIndex (int): Column index containing molecule name.
 1545         SMILESTitleLine (int): Flag to indicate presence of title line.
 1546         Sanitize (int): Sanitize molecules.
 1547 
 1548     Returns:
 1549         list : List of RDKit molecule objects.
 1550 
 1551     """
 1552     
 1553     return  Chem.SmilesMolSupplier(FileName, delimiter = SMILESDelimiter, smilesColumn = SMILESColIndex, nameColumn = SMILESNameColIndex, titleLine = SMILESTitleLine, sanitize = Sanitize)
 1554 
 1555 def MoleculesWriter(FileName, **KeyWordArgs):
 1556     """Set up a molecule writer.
 1557     
 1558     Arguments:
 1559         FileName (str): Name of a file with complete path.
 1560         **KeyWordArgs (dictionary) : Parameter name and value pairs for writing and
 1561             processing molecules.
 1562 
 1563     Returns:
 1564         RDKit object : Molecule writer.
 1565 
 1566     Notes:
 1567         The file extension is used to determine type of the file and set up an appropriate
 1568         file writer.
 1569 
 1570     """
 1571     
 1572     # Set default values for possible arguments...
 1573     WriterArgs = {"Compute2DCoords" : False, "Kekulize": True, "ForceV3000": False, "SMILESKekulize": False, "SMILESDelimiter" : ' ', "SMILESIsomeric": True, "SMILESTitleLine": True, "SMILESMolName": True}
 1574 
 1575     # Set specified values for possible arguments...
 1576     for Arg in WriterArgs:
 1577         if Arg in KeyWordArgs:
 1578             WriterArgs[Arg] = KeyWordArgs[Arg]
 1579     
 1580     Writer = None
 1581     if MiscUtil.CheckFileExt(FileName, "sdf sd"):
 1582         Writer = Chem.SDWriter(FileName)
 1583         Writer.SetKekulize(WriterArgs["Kekulize"])
 1584         Writer.SetForceV3000(WriterArgs["ForceV3000"])
 1585     elif MiscUtil.CheckFileExt(FileName, "pdb"):
 1586         Writer = Chem.PDBWriter(FileName)
 1587     elif MiscUtil.CheckFileExt(FileName, "smi"):
 1588         # Text for the name column in the title line. Blank indicates not to include name column
 1589         # in the output file...
 1590         NameHeader = 'Name' if WriterArgs["SMILESMolName"] else ''
 1591         Writer = Chem.SmilesWriter(FileName, delimiter = WriterArgs["SMILESDelimiter"], nameHeader = NameHeader, includeHeader = WriterArgs["SMILESTitleLine"],  isomericSmiles = WriterArgs["SMILESIsomeric"], kekuleSmiles = WriterArgs["SMILESKekulize"])
 1592     else:
 1593         MiscUtil.PrintWarning("RDKitUtil.WriteMolecules: Non supported file type: %s" % FileName)
 1594     
 1595     return Writer
 1596     
 1597 def WriteMolecules(FileName, Mols, **KeyWordArgs):
 1598     """Write molecules to an output file.
 1599     
 1600     Arguments:
 1601         FileName (str): Name of a file with complete path.
 1602         Mols (list): List of RDKit molecule objects. 
 1603         **KeyWordArgs (dictionary) : Parameter name and value pairs for writing and
 1604             processing molecules.
 1605 
 1606     Returns:
 1607         int : Number of total molecules.
 1608         int : Number of processed molecules written to output file.
 1609 
 1610     Notes:
 1611         The file extension is used to determine type of the file and set up an appropriate
 1612         file writer.
 1613 
 1614     """
 1615     
 1616     Compute2DCoords = False
 1617     if "Compute2DCoords" in KeyWordArgs:
 1618         Compute2DCoords = KeyWordArgs["Compute2DCoords"]
 1619     
 1620     SetSMILESMolProps = KeyWordArgs["SetSMILESMolProps"] if "SetSMILESMolProps" in KeyWordArgs else False
 1621         
 1622     MolCount = 0
 1623     ProcessedMolCount = 0
 1624     
 1625     Writer = MoleculesWriter(FileName, **KeyWordArgs)
 1626     
 1627     if Writer is None:
 1628         return (MolCount, ProcessedMolCount)
 1629     
 1630     FirstMol = True
 1631     for Mol in Mols:
 1632         MolCount += 1
 1633         if Mol is None:
 1634             continue
 1635 
 1636         if FirstMol:
 1637             FirstMol = False
 1638             if SetSMILESMolProps:
 1639                 SetWriterMolProps(Writer, Mol)
 1640                 
 1641         ProcessedMolCount += 1
 1642         if Compute2DCoords:
 1643             AllChem.Compute2DCoords(Mol)
 1644         
 1645         Writer.write(Mol)
 1646     
 1647     Writer.close()
 1648     
 1649     return (MolCount, ProcessedMolCount)
 1650 
 1651 def SetWriterMolProps(Writer, Mol):
 1652     """Setup molecule properties for a writer to output.
 1653     
 1654     Arguments:
 1655         Writer (object): RDKit writer object.
 1656         Mol (object): RDKit molecule object.
 1657 
 1658     Returns:
 1659         object : Writer object.
 1660 
 1661     """
 1662     PropNames = list(Mol.GetPropNames())
 1663     if len(PropNames):
 1664         Writer.SetProps(PropNames)
 1665         
 1666     return Writer
 1667