1 #!/bin/env python 2 # File: RDKitUtil.py 3 # Author: Manish Sud <msud@san.rr.com> 4 # 5 # Copyright (C) 2024 Manish Sud. All rights reserved. 6 # 7 # The functionality available in this file is implemented using RDKit, an 8 # open source toolkit for cheminformatics developed by Greg Landrum. 9 # 10 # This file is part of MayaChemTools. 11 # 12 # MayaChemTools is free software; you can redistribute it and/or modify it under 13 # the terms of the GNU Lesser General Public License as published by the Free 14 # Software Foundation; either version 3 of the License, or (at your option) any 15 # later version. 16 # 17 # MayaChemTools is distributed in the hope that it will be useful, but without 18 # any warranty; without even the implied warranty of merchantability of fitness 19 # for a particular purpose. See the GNU Lesser General Public License for more 20 # details. 21 # 22 # You should have received a copy of the GNU Lesser General Public License 23 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 24 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 25 # Boston, MA, 02111-1307, USA. 26 # 27 28 from __future__ import print_function 29 30 import os 31 import sys 32 import re 33 import base64 34 import pickle 35 36 from rdkit import Chem 37 from rdkit.Chem import AllChem 38 from rdkit.Chem import Draw 39 40 import MiscUtil 41 42 __all__ = ["AreAtomIndicesSequentiallyConnected", "AreAtomMapNumbersPresentInMol", "AreHydrogensMissingInMolecule", "CalculateFormalCharge", "CalculateSpinMultiplicity", "ClearAtomMapNumbers", "ConstrainAndEmbed", "FilterSubstructureMatchByAtomMapNumbers", "FilterSubstructureMatchesByAtomMapNumbers", "GetAtomIndices", "GetAtomMapIndices", "GetAtomMapIndicesAndMapNumbers", "GetAtomSymbols", "GetAtomPositions", "GetFormalCharge", "GetHeavyAtomNeighbors", "GetInlineSVGForMolecule", "GetInlineSVGForMolecules", "GetMolName", "GetNumFragments", "GetNumHeavyAtomNeighbors", "GetSpinMultiplicity", "GetSVGForMolecule", "GetSVGForMolecules", "GetPsi4XYZFormatString", "GenerateBase64EncodedMolStrings", "GenerateBase64EncodedMolStringWithConfIDs", "IsAtomSymbolPresentInMol", "IsMolEmpty", "IsValidElementSymbol", "IsValidAtomIndex", "MolFromBase64EncodedMolString", "GenerateBase64EncodedMolStringsWithIDs", "MolToBase64EncodedMolString", "MolFromSubstructureMatch", "MolsFromSubstructureMatches", "ReadMolecules", "ReadAndValidateMolecules", "ReadMoleculesFromSDFile", "ReadMoleculesFromMolFile", "ReadMoleculesFromMol2File", "ReadMoleculesFromPDBFile", "ReadMoleculesFromSMILESFile", "ReorderAtomIndicesInSequentiallyConnectedManner", "SetAtomPositions", "SetWriterMolProps", "ValidateElementSymbols", "WriteMolecules"] 43 44 def GetMolName(Mol, MolNum = None): 45 """Get molecule name. 46 47 Arguments: 48 Mol (object): RDKit molecule object. 49 MolNum (int or None): Molecule number in input file. 50 51 Returns: 52 str : Molname corresponding to _Name property of a molecule, generated 53 from specieid MolNum using the format "Mol%d" % MolNum, or an 54 empty string. 55 56 """ 57 58 MolName = '' 59 if Mol.HasProp("_Name"): 60 MolName = Mol.GetProp("_Name") 61 62 if not len(MolName): 63 if MolNum is not None: 64 MolName = "Mol%d" % MolNum 65 66 return MolName 67 68 def GetInlineSVGForMolecule(Mol, Width, Height, Legend = None, AtomListToHighlight = None, BondListToHighlight = None, BoldText = True, Base64Encoded = True): 69 """Get SVG image text for a molecule suitable for inline embedding into a HTML page. 70 71 Arguments: 72 Mol (object): RDKit molecule object. 73 Width (int): Width of a molecule image in pixels. 74 Height (int): Height of a molecule image in pixels. 75 Legend (str): Text to display under the image. 76 AtomListToHighlight (list): List of atoms to highlight. 77 BondListToHighlight (list): List of bonds to highlight. 78 BoldText (bool): Flag to make text bold in the image of molecule. 79 Base64Encoded (bool): Flag to return base64 encoded string. 80 81 Returns: 82 str : SVG image text for inline embedding into a HTML page using "img" 83 tag: <img src="data:image/svg+xml;charset=UTF-8,SVGImageText> or 84 tag: <img src="> 85 86 """ 87 88 SVGText = GetSVGForMolecule(Mol, Width, Height, Legend, AtomListToHighlight, BondListToHighlight, BoldText) 89 return _ModifySVGForInlineEmbedding(SVGText, Base64Encoded) 90 91 def GetInlineSVGForMolecules(Mols, MolsPerRow, MolWidth, MolHeight, Legends = None, AtomListsToHighlight = None, BondListsToHighLight = None, BoldText = True, Base64Encoded = True): 92 """Get SVG image text for molecules suitable for inline embedding into a HTML page. 93 94 Arguments: 95 Mols (list): List of RDKit molecule objects. 96 MolsPerRow (int): Number of molecules per row. 97 Width (int): Width of a molecule image in pixels. 98 Height (int): Height of a molecule image in pixels. 99 Legends (list): List containing strings to display under images. 100 AtomListsToHighlight (list): List of lists containing atoms to highlight 101 for molecules. 102 BondListsToHighlight (list): List of lists containing bonds to highlight 103 for molecules 104 BoldText (bool): Flag to make text bold in the image of molecules. 105 Base64Encoded (bool): Flag to return base64 encoded string. 106 107 Returns: 108 str : SVG image text for inline embedding into a HTML page using "img" 109 tag: <img src="data:image/svg+xml;charset=UTF-8,SVGImageText> or 110 tag: <img src="> 111 112 """ 113 114 SVGText = GetSVGForMolecules(Mols, MolsPerRow, MolWidth, MolHeight, Legends, AtomListsToHighlight, BondListsToHighLight, BoldText) 115 return _ModifySVGForInlineEmbedding(SVGText, Base64Encoded) 116 117 def _ModifySVGForInlineEmbedding(SVGText, Base64Encoded): 118 """Modify SVG for inline embedding into a HTML page using "img" tag 119 along with performing base64 encoding. 120 """ 121 122 # Take out all tags till the start of '<svg' tag... 123 Pattern = re.compile("^.*<svg", re.I | re.S) 124 SVGText = Pattern.sub("<svg", SVGText) 125 126 # Add an extra space before the "width=..." tag. Otherwise, inline embedding may 127 # cause the following XML error on some browsers due to start of the "width=..." 128 # at the begining of the line in <svg ...> tag: 129 # 130 # XML5607: Whitespace expected. 131 # 132 SVGText = re.sub("width='", " width='", SVGText, flags = re.I) 133 134 # Take out trailing new line... 135 SVGText = SVGText.strip() 136 137 # Perform base64 encoding by turning text into byte stream using string 138 # encode and transform byte stream returned by b64encode into a string 139 # by string decode... 140 # 141 if Base64Encoded: 142 SVGText = base64.b64encode(SVGText.encode()).decode() 143 144 return SVGText 145 146 def GetSVGForMolecule(Mol, Width, Height, Legend = None, AtomListToHighlight = None, BondListToHighlight = None, BoldText = True): 147 """Get SVG image text for a molecule suitable for viewing in a browser. 148 149 Arguments: 150 Mol (object): RDKit molecule object. 151 Width (int): Width of a molecule image in pixels. 152 Height (int): Height of a molecule image in pixels. 153 Legend (str): Text to display under the image. 154 AtomListToHighlight (list): List of atoms to highlight. 155 BondListToHighlight (list): List of bonds to highlight. 156 BoldText (bool): Flag to make text bold in the image of molecule. 157 158 Returns: 159 str : SVG image text for writing to a SVG file for viewing in a browser. 160 161 """ 162 163 Mols = [Mol] 164 165 MolsPerRow = 1 166 MolWidth = Width 167 MolHeight = Height 168 169 Legends = [Legend] if Legend is not None else None 170 AtomListsToHighlight = [AtomListToHighlight] if AtomListToHighlight is not None else None 171 BondListsToHighLight = [BondListsToHighLight] if BondListToHighlight is not None else None 172 173 return GetSVGForMolecules(Mols, MolsPerRow, MolWidth, MolHeight, Legends, AtomListsToHighlight, BondListsToHighLight, BoldText) 174 175 def GetSVGForMolecules(Mols, MolsPerRow, MolWidth, MolHeight, Legends = None, AtomListsToHighlight = None, BondListsToHighlight = None, BoldText = True): 176 """Get SVG image text for molecules suitable for viewing in a browser. 177 178 Arguments: 179 Mols (list): List of RDKit molecule objects. 180 MolsPerRow (int): Number of molecules per row. 181 Width (int): Width of a molecule image in pixels. 182 Height (int): Height of a molecule image in pixels. 183 Legends (list): List containing strings to display under images. 184 AtomListsToHighlight (list): List of lists containing atoms to highlight 185 for molecules. 186 BondListsToHighlight (list): List of lists containing bonds to highlight 187 for molecules 188 BoldText (bool): Flag to make text bold in the image of molecules. 189 190 Returns: 191 str : SVG image text for writing to a SVG file for viewing in a browser. 192 193 """ 194 195 SVGText = Draw.MolsToGridImage(Mols, molsPerRow = MolsPerRow, subImgSize = (MolWidth,MolHeight), legends = Legends, highlightAtomLists = AtomListsToHighlight, highlightBondLists = BondListsToHighlight, useSVG = True) 196 197 return _ModifySVGForBrowserViewing(SVGText, BoldText) 198 199 def _ModifySVGForBrowserViewing(SVGText, BoldText = True): 200 """Modify SVG for loading into a browser.""" 201 202 # It appears that the string 'xmlns:svg' needs to be replaced with 'xmlns' in the 203 # SVG image string generated by older versions of RDKit. Otherwise, the image 204 # doesn't load in web browsers. 205 # 206 if re.search("xmlns:svg", SVGText, re.I): 207 SVGText = re.sub("xmlns:svg", "xmlns", SVGText, flags = re.I) 208 209 # Make text bold... 210 if BoldText: 211 SVGText = re.sub("font-weight:normal;", "font-weight:bold;", SVGText, flags = re.I) 212 213 return SVGText 214 215 def IsMolEmpty(Mol): 216 """Check for the presence of atoms in a molecule. 217 218 Arguments: 219 Mol (object): RDKit molecule object. 220 221 Returns: 222 bool : True - No atoms in molecule; Otherwise, false. 223 224 """ 225 226 Status = False if Mol.GetNumAtoms() else True 227 228 return Status 229 230 def IsAtomSymbolPresentInMol(Mol, AtomSymbol, IgnoreCase = True): 231 """ Check for the presence of an atom symbol in a molecule. 232 233 Arguments: 234 Mol (object): RDKit molecule object. 235 AtomSymbol (str): Atom symbol. 236 237 Returns: 238 bool : True - Atom symbol in molecule; Otherwise, false. 239 240 """ 241 242 for Atom in Mol.GetAtoms(): 243 Symbol = Atom.GetSymbol() 244 if IgnoreCase: 245 if re.match("^%s$" % AtomSymbol, Symbol, re.I): 246 return True 247 else: 248 if re.match("^%s$" % AtomSymbol, Symbol): 249 return True 250 251 return False 252 253 def ValidateElementSymbols(ElementSymbols): 254 """Validate element symbols. 255 256 Arguments: 257 ElementSymbols (list): List of element symbols to validate. 258 259 Returns: 260 bool : True - All element symbols are valid; Otherwise, false. 261 262 """ 263 for ElementSymbol in ElementSymbols: 264 if not IsValidElementSymbol(ElementSymbol): 265 return False 266 267 return True 268 269 def GetAtomPositions(Mol, ConfID = -1): 270 """Retrieve a list of lists containing coordinates of all atoms in a 271 molecule. 272 273 Arguments: 274 Mol (object): RDKit molecule object. 275 ConfID (int): Conformer number. 276 277 Returns: 278 list : List of lists containing atom positions. 279 280 Examples: 281 282 for AtomPosition in RDKitUtil.GetAtomPositions(Mol): 283 print("X: %s; Y: %s; Z: %s" % (AtomPosition[0], AtomPosition[1], AtomPosition[2])) 284 285 """ 286 287 return Mol.GetConformer(id = ConfID).GetPositions().tolist() 288 289 def SetAtomPositions(Mol, AtomPositions, ConfID = -1): 290 """Set atom positions of all atoms in a molecule. 291 292 Arguments: 293 Mol (object): RDKit molecule object. 294 AtomPositions (object): List of lists containing atom positions. 295 ConfID (int): Conformer number. 296 297 Returns: 298 object : RDKit molecule object. 299 300 """ 301 302 MolConf = Mol.GetConformer(ConfID) 303 304 for Index in range(len(AtomPositions)): 305 MolConf.SetAtomPosition(Index, tuple(AtomPositions[Index])) 306 307 return Mol 308 309 def GetAtomSymbols(Mol): 310 """Retrieve a list containing atom symbols of all atoms a molecule. 311 312 Arguments: 313 Mol (object): RDKit molecule object. 314 315 Returns: 316 list : List of atom symbols. 317 318 """ 319 320 return [Atom.GetSymbol() for Atom in Mol.GetAtoms()] 321 322 def GetAtomIndices(Mol): 323 """Retrieve a list containing atom indices of all atoms a molecule. 324 325 Arguments: 326 Mol (object): RDKit molecule object. 327 328 Returns: 329 list : List of atom indices. 330 331 """ 332 333 return [Atom.GetIdx() for Atom in Mol.GetAtoms()] 334 335 def GetFormalCharge(Mol, CheckMolProp = True): 336 """Get formal charge of a molecule. The formal charge is either retrieved 337 from 'FormalCharge' molecule property or calculated using RDKit function 338 Chem.GetFormalCharge(Mol). 339 340 The 'FormalCharge' molecule property may contain multiple space delimited 341 values. The total formal charge corresponds to the sum of the specified formal 342 charge values. 343 344 Arguments: 345 Mol (object): RDKit molecule object. 346 CheckMolProp (bool): Check 'FormalCharge' molecule property to 347 retrieve formal charge. 348 349 Returns: 350 int : Formal charge. 351 352 """ 353 354 Name = 'FormalCharge' 355 if (CheckMolProp and Mol.HasProp(Name)): 356 FormalCharge = Mol.GetProp(Name) 357 Values = FormalCharge.split() 358 if len(Values) > 1: 359 MiscUtil.PrintWarning("RDKitUtil.GetFormalCharge: Molecule property, %s, contains multiple values, %s. Formal charge corresponds to sum of the specified values..." % (Name, FormalCharge)) 360 FormalCharge = 0.0 361 for Value in Values: 362 FormalCharge += float(Value) 363 FormalCharge = int(FormalCharge) 364 else: 365 FormalCharge = int(float(FormalCharge)) 366 else: 367 FormalCharge = CalculateFormalCharge(Mol) 368 369 return int(FormalCharge) 370 371 def CalculateFormalCharge(Mol): 372 """Calculate formal charge of a molecule. The formal charge is calculated 373 using RDKit function Chem.GetFormalCharge(Mol). 374 375 Arguments: 376 Mol (object): RDKit molecule object. 377 retrieve formal charge. 378 379 Returns: 380 int : Formal charge. 381 382 """ 383 384 return int(Chem.GetFormalCharge(Mol)) 385 386 def GetSpinMultiplicity(Mol, CheckMolProp = True): 387 """Get spin multiplicity of a molecule. The spin multiplicity is either 388 retrieved from 'SpinMultiplicity' molecule property or calculated 389 from the number of free radical electrons using Hund's rule of maximum 390 multiplicity defined as 2S + 1 where S is the total electron spin. The 391 total spin is 1/2 the number of free radical electrons in a molecule. 392 393 The 'SpinMultiplicity' molecule property may contain multiple space delimited 394 values. The total spin multiplicity corresponds to the total number of free radical 395 electrons which are calculated for each specified value. 396 397 Arguments: 398 Mol (object): RDKit molecule object. 399 CheckMolProp (bool): Check 'SpinMultiplicity' molecule property to 400 retrieve spin multiplicity. 401 402 Returns: 403 int : Spin multiplicity. 404 405 """ 406 407 Name = 'SpinMultiplicity' 408 if (CheckMolProp and Mol.HasProp(Name)): 409 SpinMultiplicity = Mol.GetProp(Name) 410 Values = SpinMultiplicity.split() 411 if len(Values) > 1: 412 MiscUtil.PrintWarning("RDKitUtil.GetSpinMultiplicity: Molecule property, %s, contains multiple values, %s. Calculating spin multiplicity corresponding to total number of free radical electrons for each specified value..." % (Name, SpinMultiplicity)) 413 NumRadicalElectrons = 0 414 for Value in Values: 415 NumRadicalElectrons += int(float(Value)) - 1 416 417 TotalElectronicSpin = NumRadicalElectrons/2 418 SpinMultiplicity = 2 * TotalElectronicSpin + 1 419 else: 420 SpinMultiplicity = int(float(SpinMultiplicity)) 421 else: 422 SpinMultiplicity = CalculateSpinMultiplicity(Mol) 423 424 return int(SpinMultiplicity) 425 426 def CalculateSpinMultiplicity(Mol): 427 """Calculate spin multiplicity of a molecule. The spin multiplicity is calculated 428 from the number of free radical electrons using Hund's rule of maximum 429 multiplicity defined as 2S + 1 where S is the total electron spin. The 430 total spin is 1/2 the number of free radical electrons in a molecule. 431 432 Arguments: 433 Mol (object): RDKit molecule object. 434 435 Returns: 436 int : Spin multiplicity. 437 438 """ 439 440 # Calculate spin multiplicity using Hund's rule of maximum multiplicity... 441 NumRadicalElectrons = 0 442 for Atom in Mol.GetAtoms(): 443 NumRadicalElectrons += Atom.GetNumRadicalElectrons() 444 445 TotalElectronicSpin = NumRadicalElectrons/2 446 SpinMultiplicity = 2 * TotalElectronicSpin + 1 447 448 return int(SpinMultiplicity) 449 450 def GetPsi4XYZFormatString(Mol, ConfID = -1, FormalCharge = "auto", SpinMultiplicity = "auto", Symmetry = "auto", NoCom = False, NoReorient = False, CheckFragments = False): 451 """Retrieve geometry string of a molecule in Psi4ish XYZ format to perform 452 Psi4 quantum chemistry calculations. 453 454 You may explicit specify multiple space delimited values for formal charge 455 and spin multiplicity. Otherwise, these values are either automatically 456 retrieved from 'FormalCharge' and 'SpinMultiplicity' molecule properties or 457 calculated using RDKit. The number of specified values for these properties 458 must match the number of fragments in the molecule during the processing 459 of the fragments. 460 461 Arguments: 462 Mol (object): RDKit molecule object. 463 ConfID (int): Conformer number. 464 FormalCharge (str): Specified formal charge or 'auto' to calculate 465 its value by RDKit. 466 SpinMultiplicity (str): Specified spin multiplicity or 'auto' to calculate 467 its value by RDKit. 468 Symmetry (str): Specified symmetry or 'auto' to calculate its value by 469 Psi4. 470 NoCom (bool): Flag to disable recentering of a molecule by Psi4. 471 NoReorient (bool): Flag to disable reorientation of a molecule by Psi4. 472 CheckFragments (bool): Check for fragments and setup geometry string 473 using -- separator between fragments. 474 475 Returns: 476 str : Geometry string of a molecule in Psi4ish XYZ format. 477 478 """ 479 480 # Check for fragments... 481 Mols = [Mol] 482 if CheckFragments: 483 Fragments = list(Chem.rdmolops.GetMolFrags(Mol, asMols = True)) 484 if len(Fragments) > 1: 485 Mols = Fragments 486 487 FragMolFormalCharges = _SetFormalChargesForPsi4XYZFormatString(Mol, Mols, FormalCharge, CheckFragments) 488 FragMolSpinMultiplicities = _SetSpinMultiplicitiesForPsi4XYZFormatString(Mol, Mols, SpinMultiplicity, CheckFragments) 489 490 # Setup geometry string for Ps4... 491 GeometryList = [] 492 FragMolCount = 0 493 494 for FragMolIndex, FragMol in enumerate(Mols): 495 FragMolCount += 1 496 if FragMolCount > 1: 497 GeometryList.append("--") 498 499 FragMolFormalCharge = FragMolFormalCharges[FragMolIndex] 500 FragMolSpinMultiplicity = FragMolSpinMultiplicities[FragMolIndex] 501 if FragMolFormalCharge is None or FragMolSpinMultiplicity is None: 502 MiscUtil.PrintInfo("") 503 MiscUtil.PrintWarning("RDKitUtil.GetPsi4XYZFormatString: Failed to set formal charge and spin multiplicity values. Both formal charge, %s, and spin multiplicity, %s, must be valid values. These values are either specified explicitly or automatically calculated..." % (FragMolFormalCharge, FragMolSpinMultiplicity)) 504 else: 505 GeometryList.append("%s %s" % (FragMolFormalCharge, FragMolSpinMultiplicity)) 506 507 AtomSymbols = GetAtomSymbols(FragMol) 508 AtomPositions = GetAtomPositions(FragMol, ConfID) 509 510 for AtomSymbol, AtomPosition in zip(AtomSymbols, AtomPositions): 511 GeometryList.append("%s %s %s %s" % (AtomSymbol, AtomPosition[0], AtomPosition[1], AtomPosition[2])) 512 513 GeometryList.append("units angstrom") 514 515 if not re.match("^auto$", Symmetry, re.I): 516 Name = 'Symmetry' 517 if (Mol.HasProp(Name)): 518 Symmetry = Mol.GetProp(Name) 519 GeometryList.append("symmetry %s" % Symmetry) 520 521 if NoCom: 522 GeometryList.append("no_com") 523 524 if NoReorient: 525 GeometryList.append("no_reorient") 526 527 Geometry = "\n".join(GeometryList) 528 529 return Geometry 530 531 def _SetFormalChargesForPsi4XYZFormatString(Mol, FragMols, FormalCharge, CheckFragments): 532 """Setup formal charges for Psi4 XYZ format string. """ 533 534 if not CheckFragments: 535 if re.match("^auto$", FormalCharge, re.I): 536 MolFormalCharge = GetFormalCharge(Mol) 537 else: 538 MolFormalCharge = int(FormalCharge) 539 return [MolFormalCharge] 540 541 FragMolsCount = len(FragMols) 542 FormalCharges = [None] * FragMolsCount 543 544 if re.match("^auto$", FormalCharge, re.I): 545 PropName = "FormalCharge" 546 if Mol.HasProp(PropName): 547 FormalCharge = Mol.GetProp(PropName) 548 FormalChargeWords = FormalCharge.split() 549 if len(FormalChargeWords) == FragMolsCount: 550 FormalCharges = [int(float(FormalCharge)) for FormalCharge in FormalChargeWords] 551 else: 552 MiscUtil.PrintWarning("RDKitUtil.GetPsi4XYZFormatString: Ignoring specified value, %s, for FormalCharge molecule property. The number of space delimted specified values, %s, must match number of fragments, %s, in the molecule..." % (FormalCharge, len(FormalChargeWords), FragMolsCount)) 553 else: 554 FormalCharges = [CalculateFormalCharge(FragMol) for FragMol in FragMols] 555 else: 556 FormalChargeWords = FormalCharge.split() 557 if len(FormalChargeWords) != FragMolsCount: 558 MiscUtil.PrintWarning("RDKitUtil.GetPsi4XYZFormatString: Ignoring specified value, %s, for FormalCharge paramater. The number of space delimted specified values, %s, must match number of fragments, %s, in the molecule..." % (FormalCharge, len(FormalChargeWords), FragMolsCount)) 559 else: 560 FormalCharges = [int(FormalCharge) for FormalCharge in FormalChargeWords] 561 562 return FormalCharges 563 564 def _SetSpinMultiplicitiesForPsi4XYZFormatString(Mol, FragMols, SpinMultiplicity, CheckFragments): 565 """Setup spin multiplicites for Psi4 XYZ format string. """ 566 567 if not CheckFragments: 568 if re.match("^auto$", SpinMultiplicity, re.I): 569 MolSpinMultiplicity = GetSpinMultiplicity(Mol) 570 else: 571 MolSpinMultiplicity = int(SpinMultiplicity) 572 return [MolSpinMultiplicity] 573 574 FragMolsCount = len(FragMols) 575 SpinMultiplicities = [None] * FragMolsCount 576 577 if re.match("^auto$", SpinMultiplicity, re.I): 578 PropName = "SpinMultiplicity" 579 if Mol.HasProp(PropName): 580 SpinMultiplicity = Mol.GetProp(PropName) 581 SpinMultiplicityWords = SpinMultiplicity.split() 582 if len(SpinMultiplicityWords) == FragMolsCount: 583 SpinMultiplicities = [int(float(SpinMultiplicity)) for SpinMultiplicity in SpinMultiplicityWords] 584 else: 585 MiscUtil.PrintWarning("RDKitUtil.GetPsi4XYZFormatString: Ignoring specified value, %s, for SpinMultiplicity molecule property. The number of space delimted specified values, %s, must match number of fragments, %s, in the molecule..." % (SpinMultiplicity, len(SpinMultiplicityWords), FragMolsCount)) 586 else: 587 SpinMultiplicities = [CalculateSpinMultiplicity(FragMol) for FragMol in FragMols] 588 else: 589 SpinMultiplicityWords = SpinMultiplicity.split() 590 if len(SpinMultiplicityWords) != FragMolsCount: 591 MiscUtil.PrintWarning("RDKitUtil.GetPsi4XYZFormatString: Ignoring specified value, %s, for SpinMultiplicity paramater. The number of space delimted specified values, %s, must match number of fragments, %s, in the molecule..." % (SpinMultiplicity, len(SpinMultiplicityWords), FragMolsCount)) 592 else: 593 SpinMultiplicities = [int(SpinMultiplicity) for SpinMultiplicity in SpinMultiplicityWords] 594 595 return SpinMultiplicities 596 597 def GetNumFragments(Mol): 598 """Get number of fragment in a molecule. 599 600 Arguments: 601 Atom (object): RDKit molecule object. 602 603 Returns: 604 int : Number of fragments. 605 606 """ 607 608 Fragments = Chem.rdmolops.GetMolFrags(Mol, asMols = False) 609 610 return len(Fragments) if Fragments is not None else 0 611 612 def GetNumHeavyAtomNeighbors(Atom): 613 """Get number of heavy atom neighbors. 614 615 Arguments: 616 Atom (object): RDKit atom object. 617 618 Returns: 619 int : Number of neighbors. 620 621 """ 622 623 NbrCount = 0 624 for AtomNbr in Atom.GetNeighbors(): 625 if AtomNbr.GetAtomicNum() > 1: 626 NbrCount += 1 627 628 return NbrCount 629 630 def GetHeavyAtomNeighbors(Atom): 631 """Get a list of heavy atom neighbors. 632 633 Arguments: 634 Atom (object): RDKit atom object. 635 636 Returns: 637 list : List of heavy atom neighbors. 638 639 """ 640 641 AtomNeighbors = [] 642 for AtomNbr in Atom.GetNeighbors(): 643 if AtomNbr.GetAtomicNum() > 1: 644 AtomNeighbors.append(AtomNbr) 645 646 return AtomNeighbors 647 648 def IsValidElementSymbol(ElementSymbol): 649 """Validate element symbol. 650 651 Arguments: 652 ElementSymbol (str): Element symbol 653 654 Returns: 655 bool : True - Valid element symbol; Otherwise, false. 656 657 """ 658 659 try: 660 AtomicNumber = Chem.GetPeriodicTable().GetAtomicNumber(ElementSymbol) 661 Status = True if AtomicNumber > 0 else False 662 except Exception as ErrMsg: 663 Status = False 664 665 return Status 666 667 def IsValidAtomIndex(Mol, AtomIndex): 668 """Validate presence atom index in a molecule. 669 670 Arguments: 671 Mol (object): RDKit molecule object. 672 AtomIndex (int): Atom index. 673 674 Returns: 675 bool : True - Valid atom index; Otherwise, false. 676 677 """ 678 for Atom in Mol.GetAtoms(): 679 if AtomIndex == Atom.GetIdx(): 680 return True 681 682 return False 683 684 def AreHydrogensMissingInMolecule(Mol): 685 """Check for any missing hydrogens in in a molecue. 686 687 Arguments: 688 Mol (object): RDKit molecule object. 689 690 Returns: 691 bool : True - Missing hydrogens; Otherwise, false. 692 693 """ 694 695 for Atom in Mol.GetAtoms(): 696 NumExplicitAndImplicitHs = Atom.GetNumExplicitHs() + Atom.GetNumImplicitHs() 697 if NumExplicitAndImplicitHs > 0: 698 return True 699 700 return False 701 702 def AreAtomIndicesSequentiallyConnected(Mol, AtomIndices): 703 """Check for the presence bonds between sequential pairs of atoms in a 704 molecule. 705 706 Arguments: 707 Mol (object): RDKit molecule object. 708 AtomIndices (list): List of atom indices. 709 710 Returns: 711 bool : True - Sequentially connected; Otherwise, false. 712 713 """ 714 715 for Index in range(0, (len(AtomIndices) -1)): 716 Bond = Mol.GetBondBetweenAtoms(AtomIndices[Index], AtomIndices[Index + 1]) 717 if Bond is None: 718 return False 719 720 if Bond.GetIdx() is None: 721 return False 722 723 return True 724 725 def ReorderAtomIndicesInSequentiallyConnectedManner(Mol, AtomIndices): 726 """Check for the presence of sequentially connected list of atoms in an 727 arbitray list of atoms in molecule. 728 729 Arguments: 730 Mol (object): RDKit molecule object. 731 AtomIndices (list): List of atom indices. 732 733 Returns: 734 bool : True - Sequentially connected list found; Otherwise, false. 735 list : List of seqeuntially connected atoms or None. 736 737 """ 738 739 # Count the number of neighbors for specified atom indices ensuring 740 # that the neighbors are also part of atom indices... 741 AtomNbrsCount = {} 742 for AtomIndex in AtomIndices: 743 Atom = Mol.GetAtomWithIdx(AtomIndex) 744 745 AtomNbrsCount[AtomIndex] = 0 746 for AtomNbr in Atom.GetNeighbors(): 747 AtomNbrIndex = AtomNbr.GetIdx() 748 if AtomNbrIndex not in AtomIndices: 749 continue 750 AtomNbrsCount[AtomIndex] += 1 751 752 # Number of neighbors for each specified atom indices must be 1 or 2 753 # for sequentially connected list of atom indices... 754 AtomsWithOneNbr = [] 755 for AtomIndex, NbrsCount in AtomNbrsCount.items(): 756 if not (NbrsCount == 1 or NbrsCount ==2): 757 return (False, None) 758 759 if NbrsCount == 1: 760 AtomsWithOneNbr.append(AtomIndex) 761 762 # A sequentially connected list of indices must have two atom indices with 763 # exactly # one neighbor... 764 if len(AtomsWithOneNbr) != 2: 765 return (False, None) 766 767 # Setup a reordered list of sequentially connected atoms... 768 ReorderedAtomIndices = [] 769 770 AtomIndex1, AtomIndex2 = AtomsWithOneNbr 771 AtomIndex = AtomIndex1 if AtomIndex1 < AtomIndex2 else AtomIndex2 772 ReorderedAtomIndices.append(AtomIndex) 773 774 while (len(ReorderedAtomIndices) < len(AtomIndices)): 775 Atom = Mol.GetAtomWithIdx(AtomIndex) 776 777 for AtomNbr in Atom.GetNeighbors(): 778 AtomNbrIndex = AtomNbr.GetIdx() 779 if AtomNbrIndex not in AtomIndices: 780 continue 781 782 if AtomNbrIndex in ReorderedAtomIndices: 783 continue 784 785 # Treat neighbor as next connected atom... 786 AtomIndex = AtomNbrIndex 787 ReorderedAtomIndices.append(AtomIndex) 788 break 789 790 # Check reorderd list size... 791 if (len(ReorderedAtomIndices) != len(AtomIndices)): 792 return (False, None) 793 794 # A final check to validate reorderd list... 795 if not AreAtomIndicesSequentiallyConnected(Mol, ReorderedAtomIndices): 796 return (False, None) 797 798 return (True, ReorderedAtomIndices) 799 800 def MolToBase64EncodedMolString(Mol, PropertyPickleFlags = Chem.PropertyPickleOptions.AllProps): 801 """Encode RDkit molecule object into a base64 encoded string. The properties 802 can be optionally excluded. 803 804 The molecule is pickled using RDKit Mol.ToBinary() function before 805 their encoding. 806 807 Arguments: 808 Mol (object): RDKit molecule object. 809 PropertyPickleFlags: RDKit property pickle options. 810 811 Returns: 812 str : Base64 encode molecule string or None. 813 814 Notes: 815 The following property pickle flags are currently available in RDKit: 816 817 Chem.PropertyPickleOptions.NoProps 818 Chem.PropertyPickleOptions.MolProps 819 Chem.PropertyPickleOptions.AtomProps 820 Chem.PropertyPickleOptions.BondProps 821 Chem.PropertyPickleOptions.PrivateProps 822 Chem.PropertyPickleOptions.AllProps 823 824 """ 825 826 return None if Mol is None else base64.b64encode(Mol.ToBinary(PropertyPickleFlags)).decode() 827 828 def MolFromBase64EncodedMolString(EncodedMol): 829 """Generate a RDKit molecule object from a base64 encoded string. 830 831 Arguments: 832 str: Base64 encoded molecule string. 833 834 Returns: 835 object : RDKit molecule object or None. 836 837 """ 838 839 return None if EncodedMol is None else Chem.Mol(base64.b64decode(EncodedMol)) 840 841 def GenerateBase64EncodedMolStrings(Mols, PropertyPickleFlags = Chem.PropertyPickleOptions.AllProps): 842 """Setup an iterator for generating base64 encoded molecule string 843 from a RDKit molecule iterator. The iterator returns a list containing 844 a molecule index and encoded molecule string or None. 845 846 The molecules are pickled using RDKit Mol.ToBinary() function 847 before their encoding. 848 849 Arguments: 850 iterator: RDKit molecules iterator. 851 PropertyFlags: RDKit property pickle options. 852 853 Returns: 854 object : Base64 endcoded molecules iterator. The iterator returns a 855 list containing a molecule index and an encoded molecule string 856 or None. 857 858 Notes: 859 The following property pickle flags are currently available in RDKit: 860 861 Chem.PropertyPickleOptions.NoProps 862 Chem.PropertyPickleOptions.MolProps 863 Chem.PropertyPickleOptions.AtomProps 864 Chem.PropertyPickleOptions.BondProps 865 Chem.PropertyPickleOptions.PrivateProps 866 Chem.PropertyPickleOptions.AllProps 867 868 Examples: 869 870 EncodedMolsInfo = GenerateBase64EncodedMolStrings(Mols) 871 for MolIndex, EncodedMol in EncodedMolsInfo: 872 if EncodeMol is not None: 873 Mol = MolFromBase64EncodedMolString(EncodedMol) 874 875 """ 876 for MolIndex, Mol in enumerate(Mols): 877 yield [MolIndex, None] if Mol is None else [MolIndex, MolToBase64EncodedMolString(Mol, PropertyPickleFlags)] 878 879 def GenerateBase64EncodedMolStringsWithIDs(Mols, MolIDs, PropertyPickleFlags = Chem.PropertyPickleOptions.AllProps): 880 """Setup an iterator for generating base64 encoded molecule string 881 from a RDKit molecule iterator. The iterator returns a list containing 882 a molecule ID and encoded molecule string or None. 883 884 The molecules are pickled using RDKit Mol.ToBinary() function 885 before their encoding. 886 887 Arguments: 888 iterator: RDKit molecules iterator. 889 MolIDs (list): Molecule IDs. 890 PropertyFlags: RDKit property pickle options. 891 892 Returns: 893 object : Base64 endcoded molecules iterator. The iterator returns a 894 list containing a molecule ID and an encoded molecule string 895 or None. 896 897 Notes: 898 The following property pickle flags are currently available in RDKit: 899 900 Chem.PropertyPickleOptions.NoProps 901 Chem.PropertyPickleOptions.MolProps 902 Chem.PropertyPickleOptions.AtomProps 903 Chem.PropertyPickleOptions.BondProps 904 Chem.PropertyPickleOptions.PrivateProps 905 Chem.PropertyPickleOptions.AllProps 906 907 Examples: 908 909 EncodedMolsInfo = GenerateBase64EncodedMolStringsWithIDs(Mols) 910 for MolID, EncodedMol in EncodedMolsInfo: 911 if EncodeMol is not None: 912 Mol = MolFromBase64EncodedMolString(EncodedMol) 913 914 """ 915 for MolIndex, Mol in enumerate(Mols): 916 yield [MolIDs[MolIndex], None] if Mol is None else [MolIDs[MolIndex], MolToBase64EncodedMolString(Mol, PropertyPickleFlags)] 917 918 def GenerateBase64EncodedMolStringWithConfIDs(Mol, MolIndex, ConfIDs, PropertyPickleFlags = Chem.PropertyPickleOptions.AllProps): 919 """Setup an iterator generating base64 encoded molecule string for a 920 molecule. The iterator returns a list containing a molecule index, an encoded 921 molecule string, and conf ID. 922 923 The molecules are pickled using RDKit Mol.ToBinary() function 924 before their encoding. 925 926 Arguments: 927 Mol (object): RDKit molecule object. 928 MolIndex (int): Molecule index. 929 ConfIDs (list): Conformer IDs. 930 PropertyFlags: RDKit property pickle options. 931 932 Returns: 933 object : Base64 endcoded molecules iterator. The iterator returns a 934 list containing a molecule index, an encoded molecule string, and 935 conf ID. 936 937 Notes: 938 The following property pickle flags are currently available in RDKit: 939 940 Chem.PropertyPickleOptions.NoProps 941 Chem.PropertyPickleOptions.MolProps 942 Chem.PropertyPickleOptions.AtomProps 943 Chem.PropertyPickleOptions.BondProps 944 Chem.PropertyPickleOptions.PrivateProps 945 Chem.PropertyPickleOptions.AllProps 946 947 Examples: 948 949 EncodedMolsInfo = GenerateBase64EncodedMolStringWithConfIDs(Mol, MolIndex, ConfIDs) 950 for MolIndex, EncodedMol, ConfID in EncodedMolsInfo: 951 if EncodeMol is not None: 952 Mol = MolFromBase64EncodedMolString(EncodedMol) 953 954 """ 955 for ConfID in ConfIDs: 956 yield [MolIndex, None, ConfID] if Mol is None else [MolIndex, MolToBase64EncodedMolString(Mol, PropertyPickleFlags), ConfID] 957 958 def AreAtomMapNumbersPresentInMol(Mol): 959 """Check for the presence of atom map numbers in a molecue. 960 961 Arguments: 962 Mol (object): RDKit molecule object. 963 964 Returns: 965 bool : True - Atom map numbers present; Otherwise, false. 966 967 """ 968 969 return False if _GetAtomMapIndices(Mol) is None else True 970 971 def ClearAtomMapNumbers(Mol, AllowImplicitValence = True, ClearRadicalElectrons = True): 972 """Check and clear atom map numbers in a molecule. In addition, allow implicit 973 valence and clear radical electrons for atoms with associated map numbers. 974 975 For example, the following atomic properties are assigned by RDKit to atom 976 map number 1 in a molecule corresponding to SMILES C[C:1](C)C: 977 978 NoImplicit: True; ImplicitValence: 0; ExplicitValence: 3; NumExplicitHs: 0; 979 NumImplicitHs: 0; NumRadicalElectrons: 1 980 981 This function clears atoms map numbers in the molecule leading to SMILES 982 CC(C)C, along with optionally updating atomic properties as shown below: 983 984 NoImplicit: False; ImplicitValence: 1; ExplicitValence: 3; NumExplicitHs: 0; 985 NumImplicitHs: 1; NumRadicalElectrons: 0 986 987 Arguments: 988 Mol (object): RDKit molecule object. 989 990 Returns: 991 Mol (object): RDKit molecule object. 992 993 """ 994 995 AtomMapIndices = GetAtomMapIndices(Mol) 996 997 if AtomMapIndices is None: 998 return Mol 999 1000 for AtomMapIndex in AtomMapIndices: 1001 Atom = Mol.GetAtomWithIdx(AtomMapIndex) 1002 1003 # Clear map number property 'molAtomMapNumber'... 1004 Atom.SetAtomMapNum(0) 1005 1006 # Allow implit valence... 1007 if AllowImplicitValence: 1008 Atom.SetNoImplicit(False) 1009 1010 # Set number of electrons to 0... 1011 if ClearRadicalElectrons: 1012 Atom.SetNumRadicalElectrons(0) 1013 1014 Atom.UpdatePropertyCache() 1015 1016 Mol.UpdatePropertyCache() 1017 1018 return Mol 1019 1020 def GetAtomMapIndices(Mol): 1021 """Get a list of available atom indices corresponding to atom map numbers 1022 present in a SMILES/SMARTS pattern used for creating a molecule. The list of 1023 atom indices is sorted in ascending order by atom map numbers. 1024 1025 Arguments: 1026 Mol (object): RDKit molecule object. 1027 1028 Returns: 1029 list : List of atom indices sorted in the ascending order of atom map 1030 numbers or None. 1031 1032 """ 1033 1034 return _GetAtomMapIndices(Mol) 1035 1036 def GetAtomMapIndicesAndMapNumbers(Mol): 1037 """Get lists of available atom indices and atom map numbers present in a 1038 SMILES/SMARTS pattern used for creating a molecule. Both lists are sorted 1039 in ascending order by atom map numbers. 1040 1041 Arguments: 1042 Mol (object): RDKit molecule object. 1043 1044 Returns: 1045 list : List of atom indices sorted in the ascending order of atom map 1046 numbers or None. 1047 list : List of atom map numbers sorted in the ascending order or None. 1048 1049 """ 1050 1051 return (_GetAtomMapIndicesAndMapNumbers(Mol)) 1052 1053 def MolFromSubstructureMatch(Mol, PatternMol, AtomIndices, FilterByAtomMapNums = False): 1054 """Generate a RDKit molecule object for a list of matched atom indices 1055 present in a pattern molecule. The list of atom indices correspond to a 1056 list retrieved by RDKit function GetSubstructureMatches using SMILES/SMARTS 1057 pattern. The atom indices are optionally filtered by mapping atom numbers 1058 to appropriate atom indices during the generation of the molecule. 1059 For example: [O:1]=[S:2](=[O])[C:3][C:4]. 1060 1061 Arguments: 1062 Mol (object): RDKit molecule object. 1063 PatternMol (object): RDKit molecule object for a SMILES/SMARTS pattern. 1064 AtomIndices (list): Atom indices. 1065 FilterByAtomMapNums (bool): Filter matches by atom map numbers. 1066 1067 Returns: 1068 object : RDKit molecule object or None. 1069 1070 """ 1071 1072 AtomMapIndices = _GetAtomMapIndices(PatternMol) if FilterByAtomMapNums else None 1073 1074 return (_MolFromSubstructureMatch(Mol, PatternMol, AtomIndices, AtomMapIndices)) 1075 1076 def MolsFromSubstructureMatches(Mol, PatternMol, AtomIndicesList, FilterByAtomMapNums = False): 1077 """Generate a list of RDKit molecule objects for a list containing lists of 1078 matched atom indices present in a pattern molecule. The list of atom indices 1079 correspond to a list retrieved by RDKit function GetSubstructureMatches using 1080 SMILES/SMARTS pattern. The atom indices are optionally filtered by mapping 1081 atom numbers to appropriate atom indices during the generation of the 1082 molecule. For example: [O:1]=[S:2](=[O])[C:3][C:4]. 1083 1084 Arguments: 1085 Mol (object): RDKit molecule object. 1086 PatternMol (object): RDKit molecule object for a SMILES/SMARTS pattern. 1087 AtomIndicesList (list): A list of lists containing atom indices. 1088 FilterByAtomMapNums (bool): Filter matches by atom map numbers. 1089 1090 Returns: 1091 list : A list of lists containg RDKit molecule objects or None. 1092 1093 """ 1094 1095 AtomMapIndices = _GetAtomMapIndices(PatternMol) if FilterByAtomMapNums else None 1096 1097 Mols = [] 1098 for AtomIndices in AtomIndicesList: 1099 Mols.append(_MolFromSubstructureMatch(Mol, PatternMol, AtomIndices, AtomMapIndices)) 1100 1101 return Mols if len(Mols) else None 1102 1103 def FilterSubstructureMatchByAtomMapNumbers(Mol, PatternMol, AtomIndices): 1104 """Filter a list of matched atom indices by map atom numbers present in a 1105 pattern molecule. The list of atom indices correspond to a list retrieved by 1106 RDKit function GetSubstructureMatches using SMILES/SMARTS pattern. The 1107 atom map numbers are mapped to appropriate atom indices during the generation 1108 of molecules. For example: [O:1]=[S:2](=[O])[C:3][C:4]. 1109 1110 Arguments: 1111 Mol (object): RDKit molecule object. 1112 PatternMol (object): RDKit molecule object for a SMILES/SMARTS pattern. 1113 AtomIndices (list): Atom indices. 1114 1115 Returns: 1116 list : A list of filtered atom indices. 1117 1118 """ 1119 AtomMapIndices = _GetAtomMapIndices(PatternMol) 1120 1121 return _FilterSubstructureMatchByAtomMapNumbers(Mol, PatternMol, AtomIndices, AtomMapIndices) 1122 1123 def FilterSubstructureMatchesByAtomMapNumbers(Mol, PatternMol, AtomIndicesList): 1124 """Filter a list of lists containing matched atom indices by map atom numbers 1125 present in a pattern molecule. The list of atom indices correspond to a list retrieved by 1126 RDKit function GetSubstructureMatches using SMILES/SMARTS pattern. The 1127 atom map numbers are mapped to appropriate atom indices during the generation 1128 of molecules. For example: [O:1]=[S:2](=[O])[C:3][C:4]. 1129 1130 Arguments: 1131 Mol (object): RDKit molecule object. 1132 PatternMol (object): RDKit molecule object for a SMILES/SMARTS pattern. 1133 AtomIndicesList (list): A list of lists containing atom indices. 1134 1135 Returns: 1136 list : A list of lists containing filtered atom indices. 1137 1138 """ 1139 AtomMapIndices = _GetAtomMapIndices(PatternMol) 1140 1141 MatchedAtomIndicesList = [] 1142 for AtomIndices in AtomIndicesList: 1143 MatchedAtomIndicesList.append(_FilterSubstructureMatchByAtomMapNumbers(Mol, PatternMol, AtomIndices, AtomMapIndices)) 1144 1145 return MatchedAtomIndicesList 1146 1147 def _MolFromSubstructureMatch(Mol, PatternMol, AtomIndices, AtomMapIndices): 1148 """Generate a RDKit molecule object for a list of matched atom indices and available 1149 atom map indices. 1150 """ 1151 1152 if AtomMapIndices is not None: 1153 MatchedAtomIndices = [AtomIndices[Index] for Index in AtomMapIndices] 1154 else: 1155 MatchedAtomIndices = list(AtomIndices) 1156 1157 return _GetMolFromAtomIndices(Mol, MatchedAtomIndices) 1158 1159 def _GetAtomMapIndices(Mol): 1160 """Get a list of available atom indices corresponding to sorted atom map 1161 numbers present in a SMILES/SMARTS pattern used for creating a molecule. 1162 """ 1163 1164 AtomMapIndices, AtomMapNumbers = _GetAtomMapIndicesAndMapNumbers(Mol) 1165 1166 return AtomMapIndices 1167 1168 def _GetAtomMapIndicesAndMapNumbers(Mol): 1169 """Get a list of available atom indices and atom map numbers present 1170 in a SMILES/SMARTS pattern used for creating a molecule. Both lists 1171 are sorted in ascending order by atom map numbers. 1172 """ 1173 1174 # Setup a atom map number to atom indices map.. 1175 AtomMapNumToIndices = {} 1176 for Atom in Mol.GetAtoms(): 1177 AtomMapNum = Atom.GetAtomMapNum() 1178 1179 if AtomMapNum: 1180 AtomMapNumToIndices[AtomMapNum] = Atom.GetIdx() 1181 1182 # Setup atom indices corresponding to sorted atom map numbers... 1183 AtomMapIndices = None 1184 AtomMapNumbers = None 1185 if len(AtomMapNumToIndices): 1186 AtomMapNumbers = sorted(AtomMapNumToIndices) 1187 AtomMapIndices = [AtomMapNumToIndices[AtomMapNum] for AtomMapNum in AtomMapNumbers] 1188 1189 return (AtomMapIndices, AtomMapNumbers) 1190 1191 def _FilterSubstructureMatchByAtomMapNumbers(Mol, PatternMol, AtomIndices, AtomMapIndices): 1192 """Filter substructure match atom indices by atom map indices corresponding to 1193 atom map numbers. 1194 """ 1195 1196 if AtomMapIndices is None: 1197 return list(AtomIndices) 1198 1199 return [AtomIndices[Index] for Index in AtomMapIndices] 1200 1201 def _GetMolFromAtomIndices(Mol, AtomIndices): 1202 """Generate a RDKit molecule object from atom indices returned by 1203 substructure search. 1204 """ 1205 1206 BondIndices = [] 1207 for AtomIndex in AtomIndices: 1208 Atom = Mol.GetAtomWithIdx(AtomIndex) 1209 1210 for AtomNbr in Atom.GetNeighbors(): 1211 AtomNbrIndex = AtomNbr.GetIdx() 1212 if AtomNbrIndex not in AtomIndices: 1213 continue 1214 1215 BondIndex = Mol.GetBondBetweenAtoms(AtomIndex, AtomNbrIndex).GetIdx() 1216 if BondIndex in BondIndices: 1217 continue 1218 1219 BondIndices.append(BondIndex) 1220 1221 MatchedMol = Chem.PathToSubmol(Mol, BondIndices) if len(BondIndices) else None 1222 1223 return MatchedMol 1224 1225 def ConstrainAndEmbed(mol, core, coreMatchesMol=None, useTethers=True, coreConfId=-1, randomseed=2342, getForceField=AllChem.UFFGetMoleculeForceField, **kwargs): 1226 """ 1227 The function is a local copy of RDKit fucntion AllChem.ConstrainedEmbed(). 1228 It has been enhanced to support an explicit list of core matches corresponding 1229 to the matched atom indices in the molecule. The number of matched atom indices 1230 must be equal to the number of atoms in core molecule. 1231 1232 Arguments: 1233 mol (object): RDKit molecule object to embed. 1234 core (object): RDKit molecule to use as a source of constraints. 1235 coreMatchesMol (list): A list matches atom indices in mol. 1236 useTethers: (bool) if True, the final conformation will be optimized 1237 subject to a series of extra forces that pull the matching atoms to 1238 the positions of the core atoms. Otherwise simple distance 1239 constraints based on the core atoms will be used in the 1240 optimization. 1241 coreConfId (int): ID of the core conformation to use. 1242 randomSeed (int): Seed for the random number generator 1243 1244 Returns: 1245 mol (object): RDKit molecule object. 1246 1247 """ 1248 if coreMatchesMol is None: 1249 match = mol.GetSubstructMatch(core) 1250 if not match: 1251 raise ValueError("Molecule doesn't match the core.") 1252 else: 1253 if core.GetNumAtoms() != len(coreMatchesMol): 1254 raise ValueError("Number of atoms, %s, in core molecule must match number of atom indices, %s, specified in the list coreMatchesMol." % (core.GetNumAtoms(), len(coreMatchesMol))) 1255 # Check specified matched atom indices in coreMatchesMol and use the match atom 1256 # indices returned by GetSubstructMatches() for embedding... 1257 coreMatch = None 1258 matches = mol.GetSubstructMatches(core) 1259 for match in matches: 1260 if len(match) != len(coreMatchesMol): 1261 continue 1262 matchFound = True 1263 for atomIndex in match: 1264 if atomIndex not in coreMatchesMol: 1265 matchFound = False 1266 break 1267 if matchFound: 1268 coreMatch = match 1269 break 1270 if coreMatch is None: 1271 raise ValueError("Molecule doesn't match the atom indices specified in the list coreMatchesMol.") 1272 match = coreMatch 1273 1274 coordMap = {} 1275 coreConf = core.GetConformer(coreConfId) 1276 for i, idxI in enumerate(match): 1277 corePtI = coreConf.GetAtomPosition(i) 1278 coordMap[idxI] = corePtI 1279 1280 ci = AllChem.EmbedMolecule(mol, coordMap=coordMap, randomSeed=randomseed, **kwargs) 1281 if ci < 0: 1282 raise ValueError('Could not embed molecule.') 1283 1284 algMap = [(j, i) for i, j in enumerate(match)] 1285 1286 if not useTethers: 1287 # clean up the conformation 1288 ff = getForceField(mol, confId=0) 1289 for i, idxI in enumerate(match): 1290 for j in range(i + 1, len(match)): 1291 idxJ = match[j] 1292 d = coordMap[idxI].Distance(coordMap[idxJ]) 1293 ff.AddDistanceConstraint(idxI, idxJ, d, d, 100.) 1294 ff.Initialize() 1295 n = 4 1296 more = ff.Minimize() 1297 while more and n: 1298 more = ff.Minimize() 1299 n -= 1 1300 # rotate the embedded conformation onto the core: 1301 rms = AllChem.AlignMol(mol, core, atomMap=algMap) 1302 else: 1303 # rotate the embedded conformation onto the core: 1304 rms = AllChem.AlignMol(mol, core, atomMap=algMap) 1305 ff = getForceField(mol, confId=0) 1306 conf = core.GetConformer() 1307 for i in range(core.GetNumAtoms()): 1308 p = conf.GetAtomPosition(i) 1309 pIdx = ff.AddExtraPoint(p.x, p.y, p.z, fixed=True) - 1 1310 ff.AddDistanceConstraint(pIdx, match[i], 0, 0, 100.) 1311 ff.Initialize() 1312 n = 4 1313 more = ff.Minimize(energyTol=1e-4, forceTol=1e-3) 1314 while more and n: 1315 more = ff.Minimize(energyTol=1e-4, forceTol=1e-3) 1316 n -= 1 1317 # realign 1318 rms = AllChem.AlignMol(mol, core, atomMap=algMap) 1319 1320 mol.SetProp('EmbedRMS', str(rms)) 1321 return mol 1322 1323 def ReadAndValidateMolecules(FileName, **KeyWordArgs): 1324 """Read molecules from an input file, validate all molecule objects, and return 1325 a list of valid and non-valid molecule objects along with their counts. 1326 1327 Arguments: 1328 FileName (str): Name of a file with complete path. 1329 **KeyWordArgs (dictionary) : Parameter name and value pairs for reading and 1330 processing molecules. 1331 1332 Returns: 1333 list : List of valid RDKit molecule objects. 1334 int : Number of total molecules in input file. 1335 int : Number of valid molecules in input file. 1336 1337 Notes: 1338 The file extension is used to determine type of the file and set up an appropriate 1339 file reader. 1340 1341 """ 1342 1343 AllowEmptyMols = True 1344 if "AllowEmptyMols" in KeyWordArgs: 1345 AllowEmptyMols = KeyWordArgs["AllowEmptyMols"] 1346 1347 Mols = ReadMolecules(FileName, **KeyWordArgs) 1348 1349 if AllowEmptyMols: 1350 ValidMols = [Mol for Mol in Mols if Mol is not None] 1351 else: 1352 ValidMols = [] 1353 MolCount = 0 1354 for Mol in Mols: 1355 MolCount += 1 1356 if Mol is None: 1357 continue 1358 1359 if IsMolEmpty(Mol): 1360 MolName = GetMolName(Mol, MolCount) 1361 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 1362 continue 1363 1364 ValidMols.append(Mol) 1365 1366 MolCount = len(Mols) 1367 ValidMolCount = len(ValidMols) 1368 1369 return (ValidMols, MolCount, ValidMolCount) 1370 1371 def ReadMolecules(FileName, **KeyWordArgs): 1372 """Read molecules from an input file without performing any validation 1373 and creation of molecule objects. 1374 1375 Arguments: 1376 FileName (str): Name of a file with complete path. 1377 **KeyWordArgs (dictionary) : Parameter name and value pairs for reading and 1378 processing molecules. 1379 1380 Returns: 1381 list : List of RDKit molecule objects. 1382 1383 Notes: 1384 The file extension is used to determine type of the file and set up an appropriate 1385 file reader. 1386 1387 """ 1388 1389 # Set default values for possible arguments... 1390 ReaderArgs = {"Sanitize": True, "RemoveHydrogens": True, "StrictParsing": True, "SMILESDelimiter" : ' ', "SMILESColumn": 1, "SMILESNameColumn": 2, "SMILESTitleLine": True } 1391 1392 # Set specified values for possible arguments... 1393 for Arg in ReaderArgs: 1394 if Arg in KeyWordArgs: 1395 ReaderArgs[Arg] = KeyWordArgs[Arg] 1396 1397 # Modify specific valeus for SMILES... 1398 if MiscUtil.CheckFileExt(FileName, "smi csv tsv txt"): 1399 Args = ["Sanitize", "SMILESTitleLine"] 1400 for Arg in Args: 1401 if ReaderArgs[Arg] is True: 1402 ReaderArgs[Arg] = 1 1403 else: 1404 ReaderArgs[Arg] = 0 1405 1406 Mols = [] 1407 if MiscUtil.CheckFileExt(FileName, "sdf sd"): 1408 return ReadMoleculesFromSDFile(FileName, ReaderArgs["Sanitize"], ReaderArgs["RemoveHydrogens"], ReaderArgs['StrictParsing']) 1409 elif MiscUtil.CheckFileExt(FileName, "mol"): 1410 return ReadMoleculesFromMolFile(FileName, ReaderArgs["Sanitize"], ReaderArgs["RemoveHydrogens"], ReaderArgs['StrictParsing']) 1411 elif MiscUtil.CheckFileExt(FileName, "mol2"): 1412 return ReadMoleculesFromMol2File(FileName, ReaderArgs["Sanitize"], ReaderArgs["RemoveHydrogens"]) 1413 elif MiscUtil.CheckFileExt(FileName, "pdb"): 1414 return ReadMoleculesFromPDBFile(FileName, ReaderArgs["Sanitize"], ReaderArgs["RemoveHydrogens"]) 1415 elif MiscUtil.CheckFileExt(FileName, "smi txt csv tsv"): 1416 SMILESColumnIndex = ReaderArgs["SMILESColumn"] - 1 1417 SMILESNameColumnIndex = ReaderArgs["SMILESNameColumn"] - 1 1418 return ReadMoleculesFromSMILESFile(FileName, ReaderArgs["SMILESDelimiter"], SMILESColumnIndex, SMILESNameColumnIndex, ReaderArgs["SMILESTitleLine"], ReaderArgs["Sanitize"]) 1419 else: 1420 MiscUtil.PrintWarning("RDKitUtil.ReadMolecules: Non supported file type: %s" % FileName) 1421 1422 return Mols 1423 1424 def ReadMoleculesFromSDFile(FileName, Sanitize = True, RemoveHydrogens = True, StrictParsing = True): 1425 """Read molecules from a SD file. 1426 1427 Arguments: 1428 FileName (str): Name of a file with complete path. 1429 Sanitize (bool): Sanitize molecules. 1430 RemoveHydrogens (bool): Remove hydrogens from molecules. 1431 StrictParsing (bool): Perform strict parsing. 1432 1433 Returns: 1434 list : List of RDKit molecule objects. 1435 1436 """ 1437 return Chem.SDMolSupplier(FileName, sanitize = Sanitize, removeHs = RemoveHydrogens, strictParsing = StrictParsing) 1438 1439 def ReadMoleculesFromMolFile(FileName, Sanitize = True, RemoveHydrogens = True, StrictParsing = True): 1440 """Read molecule from a MDL Mol file. 1441 1442 Arguments: 1443 FileName (str): Name of a file with complete path. 1444 Sanitize (bool): Sanitize molecules. 1445 RemoveHydrogens (bool): Remove hydrogens from molecules. 1446 StrictParsing (bool): Perform strict parsing. 1447 1448 Returns: 1449 list : List of RDKit molecule objects. 1450 1451 """ 1452 1453 Mols = [] 1454 Mols.append(Chem.MolFromMolFile(FileName, sanitize = Sanitize, removeHs = RemoveHydrogens, strictParsing = StrictParsing)) 1455 return Mols 1456 1457 1458 def ReadMoleculesFromMol2File(FileName, Sanitize = True, RemoveHydrogens = True): 1459 """Read molecules from a Tripos Mol2 file. The first call to the function 1460 creates and returns a generator object using Python yield statement. The 1461 molecules are created during the subsequent iteration by the generator object. 1462 1463 Arguments: 1464 FileName (str): Name of a file with complete path. 1465 Sanitize (bool): Sanitize molecules. 1466 RemoveHydrogens (bool): Remove hydrogens from molecules. 1467 1468 Returns: 1469 list : A Python generator object for iterating over the molecules. 1470 1471 """ 1472 1473 return _Mol2MolSupplier(FileName, Sanitize, RemoveHydrogens) 1474 1475 def _Mol2MolSupplier(FileName, Sanitize = True, RemoveHydrogens = True): 1476 """Read molecules from a Tripos Mol2 file.""" 1477 1478 fh = open(FileName, 'r') 1479 1480 FirstMol = True 1481 ProcessingMol = False 1482 1483 for Line in fh: 1484 if re.match("^#", Line, re.I): 1485 continue 1486 1487 if re.match("^@<TRIPOS>MOLECULE", Line, re.I): 1488 ProcessingMol = True 1489 1490 if FirstMol: 1491 FirstMol = False 1492 1493 MolLines = [] 1494 MolLines.append(Line) 1495 continue 1496 1497 # Process lines for existing molecule... 1498 MolBlock = "".join(MolLines) 1499 1500 Mol = Chem.MolFromMol2Block(MolBlock, sanitize = Sanitize, removeHs = RemoveHydrogens) 1501 yield Mol 1502 1503 # Track lines for next molecule... 1504 MolLines = [] 1505 MolLines.append(Line) 1506 continue 1507 1508 if not ProcessingMol: 1509 continue 1510 1511 MolLines.append(Line) 1512 1513 fh.close 1514 1515 # Process last molecule... 1516 if len(MolLines): 1517 MolBlock = "".join(MolLines) 1518 Mol = Chem.MolFromMol2Block(MolBlock, sanitize = Sanitize, removeHs = RemoveHydrogens) 1519 yield Mol 1520 1521 def ReadMoleculesFromPDBFile(FileName, Sanitize = True, RemoveHydrogens = True): 1522 """Read molecule from a PDB file. 1523 1524 Arguments: 1525 FileName (str): Name of a file with complete path. 1526 Sanitize (bool): Sanitize molecules. 1527 RemoveHydrogens (bool): Remove hydrogens from molecules. 1528 1529 Returns: 1530 list : List of RDKit molecule objects. 1531 1532 """ 1533 1534 Mols = [] 1535 Mols.append(Chem.MolFromPDBFile(FileName, sanitize = Sanitize, removeHs = RemoveHydrogens)) 1536 return Mols 1537 1538 def ReadMoleculesFromSMILESFile(FileName, SMILESDelimiter = ' ', SMILESColIndex = 0, SMILESNameColIndex = 1, SMILESTitleLine = 1, Sanitize = 1): 1539 """Read molecules from a SMILES file. 1540 1541 Arguments: 1542 SMILESDelimiter (str): Delimiter for parsing SMILES line 1543 SMILESColIndex (int): Column index containing SMILES string. 1544 SMILESNameColIndex (int): Column index containing molecule name. 1545 SMILESTitleLine (int): Flag to indicate presence of title line. 1546 Sanitize (int): Sanitize molecules. 1547 1548 Returns: 1549 list : List of RDKit molecule objects. 1550 1551 """ 1552 1553 return Chem.SmilesMolSupplier(FileName, delimiter = SMILESDelimiter, smilesColumn = SMILESColIndex, nameColumn = SMILESNameColIndex, titleLine = SMILESTitleLine, sanitize = Sanitize) 1554 1555 def MoleculesWriter(FileName, **KeyWordArgs): 1556 """Set up a molecule writer. 1557 1558 Arguments: 1559 FileName (str): Name of a file with complete path. 1560 **KeyWordArgs (dictionary) : Parameter name and value pairs for writing and 1561 processing molecules. 1562 1563 Returns: 1564 RDKit object : Molecule writer. 1565 1566 Notes: 1567 The file extension is used to determine type of the file and set up an appropriate 1568 file writer. 1569 1570 """ 1571 1572 # Set default values for possible arguments... 1573 WriterArgs = {"Compute2DCoords" : False, "Kekulize": True, "ForceV3000": False, "SMILESKekulize": False, "SMILESDelimiter" : ' ', "SMILESIsomeric": True, "SMILESTitleLine": True, "SMILESMolName": True} 1574 1575 # Set specified values for possible arguments... 1576 for Arg in WriterArgs: 1577 if Arg in KeyWordArgs: 1578 WriterArgs[Arg] = KeyWordArgs[Arg] 1579 1580 Writer = None 1581 if MiscUtil.CheckFileExt(FileName, "sdf sd"): 1582 Writer = Chem.SDWriter(FileName) 1583 Writer.SetKekulize(WriterArgs["Kekulize"]) 1584 Writer.SetForceV3000(WriterArgs["ForceV3000"]) 1585 elif MiscUtil.CheckFileExt(FileName, "pdb"): 1586 Writer = Chem.PDBWriter(FileName) 1587 elif MiscUtil.CheckFileExt(FileName, "smi"): 1588 # Text for the name column in the title line. Blank indicates not to include name column 1589 # in the output file... 1590 NameHeader = 'Name' if WriterArgs["SMILESMolName"] else '' 1591 Writer = Chem.SmilesWriter(FileName, delimiter = WriterArgs["SMILESDelimiter"], nameHeader = NameHeader, includeHeader = WriterArgs["SMILESTitleLine"], isomericSmiles = WriterArgs["SMILESIsomeric"], kekuleSmiles = WriterArgs["SMILESKekulize"]) 1592 else: 1593 MiscUtil.PrintWarning("RDKitUtil.WriteMolecules: Non supported file type: %s" % FileName) 1594 1595 return Writer 1596 1597 def WriteMolecules(FileName, Mols, **KeyWordArgs): 1598 """Write molecules to an output file. 1599 1600 Arguments: 1601 FileName (str): Name of a file with complete path. 1602 Mols (list): List of RDKit molecule objects. 1603 **KeyWordArgs (dictionary) : Parameter name and value pairs for writing and 1604 processing molecules. 1605 1606 Returns: 1607 int : Number of total molecules. 1608 int : Number of processed molecules written to output file. 1609 1610 Notes: 1611 The file extension is used to determine type of the file and set up an appropriate 1612 file writer. 1613 1614 """ 1615 1616 Compute2DCoords = False 1617 if "Compute2DCoords" in KeyWordArgs: 1618 Compute2DCoords = KeyWordArgs["Compute2DCoords"] 1619 1620 SetSMILESMolProps = KeyWordArgs["SetSMILESMolProps"] if "SetSMILESMolProps" in KeyWordArgs else False 1621 1622 MolCount = 0 1623 ProcessedMolCount = 0 1624 1625 Writer = MoleculesWriter(FileName, **KeyWordArgs) 1626 1627 if Writer is None: 1628 return (MolCount, ProcessedMolCount) 1629 1630 FirstMol = True 1631 for Mol in Mols: 1632 MolCount += 1 1633 if Mol is None: 1634 continue 1635 1636 if FirstMol: 1637 FirstMol = False 1638 if SetSMILESMolProps: 1639 SetWriterMolProps(Writer, Mol) 1640 1641 ProcessedMolCount += 1 1642 if Compute2DCoords: 1643 AllChem.Compute2DCoords(Mol) 1644 1645 Writer.write(Mol) 1646 1647 Writer.close() 1648 1649 return (MolCount, ProcessedMolCount) 1650 1651 def SetWriterMolProps(Writer, Mol): 1652 """Setup molecule properties for a writer to output. 1653 1654 Arguments: 1655 Writer (object): RDKit writer object. 1656 Mol (object): RDKit molecule object. 1657 1658 Returns: 1659 object : Writer object. 1660 1661 """ 1662 PropNames = list(Mol.GetPropNames()) 1663 if len(PropNames): 1664 Writer.SetProps(PropNames) 1665 1666 return Writer 1667