1 #!/bin/env python 2 # 3 # File: RDKitFilterTorsionLibraryAlerts.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Collaborator: Pat Walters 7 # 8 # Acknowledgments: Wolfgang Guba, Patrick Penner, and Levi Pierce 9 # 10 # Copyright (C) 2024 Manish Sud. All rights reserved. 11 # 12 # This script uses the Torsion Library jointly developed by the University 13 # of Hamburg, Center for Bioinformatics, Hamburg, Germany and 14 # F. Hoffmann-La-Roche Ltd., Basel, Switzerland. 15 # 16 # The functionality available in this script is implemented using RDKit, an 17 # open source toolkit for cheminformatics developed by Greg Landrum. 18 # 19 # This file is part of MayaChemTools. 20 # 21 # MayaChemTools is free software; you can redistribute it and/or modify it under 22 # the terms of the GNU Lesser General Public License as published by the Free 23 # Software Foundation; either version 3 of the License, or (at your option) any 24 # later version. 25 # 26 # MayaChemTools is distributed in the hope that it will be useful, but without 27 # any warranty; without even the implied warranty of merchantability of fitness 28 # for a particular purpose. See the GNU Lesser General Public License for more 29 # details. 30 # 31 # You should have received a copy of the GNU Lesser General Public License 32 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 33 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 34 # Boston, MA, 02111-1307, USA. 35 # 36 37 from __future__ import print_function 38 39 # Add local python path to the global path and import standard library modules... 40 import os 41 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 42 import time 43 import re 44 import glob 45 import multiprocessing as mp 46 47 # RDKit imports... 48 try: 49 from rdkit import rdBase 50 from rdkit import Chem 51 from rdkit.Chem import rdMolTransforms 52 except ImportError as ErrMsg: 53 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 54 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 55 sys.exit(1) 56 57 # MayaChemTools imports... 58 try: 59 from docopt import docopt 60 import MiscUtil 61 import RDKitUtil 62 from TorsionAlerts.TorsionLibraryAlerts import TorsionLibraryAlerts 63 except ImportError as ErrMsg: 64 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 65 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 66 sys.exit(1) 67 68 ScriptName = os.path.basename(sys.argv[0]) 69 Options = {} 70 OptionsInfo = {} 71 72 def main(): 73 """Start execution of the script.""" 74 75 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 76 77 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 78 79 # Retrieve command line arguments and options... 80 RetrieveOptions() 81 82 if Options["--list"]: 83 # Handle listing of torsion library information... 84 ProcessListTorsionLibraryOption() 85 else: 86 # Process and validate command line arguments and options... 87 ProcessOptions() 88 89 # Perform actions required by the script... 90 PerformFiltering() 91 92 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 93 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 94 95 def PerformFiltering(): 96 """Filter molecules using SMARTS torsion rules in the torsion library file.""" 97 98 # Setup a molecule reader... 99 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"]) 100 Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"]) 101 102 MolCount, ValidMolCount, RemainingMolCount, WriteFailedCount = ProcessMolecules(Mols) 103 104 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 105 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 106 MiscUtil.PrintInfo("Number of molecules failed during writing: %d" % WriteFailedCount) 107 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount + WriteFailedCount)) 108 109 MiscUtil.PrintInfo("\nNumber of remaining molecules: %d" % RemainingMolCount) 110 MiscUtil.PrintInfo("Number of filtered molecules: %d" % (ValidMolCount - RemainingMolCount)) 111 112 def ProcessMolecules(Mols): 113 """Process and filter molecules.""" 114 115 if OptionsInfo["MPMode"]: 116 return ProcessMoleculesUsingMultipleProcesses(Mols) 117 else: 118 return ProcessMoleculesUsingSingleProcess(Mols) 119 120 def ProcessMoleculesUsingSingleProcess(Mols): 121 """Process and filter molecules using a single process.""" 122 123 # Instantiate torsion library alerts class... 124 TorsionLibraryAlertsHandle = InstantiateTorsionLibraryAlertsClass() 125 126 MiscUtil.PrintInfo("\nFiltering molecules...") 127 128 OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"] 129 130 # Set up writers... 131 OutfilesWriters = SetupOutfilesWriters() 132 133 WriterRemaining = OutfilesWriters["WriterRemaining"] 134 WriterFiltered = OutfilesWriters["WriterFiltered"] 135 WriterAlertSummary = OutfilesWriters["WriterAlertSummary"] 136 137 # Initialize alerts summary info... 138 TorsionAlertsSummaryInfo = InitializeTorsionAlertsSummaryInfo() 139 140 (MolCount, ValidMolCount, RemainingMolCount, WriteFailedCount, FilteredMolWriteCount) = [0] * 5 141 for Mol in Mols: 142 MolCount += 1 143 144 if Mol is None: 145 continue 146 147 if RDKitUtil.IsMolEmpty(Mol): 148 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % RDKitUtil.GetMolName(Mol, MolCount)) 149 continue 150 151 # Check for 3D flag... 152 if not Mol.GetConformer().Is3D(): 153 MiscUtil.PrintWarning("3D tag is not set. Ignoring molecule: %s\n" % RDKitUtil.GetMolName(Mol, MolCount)) 154 continue 155 156 ValidMolCount += 1 157 158 # Identify torsion library alerts for rotatable bonds.. 159 RotBondsAlertsStatus, RotBondsAlertsInfo = TorsionLibraryAlertsHandle.IdentifyTorsionLibraryAlertsForRotatableBonds(Mol) 160 161 TrackTorsionAlertsSummaryInfo(TorsionAlertsSummaryInfo, RotBondsAlertsInfo) 162 163 # Write out filtered and remaining molecules... 164 WriteStatus = True 165 if RotBondsAlertsStatus: 166 if OutfileFilteredMode: 167 WriteStatus = WriteMolecule(WriterFiltered, Mol, RotBondsAlertsInfo) 168 if WriteStatus: 169 FilteredMolWriteCount += 1 170 else: 171 RemainingMolCount += 1 172 WriteStatus = WriteMolecule(WriterRemaining, Mol, RotBondsAlertsInfo) 173 174 if not WriteStatus: 175 WriteFailedCount += 1 176 177 WriteTorsionAlertsSummaryInfo(WriterAlertSummary, TorsionAlertsSummaryInfo) 178 CloseOutfilesWriters(OutfilesWriters) 179 180 if FilteredMolWriteCount: 181 WriteTorsionAlertsFilteredByRulesInfo(TorsionAlertsSummaryInfo) 182 183 return (MolCount, ValidMolCount, RemainingMolCount, WriteFailedCount) 184 185 def ProcessMoleculesUsingMultipleProcesses(Mols): 186 """Process and filter molecules using multiprocessing.""" 187 188 MiscUtil.PrintInfo("\nFiltering molecules using multiprocessing...") 189 190 MPParams = OptionsInfo["MPParams"] 191 OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"] 192 193 # Instantiate torsion strain energy alerts class to list torsion library information... 194 TorsionLibraryAlertsHandle = InstantiateTorsionLibraryAlertsClass() 195 196 # Set up writers... 197 OutfilesWriters = SetupOutfilesWriters() 198 199 WriterRemaining = OutfilesWriters["WriterRemaining"] 200 WriterFiltered = OutfilesWriters["WriterFiltered"] 201 WriterAlertSummary = OutfilesWriters["WriterAlertSummary"] 202 203 # Initialize alerts summary info... 204 TorsionAlertsSummaryInfo = InitializeTorsionAlertsSummaryInfo() 205 206 # Setup data for initializing a worker process... 207 MiscUtil.PrintInfo("Encoding options info and rotatable bond pattern molecule...") 208 InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo)) 209 210 # Setup a encoded mols data iterable for a worker process... 211 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols) 212 213 # Setup process pool along with data initialization for each process... 214 MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")) 215 MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]))) 216 217 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs) 218 219 # Start processing... 220 if re.match("^Lazy$", MPParams["InputDataMode"], re.I): 221 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 222 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I): 223 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 224 else: 225 MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"])) 226 227 (MolCount, ValidMolCount, RemainingMolCount, WriteFailedCount, FilteredMolWriteCount) = [0] * 5 228 for Result in Results: 229 MolCount += 1 230 MolIndex, EncodedMol, RotBondsAlertsStatus, RotBondsAlertsInfo = Result 231 232 if EncodedMol is None: 233 continue 234 ValidMolCount += 1 235 236 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 237 238 TrackTorsionAlertsSummaryInfo(TorsionAlertsSummaryInfo, RotBondsAlertsInfo) 239 240 # Write out filtered and remaining molecules... 241 WriteStatus = True 242 if RotBondsAlertsStatus: 243 if OutfileFilteredMode: 244 WriteStatus = WriteMolecule(WriterFiltered, Mol, RotBondsAlertsInfo) 245 if WriteStatus: 246 FilteredMolWriteCount += 1 247 else: 248 RemainingMolCount += 1 249 WriteStatus = WriteMolecule(WriterRemaining, Mol, RotBondsAlertsInfo) 250 251 if not WriteStatus: 252 WriteFailedCount += 1 253 254 WriteTorsionAlertsSummaryInfo(WriterAlertSummary, TorsionAlertsSummaryInfo) 255 CloseOutfilesWriters(OutfilesWriters) 256 257 if FilteredMolWriteCount: 258 WriteTorsionAlertsFilteredByRulesInfo(TorsionAlertsSummaryInfo) 259 260 return (MolCount, ValidMolCount, RemainingMolCount, WriteFailedCount) 261 262 def InitializeWorkerProcess(*EncodedArgs): 263 """Initialize data for a worker process.""" 264 265 global Options, OptionsInfo 266 267 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid()) 268 269 # Decode Options and OptionInfo... 270 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0]) 271 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1]) 272 273 # Instantiate torsion slibrary alerts class... 274 OptionsInfo["TorsionLibraryAlertsHandle"] = InstantiateTorsionLibraryAlertsClass(Quiet = True) 275 276 def WorkerProcess(EncodedMolInfo): 277 """Process data for a worker process.""" 278 279 MolIndex, EncodedMol = EncodedMolInfo 280 281 if EncodedMol is None: 282 return [MolIndex, None, False, None] 283 284 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 285 if RDKitUtil.IsMolEmpty(Mol): 286 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1)) 287 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 288 return [MolIndex, None, False, None] 289 290 # Check for 3D flag... 291 if not Mol.GetConformer().Is3D(): 292 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1)) 293 MiscUtil.PrintWarning("3D tag is not set. Ignoring molecule: %s\n" % MolName) 294 return [MolIndex, None, False, None] 295 296 # Identify torsion library alerts for rotatable bonds.. 297 TorsionTorsionLibraryAlertsHandle = OptionsInfo["TorsionLibraryAlertsHandle"] 298 RotBondsAlertsStatus, RotBondsAlertsInfo = TorsionTorsionLibraryAlertsHandle.IdentifyTorsionLibraryAlertsForRotatableBonds(Mol) 299 300 return [MolIndex, EncodedMol, RotBondsAlertsStatus, RotBondsAlertsInfo] 301 302 def InitializeTorsionAlertsSummaryInfo(): 303 """Initialize torsion alerts summary.""" 304 305 if OptionsInfo["CountMode"]: 306 return None 307 308 if not OptionsInfo["TrackAlertsSummaryInfo"]: 309 return None 310 311 TorsionAlertsSummaryInfo = {} 312 TorsionAlertsSummaryInfo["RuleIDs"] = [] 313 314 for DataLabel in ["SMARTSToRuleIDs", "RuleSMARTS", "HierarchyClassName", "HierarchySubClassName", "TorsionRulePeaks", "TorsionRuleTolerances1", "TorsionRuleTolerances2", "AlertTypes", "AlertTypesMolCount"]: 315 TorsionAlertsSummaryInfo[DataLabel] = {} 316 317 return TorsionAlertsSummaryInfo 318 319 def TrackTorsionAlertsSummaryInfo(TorsionAlertsSummaryInfo, RotBondsAlertsInfo): 320 """Track torsion alerts summary information for matched torsion rules in a 321 molecule.""" 322 323 if OptionsInfo["CountMode"]: 324 return 325 326 if not OptionsInfo["TrackAlertsSummaryInfo"]: 327 return 328 329 if RotBondsAlertsInfo is None: 330 return 331 332 MolAlertsInfo = {} 333 MolAlertsInfo["RuleIDs"] = [] 334 MolAlertsInfo["AlertTypes"] = {} 335 336 for ID in RotBondsAlertsInfo["IDs"]: 337 if not RotBondsAlertsInfo["MatchStatus"][ID]: 338 continue 339 340 if OptionsInfo["OutfileAlertsOnly"]: 341 if RotBondsAlertsInfo["AlertTypes"][ID] not in OptionsInfo["SpecifiedAlertsModeList"]: 342 continue 343 344 AlertType = RotBondsAlertsInfo["AlertTypes"][ID] 345 TorsionRuleNodeID = RotBondsAlertsInfo["TorsionRuleNodeID"][ID] 346 TorsionRuleSMARTS = RotBondsAlertsInfo["TorsionRuleSMARTS"][ID] 347 348 # Track data for torsion alert summary information across molecules... 349 if TorsionRuleNodeID not in TorsionAlertsSummaryInfo["RuleSMARTS"]: 350 TorsionAlertsSummaryInfo["RuleIDs"].append(TorsionRuleNodeID) 351 TorsionAlertsSummaryInfo["SMARTSToRuleIDs"][TorsionRuleSMARTS] = TorsionRuleNodeID 352 353 TorsionAlertsSummaryInfo["RuleSMARTS"][TorsionRuleNodeID] = TorsionRuleSMARTS 354 TorsionAlertsSummaryInfo["HierarchyClassName"][TorsionRuleNodeID] = RotBondsAlertsInfo["HierarchyClassNames"][ID] 355 TorsionAlertsSummaryInfo["HierarchySubClassName"][TorsionRuleNodeID] = RotBondsAlertsInfo["HierarchySubClassNames"][ID] 356 357 TorsionAlertsSummaryInfo["TorsionRulePeaks"][TorsionRuleNodeID] = RotBondsAlertsInfo["TorsionRulePeaks"][ID] 358 TorsionAlertsSummaryInfo["TorsionRuleTolerances1"][TorsionRuleNodeID] = RotBondsAlertsInfo["TorsionRuleTolerances1"][ID] 359 TorsionAlertsSummaryInfo["TorsionRuleTolerances2"][TorsionRuleNodeID] = RotBondsAlertsInfo["TorsionRuleTolerances2"][ID] 360 361 # Initialize number of alert types across all molecules... 362 TorsionAlertsSummaryInfo["AlertTypes"][TorsionRuleNodeID] = {} 363 364 # Initialize number of molecules flagged by each alert type... 365 TorsionAlertsSummaryInfo["AlertTypesMolCount"][TorsionRuleNodeID] = {} 366 367 if AlertType not in TorsionAlertsSummaryInfo["AlertTypes"][TorsionRuleNodeID]: 368 TorsionAlertsSummaryInfo["AlertTypes"][TorsionRuleNodeID][AlertType] = 0 369 TorsionAlertsSummaryInfo["AlertTypesMolCount"][TorsionRuleNodeID][AlertType] = 0 370 371 TorsionAlertsSummaryInfo["AlertTypes"][TorsionRuleNodeID][AlertType] += 1 372 373 # Track data for torsion alert information in a molecule... 374 if TorsionRuleNodeID not in MolAlertsInfo["AlertTypes"]: 375 MolAlertsInfo["RuleIDs"].append(TorsionRuleNodeID) 376 MolAlertsInfo["AlertTypes"][TorsionRuleNodeID] = {} 377 378 if AlertType not in MolAlertsInfo["AlertTypes"][TorsionRuleNodeID]: 379 MolAlertsInfo["AlertTypes"][TorsionRuleNodeID][AlertType] = 0 380 MolAlertsInfo["AlertTypes"][TorsionRuleNodeID][AlertType] += 1 381 382 # Track number of molecules flagged by a specific torsion alert... 383 for TorsionRuleNodeID in MolAlertsInfo["RuleIDs"]: 384 for AlertType in MolAlertsInfo["AlertTypes"][TorsionRuleNodeID]: 385 if MolAlertsInfo["AlertTypes"][TorsionRuleNodeID][AlertType]: 386 TorsionAlertsSummaryInfo["AlertTypesMolCount"][TorsionRuleNodeID][AlertType] += 1 387 388 def WriteTorsionAlertsSummaryInfo(Writer, TorsionAlertsSummaryInfo): 389 """Write out torsion alerts summary informatio to a CSV file.""" 390 391 if OptionsInfo["CountMode"]: 392 return 393 394 if not OptionsInfo["OutfileSummaryMode"]: 395 return 396 397 if len(TorsionAlertsSummaryInfo["RuleIDs"]) == 0: 398 return 399 400 # Write headers... 401 QuoteValues = True 402 Values = ["TorsionRule", "TorsionPeaks", "Tolerances1", "Tolerances2", "HierarchyClass", "HierarchySubClass", "TorsionAlertTypes", "TorsionAlertCount", "TorsionAlertMolCount"] 403 Writer.write("%s\n" % MiscUtil.JoinWords(Values, ",", QuoteValues)) 404 405 SortedRuleIDs = GetSortedTorsionAlertsSummaryInfoRuleIDs(TorsionAlertsSummaryInfo) 406 407 # Write alerts information... 408 for ID in SortedRuleIDs: 409 # Remove any double quotes in SMARTS... 410 RuleSMARTS = TorsionAlertsSummaryInfo["RuleSMARTS"][ID] 411 RuleSMARTS = re.sub("\"", "", RuleSMARTS, re.I) 412 413 HierarchyClassName = TorsionAlertsSummaryInfo["HierarchyClassName"][ID] 414 HierarchySubClassName = TorsionAlertsSummaryInfo["HierarchySubClassName"][ID] 415 416 TorsionPeaks = MiscUtil.JoinWords(["%s" % Value for Value in TorsionAlertsSummaryInfo["TorsionRulePeaks"][ID]], ",") 417 TorsionRuleTolerances1 = MiscUtil.JoinWords(["%s" % Value for Value in TorsionAlertsSummaryInfo["TorsionRuleTolerances1"][ID]], ",") 418 TorsionRuleTolerances2 = MiscUtil.JoinWords(["%s" % Value for Value in TorsionAlertsSummaryInfo["TorsionRuleTolerances2"][ID]], ",") 419 420 AlertTypes = [] 421 AlertTypeCount = [] 422 AlertTypeMolCount = [] 423 for AlertType in sorted(TorsionAlertsSummaryInfo["AlertTypes"][ID]): 424 AlertTypes.append(AlertType) 425 AlertTypeCount.append("%s" % TorsionAlertsSummaryInfo["AlertTypes"][ID][AlertType]) 426 AlertTypeMolCount.append("%s" % TorsionAlertsSummaryInfo["AlertTypesMolCount"][ID][AlertType]) 427 428 Values = [RuleSMARTS, TorsionPeaks, TorsionRuleTolerances1, TorsionRuleTolerances2, HierarchyClassName, HierarchySubClassName, "%s" % MiscUtil.JoinWords(AlertTypes, ","), "%s" % (MiscUtil.JoinWords(AlertTypeCount, ",")), "%s" % (MiscUtil.JoinWords(AlertTypeMolCount, ","))] 429 Writer.write("%s\n" % MiscUtil.JoinWords(Values, ",", QuoteValues)) 430 431 def GetSortedTorsionAlertsSummaryInfoRuleIDs(TorsionAlertsSummaryInfo): 432 """Sort torsion rule IDs by alert types molecule count in descending order.""" 433 434 SortedRuleIDs = [] 435 436 RuleIDs = TorsionAlertsSummaryInfo["RuleIDs"] 437 if len(RuleIDs) == 0: 438 return SortedRuleIDs 439 440 # Setup a map from AlertTypesMolCount to IDs for sorting alerts... 441 RuleIDs = TorsionAlertsSummaryInfo["RuleIDs"] 442 MolCountMap = {} 443 for ID in RuleIDs: 444 MolCount = 0 445 for AlertType in sorted(TorsionAlertsSummaryInfo["AlertTypes"][ID]): 446 MolCount += TorsionAlertsSummaryInfo["AlertTypesMolCount"][ID][AlertType] 447 MolCountMap[ID] = MolCount 448 449 SortedRuleIDs = sorted(RuleIDs, key = lambda ID: MolCountMap[ID], reverse = True) 450 451 return SortedRuleIDs 452 453 def WriteTorsionAlertsFilteredByRulesInfo(TorsionAlertsSummaryInfo): 454 """Write out torsion alerts SD files for individual torsion rules.""" 455 456 if OptionsInfo["CountMode"]: 457 return 458 459 if not OptionsInfo["OutfilesFilteredByRulesMode"]: 460 return 461 462 if len(TorsionAlertsSummaryInfo["RuleIDs"]) == 0: 463 return 464 465 # Setup a molecule reader for filtered molecules... 466 FilteredMols = RDKitUtil.ReadMolecules(OptionsInfo["OutfileFiltered"], **OptionsInfo["InfileParams"]) 467 468 # Get torsion rule IDs for writing out filtered SD files for individual torsion alert rules... 469 TorsionRuleIDs = GetTorsionAlertsFilteredByRuleFilesRuleIDs(TorsionAlertsSummaryInfo) 470 471 # Setup writers... 472 ByRuleOutfilesWriters = SetupByRuleOutfilesWriters(TorsionRuleIDs) 473 474 for Mol in FilteredMols: 475 # Retrieve torsion alerts info... 476 TorsionAlertsInfo = RetrieveTorsionAlertsInfo(Mol, TorsionAlertsSummaryInfo) 477 if TorsionAlertsInfo is None: 478 continue 479 480 for TorsionRuleID in TorsionRuleIDs: 481 if TorsionRuleID not in TorsionAlertsInfo["RuleSMARTS"]: 482 continue 483 484 WriteMoleculeFilteredByRuleID(ByRuleOutfilesWriters[TorsionRuleID], Mol, TorsionRuleID, TorsionAlertsSummaryInfo, TorsionAlertsInfo) 485 486 CloseByRuleOutfilesWriters(ByRuleOutfilesWriters) 487 488 def GetTorsionAlertsFilteredByRuleFilesRuleIDs(TorsionAlertsSummaryInfo): 489 """Get torsion rule IDs for writing out individual SD files filtered by torsion alert rules.""" 490 491 # Get torsion rule IDs triggering torsion alerts sorted in the order from the most to 492 # the least number of unique molecules... 493 RuleIDs = GetSortedTorsionAlertsSummaryInfoRuleIDs(TorsionAlertsSummaryInfo) 494 495 # Select torsion rule IDs for writing out SD files... 496 if not OptionsInfo["OutfilesFilteredByRulesAllMode"]: 497 MaxRuleIDs = OptionsInfo["OutfilesFilteredByRulesMaxCount"] 498 if MaxRuleIDs < len(RuleIDs): 499 RuleIDs = RuleIDs[0:MaxRuleIDs] 500 501 return RuleIDs 502 503 def RetrieveTorsionAlertsInfo(Mol, TorsionAlertsSummaryInfo): 504 """Parse torsion alerts data field value to retrieve alerts information for rotatable bonds.""" 505 506 TorsionAlertsLabel = OptionsInfo["SDFieldIDsToLabels"]["TorsionAlertsLabel"] 507 TorsionAlerts = Mol.GetProp(TorsionAlertsLabel) if Mol.HasProp(TorsionAlertsLabel) else None 508 509 if TorsionAlerts is None or len(TorsionAlerts) == 0: 510 return None 511 512 # Initialize for tracking by rule IDs... 513 TorsionAlertsInfo = {} 514 TorsionAlertsInfo["RuleIDs"] = [] 515 516 for DataLabel in ["RuleSMARTS", "HierarchyClassName", "HierarchySubClassName", "TorsionRulePeaks", "TorsionRuleTolerances1", "TorsionRuleTolerances2", "AlertTypes", "AtomIndices", "TorsionAtomIndices", "TorsionAngles", "TorsionAngleViolations", "AlertTypesCount"]: 517 TorsionAlertsInfo[DataLabel] = {} 518 519 ValuesDelimiter = OptionsInfo["IntraSetValuesDelim"] 520 TorsionAlertsSetSize = 11 521 522 TorsionAlertsWords = TorsionAlerts.split() 523 if len(TorsionAlertsWords) % TorsionAlertsSetSize: 524 MiscUtil.PrintError("The number of space delimited values, %s, for TorsionAlerts data field in filtered SD file must be a multiple of %s." % (len(TorsionAlertsWords), TorsionAlertsSetSize)) 525 526 ID = 0 527 for Index in range(0, len(TorsionAlertsWords), TorsionAlertsSetSize): 528 ID += 1 529 530 RotBondIndices, TorsionAlertType, TorsionIndices, TorsionAngle, TorsionAngleViolation, HierarchyClass, HierarchySubClass, TorsionPeaks, Tolerances1, Tolerances2, TorsionRule = TorsionAlertsWords[Index: Index + TorsionAlertsSetSize] 531 RotBondIndices = RotBondIndices.split(ValuesDelimiter) 532 TorsionIndices = TorsionIndices.split(ValuesDelimiter) 533 TorsionPeaks = TorsionPeaks.split(ValuesDelimiter) 534 Tolerances1 = Tolerances1.split(ValuesDelimiter) 535 Tolerances2 = Tolerances2.split(ValuesDelimiter) 536 537 if TorsionRule not in TorsionAlertsSummaryInfo["SMARTSToRuleIDs"]: 538 MiscUtil.PrintWarning("The SMARTS pattern, %s, for TorsionAlerts data field in filtered SD file doesn't map to any torsion rule..." % TorsionRule) 539 continue 540 TorsionRuleNodeID = TorsionAlertsSummaryInfo["SMARTSToRuleIDs"][TorsionRule] 541 542 # Track data for torsion alerts in a molecule... 543 if TorsionRuleNodeID not in TorsionAlertsInfo["RuleSMARTS"]: 544 TorsionAlertsInfo["RuleIDs"].append(TorsionRuleNodeID) 545 546 TorsionAlertsInfo["RuleSMARTS"][TorsionRuleNodeID] = TorsionRule 547 TorsionAlertsInfo["HierarchyClassName"][TorsionRuleNodeID] = HierarchyClass 548 TorsionAlertsInfo["HierarchySubClassName"][TorsionRuleNodeID] = HierarchySubClass 549 TorsionAlertsInfo["TorsionRulePeaks"][TorsionRuleNodeID] = TorsionPeaks 550 TorsionAlertsInfo["TorsionRuleTolerances1"][TorsionRuleNodeID] = Tolerances1 551 TorsionAlertsInfo["TorsionRuleTolerances2"][TorsionRuleNodeID] = Tolerances2 552 553 TorsionAlertsInfo["AlertTypes"][TorsionRuleNodeID] = [] 554 TorsionAlertsInfo["AtomIndices"][TorsionRuleNodeID] = [] 555 TorsionAlertsInfo["TorsionAtomIndices"][TorsionRuleNodeID] = [] 556 TorsionAlertsInfo["TorsionAngles"][TorsionRuleNodeID] = [] 557 TorsionAlertsInfo["TorsionAngleViolations"][TorsionRuleNodeID] = [] 558 559 TorsionAlertsInfo["AlertTypesCount"][TorsionRuleNodeID] = {} 560 561 # Track multiple values for a rule ID... 562 TorsionAlertsInfo["AlertTypes"][TorsionRuleNodeID].append(TorsionAlertType) 563 TorsionAlertsInfo["AtomIndices"][TorsionRuleNodeID].append(RotBondIndices) 564 TorsionAlertsInfo["TorsionAtomIndices"][TorsionRuleNodeID].append(TorsionIndices) 565 TorsionAlertsInfo["TorsionAngles"][TorsionRuleNodeID].append(TorsionAngle) 566 TorsionAlertsInfo["TorsionAngleViolations"][TorsionRuleNodeID].append(TorsionAngleViolation) 567 568 # Count alert type for a rule ID... 569 if TorsionAlertType not in TorsionAlertsInfo["AlertTypesCount"][TorsionRuleNodeID]: 570 TorsionAlertsInfo["AlertTypesCount"][TorsionRuleNodeID][TorsionAlertType] = 0 571 TorsionAlertsInfo["AlertTypesCount"][TorsionRuleNodeID][TorsionAlertType] += 1 572 573 return TorsionAlertsInfo 574 575 def WriteMolecule(Writer, Mol, RotBondsAlertsInfo): 576 """Write out molecule.""" 577 578 if OptionsInfo["CountMode"]: 579 return True 580 581 SetupMolPropertiesForAlertsInformation(Mol, RotBondsAlertsInfo) 582 583 try: 584 Writer.write(Mol) 585 except Exception as ErrMsg: 586 MiscUtil.PrintWarning("Failed to write molecule %s:\n%s\n" % (RDKitUtil.GetMolName(Mol), ErrMsg)) 587 return False 588 589 return True 590 591 def SetupMolPropertiesForAlertsInformation(Mol, RotBondsAlertsInfo): 592 """Setup molecule properties containing alerts information for rotatable bonds.""" 593 594 if not OptionsInfo["OutfileAlerts"]: 595 return 596 597 # Setup rotatable bonds count... 598 RotBondsCount = 0 599 if RotBondsAlertsInfo is not None: 600 RotBondsCount = len(RotBondsAlertsInfo["IDs"]) 601 Mol.SetProp(OptionsInfo["SDFieldIDsToLabels"]["RotBondsCountLabel"], "%s" % RotBondsCount) 602 603 # Setup alert counts for rotatable bonds... 604 AlertsCount = [] 605 if RotBondsAlertsInfo is not None: 606 for AlertType in ["Green", "Orange", "Red"]: 607 if AlertType in RotBondsAlertsInfo["Count"]: 608 AlertsCount.append("%s" % RotBondsAlertsInfo["Count"][AlertType]) 609 else: 610 AlertsCount.append("0") 611 612 if len(AlertsCount): 613 Mol.SetProp(OptionsInfo["SDFieldIDsToLabels"]["TorsionAlertsCountLabel"], "%s" % MiscUtil.JoinWords(AlertsCount, " ")) 614 615 # Setup alert information for rotatable bonds... 616 AlertsInfoValues = [] 617 618 # Delimiter for multiple values corresponding to specific set of information for 619 # a rotatable bond. For example: TorsionAtomIndices 620 ValuesDelim = OptionsInfo["IntraSetValuesDelim"] 621 622 # Delimiter for various values for a rotatable bond... 623 RotBondValuesDelim = OptionsInfo["InterSetValuesDelim"] 624 625 # Delimiter for values corresponding to multiple rotatable bonds... 626 AlertsInfoValuesDelim = OptionsInfo["InterSetValuesDelim"] 627 628 if RotBondsAlertsInfo is not None: 629 for ID in RotBondsAlertsInfo["IDs"]: 630 if not RotBondsAlertsInfo["MatchStatus"][ID]: 631 continue 632 633 if OptionsInfo["OutfileAlertsOnly"]: 634 if RotBondsAlertsInfo["AlertTypes"][ID] not in OptionsInfo["SpecifiedAlertsModeList"]: 635 continue 636 637 RotBondValues = [] 638 639 # Bond atom indices... 640 Values = ["%s" % Value for Value in RotBondsAlertsInfo["AtomIndices"][ID]] 641 RotBondValues.append(ValuesDelim.join(Values)) 642 643 # Alert type... 644 RotBondValues.append(RotBondsAlertsInfo["AlertTypes"][ID]) 645 646 # Torsion atom indices... 647 TorsionAtomIndices = SetupTorsionAtomIndicesValues(RotBondsAlertsInfo["TorsionAtomIndices"][ID], ValuesDelim) 648 RotBondValues.append(TorsionAtomIndices) 649 650 # Torsion angle... 651 RotBondValues.append("%.2f" % RotBondsAlertsInfo["TorsionAngles"][ID]) 652 653 # Torsion angle violation... 654 RotBondValues.append("%.2f" % RotBondsAlertsInfo["TorsionAngleViolations"][ID]) 655 656 # Hierarchy class and subclass names... 657 RotBondValues.append("%s" % RotBondsAlertsInfo["HierarchyClassNames"][ID]) 658 RotBondValues.append("%s" % RotBondsAlertsInfo["HierarchySubClassNames"][ID]) 659 660 # Torsion rule peaks... 661 Values = ["%s" % Value for Value in RotBondsAlertsInfo["TorsionRulePeaks"][ID]] 662 RotBondValues.append(ValuesDelim.join(Values)) 663 664 # Torsion rule tolerances... 665 Values = ["%s" % Value for Value in RotBondsAlertsInfo["TorsionRuleTolerances1"][ID]] 666 RotBondValues.append(ValuesDelim.join(Values)) 667 Values = ["%s" % Value for Value in RotBondsAlertsInfo["TorsionRuleTolerances2"][ID]] 668 RotBondValues.append(ValuesDelim.join(Values)) 669 670 # Torsion rule SMARTS... 671 RotBondValues.append("%s" % RotBondsAlertsInfo["TorsionRuleSMARTS"][ID]) 672 673 # Track joined values for a rotatable bond... 674 AlertsInfoValues.append("%s" % RotBondValuesDelim.join(RotBondValues)) 675 676 if len(AlertsInfoValues): 677 Mol.SetProp(OptionsInfo["SDFieldIDsToLabels"]["TorsionAlertsLabel"], "%s" % ("%s" % AlertsInfoValuesDelim.join(AlertsInfoValues))) 678 679 def WriteMoleculeFilteredByRuleID(Writer, Mol, TorsionRuleID, TorsionAlertsSummaryInfo, TorsionAlertsInfo): 680 """Write out molecule.""" 681 682 if OptionsInfo["CountMode"]: 683 return 684 685 SetupMolPropertiesForFilteredByRuleIDAlertsInformation(Mol, TorsionRuleID, TorsionAlertsSummaryInfo, TorsionAlertsInfo) 686 687 Writer.write(Mol) 688 689 def SetupMolPropertiesForFilteredByRuleIDAlertsInformation(Mol, TorsionRuleID, TorsionAlertsSummaryInfo, TorsionAlertsInfo): 690 """Setup molecule properties containing alerts information for torsion alerts 691 fileted by Rule IDs.""" 692 693 # Delete torsion alerts information for rotatable bonds... 694 if Mol.HasProp(OptionsInfo["SDFieldIDsToLabels"]["TorsionAlertsLabel"]): 695 Mol.ClearProp(OptionsInfo["SDFieldIDsToLabels"]["TorsionAlertsLabel"]) 696 697 # Delimiter for values... 698 IntraSetValuesDelim = OptionsInfo["IntraSetValuesDelim"] 699 InterSetValuesDelim = OptionsInfo["InterSetValuesDelim"] 700 701 # Setup alert rule information... 702 AlertRuleInfoValues = [] 703 704 AlertRuleInfoValues.append("%s" % TorsionAlertsInfo["HierarchyClassName"][TorsionRuleID]) 705 AlertRuleInfoValues.append("%s" % TorsionAlertsInfo["HierarchySubClassName"][TorsionRuleID]) 706 707 Values = ["%s" % Value for Value in TorsionAlertsInfo["TorsionRulePeaks"][TorsionRuleID]] 708 AlertRuleInfoValues.append(IntraSetValuesDelim.join(Values)) 709 710 Values = ["%s" % Value for Value in TorsionAlertsInfo["TorsionRuleTolerances1"][TorsionRuleID]] 711 AlertRuleInfoValues.append(IntraSetValuesDelim.join(Values)) 712 Values = ["%s" % Value for Value in TorsionAlertsInfo["TorsionRuleTolerances2"][TorsionRuleID]] 713 AlertRuleInfoValues.append(IntraSetValuesDelim.join(Values)) 714 715 AlertRuleInfoValues.append("%s" % TorsionAlertsInfo["RuleSMARTS"][TorsionRuleID]) 716 717 Mol.SetProp(OptionsInfo["SDFieldIDsToLabels"]["TorsionRuleLabel"], "%s" % ("%s" % InterSetValuesDelim.join(AlertRuleInfoValues))) 718 719 # Setup alerts count for torsion rule... 720 AlertsCount = [] 721 for AlertType in ["Green", "Orange", "Red"]: 722 if AlertType in TorsionAlertsInfo["AlertTypesCount"][TorsionRuleID]: 723 AlertsCount.append("%s" % TorsionAlertsInfo["AlertTypesCount"][TorsionRuleID][AlertType]) 724 else: 725 AlertsCount.append("0") 726 727 Mol.SetProp(OptionsInfo["SDFieldIDsToLabels"]["TorsionRuleAlertsCountLabel"], "%s" % (InterSetValuesDelim.join(AlertsCount))) 728 729 # Setup torsion rule alerts... 730 AlertsInfoValues = [] 731 for Index in range(0, len(TorsionAlertsInfo["AlertTypes"][TorsionRuleID])): 732 RotBondInfoValues = [] 733 734 # Bond atom indices... 735 Values = ["%s" % Value for Value in TorsionAlertsInfo["AtomIndices"][TorsionRuleID][Index]] 736 RotBondInfoValues.append(IntraSetValuesDelim.join(Values)) 737 738 # Alert type... 739 RotBondInfoValues.append(TorsionAlertsInfo["AlertTypes"][TorsionRuleID][Index]) 740 741 # Torsion atom indices retrieved from the filtered SD file and stored as strings... 742 Values = ["%s" % Value for Value in TorsionAlertsInfo["TorsionAtomIndices"][TorsionRuleID][Index]] 743 RotBondInfoValues.append(IntraSetValuesDelim.join(Values)) 744 745 # Torsion angle... 746 RotBondInfoValues.append(TorsionAlertsInfo["TorsionAngles"][TorsionRuleID][Index]) 747 748 # Torsion angle violation... 749 RotBondInfoValues.append(TorsionAlertsInfo["TorsionAngleViolations"][TorsionRuleID][Index]) 750 751 # Track alerts informaiton... 752 AlertsInfoValues.append("%s" % InterSetValuesDelim.join(RotBondInfoValues)) 753 754 Mol.SetProp(OptionsInfo["SDFieldIDsToLabels"]["TorsionRuleAlertsLabel"], "%s" % (InterSetValuesDelim.join(AlertsInfoValues))) 755 756 # Setup torsion rule alert max angle violation... 757 TorsionAngleViolations = [float(Angle) for Angle in TorsionAlertsInfo["TorsionAngleViolations"][TorsionRuleID]] 758 Mol.SetProp(OptionsInfo["SDFieldIDsToLabels"]["TorsionRuleMaxAngleViolationLabel"], "%.2f" % (max(TorsionAngleViolations))) 759 760 def SetupTorsionAtomIndicesValues(TorsionAtomIndicesList, ValuesDelim): 761 """Setup torsion atom indices value for output files.""" 762 763 # Check for any list values in the list of torsion atom indices used as placeholders 764 # for positions of lone pairs in torsion rules containing N_lp... 765 TorsionAtomsInfo = [] 766 for Value in TorsionAtomIndicesList: 767 if type(Value) is list: 768 TorsionAtomsInfo.append("N_lp") 769 else: 770 TorsionAtomsInfo.append(Value) 771 772 Values = ["%s" % Value for Value in TorsionAtomsInfo] 773 774 return ValuesDelim.join(Values) 775 776 def SetupOutfilesWriters(): 777 """Setup molecule and summary writers.""" 778 779 OutfilesWriters = {"WriterRemaining": None, "WriterFiltered": None, "WriterAlertSummary": None} 780 781 # Writers for SD files... 782 WriterRemaining, WriterFiltered = SetupMoleculeWriters() 783 OutfilesWriters["WriterRemaining"] = WriterRemaining 784 OutfilesWriters["WriterFiltered"] = WriterFiltered 785 786 # Writer for alert summary CSV file... 787 WriterAlertSummary = SetupAlertSummaryWriter() 788 OutfilesWriters["WriterAlertSummary"] = WriterAlertSummary 789 790 return OutfilesWriters 791 792 def SetupMoleculeWriters(): 793 """Setup molecule writers.""" 794 795 Writer = None 796 WriterFiltered = None 797 798 if OptionsInfo["CountMode"]: 799 return (Writer, WriterFiltered) 800 801 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"]) 802 if Writer is None: 803 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"]) 804 MiscUtil.PrintInfo("\nGenerating file %s..." % OptionsInfo["Outfile"]) 805 806 if OptionsInfo["OutfileFilteredMode"]: 807 WriterFiltered = RDKitUtil.MoleculesWriter(OptionsInfo["OutfileFiltered"], **OptionsInfo["OutfileParams"]) 808 if WriterFiltered is None: 809 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["OutfileFiltered"]) 810 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["OutfileFiltered"]) 811 812 return (Writer, WriterFiltered) 813 814 def SetupAlertSummaryWriter(): 815 """Setup a alert summary writer.""" 816 817 Writer = None 818 819 if OptionsInfo["CountMode"]: 820 return Writer 821 822 if not OptionsInfo["OutfileSummaryMode"]: 823 return Writer 824 825 Outfile = OptionsInfo["OutfileSummary"] 826 Writer = open(Outfile, "w") 827 if Writer is None: 828 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile) 829 830 MiscUtil.PrintInfo("Generating file %s..." % Outfile) 831 832 return Writer 833 834 def CloseOutfilesWriters(OutfilesWriters): 835 """Close outfile writers.""" 836 837 for WriterType, Writer in OutfilesWriters.items(): 838 if Writer is not None: 839 Writer.close() 840 841 def SetupByRuleOutfilesWriters(RuleIDs): 842 """Setup by rule outfiles writers.""" 843 844 # Initialize... 845 OutfilesWriters = {} 846 for RuleID in RuleIDs: 847 OutfilesWriters[RuleID] = None 848 849 if OptionsInfo["CountMode"]: 850 return OutfilesWriters 851 852 if not OptionsInfo["OutfilesFilteredByRulesMode"]: 853 return OutfilesWriters 854 855 FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"]) 856 OutfilesRoot = "%s_Filtered_TopRule" % FileName 857 OutfilesExt = "sdf" 858 859 MsgTxt = "all" if OptionsInfo["OutfilesFilteredByRulesAllMode"] else "top %s" % OptionsInfo["OutfilesFilteredByRulesMaxCount"] 860 MiscUtil.PrintInfo("\nGenerating output files %s*.%s for %s torsion rules triggering alerts..." % (OutfilesRoot, OutfilesExt, MsgTxt)) 861 862 # Delete any existing output files... 863 Outfiles = glob.glob("%s*.%s" % (OutfilesRoot, OutfilesExt)) 864 if len(Outfiles): 865 MiscUtil.PrintInfo("Deleting existing output files %s*.%s..." % (OutfilesRoot, OutfilesExt)) 866 for Outfile in Outfiles: 867 try: 868 os.remove(Outfile) 869 except Exception as ErrMsg: 870 MiscUtil.PrintWarning("Failed to delete file: %s" % ErrMsg) 871 872 RuleIndex = 0 873 for RuleID in RuleIDs: 874 RuleIndex += 1 875 Outfile = "%s%s.%s" % (OutfilesRoot, RuleIndex, OutfilesExt) 876 Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"]) 877 if Writer is None: 878 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile) 879 880 OutfilesWriters[RuleID] = Writer 881 882 return OutfilesWriters 883 884 def CloseByRuleOutfilesWriters(OutfilesWriters): 885 """Close by rule outfile writers.""" 886 887 for RuleID, Writer in OutfilesWriters.items(): 888 if Writer is not None: 889 Writer.close() 890 891 def InstantiateTorsionLibraryAlertsClass(Quiet = False): 892 """Initialize torsion library alerts class.""" 893 894 try: 895 TorsionLibraryAlertsHandle = TorsionLibraryAlerts(AlertsMode = OptionsInfo["AlertsMode"], MinAlertsCount = OptionsInfo["MinAlertsCount"], NitrogenLonePairAllowHydrogenNbrs = OptionsInfo["NitrogenLonePairParams"]["AllowHydrogenNbrs"], NitrogenLonePairPlanarityTolerance = OptionsInfo["NitrogenLonePairParams"]["PlanarityTolerance"], RotBondsSMARTSMode = OptionsInfo["RotBondsSMARTSMode"], RotBondsSMARTSPattern = OptionsInfo["RotBondsSMARTSPattern"], TorsionLibraryFilePath = OptionsInfo["TorsionLibraryFile"]) 896 except Exception as ErrMsg: 897 MiscUtil.PrintError("Failed to instantiate TorsionLibraryAlerts:\n%s\n" % (ErrMsg)) 898 899 if not Quiet: 900 MiscUtil.PrintInfo("\nRetrieving data from library file %s..." % TorsionLibraryAlertsHandle.GetTorsionLibraryFilePath()) 901 TorsionLibraryAlertsHandle.ListTorsionLibraryInfo() 902 903 return TorsionLibraryAlertsHandle 904 905 906 def ProcessRotatableBondsSMARTSMode(): 907 """"Process SMARTS pattern for rotatable bonds.""" 908 909 RotBondsMode = OptionsInfo["RotBondsSMARTSMode"] 910 911 RotBondsSMARTSPattern = None 912 RotBondsSMARTSPatternSpecified = OptionsInfo["RotBondsSMARTSPatternSpecified"] 913 914 if re.match("^(NonStrict|SemiStrict|Strict)$", RotBondsMode, re.I): 915 RotBondsSMARTSPattern = None 916 elif re.match("Specify", RotBondsMode, re.I): 917 RotBondsSMARTSPatternSpecified = RotBondsSMARTSPatternSpecified.strip() 918 if not len(RotBondsSMARTSPatternSpecified): 919 MiscUtil.PrintError("Empty value specified for SMILES/SMARTS pattern in \"--rotBondsSMARTSPattern\" option, %s." % RotBondsMode) 920 921 RotBondsPatternMol = Chem.MolFromSmarts(RotBondsSMARTSPatternSpecified) 922 if RotBondsPatternMol is None: 923 MiscUtil.PrintError("Failed to create rotatable bonds pattern molecule. The rotatable bonds SMARTS pattern, \"%s\", specified using \"--rotBondsSMARTSPattern\" option is not valid." % (RotBondsSMARTSPatternSpecified)) 924 else: 925 MiscUtil.PrintError("The value, %s, specified for option \"-r, --rotBondsSMARTSMode\" is not valid. " % RotBondsMode) 926 927 OptionsInfo["RotBondsSMARTSPattern"] = RotBondsSMARTSPattern 928 929 930 931 def ProcessSDFieldLabelsOption(): 932 """Process SD data field label option.""" 933 934 ParamsOptionName = "--outfileSDFieldLabels" 935 ParamsOptionValue = Options["--outfileSDFieldLabels"] 936 937 ParamsIDsToLabels = {"RotBondsCountLabel": "RotBondsCount", "TorsionAlertsCountLabel": "TorsionAlertsCount (Green Orange Red)", "TorsionAlertsLabel": "TorsionAlerts (RotBondIndices TorsionAlert TorsionIndices TorsionAngle TorsionAngleViolation HierarchyClass HierarchySubClass TorsionPeaks Tolerances1 Tolerances2 TorsionRule)", "TorsionRuleLabel": "TorsionRule (HierarchyClass HierarchySubClass TorsionPeaks Tolerances1 Tolerances2 TorsionRule)", "TorsionRuleAlertsCountLabel": "TorsionRuleAlertsCount (Green Orange Red)", "TorsionRuleAlertsLabel": "TorsionRuleAlerts (RotBondIndices TorsionAlert TorsionIndices TorsionAngle TorsionAngleViolation)", "TorsionRuleMaxAngleViolationLabel": "TorsionRuleMaxAngleViolation"} 938 939 if re.match("^auto$", ParamsOptionValue, re.I): 940 OptionsInfo["SDFieldIDsToLabels"] = ParamsIDsToLabels 941 return 942 943 # Setup a canonical paramater names... 944 ValidParamNames = [] 945 CanonicalParamNamesMap = {} 946 for ParamName in sorted(ParamsIDsToLabels): 947 ValidParamNames.append(ParamName) 948 CanonicalParamNamesMap[ParamName.lower()] = ParamName 949 950 ParamsOptionValue = ParamsOptionValue.strip() 951 if not ParamsOptionValue: 952 PrintError("No valid parameter name and value pairs specified using \"%s\" option" % ParamsOptionName) 953 954 ParamsOptionValueWords = ParamsOptionValue.split(",") 955 if len(ParamsOptionValueWords) % 2: 956 MiscUtil.PrintError("The number of comma delimited paramater names and values, %d, specified using \"%s\" option must be an even number." % (len(ParamsOptionValueWords), ParamsOptionName)) 957 958 # Validate paramater name and value pairs... 959 for Index in range(0, len(ParamsOptionValueWords), 2): 960 Name = ParamsOptionValueWords[Index].strip() 961 Value = ParamsOptionValueWords[Index + 1].strip() 962 963 CanonicalName = Name.lower() 964 if not CanonicalName in CanonicalParamNamesMap: 965 MiscUtil.PrintError("The parameter name, %s, specified using \"%s\" is not a valid name. Supported parameter names: %s" % (Name, ParamsOptionName, " ".join(ValidParamNames))) 966 967 ParamName = CanonicalParamNamesMap[CanonicalName] 968 ParamValue = Value 969 970 # Set value... 971 ParamsIDsToLabels[ParamName] = ParamValue 972 973 OptionsInfo["SDFieldIDsToLabels"] = ParamsIDsToLabels 974 975 def ProcessOptionNitrogenLonePairParameters(): 976 """Process nitrogen lone pair parameters option.""" 977 978 ParamsOptionName = "--nitrogenLonePairParams" 979 ParamsOptionValue = Options["--nitrogenLonePairParams"] 980 981 ParamsInfo = {"AllowHydrogenNbrs": True, "PlanarityTolerance": 1.0,} 982 983 if re.match("^auto$", ParamsOptionValue, re.I): 984 OptionsInfo["NitrogenLonePairParams"] = ParamsInfo 985 return 986 987 # Setup a canonical paramater names... 988 ValidParamNames = [] 989 CanonicalParamNamesMap = {} 990 for ParamName in sorted(ParamsInfo): 991 ValidParamNames.append(ParamName) 992 CanonicalParamNamesMap[ParamName.lower()] = ParamName 993 994 ParamsOptionValue = ParamsOptionValue.strip() 995 if not ParamsOptionValue: 996 PrintError("No valid parameter name and value pairs specified using \"%s\" option" % ParamsOptionName) 997 998 ParamsOptionValueWords = ParamsOptionValue.split(",") 999 if len(ParamsOptionValueWords) % 2: 1000 MiscUtil.PrintError("The number of comma delimited paramater names and values, %d, specified using \"%s\" option must be an even number." % (len(ParamsOptionValueWords), ParamsOptionName)) 1001 1002 # Validate paramater name and value pairs... 1003 for Index in range(0, len(ParamsOptionValueWords), 2): 1004 Name = ParamsOptionValueWords[Index].strip() 1005 Value = ParamsOptionValueWords[Index + 1].strip() 1006 1007 CanonicalName = Name.lower() 1008 if not CanonicalName in CanonicalParamNamesMap: 1009 MiscUtil.PrintError("The parameter name, %s, specified using \"%s\" is not a valid name. Supported parameter names: %s" % (Name, ParamsOptionName, " ".join(ValidParamNames))) 1010 1011 ParamName = CanonicalParamNamesMap[CanonicalName] 1012 ParamValue = Value 1013 1014 if re.match("^PlanarityTolerance$", ParamName, re.I): 1015 Value = float(Value) 1016 if Value < 0: 1017 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: >= 0" % (Value, Name, ParamsOptionName)) 1018 ParamValue = Value 1019 elif re.match("^AllowHydrogenNbrs$", ParamName, re.I): 1020 if not re.match("^(yes|no)$", Value, re.I): 1021 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: yes or no" % (Value, Name, ParamsOptionName)) 1022 ParamValue = True if re.match("^yes$", Value, re.I) else False 1023 1024 # Set value... 1025 ParamsInfo[ParamName] = ParamValue 1026 1027 OptionsInfo["NitrogenLonePairParams"] = ParamsInfo 1028 1029 def ProcessOptions(): 1030 """Process and validate command line arguments and options.""" 1031 1032 MiscUtil.PrintInfo("Processing options...") 1033 1034 # Validate options... 1035 ValidateOptions() 1036 1037 OptionsInfo["Infile"] = Options["--infile"] 1038 ParamsDefaultInfoOverride = {"RemoveHydrogens": False} 1039 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], InfileName = Options["--infile"], ParamsDefaultInfo = ParamsDefaultInfoOverride) 1040 1041 OptionsInfo["Outfile"] = Options["--outfile"] 1042 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]) 1043 1044 FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"]) 1045 OutfileFiltered = "%s_Filtered.%s" % (FileName, FileExt) 1046 OptionsInfo["OutfileFiltered"] = OutfileFiltered 1047 OptionsInfo["OutfileFilteredMode"] = True if re.match("^yes$", Options["--outfileFiltered"], re.I) else False 1048 1049 OutfileSummary = "%s_AlertsSummary.csv" % (FileName) 1050 OptionsInfo["OutfileSummary"] = OutfileSummary 1051 OptionsInfo["OutfileSummaryMode"] = True if re.match("^yes$", Options["--outfileSummary"], re.I) else False 1052 1053 OptionsInfo["OutfilesFilteredByRulesMode"] = True if re.match("^yes$", Options["--outfilesFilteredByRules"], re.I) else False 1054 OptionsInfo["TrackAlertsSummaryInfo"] = True if (OptionsInfo["OutfileSummaryMode"] or OptionsInfo["OutfilesFilteredByRulesMode"]) else False 1055 1056 OutfilesFilteredByRulesMaxCount = Options["--outfilesFilteredByRulesMaxCount"] 1057 if not re.match("^All$", OutfilesFilteredByRulesMaxCount, re.I): 1058 OutfilesFilteredByRulesMaxCount = int(OutfilesFilteredByRulesMaxCount) 1059 OptionsInfo["OutfilesFilteredByRulesMaxCount"] = OutfilesFilteredByRulesMaxCount 1060 OptionsInfo["OutfilesFilteredByRulesAllMode"] = True if re.match("^All$", Options["--outfilesFilteredByRulesMaxCount"], re.I) else False 1061 1062 OptionsInfo["OutfileAlerts"] = True if re.match("^yes$", Options["--outfileAlerts"], re.I) else False 1063 1064 if re.match("^yes$", Options["--outfilesFilteredByRules"], re.I): 1065 if not re.match("^yes$", Options["--outfileAlerts"], re.I): 1066 MiscUtil.PrintError("The value \"%s\" specified for \"--outfilesFilteredByRules\" option is not valid. The specified value is only allowed during \"yes\" value of \"--outfileAlerts\" option." % (Options["--outfilesFilteredByRules"])) 1067 1068 OptionsInfo["OutfileAlertsMode"] = Options["--outfileAlertsMode"] 1069 OptionsInfo["OutfileAlertsOnly"] = True if re.match("^AlertsOnly$", Options["--outfileAlertsMode"], re.I) else False 1070 1071 ProcessSDFieldLabelsOption() 1072 1073 OptionsInfo["Overwrite"] = Options["--overwrite"] 1074 OptionsInfo["CountMode"] = True if re.match("^count$", Options["--mode"], re.I) else False 1075 1076 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False 1077 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"]) 1078 1079 ProcessOptionNitrogenLonePairParameters() 1080 1081 OptionsInfo["AlertsMode"] = Options["--alertsMode"] 1082 OptionsInfo["SpecifiedAlertsModeList"] = [] 1083 if re.match("^Red$", Options["--alertsMode"], re.I): 1084 OptionsInfo["SpecifiedAlertsModeList"].append("Red") 1085 elif re.match("^RedAndOrange$", Options["--alertsMode"], re.I): 1086 OptionsInfo["SpecifiedAlertsModeList"].append("Red") 1087 OptionsInfo["SpecifiedAlertsModeList"].append("Orange") 1088 1089 OptionsInfo["MinAlertsCount"] = int(Options["--alertsMinCount"]) 1090 1091 OptionsInfo["RotBondsSMARTSMode"] = Options["--rotBondsSMARTSMode"] 1092 OptionsInfo["RotBondsSMARTSPatternSpecified"] = Options["--rotBondsSMARTSPattern"] 1093 ProcessRotatableBondsSMARTSMode() 1094 1095 OptionsInfo["TorsionLibraryFile"] = Options["--torsionLibraryFile"] 1096 1097 # Setup delimiter for writing out torsion alert information to output files... 1098 OptionsInfo["IntraSetValuesDelim"] = "," 1099 OptionsInfo["InterSetValuesDelim"] = " " 1100 1101 def RetrieveOptions(): 1102 """Retrieve command line arguments and options.""" 1103 1104 # Get options... 1105 global Options 1106 Options = docopt(_docoptUsage_) 1107 1108 # Set current working directory to the specified directory... 1109 WorkingDir = Options["--workingdir"] 1110 if WorkingDir: 1111 os.chdir(WorkingDir) 1112 1113 # Handle examples option... 1114 if "--examples" in Options and Options["--examples"]: 1115 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 1116 sys.exit(0) 1117 1118 def ProcessListTorsionLibraryOption(): 1119 """Process list torsion library information.""" 1120 1121 # Validate and process dataFile option for listing torsion library information... 1122 OptionsInfo["TorsionLibraryFile"] = Options["--torsionLibraryFile"] 1123 if not re.match("^auto$", Options["--torsionLibraryFile"], re.I): 1124 MiscUtil.ValidateOptionFilePath("-t, --torsionLibraryFile", Options["--torsionLibraryFile"]) 1125 1126 # Instantiate TorsionLibraryAlerts using defaults... 1127 TorsionLibraryAlertsHandle = TorsionLibraryAlerts(TorsionLibraryFilePath = OptionsInfo["TorsionLibraryFile"]) 1128 MiscUtil.PrintInfo("\nRetrieving data from torsion library file %s..." % TorsionLibraryAlertsHandle.GetTorsionLibraryFilePath()) 1129 TorsionLibraryAlertsHandle.ListTorsionLibraryInfo() 1130 1131 def ValidateOptions(): 1132 """Validate option values.""" 1133 1134 MiscUtil.ValidateOptionTextValue("-a, --alertsMode", Options["--alertsMode"], "Red RedAndOrange") 1135 MiscUtil.ValidateOptionIntegerValue("--alertsMinCount", Options["--alertsMinCount"], {">=": 1}) 1136 1137 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 1138 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol") 1139 1140 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd") 1141 if re.match("^filter$", Options["--mode"], re.I): 1142 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 1143 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 1144 1145 MiscUtil.ValidateOptionTextValue("--outfileFiltered", Options["--outfileFiltered"], "yes no") 1146 1147 MiscUtil.ValidateOptionTextValue("--outfilesFilteredByRules", Options["--outfilesFilteredByRules"], "yes no") 1148 if not re.match("^All$", Options["--outfilesFilteredByRulesMaxCount"], re.I): 1149 MiscUtil.ValidateOptionIntegerValue("--outfilesFilteredByRulesMaxCount", Options["--outfilesFilteredByRulesMaxCount"], {">": 0}) 1150 1151 MiscUtil.ValidateOptionTextValue("--outfileSummary", Options["--outfileSummary"], "yes no") 1152 MiscUtil.ValidateOptionTextValue("--outfileAlerts", Options["--outfileAlerts"], "yes no") 1153 MiscUtil.ValidateOptionTextValue("--outfileAlertsMode", Options["--outfileAlertsMode"], "All AlertsOnly") 1154 1155 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "filter count") 1156 if re.match("^filter$", Options["--mode"], re.I): 1157 if not Options["--outfile"]: 1158 MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"filter\" value of \"-m, --mode\" option") 1159 1160 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no") 1161 1162 MiscUtil.ValidateOptionTextValue("-r, --rotBondsSMARTSMode", Options["--rotBondsSMARTSMode"], "NonStrict SemiStrict Strict Specify") 1163 if re.match("^Specify$", Options["--rotBondsSMARTSMode"], re.I): 1164 if not Options["--rotBondsSMARTSPattern"]: 1165 MiscUtil.PrintError("The SMARTS pattern must be specified using \"--rotBondsSMARTSPattern\" during \"Specify\" value of \"-r, --rotBondsSMARTS\" option") 1166 1167 if not re.match("^auto$", Options["--torsionLibraryFile"], re.I): 1168 MiscUtil.ValidateOptionFilePath("-t, --torsionLibraryFile", Options["--torsionLibraryFile"]) 1169 1170 # Setup a usage string for docopt... 1171 _docoptUsage_ = """ 1172 RDKitFilterTorsionLibraryAlerts.py - Filter torsion library alerts 1173 1174 Usage: 1175 RDKitFilterTorsionLibraryAlerts.py [--alertsMode <Red, RedAndOrange>] [--alertsMinCount <Number>] 1176 [--infileParams <Name,Value,...>] [--mode <filter or count>] [--mp <yes or no>] [--mpParams <Name,Value,...>] 1177 [--nitrogenLonePairParams <Name,Value,...>] [--outfileAlerts <yes or no>] 1178 [--outfileAlertsMode <All or AlertsOnly>] [--outfileFiltered <yes or no>] 1179 [--outfilesFilteredByRules <yes or no>] [--outfilesFilteredByRulesMaxCount <All or number>] 1180 [--outfileSummary <yes or no>] [--outfileSDFieldLabels <Type,Label,...>] 1181 [--outfileParams <Name,Value,...>] [--overwrite] [ --rotBondsSMARTSMode <NonStrict, SemiStrict,...>] 1182 [--rotBondsSMARTSPattern <SMARTS>] [--torsionLibraryFile <FileName or auto>] [-w <dir>] -i <infile> -o <outfile> 1183 RDKitFilterTorsionLibraryAlerts.py [--torsionLibraryFile <FileName or auto>] -l | --list 1184 RDKitFilterTorsionLibraryAlerts.py -h | --help | -e | --examples 1185 1186 Description: 1187 Filter strained molecules from an input file for torsion library [ Ref 146, 152, 159 ] 1188 alerts by matching rotatable bonds against SMARTS patterns specified for torsion 1189 rules in a torsion library file and write out appropriate molecules to output 1190 files. The molecules must have 3D coordinates in input file. 1191 1192 The default torsion library file, TorsionLibrary.xml, is available under 1193 MAYACHEMTOOLS/lib/python/TorsionAlerts directory. 1194 1195 The data in torsion library file is organized in a hierarchical manner. It consists 1196 of one generic class and six specific classes at the highest level. Each class 1197 contains multiple subclasses corresponding to named functional groups or 1198 substructure patterns. The subclasses consist of torsion rules sorted from 1199 specific to generic torsion patterns. The torsion rule, in turn, contains a list 1200 of peak values for torsion angles and two tolerance values. A pair of tolerance 1201 values define torsion bins around a torsion peak value. For example: 1202 1203 <library> 1204 <hierarchyClass name="GG" id1="G" id2="G"> 1205 ... 1206 </hierarchyClass> 1207 <hierarchyClass name="CO" id1="C" id2="O"> 1208 <hierarchySubClass name="Ester bond I" smarts="O=[C:2][O:3]"> 1209 <torsionRule smarts="[O:1]=[C:2]!@[O:3]~[CH0:4]"> 1210 <angleList> 1211 <angle value="0.0" tolerance1="20.00" 1212 tolerance2="25.00" score="56.52"/> 1213 </angleList> 1214 </torsionRule> 1215 ... 1216 ... 1217 ... 1218 </hierarchyClass> 1219 <hierarchyClass name="NC" id1="N" id2="C"> 1220 ... 1221 </hierarchyClass> 1222 <hierarchyClass name="SN" id1="S" id2="N"> 1223 ... 1224 </hierarchyClass> 1225 <hierarchyClass name="CS" id1="C" id2="S"> 1226 ... 1227 </hierarchyClass> 1228 <hierarchyClass name="CC" id1="C" id2="C"> 1229 ... 1230 </hierarchyClass> 1231 <hierarchyClass name="SS" id1="S" id2="S"> 1232 ... 1233 </hierarchyClass> 1234 </library> 1235 1236 The rotatable bonds in a 3D molecule are identified using a default SMARTS pattern. 1237 A custom SMARTS pattern may be optionally specified to detect rotatable bonds. 1238 Each rotatable bond is matched to a torsion rule in the torsion library and 1239 assigned one of the following three alert categories: Green, Orange or Red. The 1240 rotatable bond is marked Green or Orange for the measured angle of the torsion 1241 pattern within the first or second tolerance bins around a torsion peak. 1242 Otherwise, it's marked Red implying that the measured angle is not observed in 1243 the structure databases employed to generate the torsion library. 1244 1245 The following output files are generated after the filtering: 1246 1247 <OutfileRoot>.sdf 1248 <OutfileRoot>_Filtered.sdf 1249 <OutfileRoot>_AlertsSummary.csv 1250 <OutfileRoot>_Filtered_TopRule*.sdf 1251 1252 The supported input file formats are: Mol (.mol), SD (.sdf, .sd) 1253 1254 The supported output file formats are: SD (.sdf, .sd) 1255 1256 Options: 1257 -a, --alertsMode <Red, RedAndOrange> [default: Red] 1258 Torsion library alert types to use for filtering molecules containing 1259 rotatable bonds marked with Green, Orange, or Red alerts. Possible 1260 values: Red or RedAndOrange. 1261 --alertsMinCount <Number> [default: 1] 1262 Minimum number of rotatable bond alerts in a molecule for filtering the 1263 molecule. 1264 -e, --examples 1265 Print examples. 1266 -h, --help 1267 Print this help message. 1268 -i, --infile <infile> 1269 Input file name. 1270 --infileParams <Name,Value,...> [default: auto] 1271 A comma delimited list of parameter name and value pairs for reading 1272 molecules from files. The supported parameter names for different file 1273 formats, along with their default values, are shown below: 1274 1275 SD, MOL: removeHydrogens,no,sanitize,yes,strictParsing,yes 1276 1277 -l, --list 1278 List torsion library information without performing any filtering. 1279 -m, --mode <filter or count> [default: filter] 1280 Specify whether to filter molecules for torsion library [ Ref 146, 152, 159 ] alerts 1281 by matching rotatable bonds against SMARTS patterns specified for torsion 1282 rules and write out the rest of the molecules to an outfile or simply count 1283 the number of matched molecules marked for filtering. 1284 --mp <yes or no> [default: no] 1285 Use multiprocessing. 1286 1287 By default, input data is retrieved in a lazy manner via mp.Pool.imap() 1288 function employing lazy RDKit data iterable. This allows processing of 1289 arbitrary large data sets without any additional requirements memory. 1290 1291 All input data may be optionally loaded into memory by mp.Pool.map() 1292 before starting worker processes in a process pool by setting the value 1293 of 'inputDataMode' to 'InMemory' in '--mpParams' option. 1294 1295 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input 1296 data mode may adversely impact the performance. The '--mpParams' section 1297 provides additional information to tune the value of 'chunkSize'. 1298 --mpParams <Name,Value,...> [default: auto] 1299 A comma delimited list of parameter name and value pairs to configure 1300 multiprocessing. 1301 1302 The supported parameter names along with their default and possible 1303 values are shown below: 1304 1305 chunkSize, auto 1306 inputDataMode, Lazy [ Possible values: InMemory or Lazy ] 1307 numProcesses, auto [ Default: mp.cpu_count() ] 1308 1309 These parameters are used by the following functions to configure and 1310 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and 1311 mp.Pool.imap(). 1312 1313 The chunkSize determines chunks of input data passed to each worker 1314 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions. 1315 The default value of chunkSize is dependent on the value of 'inputDataMode'. 1316 1317 The mp.Pool.map() function, invoked during 'InMemory' input data mode, 1318 automatically converts RDKit data iterable into a list, loads all data into 1319 memory, and calculates the default chunkSize using the following method 1320 as shown in its code: 1321 1322 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4) 1323 if extra: chunkSize += 1 1324 1325 For example, the default chunkSize will be 7 for a pool of 4 worker processes 1326 and 100 data items. 1327 1328 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs 1329 'lazy' RDKit data iterable to retrieve data as needed, without loading all the 1330 data into memory. Consequently, the size of input data is not known a priori. 1331 It's not possible to estimate an optimal value for the chunkSize. The default 1332 chunkSize is set to 1. 1333 1334 The default value for the chunkSize during 'Lazy' data mode may adversely 1335 impact the performance due to the overhead associated with exchanging 1336 small chunks of data. It is generally a good idea to explicitly set chunkSize to 1337 a larger value during 'Lazy' input data mode, based on the size of your input 1338 data and number of processes in the process pool. 1339 1340 The mp.Pool.map() function waits for all worker processes to process all 1341 the data and return the results. The mp.Pool.imap() function, however, 1342 returns the the results obtained from worker processes as soon as the 1343 results become available for specified chunks of data. 1344 1345 The order of data in the results returned by both mp.Pool.map() and 1346 mp.Pool.imap() functions always corresponds to the input data. 1347 -n, --nitrogenLonePairParams <Name,Value,...> [default: auto] 1348 A comma delimited list of parameter name and value pairs to match 1349 torsion SMARTS patterns containing non-standard construct 'N_lp' 1350 corresponding to nitrogen lone pair. 1351 1352 The supported parameter names along with their default and possible 1353 values are shown below: 1354 1355 allowHydrogenNbrs, yes [ Possible values: yes or no ] 1356 planarityTolerance, 1 [Possible values: >=0] 1357 1358 These parameters are used during the matching of torsion rules containing 1359 'N_lp' in their SMARTS patterns. The 'allowHydrogensNbrs' allows the use 1360 hydrogen neighbors attached to nitrogen during the determination of its 1361 planarity. The 'planarityTolerance' in degrees represents the tolerance 1362 allowed for nitrogen to be considered coplanar with its three neighbors. 1363 1364 The torsion rules containing 'N_lp' in their SMARTS patterns are categorized 1365 into the following two types of rules: 1366 1367 TypeOne: 1368 1369 [CX4:1][CX4H2:2]!@[NX3;"N_lp":3][CX4:4] 1370 [C:1][CX4H2:2]!@[NX3;"N_lp":3][C:4] 1371 ... ... ... 1372 1373 TypeTwo: 1374 1375 [!#1:1][CX4:2]!@[NX3;"N_lp":3] 1376 [C:1][$(S(=O)=O):2]!@["N_lp":3] 1377 ... ... ... 1378 1379 The torsions are matched to torsion rules containing 'N_lp' using specified 1380 SMARTS patterns without the 'N_lp' along with additional constraints using 1381 the following methodology: 1382 1383 TypeOne: 1384 1385 . SMARTS pattern must contain four mapped atoms and the third 1386 mapped atom must be a nitrogen matched with 'NX3:3' 1387 . Nitrogen atom must have 3 neighbors. The 'allowHydrogens' 1388 parameter controls inclusion of hydrogens as its neighbors. 1389 . Nitrogen atom and its 3 neighbors must be coplanar. 1390 'planarityTolerance' parameter provides tolerance in degrees 1391 for nitrogen to be considered coplanar with its 3 neighbors. 1392 1393 TypeTwo: 1394 1395 . SMARTS pattern must contain three mapped atoms and the third 1396 mapped atom must be a nitrogen matched with 'NX3:3'. The 1397 third mapped atom may contain only 'N_lp:3' The missing 'NX3' 1398 is automatically detected. 1399 . Nitrogen atom must have 3 neighbors. 'allowHydrogens' 1400 parameter controls inclusion of hydrogens as neighbors. 1401 . Nitrogen atom and its 3 neighbors must not be coplanar. 1402 'planarityTolerance' parameter provides tolerance in degrees 1403 for nitrogen to be considered coplanar with its 3 neighbors. 1404 . Nitrogen lone pair position equivalent to VSEPR theory is 1405 determined based on the position of nitrogen and its neighbors. 1406 A vector normal to 3 nitrogen neighbors is calculated and added 1407 to the coordinates of nitrogen atom to determine the approximate 1408 position of the lone pair. It is used as the fourth position to 1409 calculate the torsion angle. 1410 1411 -o, --outfile <outfile> 1412 Output file name. 1413 --outfileAlerts <yes or no> [default: yes] 1414 Write out alerts information to SD output files. 1415 --outfileAlertsMode <All or AlertsOnly> [default: AlertsOnly] 1416 Write alerts information to SD output files for all alerts or only for alerts 1417 specified by '--AlertsMode' option. Possible values: All or AlertsOnly 1418 This option is only valid for 'Yes' value of '--outfileAlerts' option. 1419 1420 The following alerts information is added to SD output files using 1421 'TorsionAlerts' data field: 1422 1423 RotBondIndices TorsionAlert TorsionIndices TorsionAngle 1424 TorsionAngleViolation HierarchyClass HierarchySubClass 1425 TorsionRule TorsionPeaks Tolerances1 Tolerances2 1426 1427 The 'RotBondsCount' and 'TorsionAlertsCount' data fields are always added 1428 to SD output files containing both remaining and filtered molecules. 1429 1430 Format: 1431 1432 > <RotBondsCount> 1433 Number 1434 1435 > <TorsionAlertsCount (Green Orange Red)> 1436 Number Number Number 1437 1438 > <TorsionAlerts (RotBondIndices TorsionAlert TorsionIndices 1439 TorsionAngle TorsionAngleViolation HierarchyClass 1440 HierarchySubClass TorsionPeaks Tolerances1 Tolerances2 1441 TorsionRule)> 1442 AtomIndex2,AtomIndex3 AlertType AtomIndex1,AtomIndex2,AtomIndex3, 1443 AtomIndex4 Angle AngleViolation ClassName SubClassName 1444 CommaDelimPeakValues CommaDelimTol1Values CommDelimTol2Values 1445 SMARTS ... ... ... 1446 ... ... ... 1447 1448 A set of 11 values is written out as value of 'TorsionAlerts' data field for 1449 each torsion in a molecule. The space character is used as a delimiter 1450 to separate values with in a set and across set. The comma character 1451 is used to delimit multiple values for each value in a set. 1452 1453 The 'RotBondIndices' and 'TorsionIndices' contain 2 and 4 comma delimited 1454 values representing atom indices for a rotatable bond and matched torsion. 1455 The 'TorsionPeaks', 'Tolerances1', and 'Tolerances2' contain same number 1456 of comma delimited values corresponding to torsion angle peaks and 1457 tolerance intervals specified in torsion library. For example: 1458 1459 ... ... ... 1460 > <RotBondsCount> (1) 1461 7 1462 1463 > <TorsionAlertsCount (Green Orange Red)> (1) 1464 3 2 2 1465 1466 > <TorsionAlerts (RotBondIndices TorsionAlert TorsionIndices 1467 TorsionAngle TorsionAngleViolation HierarchyClass 1468 HierarchySubClass TorsionPeaks Tolerances1 Tolerances2 1469 TorsionRule)> 1470 1,2 Red 32,2,1,0 0.13 149.87 NC Anilines 180.0 10.0 30.0 [cH0:1][c:2] 1471 ([cH,nX2H0])!@[NX3H1:3][CX4:4] 8,9 Red 10,9,8,28 -0.85 GG 1472 None -90.0,90.0 30.0,30.0 60.0,60.0 [cH1:1][a:2]([cH1])!@[a:3] 1473 ([cH0])[cH0:4] 1474 ... ... ... 1475 1476 --outfileFiltered <yes or no> [default: yes] 1477 Write out a file containing filtered molecules. Its name is automatically 1478 generated from the specified output file. Default: <OutfileRoot>_ 1479 Filtered.<OutfileExt>. 1480 --outfilesFilteredByRules <yes or no> [default: yes] 1481 Write out SD files containing filtered molecules for individual torsion 1482 rules triggering alerts in molecules. The name of SD files are automatically 1483 generated from the specified output file. Default file names: <OutfileRoot>_ 1484 Filtered_TopRule*.sdf 1485 1486 The following alerts information is added to SD output files: 1487 1488 > <RotBondsCount> 1489 Number 1490 1491 > <TorsionAlertsCount (Green Orange Red)> 1492 Number Number Number 1493 1494 > <TorsionRule (HierarchyClass HierarchySubClass TorsionPeaks 1495 Tolerances1 Tolerances2 TorsionRule)> 1496 ClassName SubClassName CommaDelimPeakValues CommaDelimTol1Values 1497 CommDelimTol2Values SMARTS ... ... ... 1498 ... ... ... 1499 1500 > <TorsionRuleAlertsCount (Green Orange Red)> 1501 Number Number Number 1502 1503 > <TorsionRuleAlerts (RotBondIndices TorsionAlert TorsionIndices 1504 TorsionAngle TorsionAngleViolation)> 1505 AtomIndex2,AtomIndex3 AlertType AtomIndex1,AtomIndex2,AtomIndex3, 1506 AtomIndex4 Angle AngleViolation ... ... ... 1507 1508 > <TorsionRuleMaxAngleViolation> 1509 Number 1510 ... ... ... 1511 1512 For example: 1513 1514 ... ... ... 1515 > <RotBondsCount> (1) 1516 7 1517 1518 > <TorsionAlertsCount (Green Orange Red)> (1) 1519 3 2 2 1520 1521 > <TorsionRule (HierarchyClass HierarchySubClass TorsionPeaks 1522 Tolerances1 Tolerances2 TorsionRule)> (1) 1523 NC Anilines 180.0 10.0 30.0 [cH0:1][c:2]([cH,nX2H0])!@[NX3H1:3][CX4:4] 1524 1525 > <TorsionRuleAlertsCount (Green Orange Red)> (1) 1526 0 0 1 1527 1528 > <TorsionRuleAlerts (RotBondIndices TorsionAlert TorsionIndices 1529 TorsionAngle TorsionAngleViolation)> (1) 1530 1,2 Red 32,2,1,0 0.13 149.87 1531 1532 > <TorsionRuleMaxAngleViolation> (1) 1533 149.87 1534 ... ... ... 1535 1536 --outfilesFilteredByRulesMaxCount <All or number> [default: 10] 1537 Write out SD files containing filtered molecules for specified number of 1538 top N torsion rules triggering alerts for the largest number of molecules 1539 or for all torsion rules triggering alerts across all molecules. 1540 --outfileSummary <yes or no> [default: yes] 1541 Write out a CVS text file containing summary of torsions rules responsible 1542 for triggering torsion alerts. Its name is automatically generated from the 1543 specified output file. Default: <OutfileRoot>_AlertsSummary.csv. 1544 1545 The following alerts information is written to summary text file: 1546 1547 TorsionRule, TorsionPeaks, Tolerances1, Tolerances2, 1548 HierarchyClass, HierarchySubClass, TorsionAlertType, 1549 TorsionAlertCount, TorsionAlertMolCount 1550 1551 The double quotes characters are removed from SMART patterns before 1552 before writing them to a CSV file. In addition, the torsion rules are sorted by 1553 TorsionAlertMolCount. For example: 1554 1555 "TorsionRule","TorsionPeaks","Tolerances1","Tolerances2", 1556 "HierarchyClass","HierarchySubClass","TorsionAlertTypes", 1557 "TorsionAlertCount","TorsionAlertMolCount" 1558 "[!#1:1][CX4H2:2]!@[CX4H2:3][!#1:4]","-60.0,60.0,180.0", 1559 "20.0,20.0,20.0","30.0,30.0,30.0","CC","None/[CX4:2][CX4:3]", 1560 "Red","16","11" 1561 ... ... ... 1562 1563 --outfileSDFieldLabels <Type,Label,...> [default: auto] 1564 A comma delimited list of SD data field type and label value pairs for writing 1565 torsion alerts information along with molecules to SD files. 1566 1567 The supported SD data field label type along with their default values are 1568 shown below: 1569 1570 For all SD files: 1571 1572 RotBondsCountLabel, RotBondsCount 1573 TorsionAlertsCountLabel, TorsionAlertsCount (Green Orange Red) 1574 TorsionAlertsLabel, TorsionAlerts (RotBondIndices TorsionAlert 1575 TorsionIndices TorsionAngle TorsionAngleViolation 1576 HierarchyClass HierarchySubClass TorsionPeaks Tolerances1 1577 Tolerances2 TorsionRule) 1578 1579 For individual SD files filtered by torsion rules: 1580 1581 TorsionRuleLabel, TorsionRule (HierarchyClass HierarchySubClass 1582 TorsionPeaks Tolerances1 Tolerances2 TorsionRule) 1583 TorsionRuleAlertsCountLabel, TorsionRuleAlertsCount (Green Orange 1584 Red) 1585 TorsionRuleAlertsLabel, TorsionRuleAlerts (RotBondIndices 1586 TorsionAlert TorsionIndices TorsionAngle TorsionAngleViolation) 1587 TorsionRuleMaxAngleViolationLabel, TorsionRuleMaxAngleViolation 1588 1589 --outfileParams <Name,Value,...> [default: auto] 1590 A comma delimited list of parameter name and value pairs for writing 1591 molecules to files. The supported parameter names for different file 1592 formats, along with their default values, are shown below: 1593 1594 SD: kekulize,yes,forceV3000,no 1595 1596 --overwrite 1597 Overwrite existing files. 1598 -r, --rotBondsSMARTSMode <NonStrict, SemiStrict,...> [default: SemiStrict] 1599 SMARTS pattern to use for identifying rotatable bonds in a molecule 1600 for matching against torsion rules in the torsion library. Possible values: 1601 NonStrict, SemiStrict, Strict or Specify. The rotatable bond SMARTS matches 1602 are filtered to ensure that each atom in the rotatable bond is attached to 1603 at least two heavy atoms. 1604 1605 The following SMARTS patterns are used to identify rotatable bonds for 1606 different modes: 1607 1608 NonStrict: [!$(*#*)&!D1]-&!@[!$(*#*)&!D1] 1609 1610 SemiStrict: 1611 [!$(*#*)&!D1&!$(C(F)(F)F)&!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br) 1612 &!$(C([CH3])([CH3])[CH3])]-!@[!$(*#*)&!D1&!$(C(F)(F)F) 1613 &!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)&!$(C([CH3])([CH3])[CH3])] 1614 1615 Strict: 1616 [!$(*#*)&!D1&!$(C(F)(F)F)&!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br) 1617 &!$(C([CH3])([CH3])[CH3])&!$([CD3](=[N,O,S])-!@[#7,O,S!D1]) 1618 &!$([#7,O,S!D1]-!@[CD3]=[N,O,S])&!$([CD3](=[N+])-!@[#7!D1]) 1619 &!$([#7!D1]-!@[CD3]=[N+])]-!@[!$(*#*)&!D1&!$(C(F)(F)F) 1620 &!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)&!$(C([CH3])([CH3])[CH3])] 1621 1622 The 'NonStrict' and 'Strict' SMARTS patterns are available in RDKit. The 1623 'NonStrict' SMARTS pattern corresponds to original Daylight SMARTS 1624 specification for rotatable bonds. The 'SemiStrict' SMARTS pattern is 1625 derived from 'Strict' SMARTS patterns for its usage in this script. 1626 1627 You may use any arbitrary SMARTS pattern to identify rotatable bonds by 1628 choosing 'Specify' value for '-r, --rotBondsSMARTSMode' option and providing its 1629 value via '--rotBondsSMARTSPattern' option. 1630 --rotBondsSMARTSPattern <SMARTS> 1631 SMARTS pattern for identifying rotatable bonds. This option is only valid 1632 for 'Specify' value of '-r, --rotBondsSMARTSMode' option. 1633 -t, --torsionLibraryFile <FileName or auto> [default: auto] 1634 Specify a XML file name containing data for torsion library hierarchy 1635 or use default file, TorsionLibrary.xml, available in 1636 MAYACHEMTOOLS/lib/Python/TorsionAlerts directory. 1637 1638 The format of data in local XML file must match format of the data in Torsion 1639 Library [ Ref 146, 152, 159 ] file available in MAYACHEMTOOLS directory. 1640 -w, --workingdir <dir> 1641 Location of working directory which defaults to the current directory. 1642 1643 Examples: 1644 To filter molecules containing any rotatable bonds marked with Red alerts 1645 based on torsion rules in the torsion library and write out SD files containing 1646 remaining and filtered molecules, and individual SD files for torsion rules 1647 triggering alerts along with appropriate torsion information for red alerts, 1648 type: 1649 1650 % RDKitFilterTorsionLibraryAlerts.py -i Sample3D.sdf -o Sample3DOut.sdf 1651 1652 To run the first example for only counting number of alerts without writing 1653 out any SD files, type: 1654 1655 % RDKitFilterTorsionLibraryAlerts.py -m count -i Sample3D.sdf -o 1656 Sample3DOut.sdf 1657 1658 To run the first example for filtertering molecules marked with Orange or 1659 Red alerts and write out SD files, tye: 1660 1661 % RDKitFilterTorsionLibraryAlerts.py -m Filter --alertsMode RedAndOrange 1662 -i Sample3D.sdf -o Sample3DOut.sdf 1663 1664 To run the first example for filtering molecules and writing out torsion 1665 information for all alert types to SD files, type: 1666 1667 % RDKitFilterTorsionLibraryAlerts.py --outfileAlertsMode All 1668 -i Sample3D.sdf -o Sample3DOut.sdf 1669 1670 To run the first example for filtering molecules in multiprocessing mode on 1671 all available CPUs without loading all data into memory and write out SD files, 1672 type: 1673 1674 % RDKitFilterTorsionLibraryAlerts.py --mp yes -i Sample3D.sdf 1675 -o Sample3DOut.sdf 1676 1677 To run the first example for filtering molecules in multiprocessing mode on 1678 all available CPUs by loading all data into memory and write out a SD files, 1679 type: 1680 1681 % RDKitFilterTorsionLibraryAlerts.py --mp yes --mpParams 1682 "inputDataMode, InMemory" -i Sample3D.sdf -o Sample3DOut.sdf 1683 1684 To run the first example for filtering molecules in multiprocessing mode on 1685 specific number of CPUs and chunksize without loading all data into memory 1686 and write out SD files, type: 1687 1688 % RDKitFilterTorsionLibraryAlerts.py --mp yes --mpParams 1689 "inputDataMode,lazy,numProcesses,4,chunkSize,8" -i Sample3D.sdf 1690 -o Sample3DOut.sdf 1691 1692 To list information about default torsion library file without performing any 1693 filtering, type: 1694 1695 % RDKitFilterTorsionLibraryAlerts.py -l 1696 1697 To list information about a local torsion library XML file without performing 1698 any, filtering, type: 1699 1700 % RDKitFilterTorsionLibraryAlerts.py --torsionLibraryFile 1701 TorsionLibrary.xml -l 1702 1703 Author: 1704 Manish Sud (msud@san.rr.com) 1705 1706 Collaborator: 1707 Pat Walters 1708 1709 Acknowledgments: 1710 Wolfgang Guba, Patrick Penner, and Levi Pierce 1711 1712 See also: 1713 RDKitFilterChEMBLAlerts.py, RDKitFilterPAINS.py, RDKitFilterTorsionStrainEnergyAlerts.py, 1714 RDKitConvertFileFormat.py, RDKitSearchSMARTS.py 1715 1716 Copyright: 1717 Copyright (C) 2024 Manish Sud. All rights reserved. 1718 1719 This script uses the Torsion Library jointly developed by the University 1720 of Hamburg, Center for Bioinformatics, Hamburg, Germany and 1721 F. Hoffmann-La-Roche Ltd., Basel, Switzerland. 1722 1723 The functionality available in this script is implemented using RDKit, an 1724 open source toolkit for cheminformatics developed by Greg Landrum. 1725 1726 This file is part of MayaChemTools. 1727 1728 MayaChemTools is free software; you can redistribute it and/or modify it under 1729 the terms of the GNU Lesser General Public License as published by the Free 1730 Software Foundation; either version 3 of the License, or (at your option) any 1731 later version. 1732 1733 """ 1734 1735 if __name__ == "__main__": 1736 main()