1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: CalculatePhysicochemicalProperties.pl,v $ 4 # $Date: 2011/12/27 20:27:02 $ 5 # $Revision: 1.13 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2012 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use TextUtil; 37 use SDFileUtil; 38 use MoleculeFileIO; 39 use Molecule; 40 use AtomTypes::AtomicInvariantsAtomTypes; 41 use AtomTypes::FunctionalClassAtomTypes; 42 use MolecularDescriptors::MolecularDescriptorsGenerator; 43 44 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 45 46 # Autoflush STDOUT 47 $| = 1; 48 49 # Starting message... 50 $ScriptName = basename($0); 51 print "\n$ScriptName: Starting...\n\n"; 52 $StartTime = new Benchmark; 53 54 # Get the options and setup script... 55 SetupScriptUsage(); 56 if ($Options{help} || @ARGV < 1) { 57 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 58 } 59 60 my(@SDFilesList); 61 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 62 63 # Process options... 64 print "Processing options...\n"; 65 my(%OptionsInfo); 66 ProcessOptions(); 67 68 # Setup information about input files... 69 print "Checking input SD file(s)...\n"; 70 my(%SDFilesInfo); 71 RetrieveSDFilesInfo(); 72 73 # Process input files.. 74 my($FileIndex); 75 if (@SDFilesList > 1) { 76 print "\nProcessing SD files...\n"; 77 } 78 for $FileIndex (0 .. $#SDFilesList) { 79 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 80 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 81 CalculatePhysicochemicalProperties($FileIndex); 82 } 83 } 84 print "\n$ScriptName:Done...\n\n"; 85 86 $EndTime = new Benchmark; 87 $TotalTime = timediff ($EndTime, $StartTime); 88 print "Total time: ", timestr($TotalTime), "\n"; 89 90 ############################################################################### 91 92 # Calculate physicochemical properties for a SD file... 93 # 94 sub CalculatePhysicochemicalProperties { 95 my($FileIndex) = @_; 96 my($CmpdCount, $IgnoredCmpdCount, $RuleOf5ViolationsCount, $RuleOf3ViolationsCount, $SDFile, $MoleculeFileIO, $Molecule, $MolecularDescriptorsGenerator, $PhysicochemicalPropertiesDataRef, $NewSDFileRef, $NewTextFileRef); 97 98 $SDFile = $SDFilesList[$FileIndex]; 99 100 # Setup output files... 101 $NewSDFileRef = ''; $NewTextFileRef = ''; 102 ($NewSDFileRef, $NewTextFileRef) = SetupAndOpenOutputFiles($FileIndex); 103 104 # Setup molecular descriptor generator to calculate property values for specifed 105 # property names... 106 $MolecularDescriptorsGenerator = SetupMolecularDescriptorsGenerator(); 107 108 ($CmpdCount, $IgnoredCmpdCount, $RuleOf5ViolationsCount, $RuleOf3ViolationsCount) = ('0') x 4; 109 110 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile); 111 $MoleculeFileIO->Open(); 112 113 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) { 114 $CmpdCount++; 115 116 # Filter compound data before calculating physiochemical properties... 117 if ($OptionsInfo{Filter}) { 118 if (CheckAndFilterCompound($CmpdCount, $Molecule)) { 119 $IgnoredCmpdCount++; 120 next COMPOUND; 121 } 122 } 123 124 # Calculate properties... 125 $PhysicochemicalPropertiesDataRef = CalculateMoleculeProperties($MolecularDescriptorsGenerator, $Molecule); 126 127 if (!defined($PhysicochemicalPropertiesDataRef)) { 128 $IgnoredCmpdCount++; 129 ProcessIgnoredCompound('PropertiesCalculationFailed', $CmpdCount, $Molecule); 130 next COMPOUND; 131 } 132 133 # Calculate any rule violations... 134 if ($OptionsInfo{RuleOf5Violations} && $PhysicochemicalPropertiesDataRef->{RuleOf5Violations}) { 135 $RuleOf5ViolationsCount++; 136 } 137 138 if ($OptionsInfo{RuleOf3Violations} && $PhysicochemicalPropertiesDataRef->{RuleOf3Violations}) { 139 $RuleOf3ViolationsCount++; 140 } 141 142 # Write out calculate properties... 143 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $PhysicochemicalPropertiesDataRef, $NewSDFileRef, $NewTextFileRef); 144 } 145 $MoleculeFileIO->Close(); 146 147 if ($OptionsInfo{SDOutput} && $NewSDFileRef) { 148 close $NewSDFileRef; 149 } 150 if ($OptionsInfo{TextOutput} && $NewTextFileRef) { 151 close $NewTextFileRef; 152 } 153 154 WriteCalculationSummaryStatistics($CmpdCount, $IgnoredCmpdCount, $RuleOf5ViolationsCount, $RuleOf3ViolationsCount); 155 } 156 157 # Process compound being ignored due to problems in physicochemical properties calculation... 158 # 159 sub ProcessIgnoredCompound { 160 my($Mode, $CmpdCount, $Molecule) = @_; 161 my($CmpdID, $DataFieldLabelAndValuesRef); 162 163 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 164 $CmpdID = SetupCmpdIDForTextFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 165 166 MODE: { 167 if ($Mode =~ /^ContainsNonElementalData$/i) { 168 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n"; 169 next MODE; 170 } 171 172 if ($Mode =~ /^ContainsNoElementalData$/i) { 173 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n"; 174 next MODE; 175 } 176 177 if ($Mode =~ /^PropertiesCalculationFailed$/i) { 178 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Physicochemical properties calculation didn't succeed...\n\n"; 179 next MODE; 180 } 181 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Physicochemical properties calculation didn't succeed...\n\n"; 182 } 183 } 184 185 # Check and filter compounds.... 186 # 187 sub CheckAndFilterCompound { 188 my($CmpdCount, $Molecule) = @_; 189 my($ElementCount, $NonElementCount); 190 191 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements(); 192 193 if ($NonElementCount) { 194 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule); 195 return 1; 196 } 197 198 if (!$ElementCount) { 199 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule); 200 return 1; 201 } 202 203 return 0; 204 } 205 206 # Write out compounds physicochemical properties calculation summary statistics... 207 # 208 sub WriteCalculationSummaryStatistics { 209 my($CmpdCount, $IgnoredCmpdCount, $RuleOf5ViolationsCount, $RuleOf3ViolationsCount) = @_; 210 my($ProcessedCmpdCount); 211 212 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount; 213 214 print "\nNumber of compounds: $CmpdCount\n"; 215 print "Number of compounds processed successfully during physicochemical properties calculation: $ProcessedCmpdCount\n"; 216 print "Number of compounds ignored during physicochemical properties calculation: $IgnoredCmpdCount\n"; 217 218 if ($OptionsInfo{RuleOf5Violations}) { 219 print "Number of compounds with one or more RuleOf5 violations: $RuleOf5ViolationsCount\n"; 220 } 221 222 if ($OptionsInfo{RuleOf3Violations}) { 223 print "Number of compounds with one or more RuleOf3 violations: $RuleOf3ViolationsCount\n"; 224 } 225 226 } 227 228 # Open output files... 229 # 230 sub SetupAndOpenOutputFiles { 231 my($FileIndex) = @_; 232 my($NewSDFile, $NewTextFile, $NewSDFileRef, $NewTextFileRef); 233 234 $NewSDFileRef = ''; 235 $NewTextFileRef = ''; 236 237 if ($OptionsInfo{SDOutput}) { 238 $NewSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex]; 239 print "Generating SD file $NewSDFile...\n"; 240 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n"; 241 $NewSDFileRef = \*NEWSDFILE; 242 } 243 if ($OptionsInfo{TextOutput}) { 244 $NewTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex]; 245 print "Generating text file $NewTextFile...\n"; 246 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Couldn't open $NewTextFile: $! \n"; 247 WriteTextFileCoulmnLabels($FileIndex, \*NEWTEXTFILE); 248 $NewTextFileRef = \*NEWTEXTFILE; 249 } 250 return ($NewSDFileRef, $NewTextFileRef); 251 } 252 253 # Write calculated physicochemical properties and other data to appropriate output files... 254 # 255 sub WriteDataToOutputFiles { 256 my($FileIndex, $CmpdCount, $Molecule, $PhysicochemicalPropertiesDataRef, $NewSDFileRef, $NewTextFileRef) = @_; 257 my($PropertyName, $PropertyValue); 258 259 if ($OptionsInfo{SDOutput}) { 260 # Retrieve input compound string used to create molecule and write it out 261 # without last line containing a delimiter... 262 my($CmpdString); 263 $CmpdString = $Molecule->GetInputMoleculeString(); 264 $CmpdString =~ s/\$\$\$\$$//; 265 print $NewSDFileRef "$CmpdString"; 266 267 # Write out calculated physicochemical properties data... 268 for $PropertyName (@{$OptionsInfo{SpecifiedPropertyNames}}) { 269 $PropertyValue = $PhysicochemicalPropertiesDataRef->{$PropertyName}; 270 print $NewSDFileRef "> <$PropertyName>\n$PropertyValue\n\n"; 271 } 272 273 # Write out RuleOf5 violations for molecule.... 274 if ($OptionsInfo{RuleOf5Violations}) { 275 $PropertyValue = $PhysicochemicalPropertiesDataRef->{RuleOf5Violations}; 276 print $NewSDFileRef "> <RuleOf5Violations>\n$PropertyValue\n\n"; 277 } 278 279 # Write out RuleOf3 violations for molecule.... 280 if ($OptionsInfo{RuleOf3Violations}) { 281 $PropertyValue = $PhysicochemicalPropertiesDataRef->{RuleOf3Violations}; 282 print $NewSDFileRef "> <RuleOf3Violations>\n$PropertyValue\n\n"; 283 } 284 285 # Write out delimiter... 286 print $NewSDFileRef "\$\$\$\$\n"; 287 } 288 289 if ($OptionsInfo{TextOutput}) { 290 my($Line, $DataFieldLabelAndValuesRef, $DataFieldLabel, $DataFieldValue, @LineWords,); 291 292 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 293 @LineWords = (); 294 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 295 push @LineWords, SetupCmpdIDForTextFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 296 } 297 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 298 @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 299 } 300 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 301 @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 302 } 303 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 304 @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}}; 305 } 306 307 # Append calculated physicochemical properties data... 308 for $PropertyName (@{$OptionsInfo{SpecifiedPropertyNames}}) { 309 $PropertyValue = $PhysicochemicalPropertiesDataRef->{$PropertyName}; 310 push @LineWords, $PropertyValue; 311 } 312 313 # Write out RuleOf5 violations for molecule.... 314 if ($OptionsInfo{RuleOf5Violations}) { 315 $PropertyValue = $PhysicochemicalPropertiesDataRef->{RuleOf5Violations}; 316 push @LineWords, $PropertyValue; 317 } 318 319 # Write out RuleOf3 violations for molecule.... 320 if ($OptionsInfo{RuleOf3Violations}) { 321 $PropertyValue = $PhysicochemicalPropertiesDataRef->{RuleOf3Violations}; 322 push @LineWords, $PropertyValue; 323 } 324 325 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 326 print $NewTextFileRef "$Line\n"; 327 } 328 } 329 330 # Write out approriate column labels to text file... 331 sub WriteTextFileCoulmnLabels { 332 my($FileIndex, $NewTextFileRef) = @_; 333 my($Line, @LineWords); 334 335 @LineWords = (); 336 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 337 push @LineWords, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 338 } 339 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 340 push @LineWords, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 341 } 342 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 343 push @LineWords, @{$OptionsInfo{SpecifiedDataFields}}; 344 } 345 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 346 push @LineWords, $OptionsInfo{CompoundIDLabel}; 347 } 348 my($SpecifiedPropertyName); 349 350 # Append physicochemical properties column labels... 351 push @LineWords, @{$OptionsInfo{SpecifiedPropertyNames}}; 352 353 # Write out RuleOf5 violations label... 354 if ($OptionsInfo{RuleOf5Violations}) { 355 push @LineWords, 'RuleOf5Violations'; 356 } 357 358 # Write out RuleOf3 violations label... 359 if ($OptionsInfo{RuleOf3Violations}) { 360 push @LineWords, 'RuleOf3Violations'; 361 } 362 363 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 364 print $NewTextFileRef "$Line\n"; 365 } 366 367 # Generate compound ID for text files.. 368 # 369 sub SetupCmpdIDForTextFiles { 370 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; 371 my($CmpdID); 372 373 $CmpdID = ''; 374 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) { 375 my($MolName); 376 $MolName = $Molecule->GetName(); 377 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}"; 378 } 379 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) { 380 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}"; 381 } 382 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) { 383 my($SpecifiedDataField); 384 $SpecifiedDataField = $OptionsInfo{CompoundID}; 385 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : ''; 386 } 387 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) { 388 $CmpdID = $Molecule->GetName(); 389 } 390 return $CmpdID; 391 } 392 393 # Calculate physicochemical properties for molecule... 394 # 395 sub CalculateMoleculeProperties { 396 my($MolecularDescriptorsGenerator, $Molecule) = @_; 397 my($PropertyName, $PropertyValue, $MolecularDescriptorsObject, %CalculatedPhysicochemicalProperties); 398 399 %CalculatedPhysicochemicalProperties = (); 400 401 if ($OptionsInfo{KeepLargestComponent}) { 402 $Molecule->KeepLargestComponent(); 403 } 404 405 if (!$Molecule->DetectRings()) { 406 return undef; 407 } 408 $Molecule->DetectAromaticity(); 409 410 if ($OptionsInfo{AddHydrogens}) { 411 $Molecule->AddHydrogens(); 412 } 413 414 # Calculate physicochemical properties... 415 $MolecularDescriptorsGenerator->SetMolecule($Molecule); 416 $MolecularDescriptorsGenerator->GenerateDescriptors(); 417 418 if (!$MolecularDescriptorsGenerator->IsDescriptorsGenerationSuccessful()) { 419 return undef; 420 } 421 422 %CalculatedPhysicochemicalProperties = $MolecularDescriptorsGenerator->GetDescriptorNamesAndValues(); 423 424 # Count RuleOf3 violations... 425 if ($OptionsInfo{RuleOf3Violations}) { 426 CalculateRuleViolationsCount('RuleOf3Violations', \%CalculatedPhysicochemicalProperties); 427 } 428 429 # Count RuleOf5 violations... 430 if ($OptionsInfo{RuleOf5Violations}) { 431 CalculateRuleViolationsCount('RuleOf5Violations', \%CalculatedPhysicochemicalProperties); 432 } 433 434 return \%CalculatedPhysicochemicalProperties; 435 } 436 437 # Setup molecular descriptor generator to calculate property values for specifed 438 # property names... 439 # 440 sub SetupMolecularDescriptorsGenerator { 441 my($PropertyName, $MolecularDescriptorsGenerator); 442 443 $MolecularDescriptorsGenerator = new MolecularDescriptorsGenerator('Mode' => 'Specify', 'DescriptorNames' => \@{$OptionsInfo{SpecifiedPropertyNames}}); 444 445 # Setup molecular desciptor calculation parameters... 446 if (exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('MolecularWeight')}) || exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('ExactMass')}) ) { 447 $MolecularDescriptorsGenerator->SetDescriptorClassParameters('DescriptorClassName' => 'WeightAndMassDescriptors', %{$OptionsInfo{PrecisionParametersMap}}); 448 } 449 450 if (exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('RotatableBonds')})) { 451 $MolecularDescriptorsGenerator->SetDescriptorClassParameters('DescriptorClassName' => 'RotatableBondsDescriptors', %{$OptionsInfo{RotatableBondsParametersMap}}); 452 } 453 454 if (exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('HydrogenBondDonors')}) || exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('HydrogenBondAcceptors')}) ) { 455 $MolecularDescriptorsGenerator->SetDescriptorClassParameters('DescriptorClassName' => 'HydrogenBondsDescriptors', 'HydrogenBondsType' => $OptionsInfo{HydrogenBonds}); 456 } 457 458 if (exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('TPSA')})) { 459 $MolecularDescriptorsGenerator->SetDescriptorClassParameters('DescriptorClassName' => 'TPSADescriptors', %{$OptionsInfo{TPSAParametersMap}}); 460 } 461 462 if (exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('MolecularComplexity')})) { 463 $MolecularDescriptorsGenerator->SetDescriptorClassParameters('DescriptorClassName' => 'MolecularComplexityDescriptors', %{$OptionsInfo{MolecularComplexityParametersMap}}); 464 } 465 466 return $MolecularDescriptorsGenerator; 467 } 468 469 # Calculate RuleOf3 or RuleOf5 violations count... 470 # 471 sub CalculateRuleViolationsCount { 472 my($RuleViolationsType, $CalculatedPropertiesMapRef) = @_; 473 my($RuleViolationsCount, $PropertyName); 474 475 $RuleViolationsCount = 0; 476 477 RULEVIOLATIONSTYPE: { 478 if ($RuleViolationsType =~ /^RuleOf3Violations$/i) { 479 for $PropertyName (@{$OptionsInfo{RuleOf3PropertyNames}}) { 480 if ($CalculatedPropertiesMapRef->{$PropertyName} > $OptionsInfo{RuleOf3MaxPropertyValuesMap}{$PropertyName}) { 481 $RuleViolationsCount++; 482 } 483 } 484 last RULEVIOLATIONSTYPE; 485 } 486 487 if ($RuleViolationsType =~ /^RuleOf5Violations$/i) { 488 for $PropertyName (@{$OptionsInfo{RuleOf5PropertyNames}}) { 489 if ($CalculatedPropertiesMapRef->{$PropertyName} > $OptionsInfo{RuleOf5MaxPropertyValuesMap}{$PropertyName}) { 490 $RuleViolationsCount++; 491 } 492 } 493 last RULEVIOLATIONSTYPE; 494 } 495 496 die "Warning: Unknown rule violation type: $RuleViolationsType..."; 497 } 498 499 # Set rule violation count... 500 $CalculatedPropertiesMapRef->{$RuleViolationsType} = $RuleViolationsCount; 501 502 } 503 504 # Retrieve information about SD files... 505 # 506 sub RetrieveSDFilesInfo { 507 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $NewSDFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef); 508 509 %SDFilesInfo = (); 510 @{$SDFilesInfo{FileOkay}} = (); 511 @{$SDFilesInfo{OutFileRoot}} = (); 512 @{$SDFilesInfo{SDOutFileNames}} = (); 513 @{$SDFilesInfo{TextOutFileNames}} = (); 514 @{$SDFilesInfo{AllDataFieldsRef}} = (); 515 @{$SDFilesInfo{CommonDataFieldsRef}} = (); 516 517 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0; 518 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0; 519 520 FILELIST: for $Index (0 .. $#SDFilesList) { 521 $SDFile = $SDFilesList[$Index]; 522 523 $SDFilesInfo{FileOkay}[$Index] = 0; 524 $SDFilesInfo{OutFileRoot}[$Index] = ''; 525 $SDFilesInfo{SDOutFileNames}[$Index] = ''; 526 $SDFilesInfo{TextOutFileNames}[$Index] = ''; 527 528 $SDFile = $SDFilesList[$Index]; 529 if (!(-e $SDFile)) { 530 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 531 next FILELIST; 532 } 533 if (!CheckFileType($SDFile, "sd sdf")) { 534 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 535 next FILELIST; 536 } 537 538 if ($CheckDataField) { 539 # Make sure data field exists in SD file.. 540 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues); 541 542 @CmpdLines = (); 543 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 544 $CmpdString = ReadCmpdString(\*SDFILE); 545 close SDFILE; 546 @CmpdLines = split "\n", $CmpdString; 547 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 548 $SpecifiedDataField = $OptionsInfo{CompoundID}; 549 if (!exists $DataFieldValues{$SpecifiedDataField}) { 550 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n"; 551 next FILELIST; 552 } 553 } 554 555 $AllDataFieldsRef = ''; 556 $CommonDataFieldsRef = ''; 557 if ($CollectDataFields) { 558 my($CmpdCount); 559 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 560 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE); 561 close SDFILE; 562 } 563 564 # Setup output file names... 565 $FileDir = ""; $FileName = ""; $FileExt = ""; 566 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 567 568 $TextOutFileExt = "csv"; 569 if ($Options{outdelim} =~ /^tab$/i) { 570 $TextOutFileExt = "tsv"; 571 } 572 $SDOutFileExt = $FileExt; 573 574 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { 575 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 576 if ($RootFileName && $RootFileExt) { 577 $FileName = $RootFileName; 578 } 579 else { 580 $FileName = $OptionsInfo{OutFileRoot}; 581 } 582 $OutFileRoot = $FileName; 583 } 584 else { 585 $OutFileRoot = "${FileName}PhysicochemicalProperties"; 586 } 587 588 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}"; 589 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}"; 590 591 if ($OptionsInfo{SDOutput}) { 592 if ($SDFile =~ /$NewSDFileName/i) { 593 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; 594 print "Specify a different name using \"-r --root\" option or use default name.\n"; 595 next FILELIST; 596 } 597 } 598 599 if (!$OptionsInfo{OverwriteFiles}) { 600 # Check SD and text outout files... 601 if ($OptionsInfo{SDOutput}) { 602 if (-e $NewSDFileName) { 603 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n"; 604 next FILELIST; 605 } 606 } 607 if ($OptionsInfo{TextOutput}) { 608 if (-e $NewTextFileName) { 609 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n"; 610 next FILELIST; 611 } 612 } 613 } 614 615 $SDFilesInfo{FileOkay}[$Index] = 1; 616 617 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 618 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName; 619 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName; 620 621 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef; 622 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef; 623 } 624 } 625 626 # Process option values... 627 sub ProcessOptions { 628 %OptionsInfo = (); 629 630 # Process property name related options... 631 ProcessPropertyNamesOption(); 632 633 # Setup RuleOf3 and RuleOf5 violation calculations... 634 $OptionsInfo{RuleOf3Violations} = ($Options{ruleof3violations} =~ /^Yes$/i) ? 1 : 0; 635 $OptionsInfo{RuleOf5Violations} = ($Options{ruleof5violations} =~ /^Yes$/i) ? 1 : 0; 636 637 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode}; 638 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel}; 639 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode}; 640 641 my(@SpecifiedDataFields); 642 @SpecifiedDataFields = (); 643 644 @{$OptionsInfo{SpecifiedDataFields}} = (); 645 $OptionsInfo{CompoundID} = ''; 646 647 if ($Options{datafieldsmode} =~ /^CompoundID$/i) { 648 if ($Options{compoundidmode} =~ /^DataField$/i) { 649 if (!$Options{compoundid}) { 650 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n"; 651 } 652 $OptionsInfo{CompoundID} = $Options{compoundid}; 653 } 654 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) { 655 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd'; 656 } 657 } 658 elsif ($Options{datafieldsmode} =~ /^Specify$/i) { 659 if (!$Options{datafields}) { 660 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n"; 661 } 662 @SpecifiedDataFields = split /\,/, $Options{datafields}; 663 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields; 664 } 665 666 # Types of hydrogen bonds... 667 $OptionsInfo{HydrogenBonds} = $Options{hydrogenbonds}; 668 669 # Process precision value parameters... 670 ProcessPrecisionOption(); 671 672 # Process rotatable bonds parameters... 673 ProcessRotatableBondsOption(); 674 675 # Process TPSA parameters... 676 ProcessTPSAOption(); 677 678 # Process molecular complexity parameters... 679 ProcessMolecularComplexityOption(); 680 681 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0; 682 683 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0; 684 685 $OptionsInfo{Output} = $Options{output}; 686 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|Both)$/i) ? 1 : 0; 687 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|Both)$/i) ? 1 : 0; 688 689 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,"); 690 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0; 691 692 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 693 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 694 } 695 696 # Process property name related options... 697 # 698 sub ProcessPropertyNamesOption { 699 700 # Setup supported physicochemical properties... 701 my($SupportedProperty); 702 703 @{$OptionsInfo{SupportedPropertyNames}} = (); 704 %{$OptionsInfo{SupportedPropertyNamesMap}} = (); 705 706 @{$OptionsInfo{RuleOf5PropertyNames}} = (); 707 %{$OptionsInfo{RuleOf5MaxPropertyValuesMap}} = (); 708 709 @{$OptionsInfo{RuleOf3PropertyNames}} = (); 710 %{$OptionsInfo{RuleOf3MaxPropertyValuesMap}} = (); 711 712 @{$OptionsInfo{DefaultPropertyNames}} = (); 713 714 @{$OptionsInfo{SupportedPropertyNames}} = qw(MolecularWeight ExactMass HeavyAtoms Rings AromaticRings MolecularVolume RotatableBonds HydrogenBondDonors HydrogenBondAcceptors SLogP SMR TPSA Fsp3Carbons Sp3Carbons MolecularComplexity); 715 716 @{$OptionsInfo{RuleOf5PropertyNames}} = qw(MolecularWeight HydrogenBondDonors HydrogenBondAcceptors SLogP); 717 %{$OptionsInfo{RuleOf5MaxPropertyValuesMap}} = ('MolecularWeight' => 500, 'HydrogenBondDonors' => 5, 'HydrogenBondAcceptors' => 10, 'SLogP' => 5); 718 719 @{$OptionsInfo{RuleOf3PropertyNames}} = qw(MolecularWeight RotatableBonds HydrogenBondDonors HydrogenBondAcceptors SLogP TPSA); 720 %{$OptionsInfo{RuleOf3MaxPropertyValuesMap}} = ('MolecularWeight' => 300, 'RotatableBonds' => 3, 'HydrogenBondDonors' => 3, 'HydrogenBondAcceptors' => 3, 'SLogP' => 3, 'TPSA' => 60); 721 722 @{$OptionsInfo{DefaultPropertyNames}} = qw(MolecularWeight HeavyAtoms MolecularVolume RotatableBonds HydrogenBondDonors HydrogenBondAcceptors SLogP TPSA); 723 724 for $SupportedProperty (@{$OptionsInfo{SupportedPropertyNames}}) { 725 $OptionsInfo{SupportedPropertyNamesMap}{lc($SupportedProperty)} = $SupportedProperty; 726 } 727 728 # Process specified properties.... 729 my($SpecifiedPropertyName, @SpecifiedPropertyNames, %SpecifiedPropertyNamesMap); 730 731 @SpecifiedPropertyNames = (); 732 %SpecifiedPropertyNamesMap = (); 733 734 @{$OptionsInfo{SpecifiedPropertyNames}} = (); 735 %{$OptionsInfo{SpecifiedPropertyNamesMap}} = (); 736 737 if ($Options{mode} =~ /^All$/i) { 738 @SpecifiedPropertyNames = @{$OptionsInfo{SupportedPropertyNames}}; 739 } 740 elsif ($Options{mode} =~ /^RuleOf5$/i) { 741 @SpecifiedPropertyNames = @{$OptionsInfo{RuleOf5PropertyNames}}; 742 } 743 elsif ($Options{mode} =~ /^RuleOf3$/i) { 744 @SpecifiedPropertyNames = @{$OptionsInfo{RuleOf3PropertyNames}}; 745 } 746 elsif (IsEmpty($Options{mode})) { 747 @SpecifiedPropertyNames = @{$OptionsInfo{DefaultPropertyNames}}; 748 } 749 else { 750 # Comma delimited lisr of specified property names... 751 my($Mode, $PropertyName, @PropertyNames, @UnsupportedPropertyNames); 752 753 $Mode = $Options{mode}; 754 $Mode =~ s/ //g; 755 756 @PropertyNames = split ",", $Mode; 757 @UnsupportedPropertyNames = (); 758 759 for $PropertyName (@PropertyNames) { 760 if (exists($OptionsInfo{SupportedPropertyNamesMap}{lc($PropertyName)})) { 761 push @SpecifiedPropertyNames, $PropertyName; 762 } 763 else { 764 push @UnsupportedPropertyNames, $PropertyName; 765 } 766 } 767 if (@UnsupportedPropertyNames) { 768 if (@UnsupportedPropertyNames > 1) { 769 warn "Error: The physicochemical property names specified - ", JoinWords(\@UnsupportedPropertyNames, ", ", 0)," - for option \"-m --mode\" are not valid.\n"; 770 } 771 else { 772 warn "Error: The physicochemical property name specified, @UnsupportedPropertyNames , for option \"-m --mode\" is not valid.\n"; 773 } 774 die "Allowed values:", JoinWords(\@{$OptionsInfo{SupportedPropertyNames}}, ", ", 0), "\n"; 775 } 776 if (!@SpecifiedPropertyNames) { 777 die "Error: No valid physicochemical property names specified for option \"-m --mode\".\n"; 778 } 779 } 780 781 # Set up specified property names map... 782 PROPERTY: for $SpecifiedPropertyName (@SpecifiedPropertyNames) { 783 if (exists $SpecifiedPropertyNamesMap{lc($SpecifiedPropertyName)}) { 784 warn "Warning: The physicochemical property name, $SpecifiedPropertyName, is specified multiple times as value of option \"-m --mode\" .\n"; 785 next PROPERTY; 786 } 787 # Canonical specified property name... 788 $SpecifiedPropertyNamesMap{lc($SpecifiedPropertyName)} = $OptionsInfo{SupportedPropertyNamesMap}{lc($SpecifiedPropertyName)}; 789 } 790 791 # Make sure for calculation of RuleOf3Violations, all appropriate property names are specified... 792 if ($Options{ruleof3violations} =~ /^Yes$/i && $Options{mode} =~ /^RuleOf5$/i) { 793 die "Error: The value specified, $Options{ruleof3violations}, for \"--RuleOf3Violations\" option in \"RuleOf5\" \"-m --Mode\" is not valid. You must specify RuleOf3 value for \"-m --Mode\" to calculate RuleOf3 violations.\n"; 794 } 795 796 if ($Options{ruleof3violations} =~ /^Yes$/i) { 797 my($RuleOf3PropertyName, @MissingRuleOf3Names); 798 799 @MissingRuleOf3Names = (); 800 PROPERTY: for $RuleOf3PropertyName (@{$OptionsInfo{RuleOf3PropertyNames}}) { 801 if (exists $SpecifiedPropertyNamesMap{lc($RuleOf3PropertyName)}) { 802 next PROPERTY; 803 } 804 push @MissingRuleOf3Names, $RuleOf3PropertyName; 805 806 # Add property name to specified properties names list and map... 807 push @SpecifiedPropertyNames, $RuleOf3PropertyName; 808 $SpecifiedPropertyNamesMap{lc($RuleOf3PropertyName)} = $OptionsInfo{SupportedPropertyNamesMap}{lc($RuleOf3PropertyName)}; 809 } 810 if (@MissingRuleOf3Names) { 811 warn "Warning: The following physicochemical property names not specified in \"-m --Mode\" option are required for calculating RuleOf3Violations and have been added to the list of property names: @MissingRuleOf3Names\n"; 812 } 813 } 814 815 # Make sure for calculation of RuleOf5Violations, all appropriate property names are specified... 816 if ($Options{ruleof5violations} =~ /^Yes$/i && $Options{mode} =~ /^RuleOf3$/i) { 817 die "Error: The value specified, $Options{ruleof5violations}, for \"--RuleOf5Violations\" option in \"RuleOf3\" \"-m --Mode\" is not valid. You must specify RuleOf5 value for \"-m --Mode\" to calculate RuleOf5 violations.\n"; 818 } 819 820 if ($Options{ruleof5violations} =~ /^Yes$/i) { 821 my($RuleOf5PropertyName, @MissingRuleOf5Names); 822 823 @MissingRuleOf5Names = (); 824 PROPERTY: for $RuleOf5PropertyName (@{$OptionsInfo{RuleOf5PropertyNames}}) { 825 if (exists $SpecifiedPropertyNamesMap{lc($RuleOf5PropertyName)}) { 826 next PROPERTY; 827 } 828 push @MissingRuleOf5Names, $RuleOf5PropertyName; 829 830 # Add property name to specified properties names list and map... 831 push @SpecifiedPropertyNames, $RuleOf5PropertyName; 832 $SpecifiedPropertyNamesMap{lc($RuleOf5PropertyName)} = $OptionsInfo{SupportedPropertyNamesMap}{lc($RuleOf5PropertyName)}; 833 } 834 if (@MissingRuleOf5Names) { 835 warn "Warning: The following physicochemical property names not specified in \"-m --Mode\" option are required for calculating RuleOf5Violations and have been added to the list of property names: @MissingRuleOf5Names\n"; 836 } 837 } 838 $OptionsInfo{Mode} = $Options{mode}; 839 840 # Setup canonical specified property names corresponding to supported names in mixed case... 841 my(@SpecifiedCanonicalPropertyNames); 842 843 @SpecifiedCanonicalPropertyNames = (); 844 for $SpecifiedPropertyName (@SpecifiedPropertyNames) { 845 push @SpecifiedCanonicalPropertyNames, $SpecifiedPropertyNamesMap{lc($SpecifiedPropertyName)}; 846 } 847 @{$OptionsInfo{SpecifiedPropertyNames}} = @SpecifiedCanonicalPropertyNames; 848 %{$OptionsInfo{SpecifiedPropertyNamesMap}} = %SpecifiedPropertyNamesMap; 849 850 # Based on specified property names, figure out whether hydrogens need to be added before 851 # calculation of properties... 852 # 853 $OptionsInfo{AddHydrogens} = 0; 854 if (exists($SpecifiedPropertyNamesMap{lc('MolecularVolume')}) || exists($SpecifiedPropertyNamesMap{lc('SLogP')}) || exists($SpecifiedPropertyNamesMap{lc('SMR')})) { 855 $OptionsInfo{AddHydrogens} = 1; 856 } 857 } 858 859 # Process precision option... 860 # 861 sub ProcessPrecisionOption { 862 my($ParameterName, $ParameterValue, %PrecisionParametersMap, %PrecisionParameterNamesMap); 863 864 %{$OptionsInfo{PrecisionParametersMap}} = (); 865 866 %PrecisionParametersMap = ('WeightPrecision' => 2, 'MassPrecision' => 4); 867 %PrecisionParameterNamesMap = ('molecularweight' => 'WeightPrecision', 'exactmass' => 'MassPrecision'); 868 869 if ($Options{precision}) { 870 # Process specified values... 871 my($Index, $SpecifiedPrecision, @SpecifiedPrecisionValuePairs); 872 873 $SpecifiedPrecision = $Options{precision}; 874 $SpecifiedPrecision =~ s/ //g; 875 @SpecifiedPrecisionValuePairs = split ",", $SpecifiedPrecision; 876 if (@SpecifiedPrecisionValuePairs % 2) { 877 die "Error: Invalid number of values specified using \"--Precision\" option: It must contain even number of values.\n"; 878 } 879 for ($Index = 0; (($Index + 1) < @SpecifiedPrecisionValuePairs); $Index += 2 ) { 880 $ParameterName = $SpecifiedPrecisionValuePairs[$Index]; 881 $ParameterValue = $SpecifiedPrecisionValuePairs[$Index + 1]; 882 if (!exists $PrecisionParameterNamesMap{lc($ParameterName)}) { 883 die "Error: The precision parameter name specified, $ParameterName, for option \"--Precision\" is not valid.\n"; 884 } 885 if (!IsPositiveInteger($ParameterValue)) { 886 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--Precision\" is not valid. Allowed values: positive integer. \n"; 887 } 888 $ParameterName = $PrecisionParameterNamesMap{lc($ParameterName)}; 889 $PrecisionParametersMap{$ParameterName} = $ParameterValue; 890 } 891 } 892 $OptionsInfo{Precision} = $Options{precision}; 893 %{$OptionsInfo{PrecisionParametersMap}} = %PrecisionParametersMap; 894 } 895 896 # Process rotatable bonds option... 897 sub ProcessRotatableBondsOption { 898 my($ParameterName, $ParameterValue, %RotatableBondsParametersMap, %RotatableBondsParameterNamesMap); 899 900 %{$OptionsInfo{RotatableBondsParametersMap}} = (); 901 %RotatableBondsParametersMap = ('IgnoreTerminalBonds' => 1, 'IgnoreBondsToTripleBonds' => 1, 'IgnoreAmideBonds' => 1, 'IgnoreThioamideBonds' => 1, 'IgnoreSulfonamideBonds' => 1); 902 903 for $ParameterName (keys %RotatableBondsParametersMap) { 904 $RotatableBondsParameterNamesMap{lc($ParameterName)} = $ParameterName; 905 } 906 907 if ($Options{rotatablebonds}) { 908 # Process specified values... 909 my($Index, $SpecifiedRotatableBonds, @SpecifiedRotatableBondsValuePairs); 910 911 $SpecifiedRotatableBonds = $Options{rotatablebonds}; 912 $SpecifiedRotatableBonds =~ s/ //g; 913 @SpecifiedRotatableBondsValuePairs = split ",", $SpecifiedRotatableBonds; 914 if (@SpecifiedRotatableBondsValuePairs % 2) { 915 die "Error: Invalid number of values specified using \"--RotatableBonds\" option: It must contain even number of values.\n"; 916 } 917 for ($Index = 0; (($Index + 1) < @SpecifiedRotatableBondsValuePairs); $Index += 2 ) { 918 $ParameterName = $SpecifiedRotatableBondsValuePairs[$Index]; 919 $ParameterValue = $SpecifiedRotatableBondsValuePairs[$Index + 1]; 920 if (!exists $RotatableBondsParameterNamesMap{lc($ParameterName)}) { 921 die "Error: The rotatable bonds parameter name specified, $ParameterName, for option \"--RotatableBonds\" is not valid.\n"; 922 } 923 if ($ParameterValue !~ /^(Yes|No)$/i) { 924 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--RotatableBonds\" is not valid. Allowed values: Yes or No. \n"; 925 } 926 $ParameterName = $RotatableBondsParameterNamesMap{lc($ParameterName)}; 927 $ParameterValue = ($ParameterValue =~ /^Yes$/i) ? 1 : 0; 928 $RotatableBondsParametersMap{$ParameterName} = $ParameterValue; 929 } 930 } 931 $OptionsInfo{RotatableBonds} = $Options{rotatablebonds}; 932 %{$OptionsInfo{RotatableBondsParametersMap}} = %RotatableBondsParametersMap; 933 } 934 935 # Process TPSA option... 936 # 937 sub ProcessTPSAOption { 938 my($ParameterName, $ParameterValue, %TPSAParametersMap, %TPSAParameterNamesMap); 939 940 %{$OptionsInfo{TPSAParametersMap}} = (); 941 942 %TPSAParametersMap = ('IgnorePhosphorus' => 1, 'IgnoreSulfur' => 1); 943 for $ParameterName (keys %TPSAParametersMap) { 944 $TPSAParameterNamesMap{lc($ParameterName)} = $ParameterName; 945 } 946 947 if ($Options{tpsa}) { 948 # Process specified values... 949 my($Index, $SpecifiedTPSA, @SpecifiedTPSAValuePairs); 950 951 $SpecifiedTPSA = $Options{tpsa}; 952 $SpecifiedTPSA =~ s/ //g; 953 @SpecifiedTPSAValuePairs = split ",", $SpecifiedTPSA; 954 if (@SpecifiedTPSAValuePairs % 2) { 955 die "Error: Invalid number of values specified using \"--TPSA\" option: It must contain even number of values.\n"; 956 } 957 for ($Index = 0; (($Index + 1) < @SpecifiedTPSAValuePairs); $Index += 2 ) { 958 $ParameterName = $SpecifiedTPSAValuePairs[$Index]; 959 $ParameterValue = $SpecifiedTPSAValuePairs[$Index + 1]; 960 if (!exists $TPSAParameterNamesMap{lc($ParameterName)}) { 961 die "Error: The TPSA parameter name specified, $ParameterName, for option \"--TPSA\" is not valid.\n"; 962 } 963 if ($ParameterValue !~ /^(Yes|No)$/i) { 964 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--TPSA\" is not valid. Allowed values: Yes or No. \n"; 965 } 966 $ParameterName = $TPSAParameterNamesMap{lc($ParameterName)}; 967 $ParameterValue = ($ParameterValue =~ /^Yes$/i) ? 1 : 0; 968 $TPSAParametersMap{$ParameterName} = $ParameterValue; 969 } 970 } 971 $OptionsInfo{TPSA} = $Options{tpsa}; 972 %{$OptionsInfo{TPSAParametersMap}} = %TPSAParametersMap; 973 } 974 975 # Process molecular complexity parameters... 976 # 977 sub ProcessMolecularComplexityOption { 978 my($MolecularComplexityType, $ParameterName, $ParameterValue, @ParameterNames, @ParameterValues, @AtomIdentifierTypeParameters, %ComplexityParametersMap, %ComplexityParameterNamesMap); 979 980 %{$OptionsInfo{MolecularComplexityParametersMap}} = (); 981 982 %ComplexityParametersMap = ('MolecularComplexityType' => '', 'AtomIdentifierType' => '', 983 'AtomicInvariantsToUse' => '', 'FunctionalClassesToUse' => '', 984 'MACCSKeysSize' => '166', 'NeighborhoodRadius' => '2', 985 'MinPathLength' => '1', 'MaxPathLength' => '8', 'UseBondSymbols' => '1', 986 'MinDistance' => '1', 'MaxDistance' => '10', 'UseTriangleInequality' => '', 987 'DistanceBinSize' => '2', 'NormalizationMethodology' => 'None'); 988 989 %ComplexityParameterNamesMap = (); 990 for $ParameterName (keys %ComplexityParametersMap) { 991 $ComplexityParameterNamesMap{lc($ParameterName)} = $ParameterName; 992 } 993 994 if ($Options{molecularcomplexity}) { 995 # Process specified values... 996 my($Index, $SpecifiedComplexity, @SpecifiedComplexityValuePairs); 997 998 $SpecifiedComplexity = $Options{molecularcomplexity}; 999 1000 @SpecifiedComplexityValuePairs = split ",", $SpecifiedComplexity; 1001 if (@SpecifiedComplexityValuePairs % 2) { 1002 die "Error: Invalid number of values specified using \"--MolecularComplexity\" option: It must contain even number of values.\n"; 1003 } 1004 1005 for ($Index = 0; (($Index + 1) < @SpecifiedComplexityValuePairs); $Index += 2 ) { 1006 $ParameterName = $SpecifiedComplexityValuePairs[$Index]; 1007 $ParameterValue = $SpecifiedComplexityValuePairs[$Index + 1]; 1008 1009 $ParameterName = RemoveLeadingAndTrailingWhiteSpaces($ParameterName); 1010 $ParameterValue = RemoveLeadingAndTrailingWhiteSpaces($ParameterValue); 1011 1012 if (!exists $ComplexityParameterNamesMap{lc($ParameterName)}) { 1013 die "Error: The molecular complexity parameter name specified, $ParameterName, for option \"--MolecularComplexity\" is not valid.\n"; 1014 } 1015 $ParameterName = $ComplexityParameterNamesMap{lc($ParameterName)}; 1016 1017 if ($ParameterName =~ /^AtomicInvariantsToUse$/i) { 1018 my($AtomSymbolFound); 1019 1020 $AtomSymbolFound = 0; 1021 @ParameterValues = split(' ', $ParameterValue); 1022 for $ParameterValue (@ParameterValues) { 1023 if (!AtomicInvariantsAtomTypes::IsAtomicInvariantAvailable($ParameterValue)) { 1024 die "Error: The atomic invariant specified, $ParameterValue, for AtomicInvariantsToUse in option \"--MolecularComplexity\" is not valid.\n"; 1025 } 1026 if ($ParameterValue =~ /^(AS|AtomSymbol)$/i) { 1027 $AtomSymbolFound = 1; 1028 } 1029 } 1030 if (!$AtomSymbolFound) { 1031 die "Error: The atomic invariants specified using AtomicInvariantsToUse in option \"--MolecularComplexity\" is not valid: AtomicInvariant atom symbol, AS or AtomSymbol, must be specified.\n"; 1032 } 1033 $ParameterValue = JoinWords(\@ParameterValues, ",", 0); 1034 } 1035 elsif ($ParameterName =~ /^FunctionalClassesToUse$/i) { 1036 @ParameterValues = split(' ', $ParameterValue); 1037 for $ParameterValue (@ParameterValues) { 1038 if (!FunctionalClassAtomTypes::IsFunctionalClassAvailable($ParameterValue)) { 1039 die "Error: The functional class specified, $ParameterValue, for FunctionalClassesToUse in option \"--MolecularComplexity\" is not valid.\n"; 1040 } 1041 } 1042 $ParameterValue = JoinWords(\@ParameterValues, ",", 0); 1043 } 1044 else { 1045 if ($ParameterValue =~ / /) { 1046 $ParameterValue =~ s/ //g; 1047 } 1048 if ($ParameterValue =~ /^(Yes|No)$/i) { 1049 $ParameterValue = ($ParameterValue =~ /^Yes$/i) ? 1 : 0; 1050 } 1051 } 1052 1053 if ($ParameterName =~ /^MolecularComplexityType$/i) { 1054 if ($ParameterValue !~ /^(AtomTypesFingerprints|ExtendedConnectivityFingerprints|MACCSKeys|PathLengthFingerprints|TopologicalAtomPairsFingerprints|TopologicalAtomTripletsFingerprints|TopologicalAtomTorsionsFingerprints|TopologicalPharmacophoreAtomPairsFingerprints|TopologicalPharmacophoreAtomTripletsFingerprints)$/i) { 1055 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--MolecularComplexity\" is not valid. Allowed values: AtomTypesFingerprints, ExtendedConnectivityFingerprints, MACCSKeys, PathLengthFingerprints, TopologicalAtomPairsFingerprints, TopologicalAtomTripletsFingerprints, TopologicalAtomTorsionsFingerprints, TopologicalPharmacophoreAtomPairsFingerprints, or TopologicalPharmacophoreAtomTripletsFingerprints..\n"; 1056 } 1057 } 1058 elsif ($ParameterName =~ /^AtomIdentifierType$/i) { 1059 if ($ParameterValue !~ /^(AtomicInvariantsAtomTypes|FunctionalClassAtomTypes|DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) { 1060 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--MolecularComplexity\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, FunctionalClassAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes and UFFAtomTypes.\n"; 1061 } 1062 } 1063 elsif ($ParameterName =~ /^(MACCSKeysSize|MinPathLength|MaxPathLength|MinDistance|MaxDistance|DistanceBinSize)$/i) { 1064 if (!IsPositiveInteger($ParameterValue)) { 1065 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--MolecularComplexity\" is not valid. Allowed values: positive integer. \n"; 1066 } 1067 } 1068 elsif ($ParameterName =~ /^NeighborhoodRadius$/i) { 1069 if (!(IsInteger($ParameterValue) && $ParameterValue >=0)) { 1070 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--MolecularComplexity\" is not valid. Allowed values: 0 or positive integer. \n"; 1071 } 1072 } 1073 elsif ($ParameterName =~ /^NormalizationMethodology$/i) { 1074 if ($ParameterValue !~ /^(None|ByHeavyAtomsCount|ByPossibleKeysCount)$/i) { 1075 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--MolecularComplexity\" is not valid. Allowed values: None, ByHeavyAtomsCount, or ByPossibleKeysCount\n"; 1076 } 1077 } 1078 $ComplexityParametersMap{$ParameterName} = $ParameterValue; 1079 } 1080 1081 if ($ComplexityParametersMap{MACCSKeysSize} !~ /^(166|322)$/i) { 1082 die "Error: The parameter value specified, $ComplexityParametersMap{MACCSKeysSize}, for parameter name, MACCSKeysSize in option \"--MolecularComplexity\" is not valid. Allowed values: 166 or 322\n"; 1083 } 1084 if ($ComplexityParametersMap{MinPathLength} > $ComplexityParametersMap{MaxPathLength}) { 1085 die "Error: The parameter value specified for MinPathLength, $ComplexityParametersMap{MinPathLength}, must be <= MaxPathLength, $ComplexityParametersMap{MaxPathLength} ...\n"; 1086 } 1087 if ($ComplexityParametersMap{MinDistance} > $ComplexityParametersMap{MaxDistance}) { 1088 die "Error: The parameter value specified for MinDistance, $ComplexityParametersMap{MinDistance}, must be <= MaxDistance, $ComplexityParametersMap{MaxDistance} ...\n"; 1089 } 1090 } 1091 1092 # Set default parameter values... 1093 1094 if (IsEmpty($ComplexityParametersMap{MolecularComplexityType})) { 1095 $ComplexityParametersMap{MolecularComplexityType} = 'MACCSKeys'; 1096 } 1097 $MolecularComplexityType = $ComplexityParametersMap{MolecularComplexityType}; 1098 1099 1100 if (IsEmpty($ComplexityParametersMap{AtomIdentifierType})) { 1101 $ComplexityParametersMap{AtomIdentifierType} = ($MolecularComplexityType =~ /^(TopologicalPharmacophoreAtomPairsFingerprints|TopologicalPharmacophoreAtomTripletsFingerprints)$/i) ? "FunctionalClassAtomTypes" : "AtomicInvariantsAtomTypes"; 1102 } 1103 1104 if (IsEmpty($ComplexityParametersMap{AtomicInvariantsToUse})) { 1105 my($AtomicInvariantsToUse); 1106 1107 if ($MolecularComplexityType =~ /^(AtomTypesFingerprints|TopologicalAtomPairsFingerprints|TopologicalAtomTripletsFingerprints|TopologicalAtomTorsionsFingerprints)$/i) { 1108 $AtomicInvariantsToUse = "AS,X,BO,H,FC"; 1109 } 1110 elsif ($MolecularComplexityType =~ /^ExtendedConnectivityFingerprints$/i) { 1111 $AtomicInvariantsToUse = "AS,X,BO,H,FC,MN"; 1112 } 1113 else { 1114 $AtomicInvariantsToUse = "AS"; 1115 } 1116 $ComplexityParametersMap{AtomicInvariantsToUse} = $AtomicInvariantsToUse; 1117 } 1118 1119 if (IsEmpty($ComplexityParametersMap{FunctionalClassesToUse})) { 1120 my($FunctionalClassesToUse); 1121 1122 if ($MolecularComplexityType =~ /^TopologicalPharmacophoreAtomPairsFingerprints$/i) { 1123 $FunctionalClassesToUse = "HBD,HBA,PI,NI,H"; 1124 } 1125 elsif ($MolecularComplexityType =~ /^TopologicalPharmacophoreAtomTripletsFingerprints$/i) { 1126 $FunctionalClassesToUse = "HBD,HBA,PI,NI,H,Ar"; 1127 } 1128 else { 1129 $FunctionalClassesToUse = "HBD,HBA,PI,NI,H,Ar,Hal"; 1130 } 1131 $ComplexityParametersMap{FunctionalClassesToUse} = $FunctionalClassesToUse; 1132 } 1133 1134 my(@AtomicInvariantsToUse); 1135 @AtomicInvariantsToUse = split ',', $ComplexityParametersMap{AtomicInvariantsToUse}; 1136 $ComplexityParametersMap{AtomicInvariantsToUse} = \@AtomicInvariantsToUse; 1137 1138 my(@FunctionalClassesToUse); 1139 @FunctionalClassesToUse = split ',', $ComplexityParametersMap{FunctionalClassesToUse}; 1140 $ComplexityParametersMap{FunctionalClassesToUse} = \@FunctionalClassesToUse; 1141 1142 if (IsEmpty($ComplexityParametersMap{UseTriangleInequality})) { 1143 $ComplexityParametersMap{UseTriangleInequality} = 0; 1144 if ($MolecularComplexityType =~ /^TopologicalPharmacophoreAtomTripletsFingerprints$/i) { 1145 $ComplexityParametersMap{UseTriangleInequality} = 1; 1146 } 1147 } 1148 1149 if ($MolecularComplexityType =~ /^(TopologicalPharmacophoreAtomPairsFingerprints|TopologicalPharmacophoreAtomTripletsFingerprints)$/i) { 1150 if ($ComplexityParametersMap{AtomIdentifierType} !~ /^FunctionalClassAtomTypes$/i) { 1151 die "Error: The parameter value specified for AtomIdentifierType, $ComplexityParametersMap{AtomIdentifierType}, in option \"--MolecularComplexity\" is not valid for MolecularComplexityType, $MolecularComplexityType: Allowed value: FunctionalClassAtomTypes...\n"; 1152 } 1153 } 1154 1155 # Set up approprate paremeter names for specified molecular complexity... 1156 1157 @ParameterNames = (); 1158 push @ParameterNames, 'MolecularComplexityType'; 1159 1160 @AtomIdentifierTypeParameters = (); 1161 push @AtomIdentifierTypeParameters, 'AtomIdentifierType'; 1162 if ($ComplexityParametersMap{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) { 1163 push @AtomIdentifierTypeParameters, 'AtomicInvariantsToUse'; 1164 } 1165 elsif ($ComplexityParametersMap{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) { 1166 push @AtomIdentifierTypeParameters, 'FunctionalClassesToUse'; 1167 } 1168 1169 COMPLEXITYTYPE: { 1170 if ($MolecularComplexityType =~ /^AtomTypesFingerprints$/i) { 1171 push @ParameterNames, @AtomIdentifierTypeParameters; 1172 last COMPLEXITYTYPE; 1173 } 1174 if ($MolecularComplexityType =~ /^ExtendedConnectivityFingerprints$/i) { 1175 push @ParameterNames, @AtomIdentifierTypeParameters; 1176 push @ParameterNames, ('NeighborhoodRadius', 'NormalizationMethodology'); 1177 last COMPLEXITYTYPE; 1178 } 1179 if ($MolecularComplexityType =~ /^MACCSKeys$/i) { 1180 push @ParameterNames, 'MACCSKeysSize'; 1181 last COMPLEXITYTYPE; 1182 } 1183 if ($MolecularComplexityType =~ /^PathLengthFingerprints$/i) { 1184 push @ParameterNames, @AtomIdentifierTypeParameters; 1185 push @ParameterNames, ('MinPathLength', 'MaxPathLength', 'UseBondSymbols'); 1186 last COMPLEXITYTYPE; 1187 } 1188 if ($MolecularComplexityType =~ /^TopologicalAtomPairsFingerprints$/i) { 1189 push @ParameterNames, @AtomIdentifierTypeParameters; 1190 push @ParameterNames, ('MinDistance', 'MaxDistance'); 1191 last COMPLEXITYTYPE; 1192 } 1193 if ($MolecularComplexityType =~ /^TopologicalAtomTripletsFingerprints$/i) { 1194 push @ParameterNames, @AtomIdentifierTypeParameters; 1195 push @ParameterNames, ('MinDistance', 'MaxDistance', 'UseTriangleInequality'); 1196 last COMPLEXITYTYPE; 1197 } 1198 if ($MolecularComplexityType =~ /^TopologicalAtomTorsionsFingerprints$/i) { 1199 push @ParameterNames, @AtomIdentifierTypeParameters; 1200 last COMPLEXITYTYPE; 1201 } 1202 if ($MolecularComplexityType =~ /^TopologicalPharmacophoreAtomPairsFingerprints$/i) { 1203 push @ParameterNames, ('AtomIdentifierType', 'FunctionalClassesToUse', 'MinDistance', 'MaxDistance', 'NormalizationMethodology'); 1204 last COMPLEXITYTYPE; 1205 } 1206 if ($MolecularComplexityType =~ /^TopologicalPharmacophoreAtomTripletsFingerprints$/i) { 1207 push @ParameterNames, ('AtomIdentifierType', 'FunctionalClassesToUse', 'MinDistance', 'MaxDistance', 'UseTriangleInequality', 'NormalizationMethodology', 'DistanceBinSize'); 1208 last COMPLEXITYTYPE; 1209 } 1210 die "Error: The parameter value specified, $ParameterValue, for parameter name MolecularComplexityType using \"--MolecularComplexity\" is not valid.\n"; 1211 } 1212 1213 $OptionsInfo{MolecularComplexity} = $Options{molecularcomplexity}; 1214 1215 %{$OptionsInfo{MolecularComplexityParametersMap}} = (); 1216 for $ParameterName (@ParameterNames) { 1217 $ParameterValue = $ComplexityParametersMap{$ParameterName}; 1218 $OptionsInfo{MolecularComplexityParametersMap}{$ParameterName} = $ParameterValue; 1219 } 1220 } 1221 1222 # Setup script usage and retrieve command line arguments specified using various options... 1223 sub SetupScriptUsage { 1224 1225 # Retrieve all the options... 1226 %Options = (); 1227 1228 $Options{compoundidmode} = 'LabelPrefix'; 1229 $Options{compoundidlabel} = 'CompoundID'; 1230 $Options{datafieldsmode} = 'CompoundID'; 1231 1232 $Options{filter} = 'Yes'; 1233 1234 $Options{hydrogenbonds} = 'HBondsType2'; 1235 1236 $Options{keeplargestcomponent} = 'Yes'; 1237 1238 # Default mode values are set later... 1239 $Options{mode} = ''; 1240 1241 # Default moelcular complexity values are set later... 1242 $Options{molecularcomplexity} = ''; 1243 1244 # Default precision values are set later... 1245 $Options{precision} = ''; 1246 1247 $Options{output} = 'text'; 1248 $Options{outdelim} = 'comma'; 1249 $Options{quote} = 'yes'; 1250 1251 # Default rotatable bond parameter values are set later... 1252 $Options{rotatablebonds} = ''; 1253 1254 $Options{ruleof3violations} = 'No'; 1255 $Options{ruleof5violations} = 'No'; 1256 1257 # Default TPSA paramater values are set later... 1258 $Options{tpsa} = ''; 1259 1260 if (!GetOptions(\%Options, "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "help|h", "hydrogenbonds=s", "keeplargestcomponent|k=s", "mode|m=s", "molecularcomplexity=s", "outdelim=s", "output=s", "overwrite|o", "precision=s", "rotatablebonds=s", "ruleof3violations=s", "ruleof5violations=s", "quote|q=s", "root|r=s", "tpsa=s", "workingdir|w=s")) { 1261 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 1262 } 1263 if ($Options{workingdir}) { 1264 if (! -d $Options{workingdir}) { 1265 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 1266 } 1267 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 1268 } 1269 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) { 1270 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; 1271 } 1272 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) { 1273 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n"; 1274 } 1275 if ($Options{filter} !~ /^(Yes|No)$/i) { 1276 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n"; 1277 } 1278 if ($Options{hydrogenbonds} !~ /^(HBondsType1|HydrogenBondsType1|HBondsType2|HydrogenBondsType2)$/i) { 1279 die "Error: The value specified, $Options{hydrogenbonds}, for option \"--HydrogenBonds\" is not valid. Allowed values: HBondsType1, HydrogenBondsType1, HBondsType2, HydrogenBondsType2\n"; 1280 } 1281 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) { 1282 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n"; 1283 } 1284 if ($Options{output} !~ /^(SD|text|both)$/i) { 1285 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n"; 1286 } 1287 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 1288 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 1289 } 1290 if ($Options{quote} !~ /^(Yes|No)$/i) { 1291 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n"; 1292 } 1293 if ($Options{ruleof3violations} !~ /^(Yes|No)$/i) { 1294 die "Error: The value specified, $Options{ruleof3violations}, for option \"--RuleOf3Violations\" is not valid. Allowed values: Yes or No\n"; 1295 } 1296 if ($Options{ruleof5violations} !~ /^(Yes|No)$/i) { 1297 die "Error: The value specified, $Options{ruleof5violations}, for option \"--RuleOf5Violations\" is not valid. Allowed values: Yes or No\n"; 1298 } 1299 } 1300