1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: TopologicalAtomPairsFingerprints.pl,v $ 4 # $Date: 2010/07/03 20:32:14 $ 5 # $Revision: 1.18 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2010 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use TextUtil; 37 use SDFileUtil; 38 use MoleculeFileIO; 39 use AtomTypes::AtomicInvariantsAtomTypes; 40 use AtomTypes::FunctionalClassAtomTypes; 41 use Fingerprints::TopologicalAtomPairsFingerprints; 42 use Fingerprints::FingerprintsStringUtil; 43 44 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 45 46 # Autoflush STDOUT 47 $| = 1; 48 49 # Starting message... 50 $ScriptName = basename($0); 51 print "\n$ScriptName: Starting...\n\n"; 52 $StartTime = new Benchmark; 53 54 # Get the options and setup script... 55 SetupScriptUsage(); 56 if ($Options{help} || @ARGV < 1) { 57 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 58 } 59 60 my(@SDFilesList); 61 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 62 63 # Process options... 64 my(%OptionsInfo); 65 ProcessOptions(); 66 67 # Setup information about input files... 68 my(%SDFilesInfo); 69 print "Checking input SD file(s)...\n"; 70 RetrieveSDFilesInfo(); 71 72 # Process input files.. 73 my($FileIndex, $SDFile, $FileProcessingMsg); 74 $FileProcessingMsg = "Processing file"; 75 if (@SDFilesList > 1) { 76 print "Processing SD files...\n"; 77 $FileProcessingMsg = "\n$FileProcessingMsg"; 78 } 79 80 for $FileIndex (0 .. $#SDFilesList) { 81 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 82 $SDFile = $SDFilesList[$FileIndex]; 83 print "$FileProcessingMsg $SDFile...\n"; 84 GenerateTopologicalAtomPairsFingerprints($FileIndex); 85 } 86 } 87 print "$ScriptName:Done...\n\n"; 88 89 $EndTime = new Benchmark; 90 $TotalTime = timediff ($EndTime, $StartTime); 91 print "Total time: ", timestr($TotalTime), "\n"; 92 93 ############################################################################### 94 95 # Generate fingerprints for a SD file... 96 # 97 sub GenerateTopologicalAtomPairsFingerprints { 98 my($FileIndex) = @_; 99 my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $TopologicalAtomPairsFingerprints, $NewSDFileRef, $NewTextFileRef); 100 101 $SDFile = $SDFilesList[$FileIndex]; 102 103 # Setup output files... 104 $NewSDFileRef = ''; 105 $NewTextFileRef = ''; 106 ($NewSDFileRef, $NewTextFileRef) = SetupAndOpenOutputFiles($FileIndex); 107 108 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile); 109 $MoleculeFileIO->Open(); 110 111 $CmpdCount = 0; 112 $IgnoredCmpdCount = 0; 113 114 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) { 115 $CmpdCount++; 116 117 # Filter compound data before calculating fingerprints... 118 if ($OptionsInfo{Filter}) { 119 if (CheckAndFilterCompound($CmpdCount, $Molecule)) { 120 $IgnoredCmpdCount++; 121 next COMPOUND; 122 } 123 } 124 125 $TopologicalAtomPairsFingerprints = GenerateMoleculeFingerprints($Molecule); 126 if (!$TopologicalAtomPairsFingerprints) { 127 $IgnoredCmpdCount++; 128 ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule); 129 next COMPOUND; 130 } 131 132 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $TopologicalAtomPairsFingerprints, $NewSDFileRef, $NewTextFileRef); 133 } 134 $MoleculeFileIO->Close(); 135 136 if ($OptionsInfo{SDOutput}) { 137 close $NewSDFileRef; 138 } 139 if ($OptionsInfo{TextOutput}) { 140 close $NewTextFileRef; 141 } 142 143 WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount); 144 } 145 146 # Process compound being ignored due to problems in fingerprints geneation... 147 # 148 sub ProcessIgnoredCompound { 149 my($Mode, $CmpdCount, $Molecule) = @_; 150 my($CmpdID, $DataFieldLabelAndValuesRef); 151 152 $DataFieldLabelAndValuesRef = $Molecule->GetMDLDataFieldLabelAndValues(); 153 $CmpdID = SetupCmpdIDForTextFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 154 155 MODE: { 156 if ($Mode =~ /^ContainsNonElementalData$/i) { 157 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n"; 158 next MODE; 159 } 160 161 if ($Mode =~ /^ContainsNoElementalData$/i) { 162 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n"; 163 next MODE; 164 } 165 166 if ($Mode =~ /^FingerprintsGenerationFailed$/i) { 167 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; 168 next MODE; 169 } 170 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; 171 } 172 } 173 174 # Check and filter compounds.... 175 # 176 sub CheckAndFilterCompound { 177 my($CmpdCount, $Molecule) = @_; 178 my($ElementCount, $NonElementCount); 179 180 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements(); 181 182 if ($NonElementCount) { 183 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule); 184 return 1; 185 } 186 187 if (!$ElementCount) { 188 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule); 189 return 1; 190 } 191 192 return 0; 193 } 194 195 # Write out compounds fingerprints generation summary statistics... 196 # 197 sub WriteFingerprintsGenerationSummaryStatistics { 198 my($CmpdCount, $IgnoredCmpdCount) = @_; 199 my($ProcessedCmpdCount); 200 201 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount; 202 203 print "\nNumber of compounds: $CmpdCount\n"; 204 print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n"; 205 print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n"; 206 } 207 208 # Open output files... 209 # 210 sub SetupAndOpenOutputFiles { 211 my($FileIndex) = @_; 212 my($NewSDFile, $NewTextFile, $NewSDFileRef, $NewTextFileRef); 213 214 $NewSDFileRef = ''; 215 $NewTextFileRef = ''; 216 217 if ($OptionsInfo{SDOutput}) { 218 $NewSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex]; 219 print "Generating SD file $NewSDFile...\n"; 220 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n"; 221 $NewSDFileRef = \*NEWSDFILE; 222 } 223 if ($OptionsInfo{TextOutput}) { 224 $NewTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex]; 225 print "Generating text file $NewTextFile...\n"; 226 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Couldn't open $NewTextFile: $! \n"; 227 WriteTextFileCoulmnLabels($FileIndex, \*NEWTEXTFILE); 228 $NewTextFileRef = \*NEWTEXTFILE; 229 } 230 return ($NewSDFileRef, $NewTextFileRef); 231 } 232 233 # Write fingerpritns and other data to appropriate output files... 234 # 235 sub WriteDataToOutputFiles { 236 my($FileIndex, $CmpdCount, $Molecule, $TopologicalAtomPairsFingerprints, $NewSDFileRef, $NewTextFileRef) = @_; 237 my($FingerprintsString); 238 239 $FingerprintsString = GetFingerprintsString($TopologicalAtomPairsFingerprints); 240 241 if ($OptionsInfo{SDOutput}) { 242 # Retrieve input compound string used to create molecule and write it out 243 # without last line containing a delimiter... 244 my($CmpdString); 245 $CmpdString = $Molecule->GetMDLCmpdString(); 246 $CmpdString =~ s/\$\$\$\$$//; 247 print $NewSDFileRef "$CmpdString"; 248 249 # Write out fingerprints data... 250 print $NewSDFileRef "> <$OptionsInfo{FingerprintsLabel}>\n$FingerprintsString\n\n"; 251 252 # Write out delimiter... 253 print $NewSDFileRef "\$\$\$\$\n"; 254 } 255 256 if ($OptionsInfo{TextOutput}) { 257 my($Line, $DataFieldLabelAndValuesRef, $DataFieldLabel, $DataFieldValue, @LineWords,); 258 259 $DataFieldLabelAndValuesRef = $Molecule->GetMDLDataFieldLabelAndValues(); 260 @LineWords = (); 261 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 262 push @LineWords, SetupCmpdIDForTextFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 263 } 264 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 265 @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 266 } 267 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 268 @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 269 } 270 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 271 @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}}; 272 } 273 274 # Add fingerprints string... 275 push @LineWords, $FingerprintsString; 276 277 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 278 print $NewTextFileRef "$Line\n"; 279 } 280 } 281 282 # Write out approriate column labels to text file... 283 sub WriteTextFileCoulmnLabels { 284 my($FileIndex, $NewTextFileRef) = @_; 285 my($Line, @LineWords); 286 287 @LineWords = (); 288 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 289 push @LineWords, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 290 } 291 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 292 push @LineWords, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 293 } 294 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 295 push @LineWords, @{$OptionsInfo{SpecifiedDataFields}}; 296 } 297 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 298 push @LineWords, $OptionsInfo{CompoundIDLabel}; 299 } 300 # Add fingerprints label... 301 push @LineWords, $OptionsInfo{FingerprintsLabel}; 302 303 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 304 print $NewTextFileRef "$Line\n"; 305 } 306 307 # Generate compound ID for text files.. 308 # 309 sub SetupCmpdIDForTextFiles { 310 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; 311 my($CmpdID); 312 313 $CmpdID = ''; 314 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) { 315 my($MolName); 316 $MolName = $Molecule->GetName(); 317 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}"; 318 } 319 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) { 320 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}"; 321 } 322 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) { 323 my($SpecifiedDataField); 324 $SpecifiedDataField = $OptionsInfo{CompoundID}; 325 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : ''; 326 } 327 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) { 328 $CmpdID = $Molecule->GetName(); 329 } 330 return $CmpdID; 331 } 332 333 # Get fingerprints as a string... 334 # 335 sub GetFingerprintsString { 336 my($TopologicalAtomPairsFingerprints) = @_; 337 338 return FingerprintsStringUtil::GenerateFingerprintsString($TopologicalAtomPairsFingerprints, $OptionsInfo{VectorStringFormat}); 339 } 340 341 # Generate fingerprints for molecule... 342 # 343 sub GenerateMoleculeFingerprints { 344 my($Molecule) = @_; 345 my($TopologicalAtomPairsFingerprints); 346 347 if ($OptionsInfo{KeepLargestComponent}) { 348 $Molecule->KeepLargestComponent(); 349 } 350 if (!$Molecule->DetectRings()) { 351 return undef; 352 } 353 $Molecule->DetectAromaticity(); 354 355 $TopologicalAtomPairsFingerprints = new TopologicalAtomPairsFingerprints('Molecule' => $Molecule, 'MinDistance' => $OptionsInfo{MinDistance}, 'MaxDistance' => $OptionsInfo{MaxDistance}, 'AtomIdentifierType' => $OptionsInfo{AtomIdentifierType}); 356 SetAtomIdentifierTypeValuesToUse($TopologicalAtomPairsFingerprints); 357 358 # Generate fingerprints... 359 $TopologicalAtomPairsFingerprints->GenerateFingerprints(); 360 361 # Make sure fingerprints generation is successful... 362 if (!$TopologicalAtomPairsFingerprints->IsFingerprintsGenerationSuccessful()) { 363 return undef; 364 } 365 366 return $TopologicalAtomPairsFingerprints; 367 } 368 369 # Set atom identifier type to use for generating fingerprints... 370 # 371 sub SetAtomIdentifierTypeValuesToUse { 372 my($TopologicalAtomPairsFingerprints) = @_; 373 374 if ($OptionsInfo{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) { 375 $TopologicalAtomPairsFingerprints->SetAtomicInvariantsToUse(\@{$OptionsInfo{AtomicInvariantsToUse}}); 376 } 377 elsif ($OptionsInfo{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) { 378 $TopologicalAtomPairsFingerprints->SetFunctionalClassesToUse(\@{$OptionsInfo{FunctionalClassesToUse}}); 379 } 380 elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) { 381 # Nothing to do for now... 382 } 383 else { 384 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n"; 385 } 386 } 387 388 # Retrieve information about SD files... 389 # 390 sub RetrieveSDFilesInfo { 391 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $NewSDFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef); 392 393 %SDFilesInfo = (); 394 @{$SDFilesInfo{FileOkay}} = (); 395 @{$SDFilesInfo{OutFileRoot}} = (); 396 @{$SDFilesInfo{SDOutFileNames}} = (); 397 @{$SDFilesInfo{TextOutFileNames}} = (); 398 @{$SDFilesInfo{AllDataFieldsRef}} = (); 399 @{$SDFilesInfo{CommonDataFieldsRef}} = (); 400 401 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0; 402 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0; 403 404 FILELIST: for $Index (0 .. $#SDFilesList) { 405 $SDFile = $SDFilesList[$Index]; 406 407 $SDFilesInfo{FileOkay}[$Index] = 0; 408 $SDFilesInfo{OutFileRoot}[$Index] = ''; 409 $SDFilesInfo{SDOutFileNames}[$Index] = ''; 410 $SDFilesInfo{TextOutFileNames}[$Index] = ''; 411 412 $SDFile = $SDFilesList[$Index]; 413 if (!(-e $SDFile)) { 414 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 415 next FILELIST; 416 } 417 if (!CheckFileType($SDFile, "sd sdf")) { 418 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 419 next FILELIST; 420 } 421 422 if ($CheckDataField) { 423 # Make sure data field exists in SD file.. 424 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues); 425 426 @CmpdLines = (); 427 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 428 $CmpdString = ReadCmpdString(\*SDFILE); 429 close SDFILE; 430 @CmpdLines = split "\n", $CmpdString; 431 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 432 $SpecifiedDataField = $OptionsInfo{CompoundID}; 433 if (!exists $DataFieldValues{$SpecifiedDataField}) { 434 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n"; 435 next FILELIST; 436 } 437 } 438 439 $AllDataFieldsRef = ''; 440 $CommonDataFieldsRef = ''; 441 if ($CollectDataFields) { 442 my($CmpdCount); 443 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 444 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE); 445 close SDFILE; 446 } 447 448 # Setup output file names... 449 $FileDir = ""; $FileName = ""; $FileExt = ""; 450 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 451 452 $TextOutFileExt = "csv"; 453 if ($Options{outdelim} =~ /^tab$/i) { 454 $TextOutFileExt = "tsv"; 455 } 456 $SDOutFileExt = $FileExt; 457 458 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { 459 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 460 if ($RootFileName && $RootFileExt) { 461 $FileName = $RootFileName; 462 } 463 else { 464 $FileName = $OptionsInfo{OutFileRoot}; 465 } 466 $OutFileRoot = $FileName; 467 } 468 else { 469 $OutFileRoot = "${FileName}TopologicalAtomPairsFP"; 470 } 471 472 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}"; 473 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}"; 474 475 if ($OptionsInfo{SDOutput}) { 476 if ($SDFile =~ /$NewSDFileName/i) { 477 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; 478 print "Specify a different name using \"-r --root\" option or use default name.\n"; 479 next FILELIST; 480 } 481 } 482 483 if (!$OptionsInfo{OverwriteFiles}) { 484 # Check SD and text outout files... 485 if ($OptionsInfo{SDOutput}) { 486 if (-e $NewSDFileName) { 487 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n"; 488 next FILELIST; 489 } 490 } 491 if ($OptionsInfo{TextOutput}) { 492 if (-e $NewTextFileName) { 493 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n"; 494 next FILELIST; 495 } 496 } 497 } 498 499 $SDFilesInfo{FileOkay}[$Index] = 1; 500 501 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 502 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName; 503 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName; 504 505 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef; 506 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef; 507 } 508 } 509 510 # Process option values... 511 sub ProcessOptions { 512 %OptionsInfo = (); 513 514 ProcessAtomIdentifierTypeOptions(); 515 516 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode}; 517 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel}; 518 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode}; 519 520 my(@SpecifiedDataFields); 521 @SpecifiedDataFields = (); 522 523 @{$OptionsInfo{SpecifiedDataFields}} = (); 524 $OptionsInfo{CompoundID} = ''; 525 526 if ($Options{datafieldsmode} =~ /^CompoundID$/i) { 527 if ($Options{compoundidmode} =~ /^DataField$/i) { 528 if (!$Options{compoundid}) { 529 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n"; 530 } 531 $OptionsInfo{CompoundID} = $Options{compoundid}; 532 } 533 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) { 534 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd'; 535 } 536 } 537 elsif ($Options{datafieldsmode} =~ /^Specify$/i) { 538 if (!$Options{datafields}) { 539 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n"; 540 } 541 @SpecifiedDataFields = split /\,/, $Options{datafields}; 542 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields; 543 } 544 545 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0; 546 547 $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'TopologicalAtomPairsFingerprints'; 548 549 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0; 550 551 $OptionsInfo{MinDistance} = $Options{mindistance}; 552 $OptionsInfo{MaxDistance} = $Options{maxdistance}; 553 554 $OptionsInfo{Output} = $Options{output}; 555 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|Both)$/i) ? 1 : 0; 556 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|Both)$/i) ? 1 : 0; 557 558 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,"); 559 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0; 560 561 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 562 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 563 564 $OptionsInfo{VectorStringFormat} = $Options{vectorstringformat}; 565 } 566 567 # Process atom identifier type and related options... 568 # 569 sub ProcessAtomIdentifierTypeOptions { 570 571 $OptionsInfo{AtomIdentifierType} = $Options{atomidentifiertype}; 572 573 if ($Options{atomidentifiertype} =~ /^AtomicInvariantsAtomTypes$/i) { 574 ProcessAtomicInvariantsToUseOption(); 575 } 576 elsif ($Options{atomidentifiertype} =~ /^FunctionalClassAtomTypes$/i) { 577 ProcessFunctionalClassesToUse(); 578 } 579 elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) { 580 # Nothing to do for now... 581 } 582 else { 583 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n"; 584 } 585 } 586 587 # Process specified atomic invariants to use... 588 # 589 sub ProcessAtomicInvariantsToUseOption { 590 my($AtomicInvariant, $AtomSymbolSpecified, @AtomicInvariantsWords); 591 592 @{$OptionsInfo{AtomicInvariantsToUse}} = (); 593 if (IsEmpty($Options{atomicinvariantstouse})) { 594 die "Error: Atomic invariants value specified using \"--AtomicInvariantsToUse\" option is empty\n"; 595 } 596 $AtomSymbolSpecified = 0; 597 @AtomicInvariantsWords = split /\,/, $Options{atomicinvariantstouse}; 598 for $AtomicInvariant (@AtomicInvariantsWords) { 599 if (!AtomicInvariantsAtomTypes::IsAtomicInvariantAvailable($AtomicInvariant)) { 600 die "Error: Atomic invariant specified, $AtomicInvariant, using \"--AtomicInvariantsToUse\" option is not valid...\n "; 601 } 602 if ($AtomicInvariant =~ /^(AS|AtomSymbol)$/i) { 603 $AtomSymbolSpecified = 1; 604 } 605 push @{$OptionsInfo{AtomicInvariantsToUse}}, $AtomicInvariant; 606 } 607 if (!$AtomSymbolSpecified) { 608 die "Error: Atomic invariant, AS or AtomSymbol, must be specified as using \"--AtomicInvariantsToUse\" option...\n "; 609 } 610 } 611 612 # Process specified functional classes invariants to use... 613 # 614 sub ProcessFunctionalClassesToUse { 615 my($FunctionalClass, @FunctionalClassesToUseWords); 616 617 @{$OptionsInfo{FunctionalClassesToUse}} = (); 618 if (IsEmpty($Options{functionalclassestouse})) { 619 die "Error: Functional classes value specified using \"--FunctionalClassesToUse\" option is empty\n"; 620 } 621 @FunctionalClassesToUseWords = split /\,/, $Options{functionalclassestouse}; 622 for $FunctionalClass (@FunctionalClassesToUseWords) { 623 if (!FunctionalClassAtomTypes::IsFunctionalClassAvailable($FunctionalClass)) { 624 die "Error: Functional class specified, $FunctionalClass, using \"--FunctionalClassesToUse\" option is not valid...\n "; 625 } 626 push @{$OptionsInfo{FunctionalClassesToUse}}, $FunctionalClass; 627 } 628 } 629 630 # Setup script usage and retrieve command line arguments specified using various options... 631 sub SetupScriptUsage { 632 633 # Retrieve all the options... 634 %Options = (); 635 636 $Options{atomidentifiertype} = 'AtomicInvariantsAtomTypes'; 637 $Options{atomicinvariantstouse} = 'AS,X,BO,H,FC'; 638 639 $Options{functionalclassestouse} = 'HBD,HBA,PI,NI,Ar,Hal'; 640 641 $Options{compoundidmode} = 'LabelPrefix'; 642 $Options{compoundidlabel} = 'CompoundID'; 643 $Options{datafieldsmode} = 'CompoundID'; 644 645 $Options{filter} = 'Yes'; 646 647 $Options{keeplargestcomponent} = 'Yes'; 648 649 $Options{mindistance} = 1; 650 $Options{maxdistance} = 10; 651 652 $Options{output} = 'text'; 653 $Options{outdelim} = 'comma'; 654 $Options{quote} = 'yes'; 655 656 $Options{vectorstringformat} = 'IDsAndValuesString'; 657 658 if (!GetOptions(\%Options, "atomidentifiertype|a=s", "atomicinvariantstouse=s", "functionalclassestouse=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabel=s", "help|h", "keeplargestcomponent|k=s", "mindistance=s", "maxdistance=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "vectorstringformat|v=s", "workingdir|w=s")) { 659 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 660 } 661 if ($Options{workingdir}) { 662 if (! -d $Options{workingdir}) { 663 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 664 } 665 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 666 } 667 if ($Options{atomidentifiertype} !~ /^(AtomicInvariantsAtomTypes|DREIDINGAtomTypes|EStateAtomTypes|FunctionalClassAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) { 668 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n"; 669 } 670 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) { 671 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; 672 } 673 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) { 674 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n"; 675 } 676 if ($Options{filter} !~ /^(Yes|No)$/i) { 677 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n"; 678 } 679 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) { 680 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n"; 681 } 682 if (!IsPositiveInteger($Options{mindistance})) { 683 die "Error: The value specified, $Options{mindistance}, for option \"--MinDistance\" is not valid. Allowed values: > 0 \n"; 684 } 685 if (!IsPositiveInteger($Options{maxdistance})) { 686 die "Error: The value specified, $Options{maxdistance}, for option \"--MaxDistance\" is not valid. Allowed values: > 0 \n"; 687 } 688 if ($Options{mindistance} > $Options{maxdistance}) { 689 die "Error: The value specified, specified, $Options{mindistance}, for option \"--MinDistance\" must be less than the value specified, $Options{maxdistance}, for option \"--MaxDistance\" \n"; 690 } 691 if ($Options{output} !~ /^(SD|text|both)$/i) { 692 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n"; 693 } 694 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 695 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 696 } 697 if ($Options{quote} !~ /^(Yes|No)$/i) { 698 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n"; 699 } 700 if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) { 701 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n"; 702 } 703 if ($Options{vectorstringformat} !~ /^(IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) { 704 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n"; 705 } 706 } 707