1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: MACCSKeysFingerprints.pl,v $ 4 # $Date: 2011/12/16 00:03:31 $ 5 # $Revision: 1.24 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2012 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use TextUtil; 37 use SDFileUtil; 38 use MoleculeFileIO; 39 use FileIO::FingerprintsSDFileIO; 40 use FileIO::FingerprintsTextFileIO; 41 use FileIO::FingerprintsFPFileIO; 42 use Fingerprints::MACCSKeys; 43 44 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 45 46 # Autoflush STDOUT 47 $| = 1; 48 49 # Starting message... 50 $ScriptName = basename($0); 51 print "\n$ScriptName: Starting...\n\n"; 52 $StartTime = new Benchmark; 53 54 # Get the options and setup script... 55 SetupScriptUsage(); 56 if ($Options{help} || @ARGV < 1) { 57 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 58 } 59 60 my(@SDFilesList); 61 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 62 63 # Process options... 64 print "Processing options...\n"; 65 my(%OptionsInfo); 66 ProcessOptions(); 67 68 # Setup information about input files... 69 print "Checking input SD file(s)...\n"; 70 my(%SDFilesInfo); 71 RetrieveSDFilesInfo(); 72 73 # Process input files.. 74 my($FileIndex); 75 if (@SDFilesList > 1) { 76 print "\nProcessing SD files...\n"; 77 } 78 for $FileIndex (0 .. $#SDFilesList) { 79 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 80 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 81 GenerateMACCSKeysFingerprints($FileIndex); 82 } 83 } 84 print "\n$ScriptName:Done...\n\n"; 85 86 $EndTime = new Benchmark; 87 $TotalTime = timediff ($EndTime, $StartTime); 88 print "Total time: ", timestr($TotalTime), "\n"; 89 90 ############################################################################### 91 92 # Generate fingerprints for a SD file... 93 # 94 sub GenerateMACCSKeysFingerprints { 95 my($FileIndex) = @_; 96 my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 97 98 $SDFile = $SDFilesList[$FileIndex]; 99 100 # Setup output files... 101 # 102 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex); 103 104 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile); 105 $MoleculeFileIO->Open(); 106 107 $CmpdCount = 0; 108 $IgnoredCmpdCount = 0; 109 110 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) { 111 $CmpdCount++; 112 113 # Filter compound data before calculating fingerprints... 114 if ($OptionsInfo{Filter}) { 115 if (CheckAndFilterCompound($CmpdCount, $Molecule)) { 116 $IgnoredCmpdCount++; 117 next COMPOUND; 118 } 119 } 120 121 $MACCSKeysFingerprints = GenerateMoleculeFingerprints($Molecule); 122 if (!$MACCSKeysFingerprints) { 123 $IgnoredCmpdCount++; 124 ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule); 125 next COMPOUND; 126 } 127 128 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 129 } 130 $MoleculeFileIO->Close(); 131 132 if ($NewFPSDFileIO) { 133 $NewFPSDFileIO->Close(); 134 } 135 if ($NewFPTextFileIO) { 136 $NewFPTextFileIO->Close(); 137 } 138 if ($NewFPFileIO) { 139 $NewFPFileIO->Close(); 140 } 141 142 WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount); 143 } 144 145 # Process compound being ignored due to problems in fingerprints geneation... 146 # 147 sub ProcessIgnoredCompound { 148 my($Mode, $CmpdCount, $Molecule) = @_; 149 my($CmpdID, $DataFieldLabelAndValuesRef); 150 151 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 152 $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 153 154 MODE: { 155 if ($Mode =~ /^ContainsNonElementalData$/i) { 156 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n"; 157 next MODE; 158 } 159 160 if ($Mode =~ /^ContainsNoElementalData$/i) { 161 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n"; 162 next MODE; 163 } 164 165 if ($Mode =~ /^FingerprintsGenerationFailed$/i) { 166 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; 167 next MODE; 168 } 169 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; 170 } 171 } 172 173 # Check and filter compounds.... 174 # 175 sub CheckAndFilterCompound { 176 my($CmpdCount, $Molecule) = @_; 177 my($ElementCount, $NonElementCount); 178 179 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements(); 180 181 if ($NonElementCount) { 182 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule); 183 return 1; 184 } 185 186 if (!$ElementCount) { 187 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule); 188 return 1; 189 } 190 191 return 0; 192 } 193 194 # Write out compounds fingerprints generation summary statistics... 195 # 196 sub WriteFingerprintsGenerationSummaryStatistics { 197 my($CmpdCount, $IgnoredCmpdCount) = @_; 198 my($ProcessedCmpdCount); 199 200 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount; 201 202 print "\nNumber of compounds: $CmpdCount\n"; 203 print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n"; 204 print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n"; 205 } 206 207 # Open output files... 208 # 209 sub SetupAndOpenOutputFiles { 210 my($FileIndex) = @_; 211 my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams); 212 213 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3; 214 215 # Setup common parameters for fingerprints file IO objects... 216 # 217 %FingerprintsFileIOParams = (); 218 if ($OptionsInfo{Mode} =~ /^MACCSKeyBits$/i) { 219 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsBitVectorString', 'BitStringFormat' => $OptionsInfo{BitStringFormat}, 'BitsOrder' => $OptionsInfo{BitsOrder}); 220 } 221 elsif ($OptionsInfo{Mode} =~ /^MACCSKeyCount$/i) { 222 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat}); 223 } 224 225 if ($OptionsInfo{SDOutput}) { 226 $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex]; 227 print "Generating SD file $NewFPSDFile...\n"; 228 $NewFPSDFileIO = new FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel}); 229 $NewFPSDFileIO->Open(); 230 } 231 232 if ($OptionsInfo{FPOutput}) { 233 $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex]; 234 print "Generating FP file $NewFPFile...\n"; 235 $NewFPFileIO = new FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams); 236 $NewFPFileIO->Open(); 237 } 238 239 if ($OptionsInfo{TextOutput}) { 240 my($ColLabelsRef); 241 242 $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex]; 243 $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex); 244 245 print "Generating text file $NewFPTextFile...\n"; 246 $NewFPTextFileIO = new FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote}); 247 $NewFPTextFileIO->Open(); 248 } 249 250 return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 251 } 252 253 # Write fingerpritns and other data to appropriate output files... 254 # 255 sub WriteDataToOutputFiles { 256 my($FileIndex, $CmpdCount, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_; 257 my($DataFieldLabelAndValuesRef); 258 259 $DataFieldLabelAndValuesRef = undef; 260 if ($NewFPTextFileIO || $NewFPFileIO) { 261 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 262 } 263 264 if ($NewFPSDFileIO) { 265 my($CmpdString); 266 267 $CmpdString = $Molecule->GetInputMoleculeString(); 268 $NewFPSDFileIO->WriteFingerprints($MACCSKeysFingerprints, $CmpdString); 269 } 270 271 if ($NewFPTextFileIO) { 272 my($ColValuesRef); 273 274 $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 275 $NewFPTextFileIO->WriteFingerprints($MACCSKeysFingerprints, $ColValuesRef); 276 } 277 278 if ($NewFPFileIO) { 279 my($CompoundID); 280 281 $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 282 $NewFPFileIO->WriteFingerprints($MACCSKeysFingerprints, $CompoundID); 283 } 284 } 285 286 # Generate approriate column labels for FPText output file... 287 # 288 sub SetupFPTextFileCoulmnLabels { 289 my($FileIndex) = @_; 290 my($Line, @ColLabels); 291 292 @ColLabels = (); 293 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 294 push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 295 } 296 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 297 push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 298 } 299 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 300 push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}}; 301 } 302 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 303 push @ColLabels, $OptionsInfo{CompoundIDLabel}; 304 } 305 # Add fingerprints label... 306 push @ColLabels, $OptionsInfo{FingerprintsLabel}; 307 308 return \@ColLabels; 309 } 310 311 # Generate column values FPText output file.. 312 # 313 sub SetupFPTextFileCoulmnValues { 314 my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; 315 my(@ColValues); 316 317 @ColValues = (); 318 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 319 push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 320 } 321 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 322 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 323 } 324 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 325 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 326 } 327 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 328 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}}; 329 } 330 331 return \@ColValues; 332 } 333 334 # Generate compound ID for FP and FPText output files.. 335 # 336 sub SetupCmpdIDForOutputFiles { 337 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; 338 my($CmpdID); 339 340 $CmpdID = ''; 341 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) { 342 my($MolName); 343 $MolName = $Molecule->GetName(); 344 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}"; 345 } 346 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) { 347 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}"; 348 } 349 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) { 350 my($SpecifiedDataField); 351 $SpecifiedDataField = $OptionsInfo{CompoundID}; 352 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : ''; 353 } 354 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) { 355 $CmpdID = $Molecule->GetName(); 356 } 357 return $CmpdID; 358 } 359 360 # Generate fingerprints for molecule... 361 # 362 sub GenerateMoleculeFingerprints { 363 my($Molecule) = @_; 364 my($MACCSKeysFingerprints); 365 366 if ($OptionsInfo{KeepLargestComponent}) { 367 $Molecule->KeepLargestComponent(); 368 } 369 if (!$Molecule->DetectRings()) { 370 return undef; 371 } 372 $Molecule->DetectAromaticity(); 373 374 $MACCSKeysFingerprints = undef; 375 if ($OptionsInfo{Mode} =~ /^MACCSKeyBits$/i) { 376 $MACCSKeysFingerprints = new MACCSKeys('Molecule' => $Molecule, 'Type' => 'MACCSKeyBits', 'Size' => $OptionsInfo{Size}); 377 } 378 elsif ($OptionsInfo{Mode} =~ /^MACCSKeyCount$/i) { 379 $MACCSKeysFingerprints = new MACCSKeys('Molecule' => $Molecule, 'Type' => 'MACCSKeyCount', 'Size' => $OptionsInfo{Size}); 380 } 381 else { 382 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: MACCSKeyBits or MACCSKeyCount\n"; 383 } 384 $MACCSKeysFingerprints->GenerateMACCSKeys(); 385 386 return $MACCSKeysFingerprints; 387 } 388 389 # Retrieve information about SD files... 390 # 391 sub RetrieveSDFilesInfo { 392 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef); 393 394 %SDFilesInfo = (); 395 @{$SDFilesInfo{FileOkay}} = (); 396 @{$SDFilesInfo{OutFileRoot}} = (); 397 @{$SDFilesInfo{SDOutFileNames}} = (); 398 @{$SDFilesInfo{FPOutFileNames}} = (); 399 @{$SDFilesInfo{TextOutFileNames}} = (); 400 @{$SDFilesInfo{AllDataFieldsRef}} = (); 401 @{$SDFilesInfo{CommonDataFieldsRef}} = (); 402 403 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0; 404 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0; 405 406 FILELIST: for $Index (0 .. $#SDFilesList) { 407 $SDFile = $SDFilesList[$Index]; 408 409 $SDFilesInfo{FileOkay}[$Index] = 0; 410 $SDFilesInfo{OutFileRoot}[$Index] = ''; 411 $SDFilesInfo{SDOutFileNames}[$Index] = ''; 412 $SDFilesInfo{FPOutFileNames}[$Index] = ''; 413 $SDFilesInfo{TextOutFileNames}[$Index] = ''; 414 415 $SDFile = $SDFilesList[$Index]; 416 if (!(-e $SDFile)) { 417 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 418 next FILELIST; 419 } 420 if (!CheckFileType($SDFile, "sd sdf")) { 421 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 422 next FILELIST; 423 } 424 425 if ($CheckDataField) { 426 # Make sure data field exists in SD file.. 427 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues); 428 429 @CmpdLines = (); 430 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 431 $CmpdString = ReadCmpdString(\*SDFILE); 432 close SDFILE; 433 @CmpdLines = split "\n", $CmpdString; 434 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 435 $SpecifiedDataField = $OptionsInfo{CompoundID}; 436 if (!exists $DataFieldValues{$SpecifiedDataField}) { 437 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n"; 438 next FILELIST; 439 } 440 } 441 442 $AllDataFieldsRef = ''; 443 $CommonDataFieldsRef = ''; 444 if ($CollectDataFields) { 445 my($CmpdCount); 446 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 447 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE); 448 close SDFILE; 449 } 450 451 # Setup output file names... 452 $FileDir = ""; $FileName = ""; $FileExt = ""; 453 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 454 455 $TextOutFileExt = "csv"; 456 if ($Options{outdelim} =~ /^tab$/i) { 457 $TextOutFileExt = "tsv"; 458 } 459 $SDOutFileExt = $FileExt; 460 $FPOutFileExt = "fpf"; 461 462 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { 463 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 464 if ($RootFileName && $RootFileExt) { 465 $FileName = $RootFileName; 466 } 467 else { 468 $FileName = $OptionsInfo{OutFileRoot}; 469 } 470 $OutFileRoot = $FileName; 471 } 472 else { 473 $OutFileRoot = "${FileName}MACCSKeysFP"; 474 } 475 476 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}"; 477 $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}"; 478 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}"; 479 480 if ($OptionsInfo{SDOutput}) { 481 if ($SDFile =~ /$NewSDFileName/i) { 482 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; 483 print "Specify a different name using \"-r --root\" option or use default name.\n"; 484 next FILELIST; 485 } 486 } 487 488 if (!$OptionsInfo{OverwriteFiles}) { 489 # Check SD and text outout files... 490 if ($OptionsInfo{SDOutput}) { 491 if (-e $NewSDFileName) { 492 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n"; 493 next FILELIST; 494 } 495 } 496 if ($OptionsInfo{FPOutput}) { 497 if (-e $NewFPFileName) { 498 warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n"; 499 next FILELIST; 500 } 501 } 502 if ($OptionsInfo{TextOutput}) { 503 if (-e $NewTextFileName) { 504 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n"; 505 next FILELIST; 506 } 507 } 508 } 509 510 $SDFilesInfo{FileOkay}[$Index] = 1; 511 512 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 513 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName; 514 $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName; 515 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName; 516 517 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef; 518 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef; 519 } 520 } 521 522 # Process option values... 523 sub ProcessOptions { 524 %OptionsInfo = (); 525 526 $OptionsInfo{Mode} = $Options{mode}; 527 528 $OptionsInfo{BitsOrder} = $Options{bitsorder}; 529 $OptionsInfo{BitStringFormat} = $Options{bitstringformat}; 530 531 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode}; 532 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel}; 533 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode}; 534 535 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0; 536 537 my(@SpecifiedDataFields); 538 @SpecifiedDataFields = (); 539 540 @{$OptionsInfo{SpecifiedDataFields}} = (); 541 $OptionsInfo{CompoundID} = ''; 542 543 if ($Options{datafieldsmode} =~ /^CompoundID$/i) { 544 if ($Options{compoundidmode} =~ /^DataField$/i) { 545 if (!$Options{compoundid}) { 546 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n"; 547 } 548 $OptionsInfo{CompoundID} = $Options{compoundid}; 549 } 550 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) { 551 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd'; 552 } 553 } 554 elsif ($Options{datafieldsmode} =~ /^Specify$/i) { 555 if (!$Options{datafields}) { 556 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n"; 557 } 558 @SpecifiedDataFields = split /\,/, $Options{datafields}; 559 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields; 560 } 561 562 $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'MACCSKeysFingerprints'; 563 564 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0; 565 566 $OptionsInfo{Output} = $Options{output}; 567 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0; 568 $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0; 569 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0; 570 571 $OptionsInfo{OutDelim} = $Options{outdelim}; 572 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0; 573 574 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 575 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 576 577 $OptionsInfo{Size} = $Options{size}; 578 579 $OptionsInfo{VectorStringFormat} = $Options{vectorstringformat}; 580 } 581 582 # Setup script usage and retrieve command line arguments specified using various options... 583 sub SetupScriptUsage { 584 585 # Retrieve all the options... 586 %Options = (); 587 588 $Options{bitsorder} = 'Ascending'; 589 $Options{bitstringformat} = 'BinaryString'; 590 591 $Options{compoundidmode} = 'LabelPrefix'; 592 $Options{compoundidlabel} = 'CompoundID'; 593 $Options{datafieldsmode} = 'CompoundID'; 594 595 $Options{filter} = 'Yes'; 596 597 $Options{detectaromaticity} = 'Yes'; 598 $Options{keeplargestcomponent} = 'Yes'; 599 600 $Options{mode} = 'MACCSKeyBits'; 601 602 $Options{output} = 'text'; 603 $Options{outdelim} = 'comma'; 604 $Options{quote} = 'yes'; 605 606 $Options{size} = 166; 607 608 $Options{vectorstringformat} = 'ValuesString'; 609 610 if (!GetOptions(\%Options, "bitsorder=s", "bitstringformat|b=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabel=s", "help|h", "keeplargestcomponent|k=s", "mode|m=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "size|s=i", "vectorstringformat|v=s", "workingdir|w=s")) { 611 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 612 } 613 if ($Options{workingdir}) { 614 if (! -d $Options{workingdir}) { 615 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 616 } 617 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 618 } 619 if ($Options{bitsorder} !~ /^(Ascending|Descending)$/i) { 620 die "Error: The value specified, $Options{bitsorder}, for option \"--BitsOrder\" is not valid. Allowed values: Ascending or Descending\n"; 621 } 622 if ($Options{bitstringformat} !~ /^(BinaryString|HexadecimalString)$/i) { 623 die "Error: The value specified, $Options{bitstringformat}, for option \"-b, --bitstringformat\" is not valid. Allowed values: BinaryString or HexadecimalString\n"; 624 } 625 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) { 626 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; 627 } 628 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) { 629 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n"; 630 } 631 if ($Options{filter} !~ /^(Yes|No)$/i) { 632 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n"; 633 } 634 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) { 635 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n"; 636 } 637 if ($Options{mode} !~ /^(MACCSKeyBits|MACCSKeyCount)$/i) { 638 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: MACCSKeyBits or MACCSKeyCount\n"; 639 } 640 if ($Options{output} !~ /^(SD|FP|text|all)$/i) { 641 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n"; 642 } 643 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 644 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 645 } 646 if ($Options{quote} !~ /^(Yes|No)$/i) { 647 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n"; 648 } 649 if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) { 650 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n"; 651 } 652 if (!(IsPositiveInteger($Options{size}) && ($Options{size} == 166 || $Options{size} == 322))) { 653 die "Error: The value specified, $Options{size}, for option \"-s, --size\" is not valid. Allowed values: 166 or 322 \n"; 654 } 655 if ($Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) { 656 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n"; 657 } 658 } 659