1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: ExtractFromSDFiles.pl,v $ 4 # $Date: 2011/12/16 00:03:30 $ 5 # $Revision: 1.41 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2012 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use SDFileUtil; 36 use FileUtil; 37 use TextUtil; 38 39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 40 41 # Autoflush STDOUT 42 $| = 1; 43 44 # Starting message... 45 $ScriptName = basename($0); 46 print "\n$ScriptName:Starting...\n\n"; 47 $StartTime = new Benchmark; 48 49 # Get the options and setup script... 50 SetupScriptUsage(); 51 if ($Options{help} || @ARGV < 1) { 52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 53 } 54 55 my(@SDFilesList); 56 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 57 58 # Process options... 59 print "Processing options...\n"; 60 my(%OptionsInfo); 61 ProcessOptions(); 62 63 # Collect information about SD files... 64 print "Checking input SD file(s)...\n"; 65 my(%SDFilesInfo); 66 RetrieveSDFilesInfo(); 67 68 # Generate output files... 69 my($FileIndex); 70 if (@SDFilesList > 1) { 71 print "\nProcessing SD files...\n"; 72 } 73 for $FileIndex (0 .. $#SDFilesList) { 74 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 75 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 76 ExtractFromSDFile($FileIndex); 77 } 78 } 79 print "\n$ScriptName:Done...\n\n"; 80 81 $EndTime = new Benchmark; 82 $TotalTime = timediff ($EndTime, $StartTime); 83 print "Total time: ", timestr($TotalTime), "\n"; 84 85 ############################################################################### 86 87 # Extract data from a SD file... 88 sub ExtractFromSDFile { 89 my($FileIndex) = @_; 90 91 OpenInputAndOutputFiles($FileIndex); 92 93 MODE: { 94 if ($OptionsInfo{Mode} =~ /^AllDataFields$/i) { 95 ExtractAllDataFields($FileIndex); 96 last MODE; 97 } 98 if ($OptionsInfo{Mode} =~ /^CommonDataFields$/i) { 99 ExtractCommonDataFields($FileIndex); 100 last MODE; 101 } 102 if ($OptionsInfo{Mode} =~ /^DataFields$/i) { 103 ExtractDataFields($FileIndex); 104 last MODE; 105 } 106 if ($OptionsInfo{Mode} =~ /^(DataFieldByList|DatafieldUniqueByList)$/i) { 107 ExtractDataFieldByList($FileIndex); 108 last MODE; 109 } 110 if ($OptionsInfo{Mode} =~ /^DataFieldsByValue$/i) { 111 ExtractDataFieldsByValue($FileIndex); 112 last MODE; 113 } 114 if ($OptionsInfo{Mode} =~ /^DataFieldsByRegex$/i) { 115 ExtractDataFieldsByRegex($FileIndex); 116 last MODE; 117 } 118 if ($OptionsInfo{Mode} =~ /^RandomCmpds$/i) { 119 ExtractRandomCompounds($FileIndex); 120 last MODE; 121 } 122 if ($OptionsInfo{Mode} =~ /^MolNames$/i) { 123 ExtractMolNames($FileIndex); 124 last MODE; 125 } 126 if ($OptionsInfo{Mode} =~ /^RecordNum$/i) { 127 ExtractRecordNum($FileIndex); 128 last MODE; 129 } 130 if ($OptionsInfo{Mode} =~ /^RecordRange$/i) { 131 ExtractRecordRange($FileIndex); 132 last MODE; 133 } 134 if ($OptionsInfo{Mode} =~ /^2DCmpdRecords$/i) { 135 Extract2DCmpdRecords($FileIndex); 136 last MODE; 137 } 138 if ($OptionsInfo{Mode} =~ /^3DCmpdRecords$/i) { 139 Extract3DCmpdRecords($FileIndex); 140 last MODE; 141 } 142 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, molnames, randomcmpds, recordnum, recordrange, 2dcmpdrecords, 3dcmpdrecords\n"; 143 } 144 145 CloseInputAndOutputFiles(); 146 } 147 148 # Extract all data fields... 149 sub ExtractAllDataFields { 150 my($FileIndex) = @_; 151 my(@CmpdLines); 152 153 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 154 WriteTextFileColLabels(); 155 156 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 157 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 158 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 159 160 SetupDataValues(); 161 WriteTextFileCmpdData(); 162 WriteSDFileCmpdData(); 163 } 164 } 165 166 # Extract common data fields... 167 sub ExtractCommonDataFields { 168 my($FileIndex) = @_; 169 my(@CmpdLines); 170 171 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{CommonDataFieldLabels}[$FileIndex]}; 172 WriteTextFileColLabels(); 173 174 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 175 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 176 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 177 178 SetupDataValues(); 179 WriteTextFileCmpdData(); 180 WriteSDFileCmpdData(); 181 } 182 } 183 184 # Extract specified data fields... 185 sub ExtractDataFields { 186 my($FileIndex) = @_; 187 my(@CmpdLines); 188 189 @{$SDFilesInfo{DataLabels}} = @{$OptionsInfo{SpecifiedDataFieldLabels}}; 190 WriteTextFileColLabels(); 191 192 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 193 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 194 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 195 196 SetupDataValues(); 197 WriteTextFileCmpdData(); 198 WriteSDFileCmpdData(); 199 } 200 } 201 202 # Extract data fields using a list... 203 sub ExtractDataFieldByList { 204 my($FileIndex) = @_; 205 my($CmpdNum, $Value, $SpecifiedDataFieldValuesFoundCount, $CurrentValue, $SpecifiedDataFieldLabel, @CmpdLines); 206 207 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 208 WriteTextFileColLabels(); 209 210 for $Value (keys %{$OptionsInfo{SpecifiedDataFieldValues}}) { 211 $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound"; 212 } 213 $SpecifiedDataFieldValuesFoundCount = 0; 214 $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel}; 215 216 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 217 $CmpdNum++; 218 219 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 220 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 221 222 if (!exists $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}) { 223 next CMPDSTRING; 224 } 225 226 SetupDataValues(); 227 228 $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel}; 229 $CurrentValue = $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}; 230 231 if (exists $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue}) { 232 if ($SpecifiedDataFieldValuesFoundCount < $OptionsInfo{SpecifiedDataFieldValuesCount}) { 233 if ($OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} eq "NotFound") { 234 $SpecifiedDataFieldValuesFoundCount++; 235 $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} = "Found"; 236 if ($OptionsInfo{Mode} =~ /^DataFieldUniqueByList$/i) { 237 WriteSDFileCmpdString(); 238 WriteTextFileCmpdData(); 239 } 240 } 241 if ($OptionsInfo{Mode} =~ /^DataFieldByList$/i) { 242 WriteSDFileCmpdString(); 243 WriteTextFileCmpdData(); 244 } 245 } 246 if ($SpecifiedDataFieldValuesFoundCount >= $OptionsInfo{SpecifiedDataFieldValuesCount}) { 247 last CMPDSTRING; 248 } 249 } 250 } 251 } 252 253 # Extract data fields by value... 254 sub ExtractDataFieldsByValue { 255 my($FileIndex) = @_; 256 my($Label, $CurrentValue, $SpecifiedCriterion, $SpecifiedValue, $ViolationCount, $Nothing, @CmpdLines); 257 258 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 259 WriteTextFileColLabels(); 260 261 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 262 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 263 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 264 265 SetupDataValues(); 266 $ViolationCount = 0; 267 268 for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) { 269 if (exists $SDFilesInfo{DataFieldValues}{$Label}) { 270 $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label}; 271 $SpecifiedCriterion = $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label}; 272 $SpecifiedValue = $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label}; 273 274 if ($OptionsInfo{NumericalComparison}) { 275 CRITERION: { 276 if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue != $SpecifiedValue) { $ViolationCount++; last CRITERION; } } 277 if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue > $SpecifiedValue) { $ViolationCount++; last CRITERION; } } 278 if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue < $SpecifiedValue) { $ViolationCount++; last CRITERION; } } 279 $Nothing = 1; 280 } 281 } 282 else { 283 CRITERION: { 284 if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue ne $SpecifiedValue) { $ViolationCount++; last CRITERION; } } 285 if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue gt $SpecifiedValue) { $ViolationCount++; last CRITERION; } } 286 if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue lt $SpecifiedValue) { $ViolationCount++; last CRITERION; } } 287 $Nothing = 1; 288 } 289 } 290 } 291 } 292 if ($ViolationCount <= $OptionsInfo{Violations}) { 293 WriteSDFileCmpdString(); 294 WriteTextFileCmpdData(); 295 } 296 } 297 } 298 299 # Extract data fields by value using regular expression match... 300 sub ExtractDataFieldsByRegex { 301 my($FileIndex) = @_; 302 my($Label, $CurrentValue, $SpecifiedRegexCriterion, $SpecifiedRegex, $ViolationCount, $Nothing, @CmpdLines); 303 304 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 305 WriteTextFileColLabels(); 306 307 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 308 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 309 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 310 311 SetupDataValues(); 312 $ViolationCount = 0; 313 314 for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) { 315 if (exists $SDFilesInfo{DataFieldValues}{$Label}) { 316 $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label}; 317 $SpecifiedRegexCriterion = $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label}; 318 $SpecifiedRegex = $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label}; 319 320 if ($OptionsInfo{RegexIgnoreCase}) { 321 CRITERION: { 322 if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/i) { $ViolationCount++; last CRITERION; } } 323 if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/i) { $ViolationCount++; last CRITERION; } } 324 $Nothing = 1; 325 } 326 } 327 else { 328 CRITERION: { 329 if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/) { $ViolationCount++; last CRITERION; } } 330 if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/) { $ViolationCount++; last CRITERION; } } 331 $Nothing = 1; 332 } 333 } 334 } 335 } 336 if ($ViolationCount <= $OptionsInfo{Violations}) { 337 WriteSDFileCmpdString(); 338 WriteTextFileCmpdData(); 339 } 340 } 341 } 342 343 # Extract random compounds... 344 sub ExtractRandomCompounds { 345 my($FileIndex) = @_; 346 my($CmpdNum, $CmpdCount, $RandomCycleCount, $RandomIndex, @CmpdLines, %RandomCmpdIndexMap); 347 348 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 349 WriteTextFileColLabels(); 350 351 $CmpdCount = $SDFilesInfo{CmpdCount}[$FileIndex]; 352 srand($OptionsInfo{Seed}); 353 $RandomCycleCount = 0; 354 355 %RandomCmpdIndexMap = (); 356 while ($RandomCycleCount <= $CmpdCount && $RandomCycleCount <= $OptionsInfo{NumOfCmpds}) { 357 $RandomCycleCount++; 358 $RandomIndex = int (rand $CmpdCount) + 1; 359 $RandomCmpdIndexMap{$RandomIndex} = $RandomIndex; 360 } 361 362 $CmpdNum = 0; 363 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 364 $CmpdNum++; 365 if (!exists $RandomCmpdIndexMap{$CmpdNum}) { 366 next CMPDSTRING; 367 } 368 369 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 370 371 WriteSDFileCmpdString(); 372 373 if ($OptionsInfo{OutputTextFile}) { 374 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 375 SetupDataValues(); 376 WriteTextFileCmpdData(); 377 } 378 } 379 } 380 381 # Extract mol names... 382 sub ExtractMolNames { 383 my($FileIndex) = @_; 384 my($MolName, $NewTextFileRef, @CmpdLines); 385 386 push @{$SDFilesInfo{DataLabels}}, "MolName"; 387 WriteTextFileColLabels(); 388 389 $NewTextFileRef = $SDFilesInfo{NewTextFileRef}; 390 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 391 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 392 $MolName = QuoteAWord(ParseCmpdMolNameLine($CmpdLines[0]), $OptionsInfo{OutQuote}); 393 print $NewTextFileRef "$MolName\n"; 394 } 395 } 396 397 # Extract a specific compound records... 398 sub ExtractRecordNum { 399 my($FileIndex) = @_; 400 my($CmpdNum, @CmpdLines); 401 402 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 403 WriteTextFileColLabels(); 404 405 $CmpdNum = 0; 406 407 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 408 $CmpdNum++; 409 if ($CmpdNum != $OptionsInfo{RecordNum}) { 410 next CMPDSTRING; 411 } 412 413 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 414 WriteSDFileCmpdString(); 415 416 if ($OptionsInfo{OutputTextFile}) { 417 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 418 SetupDataValues(); 419 WriteTextFileCmpdData(); 420 } 421 last CMPDSTRING; 422 } 423 } 424 425 # Extract compounds in a specific record range... 426 sub ExtractRecordRange { 427 my($FileIndex) = @_; 428 my($CmpdNum, @CmpdLines); 429 430 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 431 WriteTextFileColLabels(); 432 433 $CmpdNum = 0; 434 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 435 $CmpdNum++; 436 437 if ($CmpdNum >= $OptionsInfo{StartRecordNum} && $CmpdNum <= $OptionsInfo{EndRecordNum}) { 438 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 439 440 WriteSDFileCmpdString(); 441 442 if ($OptionsInfo{OutputTextFile}) { 443 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 444 SetupDataValues(); 445 WriteTextFileCmpdData(\*NEWTEXTFILE); 446 } 447 } 448 elsif ($CmpdNum > $OptionsInfo{EndRecordNum}) { 449 last CMPDSTRING; 450 } 451 } 452 } 453 454 # Extract 2D compound records... 455 sub Extract2DCmpdRecords { 456 my($FileIndex) = @_; 457 my(@CmpdLines); 458 459 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 460 WriteTextFileColLabels(); 461 462 463 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 464 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 465 if (!IsCmpd2D(\@CmpdLines)) { 466 next CMPDSTRING; 467 } 468 469 WriteSDFileCmpdString(); 470 471 if ($OptionsInfo{OutputTextFile}) { 472 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 473 SetupDataValues(); 474 WriteTextFileCmpdData(); 475 } 476 } 477 } 478 479 # Extract 3D compound records... 480 sub Extract3DCmpdRecords { 481 my($FileIndex) = @_; 482 my(@CmpdLines); 483 484 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 485 WriteTextFileColLabels(); 486 487 488 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 489 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 490 if (!IsCmpd3D(\@CmpdLines)) { 491 next CMPDSTRING; 492 } 493 494 WriteSDFileCmpdString(); 495 496 if ($OptionsInfo{OutputTextFile}) { 497 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 498 SetupDataValues(); 499 WriteTextFileCmpdData(); 500 } 501 } 502 } 503 504 505 # Open input and output files... 506 sub OpenInputAndOutputFiles { 507 my($FileIndex) = @_; 508 509 $SDFilesInfo{NewTextFileRef} = undef; 510 $SDFilesInfo{NewSDFileRef} = undef; 511 512 if ($OptionsInfo{OutputTextFile} && $OptionsInfo{OutputSDFile}) { 513 print "Generating files $SDFilesInfo{NewSDFileName}[$FileIndex] and $SDFilesInfo{NewTextFileName}[$FileIndex]...\n"; 514 } 515 elsif ($OptionsInfo{OutputSDFile}) { 516 print "Generating file $SDFilesInfo{NewSDFileName}[$FileIndex]...\n"; 517 } 518 else { 519 print "Generating file $SDFilesInfo{NewTextFileName}[$FileIndex]...\n"; 520 } 521 522 if ($OptionsInfo{OutputSDFile}) { 523 open NEWSDFILE, ">$SDFilesInfo{NewSDFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewSDFileName}[$FileIndex]: $! \n"; 524 $SDFilesInfo{NewSDFileRef} = \*NEWSDFILE; 525 } 526 if ($OptionsInfo{OutputTextFile}) { 527 open NEWTEXTFILE, ">$SDFilesInfo{NewTextFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewTextFileName}[$FileIndex]: $! \n"; 528 $SDFilesInfo{NewTextFileRef} = \*NEWTEXTFILE; 529 } 530 531 open SDFILE, "$SDFilesList[$FileIndex]" or die "Error: Couldn't open $SDFilesList[$FileIndex]: $! \n"; 532 $SDFilesInfo{InputSDFileRef} = \*SDFILE; 533 534 } 535 536 # Close open input and output files... 537 sub CloseInputAndOutputFiles { 538 if ($SDFilesInfo{NewSDFileRef}) { 539 close $SDFilesInfo{NewSDFileRef}; 540 } 541 if ($SDFilesInfo{NewTextFileRef}) { 542 close $SDFilesInfo{NewTextFileRef}; 543 } 544 545 if ($SDFilesInfo{InputSDFileRef}) { 546 close $SDFilesInfo{InputSDFileRef}; 547 } 548 549 $SDFilesInfo{NewTextFileRef} = undef; 550 $SDFilesInfo{NewSDFileRef} = undef; 551 $SDFilesInfo{InputSDFileRef} = undef; 552 } 553 554 # Write out column labels for text file... 555 sub WriteTextFileColLabels { 556 my($ColLabelsLine, $NewTextFileRef); 557 558 if (!$OptionsInfo{OutputTextFile}) { 559 return; 560 } 561 562 $NewTextFileRef = $SDFilesInfo{NewTextFileRef}; 563 564 if ($OptionsInfo{OutoutStrDataString}) { 565 # Append structure data string label... 566 my(@DataLabels); 567 568 @DataLabels = (); 569 push @DataLabels, @{$SDFilesInfo{DataLabels}}; 570 push @DataLabels, "StructureDataString"; 571 572 $ColLabelsLine = JoinWords(\@DataLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 573 } 574 else { 575 $ColLabelsLine = JoinWords(\@{$SDFilesInfo{DataLabels}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 576 } 577 print $NewTextFileRef "$ColLabelsLine\n"; 578 } 579 580 # Setup values for data fields... 581 sub SetupDataValues { 582 @{$SDFilesInfo{DataValues}} = map { exists $SDFilesInfo{DataFieldValues}{$_} ? $SDFilesInfo{DataFieldValues}{$_} : "" } @{$SDFilesInfo{DataLabels}}; 583 } 584 585 # Write out structure data and specific data fields to SD file... 586 sub WriteSDFileCmpdData { 587 my($MolString, $Count, $NewSDFileRef); 588 589 if (!$OptionsInfo{OutputSDFile}) { 590 return; 591 } 592 593 $NewSDFileRef = $SDFilesInfo{NewSDFileRef}; 594 595 ($MolString) = split "M END", $SDFilesInfo{CmpdString}; 596 $MolString .= "M END"; 597 print $NewSDFileRef "$MolString\n"; 598 599 for $Count (0 .. $#{$SDFilesInfo{DataLabels}}) { 600 print $NewSDFileRef "> <$SDFilesInfo{DataLabels}[$Count]>\n$SDFilesInfo{DataValues}[$Count]\n\n"; 601 } 602 print $NewSDFileRef "\$\$\$\$\n"; 603 } 604 605 # Write out compound string... 606 sub WriteSDFileCmpdString { 607 my($NewSDFileRef); 608 609 if (!$OptionsInfo{OutputSDFile}) { 610 return; 611 } 612 613 $NewSDFileRef = $SDFilesInfo{NewSDFileRef}; 614 print $NewSDFileRef "$SDFilesInfo{CmpdString}\n"; 615 } 616 617 # Write out data for text file... 618 sub WriteTextFileCmpdData { 619 my($DataValuesLine, $NewTextFileRef); 620 621 if (!$OptionsInfo{OutputTextFile}) { 622 return; 623 } 624 625 $NewTextFileRef = $SDFilesInfo{NewTextFileRef}; 626 $DataValuesLine = JoinWords(\@{$SDFilesInfo{DataValues}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 627 628 # Handle multiple lines data values for data fields by joining 'em using semicolons... 629 if ($DataValuesLine =~ /\n/) { 630 $DataValuesLine =~ s/\n/;/g; 631 } 632 633 if ($OptionsInfo{OutoutStrDataString}) { 634 # Append structure data string... 635 my($StrDataString, $OutQuoteValue, $OutDelim, $StrDataStringDelimiter); 636 637 if ($OptionsInfo{StrDataStringWithFields}) { 638 $StrDataString = $SDFilesInfo{CmpdString}; 639 } 640 else { 641 ($StrDataString) = split "M END", $SDFilesInfo{CmpdString}; 642 $StrDataString .= "M END"; 643 } 644 $StrDataStringDelimiter = $OptionsInfo{StrDataStringDelimiter}; 645 $StrDataString =~ s/\n/$StrDataStringDelimiter/g; 646 647 $OutDelim = $OptionsInfo{OutDelim}; 648 $OutQuoteValue = $OptionsInfo{OutQuote} ? "\"" : ""; 649 650 print $NewTextFileRef "$DataValuesLine${OutDelim}${OutQuoteValue}${StrDataString}${OutQuoteValue}\n"; 651 } 652 else { 653 print $NewTextFileRef "$DataValuesLine\n"; 654 } 655 } 656 657 # Retrieve information about input SD files... 658 sub RetrieveSDFilesInfo { 659 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $NewFileName, $NewSDFileName, $NewTextFileName, $CmpdCount); 660 661 %SDFilesInfo = (); 662 663 @{$SDFilesInfo{FileOkay}} = (); 664 @{$SDFilesInfo{CmpdCount}} = (); 665 @{$SDFilesInfo{NewTextFileName}} = (); 666 @{$SDFilesInfo{NewSDFileName}} = (); 667 668 @{$SDFilesInfo{AllDataFieldLabels}} = (); 669 @{$SDFilesInfo{CommonDataFieldLabels}} = (); 670 671 FILELIST: for $Index (0 .. $#SDFilesList) { 672 $SDFile = $SDFilesList[$Index]; 673 674 $SDFilesInfo{FileOkay}[$Index] = 0; 675 676 $SDFilesInfo{CmpdCount}[$Index] = 0; 677 $SDFilesInfo{NewTextFileName}[$Index] = ""; 678 $SDFilesInfo{NewSDFileName}[$Index] = ""; 679 680 @{$SDFilesInfo{AllDataFieldLabels}[$Index]} = (); 681 @{$SDFilesInfo{CommonDataFieldLabels}[$Index]} = (); 682 683 if (!(-e $SDFile)) { 684 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 685 next FILELIST; 686 } 687 688 if (!CheckFileType($SDFile, "sd sdf")) { 689 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 690 next FILELIST; 691 } 692 693 # Generate appropriate name for the new output file. 694 $FileDir = ""; $FileName = ""; $FileExt = ""; 695 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 696 $NewFileName = $FileName; 697 $NewFileName = $FileName . $OptionsInfo{FileNameMode}; 698 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { 699 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 700 if ($RootFileName && $RootFileExt) { 701 $NewFileName = $RootFileName; 702 } 703 else { 704 $NewFileName = $OptionsInfo{OutFileRoot}; 705 } 706 } 707 $NewSDFileName = $NewFileName . ".$OptionsInfo{SDFileExt}"; 708 $NewTextFileName = $NewFileName . ".$OptionsInfo{TextFileExt}"; 709 710 if ($OptionsInfo{OutputSDFile}) { 711 if (lc($NewSDFileName) eq lc($SDFile)) { 712 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; 713 print "Specify a different name using \"-r --root\" option or use default name.\n"; 714 next FILELIST; 715 } 716 } 717 718 if (!$OptionsInfo{Overwrite}) { 719 if ($OptionsInfo{OutputSDFile}) { 720 if (-e $NewSDFileName) { 721 warn "Warning: Ignoring file $SDFile: New file, $NewSDFileName, already exists\n"; 722 next FILELIST; 723 } 724 } 725 if ($OptionsInfo{OutputTextFile}) { 726 if (-e $NewTextFileName) { 727 warn "Warning: Ignoring file $SDFile: New file, $NewTextFileName, already exists\n"; 728 next FILELIST; 729 } 730 } 731 } 732 733 if (!open SDFILE, "$SDFile") { 734 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; 735 next FILELIST; 736 } 737 738 my($CountCmpds, $CollectDataFields); 739 my($CmpdString, @CmpdLines, @DataFieldLabels, %DataFieldLabelsMap,@CommonDataFieldLabels); 740 741 $CountCmpds = ($OptionsInfo{Mode} =~ /^randomcmpds$/i) ? 1 : 0; 742 743 $CollectDataFields = (($OptionsInfo{Mode} =~ /^(alldatafields|commondatafields|randomcmpds)$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldsbyvalue$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldbylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafielduniquebylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordrange$/i && $OptionsInfo{OutputTextFile})) ? 1 : 0; 744 745 $CmpdCount = 0; 746 if ($CountCmpds || $CollectDataFields) { 747 @DataFieldLabels = (); 748 @CommonDataFieldLabels = (); 749 %DataFieldLabelsMap = (); 750 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) { 751 $CmpdCount++; 752 if ($OptionsInfo{Mode} =~ /^recordnum$/i) { 753 if ($CmpdCount == $OptionsInfo{RecordNum}) { 754 @CmpdLines = split "\n", $CmpdString; 755 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); 756 last CMPDSTRING; 757 } 758 } 759 if ($CollectDataFields) { 760 my($Label); 761 @CmpdLines = split "\n", $CmpdString; 762 # Process compound data header labels and figure out which ones are present for 763 # all the compounds... 764 if (@DataFieldLabels) { 765 my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines); 766 my(%CmpdDataFieldLabelsMap) = (); 767 # Setup a map for the current labels... 768 for $Label (@CmpdDataFieldLabels) { 769 $CmpdDataFieldLabelsMap{$Label} = "PresentInSome"; 770 } 771 # Check the presence old labels for this compound; otherwise, mark 'em new... 772 for $Label (@DataFieldLabels) { 773 if (!$CmpdDataFieldLabelsMap{$Label}) { 774 $DataFieldLabelsMap{$Label} = "PresentInSome"; 775 } 776 } 777 # Check the presence this compound in the old labels; otherwise, add 'em... 778 for $Label (@CmpdDataFieldLabels ) { 779 if (!$DataFieldLabelsMap{$Label}) { 780 # It's a new label... 781 push @DataFieldLabels, $Label; 782 $DataFieldLabelsMap{$Label} = "PresentInSome"; 783 } 784 } 785 } 786 else { 787 # Get the initial label set and set up a map... 788 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); 789 for $Label (@DataFieldLabels) { 790 $DataFieldLabelsMap{$Label} = "PresentInAll"; 791 } 792 } 793 # Identify the common data field labels... 794 if ($Options{mode} =~ /^commondatafields$/i) { 795 @CommonDataFieldLabels = (); 796 for $Label (@DataFieldLabels) { 797 if ($DataFieldLabelsMap{$Label} eq "PresentInAll") { 798 push @CommonDataFieldLabels, $Label; 799 } 800 } 801 } 802 } 803 } 804 } 805 806 $SDFilesInfo{FileOkay}[$Index] = 1; 807 808 $SDFilesInfo{NewTextFileName}[$Index] = $NewTextFileName; 809 $SDFilesInfo{NewSDFileName}[$Index] = $NewSDFileName; 810 811 $SDFilesInfo{CmpdCount}[$Index] = $CmpdCount; 812 813 push @{$SDFilesInfo{AllDataFieldLabels}[$Index]}, @DataFieldLabels; 814 push @{$SDFilesInfo{CommonDataFieldLabels}[$Index]}, @CommonDataFieldLabels; 815 816 close SDFILE; 817 } 818 } 819 820 # Process options... 821 sub ProcessOptions { 822 %OptionsInfo = (); 823 824 $OptionsInfo{Mode} = $Options{mode}; 825 826 $OptionsInfo{InDelim} = "\,"; 827 if ($Options{indelim} =~ /^semicolon$/i) { 828 $OptionsInfo{InDelim} = "\;"; 829 } 830 elsif ($Options{indelim} =~ /^tab$/i) { 831 $OptionsInfo{InDelim} = "\t"; 832 } 833 834 $OptionsInfo{OutDelim} = "\,"; 835 if ($Options{outdelim} =~ /^semicolon$/i) { 836 $OptionsInfo{OutDelim} = "\;"; 837 } 838 elsif ($Options{outdelim} =~ /^tab$/i) { 839 $OptionsInfo{OutDelim} = "\t"; 840 } 841 842 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0; 843 844 $OptionsInfo{RegexIgnoreCase} = ($Options{regexignorecase} =~ /^yes$/i) ? 1 : 0; 845 846 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : undef; 847 $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef; 848 849 $OptionsInfo{NumOfCmpds} = $Options{numofcmpds}; 850 851 $OptionsInfo{ValueComparisonMode} = $Options{valuecomparisonmode}; 852 $OptionsInfo{NumericalComparison} = ($Options{valuecomparisonmode} =~ /^Numeric$/i) ? 1 : 0; 853 854 $OptionsInfo{Violations} = $Options{violations}; 855 $OptionsInfo{Seed} = $Options{seed}; 856 857 858 if ($Options{mode} =~ /^(datafields|datafieldsbyregex|datafieldsbyvalue|datafieldbylist|datafielduniquebylist)$/i) { 859 if ($Options{datafields} || $Options{datafieldsfile}) { 860 if ($Options{datafields} && $Options{datafieldsfile}) { 861 die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, or datafielduniquebylist, specify only one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n"; 862 } 863 } 864 else { 865 die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, or datafielduniquebylist, specify one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n"; 866 } 867 } 868 $OptionsInfo{DataFields} = $Options{datafields} ? $Options{datafields} : undef; 869 $OptionsInfo{DataFieldsFile} = $Options{datafieldsfile} ? $Options{datafieldsfile} : undef; 870 871 $OptionsInfo{RecordNum} = 0; $OptionsInfo{StartRecordNum} = 0; $OptionsInfo{EndRecordNum} = 0; 872 $OptionsInfo{Record} = $Options{record} ? $Options{record} : undef; 873 874 if ($Options{mode} =~ /^(recordnum|recordrange)$/i) { 875 if ($Options{record}) { 876 my(@RecordSplit) = split ",", $Options{record}; 877 if ($Options{mode} =~ /^recordnum$/i ) { 878 if (@RecordSplit == 1) { 879 $OptionsInfo{RecordNum} = $RecordSplit[0]; 880 if ($OptionsInfo{RecordNum} <= 0) { 881 die "Error: The value specified, $OptionsInfo{RecordNum}, for option \"--records\" is not valid. Allowed values: > 0 \n"; 882 } 883 } 884 else { 885 die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 1 value is allowed.\n"; 886 } 887 } 888 else { 889 if (@RecordSplit == 2) { 890 $OptionsInfo{StartRecordNum} = $RecordSplit[0]; 891 $OptionsInfo{EndRecordNum} = $RecordSplit[1]; 892 if ($OptionsInfo{StartRecordNum} <= 0 || $OptionsInfo{EndRecordNum} <= 0) { 893 die "Error: The value pair specified, $Options{record}, for option \"--records\" is not valid. Allowed values: > 0 \n"; 894 } 895 } 896 else { 897 die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 2 values is allowed.\n"; 898 } 899 if ($OptionsInfo{StartRecordNum} > $OptionsInfo{EndRecordNum}) { 900 die "Error: Start record number, $OptionsInfo{StartRecordNum}, must be smaller than end record number, $OptionsInfo{EndRecordNum}.\nSpecify different values using \"--record\" option.\n"; 901 } 902 } 903 } 904 else { 905 die "Error: For \"-m --mode\" option values recordnum, or recordrange, specify \"--record\" option value.\n"; 906 } 907 } 908 909 @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); 910 911 my(@Words, $Line, $Value); 912 if ($Options{mode} =~ /^datafields$/i) { 913 @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); 914 if ($Options{datafields}) { 915 @{$OptionsInfo{SpecifiedDataFieldLabels}} = split $OptionsInfo{InDelim}, $Options{datafields}; 916 } 917 elsif ($Options{datafieldsfile}) { 918 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; 919 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { 920 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); 921 if (@Words) { 922 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, @Words; 923 } 924 } 925 close DATAFIELDSFILE; 926 } 927 } 928 elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) { 929 my(@DataFieldsByValueTriplets); 930 @DataFieldsByValueTriplets = (); 931 if ($Options{datafields}) { 932 @DataFieldsByValueTriplets = split $OptionsInfo{InDelim}, $Options{datafields}; 933 } 934 elsif ($Options{datafieldsfile}) { 935 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; 936 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { 937 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); 938 if (@Words) { 939 push @DataFieldsByValueTriplets, @Words; 940 } 941 } 942 close DATAFIELDSFILE; 943 } 944 if ((@DataFieldsByValueTriplets % 3)) { 945 if ($Options{datafields}) { 946 die "Error: Triplets not found in values specified by \"-d --datafields\" option\n"; 947 } 948 elsif ($Options{datafieldsfile}) { 949 die "Error: Triplets not found in values specified by \"--datafieldsfile\" option\n"; 950 } 951 } 952 my($Index, $Label, $Value, $Criterion); 953 954 @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); 955 %{$OptionsInfo{SpecifiedDataFieldValuesMap}} = (); 956 %{$OptionsInfo{SpecifiedDataFieldCriteriaMap}} = (); 957 958 for ($Index = 0; $Index < @DataFieldsByValueTriplets; $Index = $Index + 3) { 959 $Label = $DataFieldsByValueTriplets[$Index]; 960 $Value = $DataFieldsByValueTriplets[$Index + 1]; 961 $Criterion = $DataFieldsByValueTriplets[$Index + 2]; 962 963 if ($Criterion =~ /^(eq|le|ge)$/i) { 964 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label; 965 $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label} = $Value; 966 $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label} = $Criterion; 967 } 968 else { 969 warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion\n"; 970 } 971 } 972 } 973 elsif ($Options{mode} =~ /^datafieldsbyregex$/i) { 974 my(@DataFieldsByRegexTriplets); 975 976 @DataFieldsByRegexTriplets = (); 977 if ($Options{datafields}) { 978 @DataFieldsByRegexTriplets = quotewords($OptionsInfo{InDelim}, 0, $Options{datafields}); 979 } 980 elsif ($Options{datafieldsfile}) { 981 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; 982 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { 983 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); 984 if (@Words) { 985 push @DataFieldsByRegexTriplets, @Words; 986 } 987 } 988 close DATAFIELDSFILE; 989 } 990 if ((@DataFieldsByRegexTriplets % 3)) { 991 if ($Options{datafields}) { 992 die "Error: Triplet not found in values specified by \"-d --datafields\" option\n"; 993 } 994 elsif ($Options{datafieldsfile}) { 995 die "Error: Triplet not found in values specified by \"--datafieldsfile\" option\n"; 996 } 997 } 998 999 my($Index, $Label, $Value, $Criterion); 1000 1001 @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); 1002 %{$OptionsInfo{SpecifiedDataFieldRegexMap}} = (); 1003 %{$OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}} = (); 1004 1005 for ($Index = 0; $Index < @DataFieldsByRegexTriplets; $Index = $Index + 3) { 1006 $Label = $DataFieldsByRegexTriplets[$Index]; 1007 $Value = $DataFieldsByRegexTriplets[$Index + 1]; 1008 $Criterion = $DataFieldsByRegexTriplets[$Index + 2]; 1009 1010 if ($Criterion =~ /^(eq|ne)$/i) { 1011 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label; 1012 $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label} = $Value; 1013 $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label} = $Criterion; 1014 } 1015 else { 1016 warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion; Supported values: eq or ne\n"; 1017 } 1018 } 1019 } 1020 elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist)$/i) { 1021 my($Index, @DataFieldAndValuesList); 1022 if ($Options{datafields}) { 1023 @DataFieldAndValuesList = split $OptionsInfo{InDelim}, $Options{datafields}; 1024 } 1025 elsif ($Options{datafieldsfile}) { 1026 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; 1027 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { 1028 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); 1029 if (@Words) { 1030 push @DataFieldAndValuesList, @Words; 1031 } 1032 } 1033 close DATAFIELDSFILE; 1034 } 1035 if (@DataFieldAndValuesList < 2) { 1036 if ($Options{datafields}) { 1037 die "Error: Invalid number of values specified by \"-d --datafields\" option\n"; 1038 } 1039 elsif ($Options{datafieldsfile}) { 1040 die "Error: Invalid number values specified by \"--datafieldsfile\" option\n"; 1041 } 1042 } 1043 1044 $OptionsInfo{SpecifiedDataFieldLabel} = $DataFieldAndValuesList[0]; 1045 $OptionsInfo{SpecifiedDataFieldValuesCount} = @DataFieldAndValuesList - 1; 1046 %{$OptionsInfo{SpecifiedDataFieldValues}} = (); 1047 1048 for ($Index = 1; $Index < @DataFieldAndValuesList; $Index++) { 1049 $Value = $DataFieldAndValuesList[$Index]; 1050 $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound"; 1051 } 1052 } 1053 1054 $OptionsInfo{SDFileExt} = "sdf"; 1055 $OptionsInfo{TextFileExt} = "csv"; 1056 1057 if ($Options{outdelim} =~ /^tab$/i) { 1058 $OptionsInfo{TextFileExt} = "tsv"; 1059 } 1060 1061 if ($Options{mode} =~ /^(alldatafields|molnames)$/i) { 1062 $OptionsInfo{OutputSDFile} = 0; 1063 $OptionsInfo{OutputTextFile} = 1; 1064 } 1065 else { 1066 $OptionsInfo{OutputSDFile} = ($Options{output} =~ /^(SD|both)$/i) ? 1 : 0; 1067 $OptionsInfo{OutputTextFile} = ($Options{output} =~ /^(text|both)$/i) ? 1 : 0; 1068 } 1069 1070 $OptionsInfo{StrDataString} = $Options{strdatastring}; 1071 $OptionsInfo{OutoutStrDataString} = ($Options{strdatastring} =~ /^Yes$/i) ? 1 : 0; 1072 1073 $OptionsInfo{StrDataStringDelimiter} = $Options{strdatastringdelimiter}; 1074 1075 if (IsEmpty($Options{strdatastringdelimiter})) { 1076 die "Error: No value specified for \"--StrDataStringDelimiter\" option.\n"; 1077 } 1078 $OptionsInfo{StrDataStringMode} = $Options{strdatastringmode}; 1079 $OptionsInfo{StrDataStringWithFields} = $Options{strdatastringmode} =~ /^StrAndDataFields$/i ? 1 : 0; 1080 1081 MODE: { 1082 if ($Options{mode} =~ /^alldatafields$/i) { $OptionsInfo{FileNameMode} = "AllDataDields"; last MODE; } 1083 if ($Options{mode} =~ /^commondatafields$/i) { $OptionsInfo{FileNameMode} = "CommonDataDields"; last MODE; } 1084 if ($Options{mode} =~ /^datafields$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFields"; last MODE; } 1085 if ($Options{mode} =~ /^datafieldsbyvalue$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByValue"; last MODE; } 1086 if ($Options{mode} =~ /^datafieldsbyregex$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByRegex"; last MODE; } 1087 if ($Options{mode} =~ /^datafieldbylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataField"; last MODE; } 1088 if ($Options{mode} =~ /^datafielduniquebylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedUniqueDataField"; last MODE; } 1089 if ($Options{mode} =~ /^molnames$/i) { $OptionsInfo{FileNameMode} = "MolName"; last MODE; } 1090 if ($Options{mode} =~ /^randomcmpds$/i) { $OptionsInfo{FileNameMode} = "RandomCmpds"; last MODE; } 1091 if ($Options{mode} =~ /^recordnum$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{RecordNum}"; last MODE; } 1092 if ($Options{mode} =~ /^recordrange$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{StartRecordNum}" . "To" . "$OptionsInfo{EndRecordNum}"; last MODE; } 1093 if ($Options{mode} =~ /^2dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "2DCmpdRecords"; last MODE; } 1094 if ($Options{mode} =~ /^3dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "3DCmpdRecords"; last MODE; } 1095 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, molnames, randomcmpds, recordnum, recordrange, 2dcmpdrecords, 3dcmpdrecords\n"; 1096 } 1097 1098 } 1099 1100 # Setup script usage and retrieve command line arguments specified using various options... 1101 sub SetupScriptUsage { 1102 1103 # Retrieve all the options... 1104 %Options = (); 1105 $Options{numofcmpds} = 1; 1106 $Options{mode} = "alldatafields"; 1107 $Options{indelim} = "comma"; 1108 $Options{outdelim} = "comma"; 1109 $Options{output} = "SD"; 1110 $Options{quote} = "yes"; 1111 $Options{regexignorecase} = "yes"; 1112 $Options{valuecomparisonmode} = "numeric"; 1113 $Options{violations} = 0; 1114 $Options{seed} = 123456789; 1115 1116 $Options{strdatastring} = "no"; 1117 $Options{strdatastringdelimiter} = "|"; 1118 $Options{strdatastringmode} = "StrOnly"; 1119 1120 if (!GetOptions(\%Options, "help|h", "datafields|d=s", "datafieldsfile=s", "indelim=s", "mode|m=s", "numofcmpds|n=i", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "regexignorecase=s", "record=s", "root|r=s", "seed|s=i", "strdatastring=s", "strdatastringdelimiter=s", "strdatastringmode=s", "valuecomparisonmode=s", "violations|v=i", "workingdir|w=s")) { 1121 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 1122 } 1123 if ($Options{workingdir}) { 1124 if (! -d $Options{workingdir}) { 1125 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 1126 } 1127 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 1128 } 1129 if ($Options{numofcmpds} < 1) { 1130 die "Error: The value specified, $Options{numofcmpds}, for option \"-n --numofcmpds\" is not valid. Allowed values: >= 1 \n"; 1131 } 1132 if ($Options{valuecomparisonmode} !~ /^(Numeric|Alphanumeric)$/i) { 1133 die "Error: The value specified, $Options{valuecomparisonmode}, for option \"--ValueComparisonMode\" is not valid. Allowed values: Numeric or Alphanumeric\n"; 1134 } 1135 if ($Options{violations} < 0) { 1136 die "Error: The value specified, $Options{violations}, for option \"-v --violations\" is not valid. Allowed values: >= 0 \n"; 1137 } 1138 if ($Options{mode} !~ /^(alldatafields|commondatafields|datafields|datafieldsbyvalue|datafieldsbyregex|datafieldbylist|datafielduniquebylist|molnames|randomcmpds|recordnum|recordrange|2dcmpdrecords|3dcmpdrecords)$/i) { 1139 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, molnames, randomcmpds, recordnum, recordrange, 2dcmpdrecords, 3dcmpdrecords\n"; 1140 } 1141 if ($Options{output} !~ /^(SD|text|both)$/i) { 1142 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n"; 1143 } 1144 if ($Options{indelim} !~ /^(comma|semicolon|tab)$/i) { 1145 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 1146 } 1147 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 1148 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 1149 } 1150 if ($Options{quote} !~ /^(yes|no)$/i) { 1151 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; 1152 } 1153 if ($Options{regexignorecase} !~ /^(yes|no)$/i) { 1154 die "Error: The value specified, $Options{regexignorecase}, for option \"--regexignorecase\" is not valid. Allowed values: yes or no\n"; 1155 } 1156 if ($Options{strdatastring} !~ /^(yes|no)$/i) { 1157 die "Error: The value specified, $Options{strdatastring}, for option \"--StrDataString\" is not valid. Allowed values: yes or no\n"; 1158 } 1159 if ($Options{strdatastringmode} !~ /^(StrOnly|StrAndDataFields)$/i) { 1160 die "Error: The value specified, $Options{strdatastringmode}, for option \"--StrDataStringMode\" is not valid. Allowed values: StrOnly or StrAndDataFields\n"; 1161 } 1162 } 1163