1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: ExtractFromSDFiles.pl,v $ 4 # $Date: 2010/07/18 17:14:05 $ 5 # $Revision: 1.35 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2010 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use SDFileUtil; 36 use FileUtil; 37 use TextUtil; 38 39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 40 41 # Autoflush STDOUT 42 $| = 1; 43 44 # Starting message... 45 $ScriptName = basename($0); 46 print "\n$ScriptName:Starting...\n\n"; 47 $StartTime = new Benchmark; 48 49 # Get the options and setup script... 50 SetupScriptUsage(); 51 if ($Options{help} || @ARGV < 1) { 52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 53 } 54 55 my(@SDFilesList); 56 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 57 58 print "Processing options...\n"; 59 my($InDelim, $OutDelim, $OutQuote, $RecordNum, $StartRecordNum, $EndRecordNum, $FileNameMode, $SDFileExt, $TextFileExt, $OutputSDFileFlag, $OutputTextFileFlag, $OutoutStrDataStringFlag, $StrDataStringDelimiter, $StrDataStringWithFieldsFlag, @SpecifiedDataFieldLabels, %SpecifiedDataFieldValuesMap, %SpecifiedDataFieldCriteriaMap, $SpecifiedDataFieldLabel, $SpecifiedDataFieldValuesCount, %SpecifiedDataFieldValues); 60 ProcessOptions(); 61 62 # Collect information about SD files... 63 print "Checking input SD file(s)...\n"; 64 my(@SDFilesOkay, @SDFilesCmpdCount, @SDFilesNewTextFileName, @SDFilesNewSDFileName, @SDFilesAllDataFieldLabels, @SDFilesCommonDataFieldLabels); 65 RetrieveSDFilesInfo(); 66 67 my($SDFile, $Index, $CmpdString, @CmpdLines, @DataLabels, @DataValues, %DataFieldValues, $DataValuesLine, $ColLabelsLine, $Label, $MolName, $CmpdNum, $CmpdCount, %RandomCmpdIndexMap, $SpecifiedDataFieldValuesFoundCount, @Words, $Line, $Value); 68 69 if (@SDFilesList > 1) { 70 print "Processing SD files...\n"; 71 } 72 SDFILE: for $Index (0 .. $#SDFilesList) { 73 if (!$SDFilesOkay[$Index]) { 74 next SDFILE; 75 } 76 $SDFile = $SDFilesList[$Index]; 77 if (@SDFilesList > 1) { 78 print "\nProcessing file $SDFile...\n"; 79 } 80 else { 81 print "Processing file $SDFile...\n" 82 } 83 # Open output files... 84 if ($OutputTextFileFlag && $OutputSDFileFlag) { 85 print "Generating $SDFilesNewSDFileName[$Index] and $SDFilesNewTextFileName[$Index]...\n"; 86 } 87 elsif ($OutputSDFileFlag) { 88 print "Generating $SDFilesNewSDFileName[$Index]...\n"; 89 } 90 else { 91 print "Generating $SDFilesNewTextFileName[$Index]...\n"; 92 } 93 if ($OutputSDFileFlag) { 94 open NEWSDFILE, ">$SDFilesNewSDFileName[$Index]" or die "Error: Couldn't open $SDFilesNewSDFileName[$Index]: $! \n"; 95 } 96 if ($OutputTextFileFlag) { 97 open NEWTEXTFILE, ">$SDFilesNewTextFileName[$Index]" or die "Error: Couldn't open $SDFilesNewTextFileName[$Index]: $! \n"; 98 } 99 # Prepare for mode specific processing.... 100 @DataLabels = (); 101 if ($Options{mode} =~ /^alldatafields$/i) { 102 @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]}; 103 } 104 elsif ($Options{mode} =~ /^commondatafields$/i) { 105 @DataLabels = @{$SDFilesCommonDataFieldLabels[$Index]}; 106 } 107 elsif ($Options{mode} =~ /^datafields$/i) { 108 @DataLabels = @SpecifiedDataFieldLabels; 109 } 110 elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist)$/i) { 111 for $Value (keys %SpecifiedDataFieldValues) { 112 $SpecifiedDataFieldValues{$Value} = "NotFound"; 113 } 114 $SpecifiedDataFieldValuesFoundCount = 0; 115 @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]}; 116 } 117 elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) { 118 @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]}; 119 } 120 elsif ($Options{mode} =~ /^randomcmpds$/i) { 121 my($RandomCycleCount, $RandomIndex); 122 $CmpdCount = $SDFilesCmpdCount[$Index]; 123 %RandomCmpdIndexMap = (); 124 srand($Options{seed}); 125 $RandomCycleCount = 0; 126 while ($RandomCycleCount <= $CmpdCount && $RandomCycleCount <= $Options{numofcmpds}) { 127 $RandomCycleCount++; 128 $RandomIndex = int (rand $CmpdCount) + 1; 129 $RandomCmpdIndexMap{$RandomIndex} = $RandomIndex; 130 } 131 @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]}; 132 } 133 elsif ($Options{mode} =~ /^molnames$/i) { 134 push @DataLabels, "MolName"; 135 } 136 elsif ($Options{mode} =~ /^recordnum$/i) { 137 @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]}; 138 } 139 elsif ($Options{mode} =~ /^recordrange$/i) { 140 @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]}; 141 } 142 143 if ($OutputTextFileFlag) { 144 if ($OutoutStrDataStringFlag) { 145 # Append structure data string label... 146 my(@NewDataLabels); 147 148 @NewDataLabels = (); 149 push @NewDataLabels, @DataLabels; 150 push @NewDataLabels, "StructureDataString"; 151 152 $ColLabelsLine = JoinWords(\@NewDataLabels, $OutDelim, $OutQuote); 153 } 154 else { 155 $ColLabelsLine = JoinWords(\@DataLabels, $OutDelim, $OutQuote); 156 } 157 print NEWTEXTFILE "$ColLabelsLine\n"; 158 } 159 160 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 161 $CmpdNum = 0; 162 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) { 163 $CmpdNum++; 164 @DataValues = (); 165 166 if ($Options{mode} =~ /^(alldatafields|commondatafields|datafields)$/i) { 167 @CmpdLines = split "\n", $CmpdString; 168 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 169 170 SetupDataValues(); 171 WriteTextFileCmpdData(); 172 WriteSDFileCmpdData(); 173 } 174 elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist)$/i) { 175 my($CurrentValue); 176 177 @CmpdLines = split "\n", $CmpdString; 178 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 179 180 SetupDataValues(); 181 if (exists $DataFieldValues{$SpecifiedDataFieldLabel}) { 182 $CurrentValue = $DataFieldValues{$SpecifiedDataFieldLabel}; 183 if (exists $SpecifiedDataFieldValues{$CurrentValue}) { 184 if ($SpecifiedDataFieldValuesFoundCount < $SpecifiedDataFieldValuesCount) { 185 if ($SpecifiedDataFieldValues{$CurrentValue} eq "NotFound") { 186 $SpecifiedDataFieldValuesFoundCount++; 187 $SpecifiedDataFieldValues{$CurrentValue} = "Found"; 188 if ($Options{mode} =~ /^datafielduniquebylist$/i) { 189 WriteSDFileCmpdString(); 190 WriteTextFileCmpdData(); 191 } 192 } 193 if ($Options{mode} =~ /^datafieldbylist$/i) { 194 WriteSDFileCmpdString(); 195 WriteTextFileCmpdData(); 196 } 197 } 198 if ($SpecifiedDataFieldValuesFoundCount >= $SpecifiedDataFieldValuesCount) { 199 last CMPDSTRING; 200 } 201 } 202 } 203 } 204 elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) { 205 my($CurrentValue, $SpecifiedCriterion, $SpecifiedValue, $ViolationCount, $Nothing); 206 207 @CmpdLines = split "\n", $CmpdString; 208 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 209 210 SetupDataValues(); 211 $ViolationCount = 0; 212 for $Label (@SpecifiedDataFieldLabels) { 213 if (exists $DataFieldValues{$Label}) { 214 $CurrentValue = $DataFieldValues{$Label}; 215 $SpecifiedCriterion = $SpecifiedDataFieldCriteriaMap{$Label}; 216 $SpecifiedValue = $SpecifiedDataFieldValuesMap{$Label}; 217 SWITCH: { 218 if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue ne $SpecifiedValue) { $ViolationCount++; last SWITCH; } } 219 if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue gt $SpecifiedValue) { $ViolationCount++; } } 220 if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue lt $SpecifiedValue) { $ViolationCount++; } } 221 $Nothing = 1; 222 } 223 } 224 } 225 if ($ViolationCount <= $Options{violations}) { 226 WriteSDFileCmpdString(); 227 WriteTextFileCmpdData(); 228 } 229 } 230 elsif ($Options{mode} =~ /^randomcmpds$/i) { 231 if (exists $RandomCmpdIndexMap{$CmpdNum}) { 232 @CmpdLines = split "\n", $CmpdString; 233 234 WriteSDFileCmpdString(); 235 if ($OutputTextFileFlag) { 236 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 237 SetupDataValues(); 238 WriteTextFileCmpdData(); 239 } 240 } 241 } 242 elsif ($Options{mode} =~ /^molnames$/i) { 243 @CmpdLines = split "\n", $CmpdString; 244 $MolName = QuoteAWord(ParseCmpdMolNameLine($CmpdLines[0]), $OutQuote); 245 print NEWTEXTFILE "$MolName\n"; 246 } 247 elsif ($Options{mode} =~ /^recordnum$/i) { 248 if ($CmpdNum == $RecordNum) { 249 @CmpdLines = split "\n", $CmpdString; 250 251 WriteSDFileCmpdString(); 252 if ($OutputTextFileFlag) { 253 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 254 SetupDataValues(); 255 WriteTextFileCmpdData(); 256 } 257 last CMPDSTRING; 258 } 259 } 260 elsif ($Options{mode} =~ /^recordrange$/i) { 261 if ($CmpdNum >= $StartRecordNum && $CmpdNum <= $EndRecordNum) { 262 @CmpdLines = split "\n", $CmpdString; 263 264 WriteSDFileCmpdString(); 265 if ($OutputTextFileFlag) { 266 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 267 SetupDataValues(); 268 WriteTextFileCmpdData(); 269 } 270 } 271 elsif ($CmpdNum > $EndRecordNum) { 272 last CMPDSTRING; 273 } 274 } 275 } 276 close SDFILE; 277 278 if ($OutputSDFileFlag) { 279 close NEWSDFILE; 280 } 281 if ($OutputTextFileFlag) { 282 close NEWTEXTFILE; 283 } 284 } 285 print "$ScriptName:Done...\n\n"; 286 287 $EndTime = new Benchmark; 288 $TotalTime = timediff ($EndTime, $StartTime); 289 print "Total time: ", timestr($TotalTime), "\n"; 290 291 ############################################################################### 292 293 # Process options... 294 sub ProcessOptions { 295 $InDelim = "\,"; 296 if ($Options{indelim} =~ /^semicolon$/i) { 297 $InDelim = "\;"; 298 } 299 elsif ($Options{indelim} =~ /^tab$/i) { 300 $InDelim = "\t"; 301 } 302 $OutDelim = "\,"; 303 if ($Options{outdelim} =~ /^semicolon$/i) { 304 $OutDelim = "\;"; 305 } 306 elsif ($Options{outdelim} =~ /^tab$/i) { 307 $OutDelim = "\t"; 308 } 309 $OutQuote = 1; 310 if ($Options{quote} =~ /^no$/i) { 311 $OutQuote = 0; 312 } 313 if ($Options{mode} =~ /^(datafields|datafieldsbyvalue|datafieldbylist|datafielduniquebylist)$/i) { 314 if ($Options{datafields} || $Options{datafieldsfile}) { 315 if ($Options{datafields} && $Options{datafieldsfile}) { 316 die "Error: For \"-m --mode\" option values datafields, datafieldsbyvalue, datafieldbylist, or datafielduniquebylist, specify only one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n"; 317 } 318 } 319 else { 320 die "Error: For \"-m --mode\" option values datafields, datafieldsbyvalue, datafieldbylist, or datafielduniquebylist, specify one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n"; 321 } 322 } 323 $RecordNum = 0; $StartRecordNum = 0; $EndRecordNum = 0; 324 if ($Options{mode} =~ /^(recordnum|recordrange)$/i) { 325 if ($Options{record}) { 326 my(@RecordSplit) = split ",", $Options{record}; 327 if ($Options{mode} =~ /^recordnum$/i ) { 328 if (@RecordSplit == 1) { 329 $RecordNum = $RecordSplit[0]; 330 if ($RecordNum <= 0) { 331 die "Error: The value specified, $RecordNum, for option \"--records\" is not valid. Allowed values: > 0 \n"; 332 } 333 } 334 else { 335 die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 1 value is allowed.\n"; 336 } 337 } 338 else { 339 if (@RecordSplit == 2) { 340 $StartRecordNum = $RecordSplit[0]; 341 $EndRecordNum = $RecordSplit[1]; 342 if ($StartRecordNum <= 0 || $EndRecordNum <= 0) { 343 die "Error: The value pair specified, $Options{record}, for option \"--records\" is not valid. Allowed values: > 0 \n"; 344 } 345 } 346 else { 347 die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 2 values is allowed.\n"; 348 } 349 if ($StartRecordNum > $EndRecordNum) { 350 die "Error: Start record number, $StartRecordNum, must be smaller than end record number, $EndRecordNum.\nSpecify different values using \"--record\" option.\n"; 351 } 352 } 353 } 354 else { 355 die "Error: For \"-m --mode\" option values recordnum, or recordrange, specify \"--record\" option value.\n"; 356 } 357 } 358 my(@Words, $Line, $Value); 359 if ($Options{mode} =~ /^datafields$/i) { 360 @SpecifiedDataFieldLabels = (); 361 if ($Options{datafields}) { 362 @SpecifiedDataFieldLabels = split "$InDelim", $Options{datafields}; 363 } 364 elsif ($Options{datafieldsfile}) { 365 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; 366 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { 367 @Words = quotewords($InDelim, 0, $Line); 368 if (@Words) { 369 push @SpecifiedDataFieldLabels, @Words; 370 } 371 } 372 close DATAFIELDSFILE; 373 } 374 } 375 elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) { 376 my(@DataFieldsByValueTriplets); 377 @DataFieldsByValueTriplets = (); 378 if ($Options{datafields}) { 379 @DataFieldsByValueTriplets = split "$InDelim", $Options{datafields}; 380 } 381 elsif ($Options{datafieldsfile}) { 382 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; 383 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { 384 @Words = quotewords($InDelim, 0, $Line); 385 if (@Words) { 386 push @DataFieldsByValueTriplets, @Words; 387 } 388 } 389 close DATAFIELDSFILE; 390 } 391 if ((@DataFieldsByValueTriplets % 3)) { 392 if ($Options{datafields}) { 393 die "Error: Triplets not found in values specified by \"-d --datafields\" option\n"; 394 } 395 elsif ($Options{datafieldsfile}) { 396 die "Error: Triplets not found in values specified by \"--datafieldsfile\" option\n"; 397 } 398 } 399 @SpecifiedDataFieldLabels = (); 400 %SpecifiedDataFieldValuesMap = (); 401 %SpecifiedDataFieldCriteriaMap = (); 402 for ($Index = 0; $Index < @DataFieldsByValueTriplets; $Index = $Index + 3) { 403 $Label = $DataFieldsByValueTriplets[$Index]; 404 my($Value) = $DataFieldsByValueTriplets[$Index + 1]; 405 my($Criterion) = $DataFieldsByValueTriplets[$Index + 2]; 406 if ($Criterion =~ /^(eq|le|ge)$/i) { 407 push @SpecifiedDataFieldLabels, $Label; 408 $SpecifiedDataFieldValuesMap{$Label} = $Value; 409 $SpecifiedDataFieldCriteriaMap{$Label} = $Criterion; 410 } 411 else { 412 warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion\n"; 413 } 414 } 415 } 416 elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist)$/i) { 417 my(@DataFieldAndValuesList); 418 if ($Options{datafields}) { 419 @DataFieldAndValuesList = split "$InDelim", $Options{datafields}; 420 } 421 elsif ($Options{datafieldsfile}) { 422 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; 423 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { 424 @Words = quotewords($InDelim, 0, $Line); 425 if (@Words) { 426 push @DataFieldAndValuesList, @Words; 427 } 428 } 429 close DATAFIELDSFILE; 430 } 431 if (@DataFieldAndValuesList < 2) { 432 if ($Options{datafields}) { 433 die "Error: Invalid number of values specified by \"-d --datafields\" option\n"; 434 } 435 elsif ($Options{datafieldsfile}) { 436 die "Error: Invalid number values specified by \"--datafieldsfile\" option\n"; 437 } 438 } 439 $SpecifiedDataFieldLabel = $DataFieldAndValuesList[0]; 440 %SpecifiedDataFieldValues = (); 441 $SpecifiedDataFieldValuesCount = @DataFieldAndValuesList - 1; 442 for ($Index = 1; $Index < @DataFieldAndValuesList; $Index++) { 443 $Value = $DataFieldAndValuesList[$Index]; 444 $SpecifiedDataFieldValues{$Value} = "NotFound"; 445 } 446 } 447 448 $SDFileExt = "sdf"; 449 $TextFileExt = "csv"; 450 if ($Options{outdelim} =~ /^tab$/i) { 451 $TextFileExt = "tsv"; 452 } 453 if ($Options{mode} =~ /^(alldatafields|molnames)$/i) { 454 $OutputSDFileFlag = 0; 455 $OutputTextFileFlag = 1; 456 } 457 else { 458 $OutputSDFileFlag = ($Options{output} =~ /^(SD|both)$/i) ? 1 : 0; 459 $OutputTextFileFlag = ($Options{output} =~ /^(text|both)$/i) ? 1 : 0; 460 } 461 462 $OutoutStrDataStringFlag = ($Options{strdatastring} =~ /^Yes$/i) ? 1 : 0; 463 $StrDataStringDelimiter = $Options{strdatastringdelimiter}; 464 if (IsEmpty($StrDataStringDelimiter)) { 465 die "Error: No value specified for \"--StrDataStringDelimiter\" option.\n"; 466 } 467 $StrDataStringWithFieldsFlag = $Options{strdatastringmode} =~ /^StrAndDataFields$/i ? 1 : 0; 468 469 my($Nothing); 470 SWITCH: { 471 if ($Options{mode} =~ /^alldatafields$/i) { $FileNameMode = "AllDataDields"; last SWITCH; } 472 if ($Options{mode} =~ /^commondatafields$/i) { $FileNameMode = "CommonDataDields"; last SWITCH; } 473 if ($Options{mode} =~ /^datafields$/i) { $FileNameMode = "SpecifiedDataFields"; last SWITCH; } 474 if ($Options{mode} =~ /^datafieldsbyvalue$/i) { $FileNameMode = "SpecifiedDataFieldsByValue"; last SWITCH; } 475 if ($Options{mode} =~ /^datafieldbylist$/i) { $FileNameMode = "SpecifiedDataField"; last SWITCH; } 476 if ($Options{mode} =~ /^datafielduniquebylist$/i) { $FileNameMode = "SpecifiedUniqueDataField"; last SWITCH; } 477 if ($Options{mode} =~ /^molnames$/i) { $FileNameMode = "MolName"; last SWITCH; } 478 if ($Options{mode} =~ /^randomcmpds$/i) { $FileNameMode = "RandomCmpds"; last SWITCH; } 479 if ($Options{mode} =~ /^recordnum$/i) { $FileNameMode = "RecordNum$RecordNum"; last SWITCH; } 480 if ($Options{mode} =~ /^recordrange$/i) { $FileNameMode = "RecordNum$StartRecordNum" . "To" . "$EndRecordNum"; last SWITCH; } 481 $Nothing = 1; 482 } 483 } 484 485 # Retrieve information about input SD files... 486 sub RetrieveSDFilesInfo { 487 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $NewFileName, $NewSDFileName, $NewTextFileName, $CmpdCount); 488 489 @SDFilesOkay = (); 490 @SDFilesCmpdCount = (); 491 @SDFilesNewTextFileName = (); 492 @SDFilesNewSDFileName = (); 493 494 @SDFilesAllDataFieldLabels = (); 495 @SDFilesCommonDataFieldLabels = (); 496 497 FILELIST: for $Index (0 .. $#SDFilesList) { 498 $SDFile = $SDFilesList[$Index]; 499 $SDFilesOkay[$Index] = 0; 500 $SDFilesCmpdCount[$Index] = 0; 501 $SDFilesNewTextFileName[$Index] = ""; 502 $SDFilesNewSDFileName[$Index] = ""; 503 504 @{$SDFilesAllDataFieldLabels[$Index]} = (); 505 @{$SDFilesCommonDataFieldLabels[$Index]} = (); 506 507 if (!(-e $SDFile)) { 508 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 509 next FILELIST; 510 } 511 if (!CheckFileType($SDFile, "sd sdf")) { 512 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 513 next FILELIST; 514 } 515 # Generate appropriate name for the new output file. 516 $FileDir = ""; $FileName = ""; $FileExt = ""; 517 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 518 $NewFileName = $FileName; 519 $NewFileName = $FileName . "$FileNameMode"; 520 if ($Options{root} && (@SDFilesList == 1)) { 521 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); 522 if ($RootFileName && $RootFileExt) { 523 $NewFileName = $RootFileName; 524 } 525 else { 526 $NewFileName = $Options{root}; 527 } 528 } 529 $NewSDFileName = $NewFileName . ".$SDFileExt"; 530 $NewTextFileName = $NewFileName . ".$TextFileExt"; 531 if ($OutputSDFileFlag) { 532 if (lc($NewSDFileName) eq lc($SDFile)) { 533 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; 534 print "Specify a different name using \"-r --root\" option or use default name.\n"; 535 next FILELIST; 536 } 537 } 538 if (!$Options{overwrite}) { 539 if ($OutputSDFileFlag) { 540 if (-e $NewSDFileName) { 541 warn "Warning: Ignoring file $SDFile: New file, $NewSDFileName, already exists\n"; 542 next FILELIST; 543 } 544 } 545 if ($OutputTextFileFlag) { 546 if (-e $NewTextFileName) { 547 warn "Warning: Ignoring file $SDFile: New file, $NewTextFileName, already exists\n"; 548 next FILELIST; 549 } 550 } 551 } 552 if (!open SDFILE, "$SDFile") { 553 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; 554 next FILELIST; 555 } 556 557 my($CountCmpdsFlag, $CollectDataFieldsFlag); 558 my($CmpdString, @CmpdLines, @DataFieldLabels, %DataFieldLabelsMap,@CommonDataFieldLabels); 559 560 $CountCmpdsFlag = ($Options{mode} =~ /^randomcmpds$/i) ? 1 : 0; 561 562 $CollectDataFieldsFlag = (($Options{mode} =~ /^(alldatafields|commondatafields|randomcmpds)$/i && $OutputTextFileFlag) || ($Options{mode} =~ /^datafieldsbyvalue$/i && $OutputTextFileFlag) || ($Options{mode} =~ /^datafieldbylist$/i && $OutputTextFileFlag) || ($Options{mode} =~ /^datafielduniquebylist$/i && $OutputTextFileFlag) || ($Options{mode} =~ /^recordrange$/i && $OutputTextFileFlag)) ? 1 : 0; 563 564 $CmpdCount = 0; 565 if ($CountCmpdsFlag || $CollectDataFieldsFlag) { 566 @DataFieldLabels = (); 567 @CommonDataFieldLabels = (); 568 %DataFieldLabelsMap = (); 569 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) { 570 $CmpdCount++; 571 if ($Options{mode} =~ /^recordnum$/i) { 572 if ($CmpdCount == $RecordNum) { 573 @CmpdLines = split "\n", $CmpdString; 574 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); 575 last CMPDSTRING; 576 } 577 } 578 if ($CollectDataFieldsFlag) { 579 my($Label); 580 @CmpdLines = split "\n", $CmpdString; 581 # Process compound data header labels and figure out which ones are present for 582 # all the compounds... 583 if (@DataFieldLabels) { 584 my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines); 585 my(%CmpdDataFieldLabelsMap) = (); 586 # Setup a map for the current labels... 587 for $Label (@CmpdDataFieldLabels) { 588 $CmpdDataFieldLabelsMap{$Label} = "PresentInSome"; 589 } 590 # Check the presence old labels for this compound; otherwise, mark 'em new... 591 for $Label (@DataFieldLabels) { 592 if (!$CmpdDataFieldLabelsMap{$Label}) { 593 $DataFieldLabelsMap{$Label} = "PresentInSome"; 594 } 595 } 596 # Check the presence this compound in the old labels; otherwise, add 'em... 597 for $Label (@CmpdDataFieldLabels ) { 598 if (!$DataFieldLabelsMap{$Label}) { 599 # It's a new label... 600 push @DataFieldLabels, $Label; 601 $DataFieldLabelsMap{$Label} = "PresentInSome"; 602 } 603 } 604 } 605 else { 606 # Get the initial label set and set up a map... 607 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); 608 for $Label (@DataFieldLabels) { 609 $DataFieldLabelsMap{$Label} = "PresentInAll"; 610 } 611 } 612 # Identify the common data field labels... 613 if ($Options{mode} =~ /^commondatafields$/i) { 614 @CommonDataFieldLabels = (); 615 for $Label (@DataFieldLabels) { 616 if ($DataFieldLabelsMap{$Label} eq "PresentInAll") { 617 push @CommonDataFieldLabels, $Label; 618 } 619 } 620 } 621 } 622 } 623 } 624 625 $SDFilesOkay[$Index] = 1; 626 $SDFilesNewTextFileName[$Index] = "$NewTextFileName"; 627 $SDFilesNewSDFileName[$Index] = "$NewSDFileName"; 628 629 $SDFilesCmpdCount[$Index] = $CmpdCount; 630 push @{$SDFilesAllDataFieldLabels[$Index]}, @DataFieldLabels; 631 push @{$SDFilesCommonDataFieldLabels[$Index]}, @CommonDataFieldLabels; 632 633 close SDFILE; 634 } 635 } 636 637 # Setup values for data fields... 638 sub SetupDataValues { 639 @DataValues = (); 640 for $Label (@DataLabels) { 641 if (exists $DataFieldValues{$Label}) { 642 push @DataValues, $DataFieldValues{$Label}; 643 } 644 else { 645 push @DataValues, ""; 646 } 647 } 648 } 649 650 # Write out structure data and specific data fields to SD file... 651 sub WriteSDFileCmpdData { 652 my($Count); 653 if ($OutputSDFileFlag) { 654 my($MolString) = split "M END", $CmpdString; 655 $MolString .= "M END"; 656 print NEWSDFILE "$MolString\n"; 657 for $Count (0 .. $#DataLabels) { 658 print NEWSDFILE "> <$DataLabels[$Count]>\n$DataValues[$Count]\n\n"; 659 } 660 print NEWSDFILE "\$\$\$\$\n"; 661 } 662 } 663 664 # Write out compound string... 665 sub WriteSDFileCmpdString { 666 if ($OutputSDFileFlag) { 667 print NEWSDFILE "$CmpdString\n"; 668 } 669 } 670 671 # Write out data for text file... 672 sub WriteTextFileCmpdData { 673 if ($OutputTextFileFlag) { 674 $DataValuesLine = JoinWords(\@DataValues, $OutDelim, $OutQuote); 675 676 # Handle multiple lines data values for data fields by joining 'em using semicolons... 677 if ($DataValuesLine =~ /\n/) { 678 $DataValuesLine =~ s/\n/;/g; 679 } 680 681 if ($OutoutStrDataStringFlag) { 682 # Append structure data string... 683 my($StrDataString, $OutQuoteValue); 684 685 if ($StrDataStringWithFieldsFlag) { 686 $StrDataString = $CmpdString; 687 } 688 else { 689 ($StrDataString) = split "M END", $CmpdString; 690 $StrDataString .= "M END"; 691 } 692 $StrDataString =~ s/\n/$StrDataStringDelimiter/g; 693 $OutQuoteValue = $OutDelim ? "\"" : ""; 694 695 print NEWTEXTFILE "$DataValuesLine${OutDelim}${OutQuoteValue}${StrDataString}${OutQuoteValue}\n"; 696 } 697 else { 698 print NEWTEXTFILE "$DataValuesLine\n"; 699 } 700 } 701 } 702 703 # Setup script usage and retrieve command line arguments specified using various options... 704 sub SetupScriptUsage { 705 706 # Retrieve all the options... 707 %Options = (); 708 $Options{numofcmpds} = 1; 709 $Options{mode} = "alldatafields"; 710 $Options{indelim} = "comma"; 711 $Options{outdelim} = "comma"; 712 $Options{output} = "SD"; 713 $Options{quote} = "yes"; 714 $Options{violations} = 0; 715 $Options{seed} = 999999999; 716 717 $Options{strdatastring} = "no"; 718 $Options{strdatastringdelimiter} = "|"; 719 $Options{strdatastringmode} = "StrOnly"; 720 721 if (!GetOptions(\%Options, "help|h", "datafields|d=s", "datafieldsfile=s", "indelim=s", "mode|m=s", "numofcmpds|n=i", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "record=s", "root|r=s", "seed|s=i", "strdatastring=s", "strdatastringdelimiter=s", "strdatastringmode=s", "violations|v=i", "workingdir|w=s")) { 722 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 723 } 724 if ($Options{workingdir}) { 725 if (! -d $Options{workingdir}) { 726 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 727 } 728 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 729 } 730 if ($Options{numofcmpds} < 1) { 731 die "Error: The value specified, $Options{numofcmpds}, for option \"-n --numofcmpds\" is not valid. Allowed values: >= 1 \n"; 732 } 733 if ($Options{violations} < 0) { 734 die "Error: The value specified, $Options{violations}, for option \"-v --violations\" is not valid. Allowed values: >= 0 \n"; 735 } 736 if ($Options{mode} !~ /^(alldatafields|commondatafields|datafields|datafieldsbyvalue|datafieldbylist|datafielduniquebylist|molnames|randomcmpds|recordnum|recordrange)$/i) { 737 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, molnames, randomcmpds, recordnum, recordrange\n"; 738 } 739 if ($Options{output} !~ /^(SD|text|both)$/i) { 740 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n"; 741 } 742 if ($Options{indelim} !~ /^(comma|semicolon|tab)$/i) { 743 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 744 } 745 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 746 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 747 } 748 if ($Options{quote} !~ /^(yes|no)$/i) { 749 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; 750 } 751 if ($Options{strdatastring} !~ /^(yes|no)$/i) { 752 die "Error: The value specified, $Options{strdatastring}, for option \"--StrDataString\" is not valid. Allowed values: yes or no\n"; 753 } 754 if ($Options{strdatastringmode} !~ /^(StrOnly|StrAndDataFields)$/i) { 755 die "Error: The value specified, $Options{strdatastringmode}, for option \"--StrDataStringMode\" is not valid. Allowed values: StrOnly or StrAndDataFields\n"; 756 } 757 } 758