1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: ExtractFromSDFiles.pl,v $ 4 # $Date: 2008/02/28 00:29:34 $ 5 # $Revision: 1.28 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use 5.006; 30 use strict; 31 use FindBin; use lib "$FindBin::Bin/../lib"; 32 use Getopt::Long; 33 use File::Basename; 34 use Text::ParseWords; 35 use Benchmark; 36 use SDFileUtil; 37 use FileUtil; 38 use TextUtil; 39 40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 41 42 # Autoflush STDOUT 43 $| = 1; 44 45 # Starting message... 46 $ScriptName = basename($0); 47 print "\n$ScriptName:Starting...\n\n"; 48 $StartTime = new Benchmark; 49 50 # Get the options and setup script... 51 SetupScriptUsage(); 52 if ($Options{help} || @ARGV < 1) { 53 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 54 } 55 56 my(@SDFilesList); 57 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 58 59 print "Processing options...\n"; 60 my($InDelim, $OutDelim, $OutQuote, $RecordNum, $StartRecordNum, $EndRecordNum, $FileNameMode, $SDFileExt, $TextFileExt, $OutputSDFileFlag, $OutputTextFileFlag, @SpecifiedDataFieldLabels, %SpecifiedDataFieldValuesMap, %SpecifiedDataFieldCriteriaMap, $SpecifiedDataFieldLabel, $SpecifiedDataFieldValuesCount, %SpecifiedDataFieldValues); 61 ProcessOptions(); 62 63 # Collect information about SD files... 64 print "Checking input SD file(s)...\n"; 65 my(@SDFilesOkay, @SDFilesCmpdCount, @SDFilesNewTextFileName, @SDFilesNewSDFileName, @SDFilesAllDataFieldLabels, @SDFilesCommonDataFieldLabels); 66 RetrieveSDFilesInfo(); 67 68 my($SDFile, $Index, $CmpdString, @CmpdLines, @DataLabels, @DataValues, %DataFieldValues, $DataValuesLine, $ColLabelsLine, $Label, $MolName, $CmpdNum, $CmpdCount, %RandomCmpdIndexMap, $SpecifiedDataFieldValuesFoundCount, @Words, $Line, $Value); 69 70 if (@SDFilesList > 1) { 71 print "Processing SD files...\n"; 72 } 73 SDFILE: for $Index (0 .. $#SDFilesList) { 74 if (!$SDFilesOkay[$Index]) { 75 next SDFILE; 76 } 77 $SDFile = $SDFilesList[$Index]; 78 if (@SDFilesList > 1) { 79 print "\nProcessing file $SDFile...\n"; 80 } 81 else { 82 print "Processing file $SDFile...\n" 83 } 84 # Open output files... 85 if ($OutputTextFileFlag && $OutputSDFileFlag) { 86 print "Generating $SDFilesNewSDFileName[$Index] and $SDFilesNewTextFileName[$Index]...\n"; 87 } 88 elsif ($OutputSDFileFlag) { 89 print "Generating $SDFilesNewSDFileName[$Index] ...\n"; 90 } 91 else { 92 print "Generating $SDFilesNewTextFileName[$Index]...\n"; 93 } 94 if ($OutputSDFileFlag) { 95 open NEWSDFILE, ">$SDFilesNewSDFileName[$Index]" or die "Error: Couldn't open $SDFilesNewSDFileName[$Index]: $! \n"; 96 } 97 if ($OutputTextFileFlag) { 98 open NEWTEXTFILE, ">$SDFilesNewTextFileName[$Index]" or die "Error: Couldn't open $SDFilesNewTextFileName[$Index]: $! \n"; 99 } 100 # Prepare for mode specific processing.... 101 @DataLabels = (); 102 if ($Options{mode} =~ /^alldatafields$/i) { 103 @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]}; 104 } 105 elsif ($Options{mode} =~ /^commondatafields$/i) { 106 @DataLabels = @{$SDFilesCommonDataFieldLabels[$Index]}; 107 } 108 elsif ($Options{mode} =~ /^datafields$/i) { 109 @DataLabels = @SpecifiedDataFieldLabels; 110 } 111 elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist)$/i) { 112 for $Value (keys %SpecifiedDataFieldValues) { 113 $SpecifiedDataFieldValues{$Value} = "NotFound"; 114 } 115 $SpecifiedDataFieldValuesFoundCount = 0; 116 @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]}; 117 } 118 elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) { 119 @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]}; 120 } 121 elsif ($Options{mode} =~ /^randomcmpds$/i) { 122 my($RandomCycleCount, $RandomIndex); 123 $CmpdCount = $SDFilesCmpdCount[$Index]; 124 %RandomCmpdIndexMap = (); 125 srand($Options{seed}); 126 $RandomCycleCount = 0; 127 while ($RandomCycleCount <= $CmpdCount && $RandomCycleCount <= $Options{numofcmpds}) { 128 $RandomCycleCount++; 129 $RandomIndex = int (rand $CmpdCount) + 1; 130 $RandomCmpdIndexMap{$RandomIndex} = $RandomIndex; 131 } 132 @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]}; 133 } 134 elsif ($Options{mode} =~ /^molnames$/i) { 135 push @DataLabels, "MolName"; 136 } 137 elsif ($Options{mode} =~ /^recordnum$/i) { 138 @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]}; 139 } 140 elsif ($Options{mode} =~ /^recordrange$/i) { 141 @DataLabels = @{$SDFilesAllDataFieldLabels[$Index]}; 142 } 143 144 if ($OutputTextFileFlag) { 145 $ColLabelsLine = JoinWords(\@DataLabels, $OutDelim, $OutQuote); 146 print NEWTEXTFILE "$ColLabelsLine\n"; 147 } 148 149 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 150 $CmpdNum = 0; 151 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) { 152 @CmpdLines = split "\n", $CmpdString; 153 $CmpdNum++; 154 @DataValues = (); 155 if ($Options{mode} =~ /^(alldatafields|commondatafields|datafields)$/i) { 156 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 157 SetupDataValues(); 158 WriteTextFileCmpdData(); 159 WriteSDFileCmpdData(); 160 } 161 elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist)$/i) { 162 my($CurrentValue); 163 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 164 SetupDataValues(); 165 if (exists $DataFieldValues{$SpecifiedDataFieldLabel}) { 166 $CurrentValue = $DataFieldValues{$SpecifiedDataFieldLabel}; 167 if (exists $SpecifiedDataFieldValues{$CurrentValue}) { 168 if ($SpecifiedDataFieldValuesFoundCount < $SpecifiedDataFieldValuesCount) { 169 if ($SpecifiedDataFieldValues{$CurrentValue} eq "NotFound") { 170 $SpecifiedDataFieldValuesFoundCount++; 171 $SpecifiedDataFieldValues{$CurrentValue} = "Found"; 172 if ($Options{mode} =~ /^datafielduniquebylist$/i) { 173 WriteSDFileCmpdString(); 174 WriteTextFileCmpdData(); 175 } 176 } 177 if ($Options{mode} =~ /^datafieldbylist$/i) { 178 WriteSDFileCmpdString(); 179 WriteTextFileCmpdData(); 180 } 181 } 182 if ($SpecifiedDataFieldValuesFoundCount >= $SpecifiedDataFieldValuesCount) { 183 last CMPDSTRING; 184 } 185 } 186 } 187 } 188 elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) { 189 my($CurrentValue, $SpecifiedCriterion, $SpecifiedValue, $ViolationCount, $Nothing); 190 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 191 SetupDataValues(); 192 $ViolationCount = 0; 193 for $Label (@SpecifiedDataFieldLabels) { 194 if (exists $DataFieldValues{$Label}) { 195 $CurrentValue = $DataFieldValues{$Label}; 196 $SpecifiedCriterion = $SpecifiedDataFieldCriteriaMap{$Label}; 197 $SpecifiedValue = $SpecifiedDataFieldValuesMap{$Label}; 198 SWITCH: { 199 if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue ne $SpecifiedValue) { $ViolationCount++; last SWITCH; } } 200 if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue gt $SpecifiedValue) { $ViolationCount++; } } 201 if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue lt $SpecifiedValue) { $ViolationCount++; } } 202 $Nothing = 1; 203 } 204 } 205 } 206 if ($ViolationCount <= $Options{violations}) { 207 WriteSDFileCmpdString(); 208 WriteTextFileCmpdData(); 209 } 210 } 211 elsif ($Options{mode} =~ /^randomcmpds$/i) { 212 if (exists $RandomCmpdIndexMap{$CmpdNum}) { 213 WriteSDFileCmpdString(); 214 if ($OutputTextFileFlag) { 215 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 216 SetupDataValues(); 217 WriteTextFileCmpdData(); 218 } 219 } 220 } 221 elsif ($Options{mode} =~ /^molnames$/i) { 222 $MolName = QuoteAWord(ParseCmpdMolNameLine($CmpdLines[0]), $OutQuote); 223 print NEWTEXTFILE "$MolName\n"; 224 } 225 elsif ($Options{mode} =~ /^recordnum$/i) { 226 if ($CmpdNum == $RecordNum) { 227 WriteSDFileCmpdString(); 228 if ($OutputTextFileFlag) { 229 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 230 SetupDataValues(); 231 WriteTextFileCmpdData(); 232 } 233 last CMPDSTRING; 234 } 235 } 236 elsif ($Options{mode} =~ /^recordrange$/i) { 237 if ($CmpdNum >= $StartRecordNum && $CmpdNum <= $EndRecordNum) { 238 WriteSDFileCmpdString(); 239 if ($OutputTextFileFlag) { 240 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 241 SetupDataValues(); 242 WriteTextFileCmpdData(); 243 } 244 } 245 elsif ($CmpdNum > $EndRecordNum) { 246 last CMPDSTRING; 247 } 248 } 249 } 250 close SDFILE; 251 252 if ($OutputSDFileFlag) { 253 close NEWSDFILE; 254 } 255 if ($OutputTextFileFlag) { 256 close NEWTEXTFILE; 257 } 258 } 259 print "$ScriptName:Done...\n\n"; 260 261 $EndTime = new Benchmark; 262 $TotalTime = timediff ($EndTime, $StartTime); 263 print "Total time: ", timestr($TotalTime), "\n"; 264 265 ############################################################################### 266 267 # Process options... 268 sub ProcessOptions { 269 $InDelim = "\,"; 270 if ($Options{indelim} =~ /^semicolon$/i) { 271 $InDelim = "\;"; 272 } 273 elsif ($Options{indelim} =~ /^tab$/i) { 274 $InDelim = "\t"; 275 } 276 $OutDelim = "\,"; 277 if ($Options{outdelim} =~ /^semicolon$/i) { 278 $OutDelim = "\;"; 279 } 280 elsif ($Options{outdelim} =~ /^tab$/i) { 281 $OutDelim = "\t"; 282 } 283 $OutQuote = 1; 284 if ($Options{quote} =~ /^no$/i) { 285 $OutQuote = 0; 286 } 287 if ($Options{mode} =~ /^(datafields|datafieldsbyvalue|datafieldbylist|datafielduniquebylist)$/i) { 288 if ($Options{datafields} || $Options{datafieldsfile}) { 289 if ($Options{datafields} && $Options{datafieldsfile}) { 290 die "Error: For \"-m --mode\" option values datafields, datafieldsbyvalue, datafieldbylist, or datafielduniquebylist, specify only one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n"; 291 } 292 } 293 else { 294 die "Error: For \"-m --mode\" option values datafields, datafieldsbyvalue, datafieldbylist, or datafielduniquebylist, specify one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n"; 295 } 296 } 297 $RecordNum = 0; $StartRecordNum = 0; $EndRecordNum = 0; 298 if ($Options{mode} =~ /^(recordnum|recordrange)$/i) { 299 if ($Options{record}) { 300 my(@RecordSplit) = split ",", $Options{record}; 301 if ($Options{mode} =~ /^recordnum$/i ) { 302 if (@RecordSplit == 1) { 303 $RecordNum = $RecordSplit[0]; 304 if ($RecordNum <= 0) { 305 die "Error: The value specified, $RecordNum, for option \"--records\" is not valid. Allowed values: > 0 \n"; 306 } 307 } 308 else { 309 die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 1 value is allowed.\n"; 310 } 311 } 312 else { 313 if (@RecordSplit == 2) { 314 $StartRecordNum = $RecordSplit[0]; 315 $EndRecordNum = $RecordSplit[1]; 316 if ($StartRecordNum <= 0 || $EndRecordNum <= 0) { 317 die "Error: The value pair specified, $Options{record}, for option \"--records\" is not valid. Allowed values: > 0 \n"; 318 } 319 } 320 else { 321 die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 2 values is allowed.\n"; 322 } 323 if ($StartRecordNum > $EndRecordNum) { 324 die "Error: Start record number, $StartRecordNum, must be smaller than end record number, $EndRecordNum.\nSpecify different values using \"--record\" option.\n"; 325 } 326 } 327 } 328 else { 329 die "Error: For \"-m --mode\" option values recordnum, or recordrange, specify \"--record\" option value.\n"; 330 } 331 } 332 my(@Words, $Line, $Value); 333 if ($Options{mode} =~ /^datafields$/i) { 334 @SpecifiedDataFieldLabels = (); 335 if ($Options{datafields}) { 336 @SpecifiedDataFieldLabels = split "$InDelim", $Options{datafields}; 337 } 338 elsif ($Options{datafieldsfile}) { 339 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; 340 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { 341 @Words = quotewords($InDelim, 0, $Line); 342 if (@Words) { 343 push @SpecifiedDataFieldLabels, @Words; 344 } 345 } 346 close DATAFIELDSFILE; 347 } 348 } 349 elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) { 350 my(@DataFieldsByValueTriplets); 351 @DataFieldsByValueTriplets = (); 352 if ($Options{datafields}) { 353 @DataFieldsByValueTriplets = split "$InDelim", $Options{datafields}; 354 } 355 elsif ($Options{datafieldsfile}) { 356 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; 357 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { 358 @Words = quotewords($InDelim, 0, $Line); 359 if (@Words) { 360 push @DataFieldsByValueTriplets, @Words; 361 } 362 } 363 close DATAFIELDSFILE; 364 } 365 if ((@DataFieldsByValueTriplets % 3)) { 366 if ($Options{datafields}) { 367 die "Error: Triplets not found in values specified by \"-d --datafields\" option\n"; 368 } 369 elsif ($Options{datafieldsfile}) { 370 die "Error: Triplets not found in values specified by \"--datafieldsfile\" option\n"; 371 } 372 } 373 @SpecifiedDataFieldLabels = (); 374 %SpecifiedDataFieldValuesMap = (); 375 %SpecifiedDataFieldCriteriaMap = (); 376 for ($Index = 0; $Index < @DataFieldsByValueTriplets; $Index = $Index + 3) { 377 $Label = $DataFieldsByValueTriplets[$Index]; 378 my($Value) = $DataFieldsByValueTriplets[$Index + 1]; 379 my($Criterion) = $DataFieldsByValueTriplets[$Index + 2]; 380 if ($Criterion =~ /^(eq|le|ge)$/i) { 381 push @SpecifiedDataFieldLabels, $Label; 382 $SpecifiedDataFieldValuesMap{$Label} = $Value; 383 $SpecifiedDataFieldCriteriaMap{$Label} = $Criterion; 384 } 385 else { 386 warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion\n"; 387 } 388 } 389 } 390 elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist)$/i) { 391 my(@DataFieldAndValuesList); 392 if ($Options{datafields}) { 393 @DataFieldAndValuesList = split "$InDelim", $Options{datafields}; 394 } 395 elsif ($Options{datafieldsfile}) { 396 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; 397 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { 398 @Words = quotewords($InDelim, 0, $Line); 399 if (@Words) { 400 push @DataFieldAndValuesList, @Words; 401 } 402 } 403 close DATAFIELDSFILE; 404 } 405 if (@DataFieldAndValuesList < 2) { 406 if ($Options{datafields}) { 407 die "Error: Invalid number of values specified by \"-d --datafields\" option\n"; 408 } 409 elsif ($Options{datafieldsfile}) { 410 die "Error: Invalid number values specified by \"--datafieldsfile\" option\n"; 411 } 412 } 413 $SpecifiedDataFieldLabel = $DataFieldAndValuesList[0]; 414 %SpecifiedDataFieldValues = (); 415 $SpecifiedDataFieldValuesCount = @DataFieldAndValuesList - 1; 416 for ($Index = 1; $Index < @DataFieldAndValuesList; $Index++) { 417 $Value = $DataFieldAndValuesList[$Index]; 418 $SpecifiedDataFieldValues{$Value} = "NotFound"; 419 } 420 } 421 422 $SDFileExt = "sdf"; 423 $TextFileExt = "csv"; 424 if ($Options{outdelim} =~ /^tab$/i) { 425 $TextFileExt = "tsv"; 426 } 427 if ($Options{mode} =~ /^(alldatafields|molnames)$/i) { 428 $OutputSDFileFlag = 0; 429 $OutputTextFileFlag = 1; 430 } 431 else { 432 $OutputSDFileFlag = ($Options{output} =~ /^(SD|both)$/i) ? 1 : 0; 433 $OutputTextFileFlag = ($Options{output} =~ /^(text|both)$/i) ? 1 : 0; 434 } 435 436 my($Nothing); 437 SWITCH: { 438 if ($Options{mode} =~ /^alldatafields$/i) { $FileNameMode = "AllDataDields"; last SWITCH; } 439 if ($Options{mode} =~ /^commondatafields$/i) { $FileNameMode = "CommonDataDields"; last SWITCH; } 440 if ($Options{mode} =~ /^datafields$/i) { $FileNameMode = "SpecifiedDataFields"; last SWITCH; } 441 if ($Options{mode} =~ /^datafieldsbyvalue$/i) { $FileNameMode = "SpecifiedDataFieldsByValue"; last SWITCH; } 442 if ($Options{mode} =~ /^datafieldbylist$/i) { $FileNameMode = "SpecifiedDataField"; last SWITCH; } 443 if ($Options{mode} =~ /^datafielduniquebylist$/i) { $FileNameMode = "SpecifiedUniqueDataField"; last SWITCH; } 444 if ($Options{mode} =~ /^molnames$/i) { $FileNameMode = "MolName"; last SWITCH; } 445 if ($Options{mode} =~ /^randomcmpds$/i) { $FileNameMode = "RandomCmpds"; last SWITCH; } 446 if ($Options{mode} =~ /^recordnum$/i) { $FileNameMode = "RecordNum$RecordNum"; last SWITCH; } 447 if ($Options{mode} =~ /^recordrange$/i) { $FileNameMode = "RecordNum$StartRecordNum" . "To" . "$EndRecordNum"; last SWITCH; } 448 $Nothing = 1; 449 } 450 } 451 452 # Retrieve information about input SD files... 453 sub RetrieveSDFilesInfo { 454 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $NewFileName, $NewSDFileName, $NewTextFileName, $CmpdCount); 455 456 @SDFilesOkay = (); 457 @SDFilesCmpdCount = (); 458 @SDFilesNewTextFileName = (); 459 @SDFilesNewSDFileName = (); 460 461 @SDFilesAllDataFieldLabels = (); 462 @SDFilesCommonDataFieldLabels = (); 463 464 FILELIST: for $Index (0 .. $#SDFilesList) { 465 $SDFile = $SDFilesList[$Index]; 466 $SDFilesOkay[$Index] = 0; 467 $SDFilesCmpdCount[$Index] = 0; 468 $SDFilesNewTextFileName[$Index] = ""; 469 $SDFilesNewSDFileName[$Index] = ""; 470 471 @{$SDFilesAllDataFieldLabels[$Index]} = (); 472 @{$SDFilesCommonDataFieldLabels[$Index]} = (); 473 474 if (!(-e $SDFile)) { 475 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 476 next FILELIST; 477 } 478 if (!CheckFileType($SDFile, "sd sdf")) { 479 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 480 next FILELIST; 481 } 482 # Generate appropriate name for the new output file. 483 $FileDir = ""; $FileName = ""; $FileExt = ""; 484 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 485 $NewFileName = $FileName; 486 $NewFileName = $FileName . "$FileNameMode"; 487 if ($Options{root} && (@SDFilesList == 1)) { 488 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); 489 if ($RootFileName && $RootFileExt) { 490 $NewFileName = $RootFileName; 491 } 492 else { 493 $NewFileName = $Options{root}; 494 } 495 } 496 $NewSDFileName = $NewFileName . ".$SDFileExt"; 497 $NewTextFileName = $NewFileName . ".$TextFileExt"; 498 if ($OutputSDFileFlag) { 499 if (lc($NewSDFileName) eq lc($SDFile)) { 500 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; 501 print "Specify a different name using \"-r --root\" option or use default name.\n"; 502 next FILELIST; 503 } 504 } 505 if (!$Options{overwrite}) { 506 if ($OutputSDFileFlag) { 507 if (-e $NewSDFileName) { 508 warn "Warning: Ignoring file $SDFile: New file, $NewSDFileName, already exists\n"; 509 next FILELIST; 510 } 511 } 512 if ($OutputTextFileFlag) { 513 if (-e $NewTextFileName) { 514 warn "Warning: Ignoring file $SDFile: New file, $NewTextFileName, already exists\n"; 515 next FILELIST; 516 } 517 } 518 } 519 if (!open SDFILE, "$SDFile") { 520 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; 521 next FILELIST; 522 } 523 524 my($CountCmpdsFlag, $CollectDataFieldsFlag); 525 my($CmpdString, @CmpdLines, @DataFieldLabels, %DataFieldLabelsMap,@CommonDataFieldLabels); 526 527 $CountCmpdsFlag = ($Options{mode} =~ /^(randomcmpds|recordnum|recordrange)$/i) ? 1 : 0; 528 $CollectDataFieldsFlag = (($Options{mode} =~ /^(alldatafields|commondatafields|randomcmpds)$/i && $OutputTextFileFlag) || ($Options{mode} =~ /^datafieldsbyvalue$/i && $OutputTextFileFlag) || ($Options{mode} =~ /^datafieldbylist$/i && $OutputTextFileFlag) || ($Options{mode} =~ /^datafielduniquebylist$/i && $OutputTextFileFlag) || ($Options{mode} =~ /^recordrange$/i && $OutputTextFileFlag)) ? 1 : 0; 529 530 $CmpdCount = 0; 531 if ($CountCmpdsFlag || $CollectDataFieldsFlag) { 532 @DataFieldLabels = (); 533 @CommonDataFieldLabels = (); 534 %DataFieldLabelsMap = (); 535 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) { 536 $CmpdCount++; 537 if ($Options{mode} =~ /^recordnum$/i) { 538 if ($CmpdCount == $RecordNum) { 539 @CmpdLines = split "\n", $CmpdString; 540 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); 541 last CMPDSTRING; 542 } 543 } 544 if ($CollectDataFieldsFlag) { 545 my($Label); 546 @CmpdLines = split "\n", $CmpdString; 547 # Process compound data header labels and figure out which ones are present for 548 # all the compounds... 549 if (@DataFieldLabels) { 550 my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines); 551 my(%CmpdDataFieldLabelsMap) = (); 552 # Setup a map for the current labels... 553 for $Label (@CmpdDataFieldLabels) { 554 $CmpdDataFieldLabelsMap{$Label} = "PresentInSome"; 555 } 556 # Check the presence old labels for this compound; otherwise, mark 'em new... 557 for $Label (@DataFieldLabels) { 558 if (!$CmpdDataFieldLabelsMap{$Label}) { 559 $DataFieldLabelsMap{$Label} = "PresentInSome"; 560 } 561 } 562 # Check the presence this compound in the old labels; otherwise, add 'em... 563 for $Label (@CmpdDataFieldLabels ) { 564 if (!$DataFieldLabelsMap{$Label}) { 565 # It's a new label... 566 push @DataFieldLabels, $Label; 567 $DataFieldLabelsMap{$Label} = "PresentInSome"; 568 } 569 } 570 } 571 else { 572 # Get the initial label set and set up a map... 573 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); 574 for $Label (@DataFieldLabels) { 575 $DataFieldLabelsMap{$Label} = "PresentInAll"; 576 } 577 } 578 # Identify the common data field labels... 579 if ($Options{mode} =~ /^commondatafields$/i) { 580 @CommonDataFieldLabels = (); 581 for $Label (@DataFieldLabels) { 582 if ($DataFieldLabelsMap{$Label} eq "PresentInAll") { 583 push @CommonDataFieldLabels, $Label; 584 } 585 } 586 } 587 } 588 } 589 } 590 if ($Options{mode} =~ /^recordnum$/i) { 591 if ($RecordNum > $CmpdCount) { 592 warn "Warning: Ignoring file $SDFile: The record specified, $RecordNum, using option \"--records\" doesn't exist \n"; 593 next FILELIST; 594 } 595 } 596 elsif ($Options{mode} =~ /^recordrange$/i) { 597 if ($StartRecordNum > $CmpdCount && $EndRecordNum > $CmpdCount) { 598 warn "Warning: Ignoring file $SDFile: The record range specified, $StartRecordNum to $EndRecordNum, using option \"--records\" doesn't exist \n"; 599 next FILELIST; 600 } 601 } 602 603 $SDFilesOkay[$Index] = 1; 604 $SDFilesNewTextFileName[$Index] = "$NewTextFileName"; 605 $SDFilesNewSDFileName[$Index] = "$NewSDFileName"; 606 607 $SDFilesCmpdCount[$Index] = $CmpdCount; 608 push @{$SDFilesAllDataFieldLabels[$Index]}, @DataFieldLabels; 609 push @{$SDFilesCommonDataFieldLabels[$Index]}, @CommonDataFieldLabels; 610 611 close SDFILE; 612 } 613 } 614 615 # Setup values for data fields... 616 sub SetupDataValues { 617 @DataValues = (); 618 for $Label (@DataLabels) { 619 if (exists $DataFieldValues{$Label}) { 620 push @DataValues, $DataFieldValues{$Label}; 621 } 622 else { 623 push @DataValues, ""; 624 } 625 } 626 } 627 628 # Write out structure data and specific data fields to SD file... 629 sub WriteSDFileCmpdData { 630 my($Count); 631 if ($OutputSDFileFlag) { 632 my($MolString) = split "M END", $CmpdString; 633 $MolString .= "M END"; 634 print NEWSDFILE "$MolString\n"; 635 for $Count (0 .. $#DataLabels) { 636 print NEWSDFILE "> <$DataLabels[$Count]>\n$DataValues[$Count]\n\n"; 637 } 638 print NEWSDFILE "\$\$\$\$\n"; 639 } 640 } 641 642 # Write out compound string... 643 sub WriteSDFileCmpdString { 644 if ($OutputSDFileFlag) { 645 print NEWSDFILE "$CmpdString\n"; 646 } 647 } 648 649 # Write out data for text file... 650 sub WriteTextFileCmpdData { 651 if ($OutputTextFileFlag) { 652 $DataValuesLine = JoinWords(\@DataValues, $OutDelim, $OutQuote); 653 # Handle multiple lines data values for data fields by joining 'em using hyphen... 654 if ($DataValuesLine =~ /\n/) { 655 $DataValuesLine =~ s/\n/ /g; 656 } 657 print NEWTEXTFILE "$DataValuesLine\n"; 658 } 659 } 660 661 # Setup script usage and retrieve command line arguments specified using various options... 662 sub SetupScriptUsage { 663 664 # Retrieve all the options... 665 %Options = (); 666 $Options{numofcmpds} = 1; 667 $Options{mode} = "alldatafields"; 668 $Options{indelim} = "comma"; 669 $Options{outdelim} = "comma"; 670 $Options{output} = "SD"; 671 $Options{quote} = "yes"; 672 $Options{violations} = 0; 673 $Options{seed} = 999999999; 674 if (!GetOptions(\%Options, "help|h", "datafields|d=s", "datafieldsfile=s", "indelim=s", "mode|m=s", "numofcmpds|n=i", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "record=s", "root|r=s", "seed|s=i", "violations|v=i", "workingdir|w=s")) { 675 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 676 } 677 if ($Options{workingdir}) { 678 if (! -d $Options{workingdir}) { 679 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 680 } 681 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 682 } 683 if ($Options{numofcmpds} < 1) { 684 die "Error: The value specified, $Options{numofcmpds}, for option \"-n --numofcmpds\" is not valid. Allowed values: >= 1 \n"; 685 } 686 if ($Options{violations} < 0) { 687 die "Error: The value specified, $Options{violations}, for option \"-v --violations\" is not valid. Allowed values: >= 0 \n"; 688 } 689 if ($Options{mode} !~ /(^(alldatafields|commondatafields|datafields|datafieldsbyvalue|datafieldbylist|datafielduniquebylist|molnames|randomcmpds|recordnum|recordrange)$)/i) { 690 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, molnames, randomcmpds, recordnum, recordrange\n"; 691 } 692 if ($Options{output} !~ /(^(SD|text|both)$)/i) { 693 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n"; 694 } 695 if ($Options{indelim} !~ /^(comma|semicolon|tab)$/i) { 696 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 697 } 698 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 699 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 700 } 701 if ($Options{quote} !~ /^(yes|no)$/i) { 702 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; 703 } 704 } 705