1 package SDFileUtil; 2 # 3 # $RCSfile: SDFileUtil.pm,v $ 4 # $Date: 2011/12/26 21:53:29 $ 5 # $Revision: 1.41 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2012 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use Exporter; 31 use Carp; 32 use PeriodicTable qw(IsElement); 33 use TimeUtil (); 34 35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 36 37 @ISA = qw(Exporter); 38 @EXPORT = qw(GenerateCmpdAtomLine GenerateCmpdBondLine GenerateCmpdChargePropertyLines GenerateCmpdCommentsLine GenerateCmpdCountsLine GenerateCmpdAtomAliasPropertyLines GenerateCmpdIsotopePropertyLines GenerateCmpdDataHeaderLabelsAndValuesLines GenerateCmpdMiscInfoLine GenerateCmpdRadicalPropertyLines GenerateCmpdMolNameLine GenerateEmptyCtabBlockLines GenerateMiscLineDateStamp GetAllAndCommonCmpdDataHeaderLabels GetCmpdDataHeaderLabels GetCmpdDataHeaderLabelsAndValues GetCmpdFragments GetCtabLinesCount GetUnknownAtoms GetInvalidAtomNumbers MDLChargeToInternalCharge InternalChargeToMDLCharge MDLBondTypeToInternalBondOrder InternalBondOrderToMDLBondType InternalSpinMultiplicityToMDLRadical MDLRadicalToInternalSpinMultiplicity IsCmpd3D IsCmpd2D ParseCmpdAtomLine ParseCmpdBondLine ParseCmpdCommentsLine ParseCmpdCountsLine ParseCmpdMiscInfoLine ParseCmpdMolNameLine ParseCmpdAtomAliasPropertyLine ParseCmpdChargePropertyLine ParseCmpdIsotopePropertyLine ParseCmpdRadicalPropertyLine ReadCmpdString RemoveCmpdDataHeaderLabelAndValue WashCmpd); 39 @EXPORT_OK = qw(); 40 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 41 42 # Format data for compounds count line... 43 sub GenerateCmpdCountsLine { 44 my($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version, $Line); 45 46 if (@_ == 5) { 47 ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) = @_; 48 } 49 elsif (@_ == 3) { 50 ($AtomCount, $BondCount, $ChiralFlag) = @_; 51 $PropertyCount = 999; 52 $Version = "V2000"; 53 } 54 else { 55 ($AtomCount, $BondCount) = @_; 56 $ChiralFlag = 0; 57 $PropertyCount = 999; 58 $Version = "V2000"; 59 } 60 if ($AtomCount > 999) { 61 croak "Error: SDFileUtil::GenerateCmpdCountsLine: The atom count, $AtomCount, exceeds maximum of 999 allowed for CTAB version 2000. The Extended Connection Table (V3000) format in MDL MOL and SD files is not supported by the current release of MayaChemTools..."; 62 } 63 $Line = sprintf "%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%6s", $AtomCount, $BondCount, 0, 0, $ChiralFlag, 0, 0, 0, 0, 0, $PropertyCount, $Version; 64 65 return ($Line); 66 } 67 68 # Generate comments line... 69 sub GenerateCmpdCommentsLine { 70 my($Comments) = @_; 71 my($Line); 72 73 $Line = (length($Comments) > 80) ? substr($Comments, 0, 80) : $Comments; 74 75 return $Line; 76 } 77 78 # Generate molname line... 79 sub GenerateCmpdMolNameLine { 80 my($MolName) = @_; 81 my($Line); 82 83 $Line = (length($MolName) > 80) ? substr($MolName, 0, 80) : $MolName; 84 85 return $Line; 86 } 87 88 # Generate data for compounds misc info line... 89 sub GenerateCmpdMiscInfoLine { 90 my($ProgramName, $UserInitial, $Code) = @_; 91 my($Date, $Line); 92 93 if (!(defined($ProgramName) && $ProgramName)) { 94 $ProgramName = "MayaChem"; 95 } 96 if (!(defined($UserInitial) && $UserInitial)) { 97 $UserInitial = " "; 98 } 99 if (!(defined($Code) && $Code)) { 100 $Code = "2D"; 101 } 102 103 if (length($ProgramName) > 8) { 104 $ProgramName = substr($ProgramName, 0, 8); 105 } 106 if (length($UserInitial) > 2) { 107 $UserInitial = substr($UserInitial, 0, 2); 108 } 109 if (length($Code) > 2) { 110 $Code = substr($Code, 0, 2); 111 } 112 $Date = GenerateMiscLineDateStamp(); 113 114 $Line = "${UserInitial}${ProgramName}${Date}${Code}"; 115 116 return $Line; 117 } 118 119 # Generate data for compounds misc info line... 120 sub GenerateEmptyCtabBlockLines { 121 my($Date, $Lines); 122 123 if (@_ == 1) { 124 ($Date) = @_; 125 } 126 else { 127 $Date = GenerateMiscLineDateStamp(); 128 } 129 # First line: Blank molname line... 130 # Second line: Misc info... 131 # Third line: Blank comments line... 132 # Fourth line: Counts line reflecting empty structure data block... 133 $Lines = "\n"; 134 $Lines .= " MayaChem${Date}2D\n"; 135 $Lines .= "\n"; 136 $Lines .= GenerateCmpdCountsLine(0, 0, 0) . "\n"; 137 $Lines .= "M END"; 138 139 return $Lines; 140 } 141 142 # Generate SD file data stamp... 143 sub GenerateMiscLineDateStamp { 144 return TimeUtil::SDFileTimeStamp(); 145 } 146 147 # Generate data for compound atom line... 148 # 149 sub GenerateCmpdAtomLine { 150 my($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge, $StereoParity) = @_; 151 my($Line); 152 153 if (!defined $MassDifference) { 154 $MassDifference = 0; 155 } 156 if (!defined $Charge) { 157 $Charge = 0; 158 } 159 if (!defined $StereoParity) { 160 $StereoParity = 0; 161 } 162 $Line = sprintf "%10.4f%10.4f%10.4f %-3s%2i%3i%3i 0 0 0 0 0 0 0 0 0", $AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity; 163 164 return $Line 165 } 166 167 # Generate data for compound bond line... 168 # 169 sub GenerateCmpdBondLine { 170 my($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = @_; 171 my($Line); 172 173 if (!defined $BondStereo) { 174 $BondStereo = 0; 175 } 176 $Line = sprintf "%3i%3i%3i%3i 0 0 0", $FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo; 177 178 return $Line 179 } 180 181 # Generate charge property lines for CTAB block... 182 # 183 sub GenerateCmpdChargePropertyLines { 184 my($ChargeValuePairsRef) = @_; 185 186 return _GenerateCmpdGenericPropertyLines('Charge', $ChargeValuePairsRef); 187 } 188 189 # Generate isotope property lines for CTAB block... 190 # 191 sub GenerateCmpdIsotopePropertyLines { 192 my($IsotopeValuePairsRef) = @_; 193 194 return _GenerateCmpdGenericPropertyLines('Isotope', $IsotopeValuePairsRef); 195 } 196 197 # Generate radical property line property lines for CTAB block... 198 # 199 sub GenerateCmpdRadicalPropertyLines { 200 my($RadicalValuePairsRef) = @_; 201 202 return _GenerateCmpdGenericPropertyLines('Radical', $RadicalValuePairsRef); 203 } 204 205 # Generate atom alias property line property lines for CTAB block... 206 # 207 # Atom alias property line format: 208 # 209 # A aaa 210 # x... 211 # 212 # aaa: Atom number 213 # x: Atom alias in next line 214 # 215 sub GenerateCmpdAtomAliasPropertyLines { 216 my($PropertyValuePairsRef) = @_; 217 my($Index, $AtomNum, $AtomAlias, $Line, @PropertyLines); 218 219 @PropertyLines = (); 220 221 for ($Index = 0; $Index < $#{$PropertyValuePairsRef}; $Index += 2) { 222 $AtomNum = $PropertyValuePairsRef->[$Index]; 223 $AtomAlias = $PropertyValuePairsRef->[$Index + 1]; 224 225 $Line = "A " . sprintf "%3i", $AtomNum; 226 227 push @PropertyLines, $Line; 228 push @PropertyLines, $AtomAlias; 229 } 230 231 return @PropertyLines; 232 } 233 234 # Generate data header labels and values lines... 235 # 236 sub GenerateCmpdDataHeaderLabelsAndValuesLines { 237 my($DataHeaderLabelsRef, $DataHeaderLabelsAndValuesRef, $SortDataLabels) = @_; 238 my($DataLabel, $DataValue, @DataLabels, @DataLines); 239 240 if (!defined $SortDataLabels) { 241 $SortDataLabels = 0; 242 } 243 244 @DataLines = (); 245 @DataLabels = (); 246 if ($SortDataLabels) { 247 push @DataLabels, sort @{$DataHeaderLabelsRef}; 248 } 249 else { 250 push @DataLabels, @{$DataHeaderLabelsRef}; 251 } 252 for $DataLabel (@DataLabels) { 253 $DataValue = ''; 254 if (exists $DataHeaderLabelsAndValuesRef->{$DataLabel}) { 255 $DataValue = $DataHeaderLabelsAndValuesRef->{$DataLabel}; 256 } 257 push @DataLines, ("> <${DataLabel}>", "$DataValue", ""); 258 } 259 return @DataLines; 260 } 261 262 # Parse data field header in SD file and return lists of all and common data field 263 # labels. 264 sub GetAllAndCommonCmpdDataHeaderLabels { 265 my($SDFileRef) = @_; 266 my($CmpdCount, $CmpdString, $Label, @CmpdLines, @DataFieldLabels, @CommonDataFieldLabels, %DataFieldLabelsMap); 267 268 $CmpdCount = 0; 269 @DataFieldLabels = (); 270 @CommonDataFieldLabels = (); 271 %DataFieldLabelsMap = (); 272 273 while ($CmpdString = ReadCmpdString($SDFileRef)) { 274 $CmpdCount++; 275 @CmpdLines = split "\n", $CmpdString; 276 # Process compound data header labels and figure out which ones are present for 277 # all the compounds... 278 if (@DataFieldLabels) { 279 my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines); 280 my(%CmpdDataFieldLabelsMap) = (); 281 # Setup a map for the current labels... 282 for $Label (@CmpdDataFieldLabels) { 283 $CmpdDataFieldLabelsMap{$Label} = "PresentInSome"; 284 } 285 # Check the presence old labels for this compound; otherwise, mark 'em new... 286 for $Label (@DataFieldLabels) { 287 if (!$CmpdDataFieldLabelsMap{$Label}) { 288 $DataFieldLabelsMap{$Label} = "PresentInSome"; 289 } 290 } 291 # Check the presence this compound in the old labels; otherwise, add 'em... 292 for $Label (@CmpdDataFieldLabels ) { 293 if (!$DataFieldLabelsMap{$Label}) { 294 # It's a new label... 295 push @DataFieldLabels, $Label; 296 $DataFieldLabelsMap{$Label} = "PresentInSome"; 297 } 298 } 299 } 300 else { 301 # Get the initial label set and set up a map... 302 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); 303 for $Label (@DataFieldLabels) { 304 $DataFieldLabelsMap{$Label} = "PresentInAll"; 305 } 306 } 307 } 308 # Identify the common data field labels... 309 @CommonDataFieldLabels = (); 310 for $Label (@DataFieldLabels) { 311 if ($DataFieldLabelsMap{$Label} eq "PresentInAll") { 312 push @CommonDataFieldLabels, $Label; 313 } 314 } 315 return ($CmpdCount, \@DataFieldLabels, \@CommonDataFieldLabels); 316 } 317 318 # Parse all the data header labels and return 'em as an list... 319 # 320 # Format: 321 # 322 #> Data header line 323 #Data line(s) 324 #Blank line 325 # 326 # [Data Header] (one line) precedes each item of data, starts with a greater than (>) sign, and 327 # contains at least one of the following: 328 # The field name enclosed in angle brackets. For example: <melting.point> 329 # The field number, DTn , where n represents the number assigned to the field in a MACCS-II database 330 # 331 #Optional information for the data header includes: 332 # The compound’s external and internal registry numbers. External registry numbers must be enclosed in parentheses. 333 # Any combination of information 334 # 335 #The following are examples of valid data headers: 336 #> <MELTING.POINT> 337 #> 55 (MD-08974) <BOILING.POINT> DT12 338 #> DT12 55 339 #> (MD-0894) <BOILING.POINT> FROM ARCHIVES 340 # 341 #Notes: Sometimes last blank line is missing and can be just followed by $$$$ 342 # 343 sub GetCmpdDataHeaderLabels { 344 my($CmpdLines) = @_; 345 my($CmpdLine, $Label, @Labels); 346 347 @Labels = (); 348 CMPDLINE: for $CmpdLine (@$CmpdLines) { 349 if ($CmpdLine !~ /^>/) { 350 next CMPDLINE; 351 } 352 # Does the line contains field name enclosed in angular brackets? 353 ($Label) = $CmpdLine =~ /<.*?>/g; 354 if (!defined($Label)) { 355 next CMPDLINE; 356 } 357 $Label =~ s/(<|>)//g; 358 push @Labels, $Label; 359 } 360 return (@Labels); 361 } 362 363 # Parse all the data header labels and values 364 sub GetCmpdDataHeaderLabelsAndValues { 365 my($CmpdLines) = @_; 366 my($CmpdLine, $CurrentLabel, $Label, $Value, $ValueCount, $ProcessingLabelData, @Values, %DataFields); 367 368 %DataFields = (); 369 $ProcessingLabelData = 0; 370 $ValueCount = 0; 371 CMPDLINE: for $CmpdLine (@$CmpdLines) { 372 if ($CmpdLine =~ /^\$\$\$\$/) { 373 last CMPDLINE; 374 } 375 if ($CmpdLine =~ /^>/) { 376 # Does the line contains field name enclosed in angular brackets? 377 ($Label) = $CmpdLine =~ /<.*?>/g; 378 if (defined $Label) { 379 $CurrentLabel = $Label; 380 $CurrentLabel =~ s/(<|>)//g; 381 $ProcessingLabelData = 0; 382 $ValueCount = 0; 383 384 if ($CurrentLabel) { 385 $ProcessingLabelData = 1; 386 $DataFields{$CurrentLabel} = ''; 387 next CMPDLINE; 388 } 389 } 390 else { 391 if (!$ProcessingLabelData) { 392 # Data line containing no <label> as allowed by SDF format. Just ignore it... 393 next CMPDLINE; 394 } 395 } 396 } 397 if (!$ProcessingLabelData) { 398 next CMPDLINE; 399 } 400 if (!(defined($CmpdLine) && length($CmpdLine))) { 401 # Blank line terminates value for a label... 402 $CurrentLabel = ''; 403 $ValueCount = 0; 404 $ProcessingLabelData = 0; 405 next CMPDLINE; 406 } 407 $ValueCount++; 408 $Value = $CmpdLine; 409 410 if ($ValueCount > 1) { 411 $DataFields{$CurrentLabel} .= "\n" . $Value; 412 } 413 else { 414 $DataFields{$CurrentLabel} = $Value; 415 } 416 } 417 return (%DataFields); 418 } 419 420 # Return an updated compoud string after removing data header label along with its 421 # value from the specified compound string... 422 # 423 sub RemoveCmpdDataHeaderLabelAndValue { 424 my($CmpdString, $DataHeaderLabel) = @_; 425 my($Line, $PorcessingDataHeaderLabel, @CmpdLines); 426 427 @CmpdLines = (); 428 $PorcessingDataHeaderLabel = 0; 429 430 CMPDLINE: for $Line (split "\n", $CmpdString) { 431 if ($Line =~ /^>/ && $Line =~ /<$DataHeaderLabel>/i) { 432 $PorcessingDataHeaderLabel = 1; 433 next CMPDLINE; 434 } 435 436 if ($PorcessingDataHeaderLabel) { 437 # Blank line indicates end of fingerprints data value... 438 if ($Line =~ /^\$\$\$\$/) { 439 push @CmpdLines, $Line; 440 $PorcessingDataHeaderLabel = 0; 441 } 442 elsif (!length($Line)) { 443 $PorcessingDataHeaderLabel = 0; 444 } 445 next CMPDLINE; 446 } 447 448 # Track compound lines without fingerprints data... 449 push @CmpdLines, $Line; 450 } 451 452 return join "\n", @CmpdLines; 453 } 454 455 # 456 # Using bond blocks, figure out the number of disconnected fragments and 457 # return their values along with the atom numbers in a string delimited by new 458 # line character. 459 # 460 sub GetCmpdFragments { 461 my($CmpdLines) = @_; 462 my($AtomCount, $BondCount, $FirstAtomNum, $SecondAtomNum, @AtomConnections, $BondType, $FragmentString, $FragmentCount, $LineIndex, $Index, $AtomNum, $NbrAtomNum, @ProcessedAtoms, $ProcessedAtomCount, $ProcessAtomNum, @ProcessingAtoms, @ConnectedAtoms, %Fragments, $FragmentNum, $AFragmentString); 463 464 # Setup the connection table for each atom... 465 @AtomConnections = (); 466 ($AtomCount, $BondCount) = ParseCmpdCountsLine(@$CmpdLines[3]); 467 for $AtomNum (1 .. $AtomCount) { 468 %{$AtomConnections[$AtomNum]} = (); 469 } 470 for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) { 471 ($FirstAtomNum, $SecondAtomNum, $BondType) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]); 472 if (!$AtomConnections[$FirstAtomNum]{$SecondAtomNum}) { 473 $AtomConnections[$FirstAtomNum]{$SecondAtomNum} = $BondType; 474 } 475 if (!$AtomConnections[$SecondAtomNum]{$FirstAtomNum}) { 476 $AtomConnections[$SecondAtomNum]{$FirstAtomNum} = $BondType; 477 } 478 } 479 480 #Get set to count fragments... 481 $ProcessedAtomCount = 0; 482 $FragmentNum = 0; 483 %Fragments = (); 484 @ProcessedAtoms = (); 485 for $AtomNum (1 .. $AtomCount) { 486 $ProcessedAtoms[$AtomNum] = 0; 487 } 488 while ($ProcessedAtomCount < $AtomCount) { 489 @ProcessingAtoms = (); 490 @ConnectedAtoms = (); 491 ATOMNUM: for $AtomNum (1 .. $AtomCount) { 492 if (!$ProcessedAtoms[$AtomNum]) { 493 $ProcessedAtomCount++; 494 $ProcessedAtoms[$AtomNum] = 1; 495 push @ProcessingAtoms, $AtomNum; 496 $FragmentNum++; 497 @{$Fragments{$FragmentNum} } = (); 498 push @{$Fragments{$FragmentNum} }, $AtomNum; 499 last ATOMNUM; 500 } 501 } 502 503 # Go over the neighbors and follow the connection trail while collecting the 504 # atoms numbers present in the connected fragment... 505 while (@ProcessingAtoms) { 506 for ($Index = 0; $Index < @ProcessingAtoms; $Index++) { 507 $ProcessAtomNum = $ProcessingAtoms[$Index]; 508 for $NbrAtomNum (keys %{$AtomConnections[$ProcessAtomNum]}) { 509 if (!$ProcessedAtoms[$NbrAtomNum]) { 510 $ProcessedAtomCount++; 511 $ProcessedAtoms[$NbrAtomNum] = 1; 512 push @ConnectedAtoms, $NbrAtomNum; 513 push @{ $Fragments{$FragmentNum} }, $NbrAtomNum; 514 } 515 } 516 } 517 @ProcessingAtoms = (); 518 @ProcessingAtoms = @ConnectedAtoms; 519 @ConnectedAtoms = (); 520 } 521 } 522 $FragmentCount = $FragmentNum; 523 $FragmentString = ""; 524 525 # Sort out the fragments by size... 526 for $FragmentNum (sort { @{$Fragments{$b}} <=> @{$Fragments{$a}} } keys %Fragments ) { 527 # Sort the atoms in a fragment by their numbers... 528 $AFragmentString = join " ", sort { $a <=> $b } @{ $Fragments{$FragmentNum} }; 529 if ($FragmentString) { 530 $FragmentString .= "\n" . $AFragmentString; 531 } 532 else { 533 $FragmentString = $AFragmentString; 534 } 535 } 536 return ($FragmentCount, $FragmentString); 537 } 538 539 # Count number of lines present in between 4th and line containg "M END" 540 sub GetCtabLinesCount { 541 my($CmpdLines) = @_; 542 my($LineIndex, $CtabLinesCount); 543 544 $CtabLinesCount = 0; 545 LINE: for ($LineIndex = 4; $LineIndex < @$CmpdLines; $LineIndex++) { 546 # 547 # Any line after atom and bond data starting with anything other than space or 548 # a digit indicates end of Ctab atom/bond data block... 549 # 550 if (@$CmpdLines[$LineIndex] !~ /^[0-9 ]/) { 551 $CtabLinesCount = $LineIndex - 4; 552 last LINE; 553 } 554 } 555 return $CtabLinesCount; 556 } 557 558 # Using atom blocks, count the number of atoms which contain special element 559 # symbols not present in the periodic table. 560 sub GetUnknownAtoms { 561 my($CmpdLines) = @_; 562 my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines, $LineIndex, $AtomCount, $AtomSymbol); 563 564 $UnknownAtomCount = 0; 565 $UnknownAtoms = ""; 566 $UnknownAtomLines = ""; 567 ($AtomCount) = ParseCmpdCountsLine(@$CmpdLines[3]); 568 for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) { 569 ($AtomSymbol) = ParseCmpdAtomLine(@$CmpdLines[$LineIndex]); 570 if (!IsElement($AtomSymbol)) { 571 $UnknownAtomCount++; 572 $UnknownAtoms .= " $AtomSymbol"; 573 if ($UnknownAtomLines) { 574 $UnknownAtomLines .= "\n" . @$CmpdLines[$LineIndex]; 575 } 576 else { 577 $UnknownAtomLines = @$CmpdLines[$LineIndex]; 578 } 579 } 580 } 581 return ($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines); 582 } 583 584 # Check z coordinates of all atoms to see whether any of them is non-zero 585 # which makes the compound geometry three dimensional... 586 # 587 sub IsCmpd3D { 588 my($CmpdLines) = @_; 589 my($LineIndex, $AtomCount, $AtomSymbol, $AtomX, $AtomY, $AtomZ); 590 591 ($AtomCount) = ParseCmpdCountsLine(@$CmpdLines[3]); 592 for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) { 593 ($AtomSymbol, $AtomX, $AtomY, $AtomZ) = ParseCmpdAtomLine(@$CmpdLines[$LineIndex]); 594 if ($AtomZ != 0) { 595 return 1; 596 } 597 } 598 return 0; 599 } 600 601 # Check whether it's a 2D compound... 602 # 603 sub IsCmpd2D { 604 my($CmpdLines) = @_; 605 606 return IsCmpd3D($CmpdLines) ? 0 : 1; 607 } 608 609 # Using bond blocks, count the number of bond lines which contain atom numbers 610 # greater than atom count specified in compound count line... 611 # 612 sub GetInvalidAtomNumbers { 613 my($CmpdLines) = @_; 614 my($LineIndex, $AtomCount, $BondCount, $FirstAtomNum, $SecondAtomNum, $InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines, $Line, $InvalidAtomPropertyLine, $ValuePairIndex, $AtomNum, $Value, @ValuePairs); 615 616 ($AtomCount, $BondCount) = ParseCmpdCountsLine(@$CmpdLines[3]); 617 618 $InvalidAtomNumbersCount = 0; 619 $InvalidAtomNumbers = ""; 620 $InvalidAtomNumberLines = ""; 621 622 # Go over bond block lines... 623 LINE: for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) { 624 ($FirstAtomNum, $SecondAtomNum) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]); 625 if ($FirstAtomNum <= $AtomCount && $SecondAtomNum <= $AtomCount) { 626 next LINE; 627 } 628 if ($FirstAtomNum > $AtomCount) { 629 $InvalidAtomNumbersCount++; 630 $InvalidAtomNumbers .= " $FirstAtomNum"; 631 } 632 if ($SecondAtomNum > $AtomCount) { 633 $InvalidAtomNumbersCount++; 634 $InvalidAtomNumbers .= " $SecondAtomNum"; 635 } 636 if ($InvalidAtomNumberLines) { 637 $InvalidAtomNumberLines .= "\n" . @$CmpdLines[$LineIndex]; 638 } 639 else { 640 $InvalidAtomNumberLines = @$CmpdLines[$LineIndex]; 641 } 642 } 643 # Go over property lines before M END... 644 # 645 LINE: for ($LineIndex = (4 + $AtomCount + $BondCount); $LineIndex < @$CmpdLines; $LineIndex++) { 646 $Line = @$CmpdLines[$LineIndex]; 647 @ValuePairs = (); 648 if ($Line =~ /^M END/i) { 649 last LINE; 650 } 651 @ValuePairs = (); 652 if ($Line =~ /^M CHG/i) { 653 @ValuePairs = ParseCmpdChargePropertyLine($Line); 654 } 655 elsif ($Line =~ /^M RAD/i) { 656 @ValuePairs = ParseCmpdRadicalPropertyLine($Line); 657 } 658 elsif ($Line =~ /^M ISO/i) { 659 @ValuePairs = ParseCmpdIsotopePropertyLine($Line); 660 } 661 elsif ($Line =~ /^A /i) { 662 my($NextLine); 663 $LineIndex++; 664 $NextLine = @$CmpdLines[$LineIndex]; 665 @ValuePairs = ParseCmpdAtomAliasPropertyLine($Line, $NextLine); 666 } 667 else { 668 next LINE; 669 } 670 671 $InvalidAtomPropertyLine = 0; 672 for ($ValuePairIndex = 0; $ValuePairIndex < $#ValuePairs; $ValuePairIndex += 2) { 673 $AtomNum = $ValuePairs[$ValuePairIndex]; $Value = $ValuePairs[$ValuePairIndex + 1]; 674 if ($AtomNum > $AtomCount) { 675 $InvalidAtomPropertyLine = 1; 676 $InvalidAtomNumbersCount++; 677 $InvalidAtomNumbers .= " $AtomNum"; 678 } 679 } 680 if ($InvalidAtomPropertyLine) { 681 if ($InvalidAtomNumberLines) { 682 $InvalidAtomNumberLines .= "\n" . $Line; 683 } 684 else { 685 $InvalidAtomNumberLines = $Line; 686 } 687 } 688 } 689 690 return ($InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines); 691 } 692 693 # Ctab lines: Atom block 694 # 695 # Format: xxxxx.xxxxyyyyy.yyyyzzzzz.zzzz aaaddcccssshhhbbbvvvHHHrrriiimmmnnneee 696 # A10 A10 A10 xA3 A2A3 A3 A3 A3 A3 A3 A3 A3 A3 A3 A3 697 # x,y,z: Atom coordinates 698 # aaa: Atom symbol. Entry in periodic table or L for atom list, A, Q, * for unspecified 699 # atom, and LP for lone pair, or R# for Rgroup label 700 # dd: Mass difference. -3, -2, -1, 0, 1, 2, 3, 4 (0 for value beyond these limits) 701 # ccc: Charge. 0 = uncharged or value other than these, 1 = +3, 2 = +2, 3 = +1, 702 # 4 = doublet radical, 5 = -1, 6 = -2, 7 = -3 703 # sss: Atom stereo parity. 0 = not stereo, 1 = odd, 2 = even, 3 = either or unmarked stereo center 704 # hhh: Hydrogen count + 1. 1 = H0, 2 = H1, 3 = H2, 4 = H3, 5 = H4 705 # bbb: Stereo care box. 0 = ignore stereo configuration of this double bond atom, 1 = stereo 706 # configuration of double bond atom must match 707 # vvv: Valence. 0 = no marking (default)(1 to 14) = (1 to 14) 15 = zero valence 708 # HHH: H0 designator. 0 = not specified, 1 = no H atoms allowed (redundant due to hhh) 709 # rrr: Not used 710 # iii: Not used 711 # mmm: Atom-atom mapping number. 1 - number of atoms 712 # nnn: Inversion/retention flag. 0 = property not applied, 1 = configuration is inverted, 713 # 2 = configuration is retained. 714 # eee: Exact change flag. 0 = property not applied, 1 = change on atom must be 715 # exactly as shown 716 # 717 # Notes: 718 # . StereoParity: 1 - ClockwiseStereo, 2 - AntiClockwiseStereo; 3 - Either; 0 - none. These 719 # values determine chirailty around the chiral center; a non zero value indicates atom 720 # has been marked as chiral center. 721 # 722 sub ParseCmpdAtomLine { 723 my($Line) = @_; 724 my ($LineIndex, $AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity); 725 726 ($AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity) = ('') x 7; 727 if (length($Line) > 31) { 728 ($AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity) = unpack("A10A10A10xA3A2A3A3", $Line); 729 } 730 else { 731 ($AtomX, $AtomY, $AtomZ, $AtomSymbol) = unpack("A10A10A10", $Line); 732 } 733 return ($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge, $StereoParity); 734 } 735 736 # Map MDL charge value used in SD and MOL files to internal charge used by MayaChemTools. 737 # 738 sub MDLChargeToInternalCharge { 739 my($MDLCharge) = @_; 740 my($InternalCharge); 741 742 CHARGE: { 743 if ($MDLCharge == 0) { $InternalCharge = 0; last CHARGE;} 744 if ($MDLCharge == 1) { $InternalCharge = 3; last CHARGE;} 745 if ($MDLCharge == 2) { $InternalCharge = 2; last CHARGE;} 746 if ($MDLCharge == 3) { $InternalCharge = 1; last CHARGE;} 747 if ($MDLCharge == 5) { $InternalCharge = -1; last CHARGE;} 748 if ($MDLCharge == 6) { $InternalCharge = -2; last CHARGE;} 749 if ($MDLCharge == 7) { $InternalCharge = -3; last CHARGE;} 750 # All other MDL charge values, including 4 corresponding to "doublet radical", 751 # are assigned internal value of 0. 752 $InternalCharge = 0; 753 if ($MDLCharge != 4) { 754 carp "Warning: MDLChargeToInternalCharge: MDL charge value, $MDLCharge, is not supported: An internal charge value, 0, has been assigned..."; 755 } 756 } 757 return $InternalCharge; 758 } 759 760 # Map internal charge used by MayaChemTools to MDL charge value used in SD and MOL files. 761 # 762 sub InternalChargeToMDLCharge { 763 my($InternalCharge) = @_; 764 my($MDLCharge); 765 766 CHARGE: { 767 if ($InternalCharge == 3) { $MDLCharge = 1; last CHARGE;} 768 if ($InternalCharge == 2) { $MDLCharge = 2; last CHARGE;} 769 if ($InternalCharge == 1) { $MDLCharge = 3; last CHARGE;} 770 if ($InternalCharge == -1) { $MDLCharge = 5; last CHARGE;} 771 if ($InternalCharge == -2) { $MDLCharge = 6; last CHARGE;} 772 if ($InternalCharge == -3) { $MDLCharge = 7; last CHARGE;} 773 # All other MDL charge values, including 4 corresponding to "doublet radical", 774 # are assigned internal value of 0. 775 $MDLCharge = 0; 776 } 777 return $MDLCharge; 778 } 779 780 # Ctab lines: Bond block 781 # 782 # Format: 111222tttsssxxxrrrccc 783 # 784 # 111: First atom number. 785 # 222: Second atom number. 786 # ttt: Bond type. 1 = Single, 2 = Double, 3 = Triple, 4 = Aromatic, 5 = Single or Double, 787 # 6 = Single or Aromatic, 7 = Double or Aromatic, 8 = Any 788 # sss: Bond stereo. Single bonds: 0 = not stereo, 1 = Up, 4 = Either, 6 = Down, 789 # Double bonds: 0 = Use x-, y-, z-coords from atom block to determine cis or trans, 790 # 3 = Cis or trans (either) double bond 791 # xxx: Not used 792 # rrr: Bond topology. 0 = Either, 1 = Ring, 2 = Chain 793 # ccc: Reacting center status. 0 = unmarked, 1 = a center, -1 = not a center, 794 # Additional: 2 = no change,4 = bond made/broken, 8 = bond order changes 12 = 4+8 795 # (both made/broken and changes); 5 = (4 + 1), 9 = (8 + 1), and 13 = (12 + 1) are also possible 796 sub ParseCmpdBondLine { 797 my($Line) = @_; 798 my($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo); 799 800 ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = map {s/ //g; $_} unpack("A3A3A3A3", $Line); 801 return ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo); 802 } 803 804 # Map MDL bond type value used in SD and MOL files to internal bond order values used by MayaChemTools... 805 # 806 sub MDLBondTypeToInternalBondOrder { 807 my($MDLBondType) = @_; 808 my($InternalBondOrder); 809 810 BONDTYPE: { 811 if ($MDLBondType == 1) { $InternalBondOrder = 1; last BONDTYPE;} 812 if ($MDLBondType == 2) { $InternalBondOrder = 2; last BONDTYPE;} 813 if ($MDLBondType == 3) { $InternalBondOrder = 3; last BONDTYPE;} 814 if ($MDLBondType == 4) { $InternalBondOrder = 1.5; last BONDTYPE;} # Aromatic 815 # 816 # Although MDL aromatic bond values are used for query only and explicit Kekule bond order 817 # values must be assigned, internal value of 1.5 is allowed to indicate aromatic bond orders. 818 # 819 # All other MDL bond type values - 5 = Single or Double, 6 = Single or Aromatic, 7 = Double or Aromatic, 820 # 8 = Any - are assigned internal value of 1: These are meant to be used for structure queries by MDL products. 821 # 822 $InternalBondOrder = 1; 823 carp "Warning: MDLBondTypeToInternalBondOrder: MDL bond type value, $MDLBondType, is not supported: An internal bond order value, 1, has been assigned..."; 824 } 825 return $InternalBondOrder; 826 } 827 828 # Map internal bond order values used by MayaChemTools to MDL bond type value used in SD and MOL files... 829 # 830 sub InternalBondOrderToMDLBondType { 831 my($InternalBondOrder) = @_; 832 my($MDLBondType); 833 834 BONDTYPE: { 835 if ($InternalBondOrder == 1) { $MDLBondType = 1; last BONDTYPE;} 836 if ($InternalBondOrder == 2) { $MDLBondType = 2; last BONDTYPE;} 837 if ($InternalBondOrder == 3) { $MDLBondType = 3; last BONDTYPE;} 838 if ($InternalBondOrder == 1.5) { $MDLBondType = 4; last BONDTYPE;} # Aromatic 839 $MDLBondType = 1; 840 } 841 return $MDLBondType; 842 } 843 844 # Third line: Comments - A blank line is also allowed. 845 sub ParseCmpdCommentsLine { 846 my($Line) = @_; 847 my($Comments); 848 849 $Comments = unpack("A80", $Line); 850 851 return ($Comments); 852 } 853 854 # Map MDL bond stereo value used in SD and MOL files to internal bond type values used by MayaChemTools... 855 # 856 sub MDLBondStereoToInternalBondType { 857 my($MDLBondStereo) = @_; 858 my($InternalBondType); 859 860 $InternalBondType = ''; 861 862 BONDSTEREO: { 863 if ($MDLBondStereo == 1) { $InternalBondType = 'SingleUp'; last BONDSTEREO;} 864 if ($MDLBondStereo == 4) { $InternalBondType = 'SingleWavy'; last BONDSTEREO;} 865 if ($MDLBondStereo == 6) { $InternalBondType = 'SingleDown'; last BONDSTEREO;} 866 if ($MDLBondStereo == 3) { $InternalBondType = 'DoubleCross'; last BONDSTEREO;} 867 $InternalBondType = ''; 868 carp "Warning: MDLBondStereoToInternalBondType: MDL bond stereo value, $MDLBondStereo, is not supported: It has been ignored and bond order would be used to determine bond type..."; 869 } 870 return $InternalBondType; 871 } 872 873 # Map internal bond type values used by MayaChemTools to MDL bond stereo value used in SD and MOL files... 874 # 875 sub InternalBondTypeToMDLBondStereo { 876 my($InternalBondType) = @_; 877 my($MDLBondStereo); 878 879 $MDLBondStereo = 0; 880 881 BONDSTEREO: { 882 if ($InternalBondType =~ /^SingleUp$/i) { $MDLBondStereo = 1; last BONDSTEREO;} 883 if ($InternalBondType =~ /^SingleWavy$/i) { $MDLBondStereo = 4; last BONDSTEREO;} 884 if ($InternalBondType =~ /^SingleDown$/) { $MDLBondStereo = 6; last BONDSTEREO;} 885 if ($InternalBondType =~ /^DoubleCross$/) { $MDLBondStereo = 3; last BONDSTEREO;} 886 $MDLBondStereo = 0; 887 } 888 return $MDLBondStereo; 889 } 890 891 # Fourth line: Counts 892 # 893 # Format: aaabbblllfffcccsssxxxrrrpppiiimmmvvvvvv 894 # 895 # aaa: number of atoms; bbb: number of bonds; lll: number of atom lists; fff: (obsolete) 896 # ccc: chiral flag: 0=not chiral, 1=chiral; sss: number of stext entries; xxx,rrr,ppp,iii: 897 # (obsolete); mmm: number of lines of additional properties, including the M END line, No 898 # longer supported, default is set to 999; vvvvvv: version 899 900 sub ParseCmpdCountsLine { 901 my($Line) = @_; 902 my($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version); 903 904 if (length($Line) >= 39) { 905 ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) = unpack("A3A3x3x3A3x3x3x3x3x3A3A6", $Line); 906 } 907 elsif (length($Line) >= 15) { 908 ($PropertyCount, $Version) = ("999", "v2000"); 909 ($AtomCount, $BondCount, $ChiralFlag) = unpack("A3A3x3x3A3", $Line); 910 } 911 else { 912 ($ChiralFlag, $PropertyCount, $Version) = ("0", "999", "v2000"); 913 ($AtomCount, $BondCount) = unpack("A3A3", $Line); 914 } 915 916 if ($Version =~ /V3000/i) { 917 # Current version of MayaChemTools modules and classes for processing MDL MOL and SD don't support 918 # V3000. So instead of relying on callers, just exit with an error to disable any processing of V3000 919 # format. 920 croak "Error: SDFileUtil::ParseCmpdCountsLine: The Extended Connection Table (V3000) format in MDL MOL and SD files is not supported by the current release of MayaChemTools..."; 921 } 922 923 return ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version); 924 } 925 926 # Second line: Misc info 927 # 928 # Format: IIPPPPPPPPMMDDYYHHmmddSSssssssssssEEEEEEEEEEEERRRRRR 929 # A2A8 A10 A2I2A10 A12 A6 930 # User's first and last initials (I), program name (P), date/time (M/D/Y,H:m), 931 # dimensional codes - 2D or 3D (d),scaling factors (S, s), energy (E) if modeling program input, 932 # internal registry number (R) if input through MDL form. A blank line is also allowed. 933 sub ParseCmpdMiscInfoLine { 934 my($Line) = @_; 935 my($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum); 936 937 ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum) = unpack("A2A8A10A2A2A10A12A6", $Line); 938 return ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum); 939 } 940 941 # First line: Molecule name. This line is unformatted, but like all other lines in a 942 # molfile may not extend beyond column 80. A blank line is also allowed. 943 sub ParseCmpdMolNameLine { 944 my($Line) = @_; 945 my($MolName); 946 947 $MolName = unpack("A80", $Line); 948 949 return ($MolName); 950 } 951 952 # Parse atom alias property line in CTAB generic properties block. 953 # 954 # Atom alias property line format: 955 # 956 # A aaa 957 # x... 958 # 959 # aaa: Atom number 960 # x: Atom alias in next line 961 # 962 sub ParseCmpdAtomAliasPropertyLine { 963 my($Line, $NextLine) = @_; 964 my($Label, $AtomNumber, $AtomAlias); 965 966 ($Label, $AtomNumber) = split(' ', $Line); 967 $AtomAlias = $NextLine; 968 969 if (!$AtomAlias) { 970 carp "Warning: _ParseCmpdAtomAliasPropertyLine: No atom alias value specified on the line following atom alias property line..."; 971 } 972 973 return ($AtomNumber, $AtomAlias); 974 } 975 976 # Parse charge property line in CTAB generic properties block. 977 # 978 # Charge property line format: 979 # 980 # M CHGnn8 aaa vvv ... 981 # 982 # nn8: Number of value pairs. Maximum of 8 pairs allowed. 983 # aaa: Atom number 984 # vvv: -15 to +15. Default of 0 = uncharged atom. When present, this property supersedes 985 # all charge and radical values in the atom block, forcing a 0 charge on all atoms not 986 # listed in an M CHG or M RAD line. 987 # 988 sub ParseCmpdChargePropertyLine { 989 my($Line) = @_; 990 991 return _ParseCmpdGenericPropertyLine('Charge', $Line); 992 } 993 994 995 # Parse isotope property line in CTAB generic properties block. 996 # 997 # Isoptope property line format: 998 # 999 # M ISOnn8 aaa vvv ... 1000 # 1001 # nn8: Number of value paris. Maximum of 8 pairs allowed. 1002 # aaa: Atom number 1003 # vvv: Absolute mass of the atom isotope as a positive integer. When present, this property 1004 # supersedes all isotope values in the atom block. Default (no entry) means natural 1005 # abundance. The difference between this absolute mass value and the natural 1006 # abundance value specified in the PTABLE.DAT file must be within the range of -18 1007 # to +12 1008 # 1009 # Notes: 1010 # . Values correspond to mass numbers... 1011 # 1012 sub ParseCmpdIsotopePropertyLine { 1013 my($Line) = @_; 1014 1015 return _ParseCmpdGenericPropertyLine('Isotope', $Line); 1016 } 1017 1018 # Parse radical property line in CTAB generic properties block. 1019 # 1020 # Radical property line format: 1021 # 1022 # M RADnn8 aaa vvv ... 1023 # 1024 # nn8: Number of value paris. Maximum of 8 pairs allowed. 1025 # aaa: Atom number 1026 # vvv: Default of 0 = no radical, 1 = singlet (:), 2 = doublet ( . or ^), 3 = triplet (^^). When 1027 # present, this property supersedes all charge and radical values in the atom block, 1028 # forcing a 0 (zero) charge and radical on all atoms not listed in an M CHG or 1029 # M RAD line. 1030 # 1031 sub ParseCmpdRadicalPropertyLine { 1032 my($Line) = @_; 1033 1034 return _ParseCmpdGenericPropertyLine('Radical', $Line); 1035 } 1036 1037 # Map MDL radical stereo value used in SD and MOL files to internal spin multiplicity values used by MayaChemTools... 1038 # 1039 sub MDLRadicalToInternalSpinMultiplicity { 1040 my($MDLRadical) = @_; 1041 my($InternalSpinMultiplicity); 1042 1043 $InternalSpinMultiplicity = ''; 1044 1045 BONDSTEREO: { 1046 if ($MDLRadical == 0) { $InternalSpinMultiplicity = 0; last BONDSTEREO;} 1047 if ($MDLRadical == 1) { $InternalSpinMultiplicity = 1; last BONDSTEREO;} 1048 if ($MDLRadical == 2) { $InternalSpinMultiplicity = 2; last BONDSTEREO;} 1049 if ($MDLRadical == 3) { $InternalSpinMultiplicity = 3; last BONDSTEREO;} 1050 $InternalSpinMultiplicity = ''; 1051 carp "Warning: MDLRadicalToInternalSpinMultiplicity: MDL radical value, $MDLRadical, specifed on line M RAD is not supported..."; 1052 } 1053 return $InternalSpinMultiplicity; 1054 } 1055 1056 # Map internal spin multiplicity values used by MayaChemTools to MDL radical stereo value used in SD and MOL files... 1057 # 1058 sub InternalSpinMultiplicityToMDLRadical { 1059 my($InternalSpinMultiplicity) = @_; 1060 my($MDLRadical); 1061 1062 $MDLRadical = 0; 1063 1064 BONDSTEREO: { 1065 if ($InternalSpinMultiplicity == 1) { $MDLRadical = 1; last BONDSTEREO;} 1066 if ($InternalSpinMultiplicity == 2) { $MDLRadical = 2; last BONDSTEREO;} 1067 if ($InternalSpinMultiplicity == 3) { $MDLRadical = 3; last BONDSTEREO;} 1068 $MDLRadical = 0; 1069 } 1070 return $MDLRadical; 1071 } 1072 1073 # Process generic CTAB property line... 1074 sub _ParseCmpdGenericPropertyLine { 1075 my($PropertyName, $Line) = @_; 1076 1077 my($Label, $PropertyLabel, $ValuesCount, $ValuePairsCount, @ValuePairs); 1078 1079 @ValuePairs = (); 1080 ($Label, $PropertyLabel, $ValuesCount, @ValuePairs) = split(' ', $Line); 1081 $ValuePairsCount = (scalar @ValuePairs)/2; 1082 if ($ValuesCount != $ValuePairsCount) { 1083 carp "Warning: _ParseCmpdGenericPropertyLine: Number of atom number and $PropertyName value paris specified on $Label $PropertyLabel property line, $ValuePairsCount, does not match expected value of $ValuesCount..."; 1084 } 1085 1086 return (@ValuePairs); 1087 } 1088 1089 # Generic CTAB property lines for charge, istope and radical properties... 1090 # 1091 sub _GenerateCmpdGenericPropertyLines { 1092 my($PropertyName, $PropertyValuePairsRef) = @_; 1093 my($Index, $PropertyLabel, $Line, $PropertyCount, $AtomNum, $PropertyValue, @PropertyLines); 1094 1095 @PropertyLines = (); 1096 NAME: { 1097 if ($PropertyName =~ /^Charge$/i) { $PropertyLabel = "M CHG"; last NAME; } 1098 if ($PropertyName =~ /^Isotope$/i) { $PropertyLabel = "M ISO"; last NAME; } 1099 if ($PropertyName =~ /^Radical$/i) { $PropertyLabel = "M RAD"; last NAME; } 1100 carp "Warning: _GenerateCmpdGenericPropertyLines: Unknown property name, $PropertyName, specified..."; 1101 return @PropertyLines; 1102 } 1103 1104 # A maximum of 8 property pair values allowed per line... 1105 $PropertyCount = 0; 1106 $Line = ''; 1107 for ($Index = 0; $Index < $#{$PropertyValuePairsRef}; $Index += 2) { 1108 if ($PropertyCount > 8) { 1109 # Setup property line... 1110 $Line = "${PropertyLabel} 8${Line}"; 1111 push @PropertyLines, $Line; 1112 1113 $PropertyCount = 0; 1114 $Line = ''; 1115 } 1116 $PropertyCount++; 1117 $AtomNum = $PropertyValuePairsRef->[$Index]; 1118 $PropertyValue = $PropertyValuePairsRef->[$Index + 1]; 1119 $Line .= sprintf " %3i %3i", $AtomNum, $PropertyValue; 1120 } 1121 if ($Line) { 1122 $Line = "${PropertyLabel} ${PropertyCount}${Line}"; 1123 push @PropertyLines, $Line; 1124 } 1125 return @PropertyLines; 1126 } 1127 1128 # 1129 # Read compound data into a string and return its value 1130 sub ReadCmpdString { 1131 my($SDFileRef) = @_; 1132 my($CmpdString); 1133 1134 $CmpdString = ""; 1135 LINE: while (defined($_ = <$SDFileRef>)) { 1136 # Change Windows and Mac new line char to UNIX... 1137 s/(\r\n)|(\r)/\n/g; 1138 1139 if (/^\$\$\$\$/) { 1140 chomp; 1141 $CmpdString .= $_; 1142 last LINE; 1143 } 1144 else { 1145 $CmpdString .= $_; 1146 } 1147 } 1148 return $CmpdString; 1149 } 1150 1151 # Find out the number of fragements in the compounds. And for the compound with 1152 # more than one fragment, remove all the others besides the largest one. 1153 sub WashCmpd { 1154 my($CmpdLines) = @_; 1155 my($WashedCmpdString, $FragmentCount, $Fragments); 1156 1157 $WashedCmpdString = ""; 1158 ($FragmentCount, $Fragments) = GetCmpdFragments($CmpdLines); 1159 if ($FragmentCount > 1) { 1160 # Go over the compound data for the largest fragment including property 1161 # data... 1162 my (@AllFragments, @LargestFragment, %LargestFragmentAtoms, @WashedCmpdLines, $Index, $LineIndex, $AtomCount, $BondCount, $NewAtomCount, $NewBondCount, $FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo, $FirstNewAtomNum, $SecondNewAtomNum, $AtomNum, $ChiralFlag, $BondLine, $MENDLineIndex, $Line, $Value, @ValuePairs, @NewValuePairs, $ValuePairIndex, $NewAtomNum, @NewPropertyLines); 1163 1164 @AllFragments = (); @LargestFragment = (); 1165 %LargestFragmentAtoms = (); 1166 @AllFragments = split "\n", $Fragments; 1167 @LargestFragment = split " ", $AllFragments[0]; 1168 for $Index (0 .. $#LargestFragment) { 1169 # Map old atom numbers to new atom numbers as the fragment atom numbers are sorted 1170 # from lowest to highest old atom numbers... 1171 $LargestFragmentAtoms{$LargestFragment[$Index]} = $Index + 1; 1172 } 1173 @WashedCmpdLines = (); 1174 push @WashedCmpdLines, @$CmpdLines[0], @$CmpdLines[1], @$CmpdLines[2], @$CmpdLines[3]; 1175 ($AtomCount, $BondCount, $ChiralFlag) = ParseCmpdCountsLine(@$CmpdLines[3]); 1176 $NewAtomCount = @LargestFragment; 1177 $NewBondCount = 0; 1178 $AtomNum = 0; 1179 # Retrieve the largest fragment atom lines... 1180 for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) { 1181 $AtomNum++; 1182 if ($LargestFragmentAtoms{$AtomNum}) { 1183 push @WashedCmpdLines, @$CmpdLines[$LineIndex]; 1184 } 1185 } 1186 # Retrieve the largest fragment bond lines... 1187 for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) { 1188 ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]); 1189 if ($LargestFragmentAtoms{$FirstAtomNum} && $LargestFragmentAtoms{$SecondAtomNum}) { 1190 $NewBondCount++; 1191 # Set up bond line with new atom number mapping... 1192 $FirstNewAtomNum = $LargestFragmentAtoms{$FirstAtomNum}; 1193 $SecondNewAtomNum = $LargestFragmentAtoms{$SecondAtomNum}; 1194 $BondLine = GenerateCmpdBondLine($FirstNewAtomNum, $SecondNewAtomNum, $BondType, $BondStereo); 1195 push @WashedCmpdLines, $BondLine; 1196 } 1197 } 1198 # Get property lines for CHG, ISO and RAD label and map the old atom numbers to new 1199 # atom numners; Others, property lines before M END line are skipped as atom numbers for 1200 # other properties might not valid anymore... 1201 # 1202 $MENDLineIndex = $LineIndex; 1203 LINE: for ($LineIndex = (4 + $AtomCount + $BondCount); $LineIndex < @$CmpdLines; $LineIndex++) { 1204 $Line = @$CmpdLines[$LineIndex]; 1205 if ($Line =~ /^M END/i) { 1206 push @WashedCmpdLines, "M END"; 1207 $MENDLineIndex = $LineIndex; 1208 last LINE; 1209 } 1210 1211 @ValuePairs = (); 1212 if ($Line =~ /^M CHG/i) { 1213 @ValuePairs = ParseCmpdChargePropertyLine($Line); 1214 } 1215 elsif ($Line =~ /^M RAD/i) { 1216 @ValuePairs = ParseCmpdRadicalPropertyLine($Line); 1217 } 1218 elsif ($Line =~ /^M ISO/i) { 1219 @ValuePairs = ParseCmpdIsotopePropertyLine($Line); 1220 } 1221 elsif ($Line =~ /^A /i) { 1222 my($NextLine); 1223 $LineIndex++; 1224 $NextLine = @$CmpdLines[$LineIndex]; 1225 @ValuePairs = ParseCmpdAtomAliasPropertyLine($Line, $NextLine); 1226 } 1227 else { 1228 next LINE; 1229 } 1230 1231 if (!@ValuePairs) { 1232 next LINE; 1233 } 1234 1235 # Collect values for valid atom numbers with mapping to new atom numbers... 1236 @NewValuePairs = (); 1237 VALUEINDEX: for ($ValuePairIndex = 0; $ValuePairIndex < $#ValuePairs; $ValuePairIndex += 2) { 1238 $AtomNum = $ValuePairs[$ValuePairIndex]; $Value = $ValuePairs[$ValuePairIndex + 1]; 1239 if (!exists $LargestFragmentAtoms{$AtomNum}) { 1240 next VALUEINDEX; 1241 } 1242 $NewAtomNum = $LargestFragmentAtoms{$AtomNum}; 1243 push @NewValuePairs, ($NewAtomNum, $Value) 1244 } 1245 if (!@NewValuePairs) { 1246 next LINE; 1247 } 1248 @NewPropertyLines = (); 1249 if ($Line =~ /^M CHG/i) { 1250 @NewPropertyLines = GenerateCmpdChargePropertyLines(\@NewValuePairs); 1251 } 1252 elsif ($Line =~ /^M RAD/i) { 1253 @NewPropertyLines = GenerateCmpdRadicalPropertyLines(\@NewValuePairs); 1254 } 1255 elsif ($Line =~ /^M ISO/i) { 1256 @NewPropertyLines = GenerateCmpdIsotopePropertyLines(\@NewValuePairs); 1257 } 1258 elsif ($Line =~ /^A /i) { 1259 @NewPropertyLines = GenerateCmpdAtomAliasPropertyLines(\@NewValuePairs); 1260 } 1261 push @WashedCmpdLines, @NewPropertyLines; 1262 } 1263 1264 # Retrieve rest of the data label and value property data... 1265 for ($LineIndex = (1 + $MENDLineIndex); $LineIndex < @$CmpdLines; $LineIndex++) { 1266 push @WashedCmpdLines, @$CmpdLines[$LineIndex]; 1267 } 1268 # Update atom and bond count line... 1269 $WashedCmpdLines[3] = GenerateCmpdCountsLine($NewAtomCount, $NewBondCount, $ChiralFlag); 1270 1271 $WashedCmpdString = join "\n", @WashedCmpdLines; 1272 } 1273 return ($FragmentCount, $Fragments, $WashedCmpdString); 1274 } 1275