1 package SDFileUtil; 2 # 3 # $RCSfile: SDFileUtil.pm,v $ 4 # $Date: 2008/04/25 00:00:46 $ 5 # $Revision: 1.26 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 use 5.006; 29 use strict; 30 use Exporter; 31 use Carp; 32 use PeriodicTable qw(IsElement); 33 34 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 35 36 $VERSION = '1.00'; 37 @ISA = qw(Exporter); 38 @EXPORT = qw(GenerateCmpdAtomLine GenerateCmpdBondLine GenerateCmpdChargePropertyLines GenerateCmpdCommentsLine GenerateCmpdCountsLine GenerateCmpdIsotopePropertyLines GenerateCmpdDataHeaderLabelsAndValuesLines GenerateCmpdMiscInfoLine GenerateCmpdRadicalPropertyLines GenerateCmpdMolNameLine GenerateEmptyCtabBlockLines GenerateMiscLineDateStamp GetAllAndCommonCmpdDataHeaderLabels GetCmpdDataHeaderLabels GetCmpdDataHeaderLabelsAndValues GetCmpdFragments GetCtabLinesCount GetUnknownAtoms MDLChargeToInternalCharge InternalChargeToMDLCharge MDLBondTypeToInternalBondOrder InternalBondOrderToMDLBondType InternalSpinMultiplicityToMDLRadical MDLRadicalToInternalSpinMultiplicity ParseCmpdAtomLine ParseCmpdBondLine ParseCmpdCommentsLine ParseCmpdCountsLine ParseCmpdMiscInfoLine ParseCmpdMolNameLine ParseCmpdChargePropertyLine ParseCmpdIsotopePropertyLine ParseCmpdRadicalPropertyLine ReadCmpdString WashCmpd); 39 @EXPORT_OK = qw(); 40 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 41 42 # Format data for compounds count line... 43 sub GenerateCmpdCountsLine { 44 my($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version, $Line); 45 46 if (@_ == 5) { 47 ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) = @_; 48 } 49 elsif (@_ == 3) { 50 ($AtomCount, $BondCount, $ChiralFlag) = @_; 51 $PropertyCount = 999; 52 $Version = "V2000"; 53 } 54 else { 55 ($AtomCount, $BondCount) = @_; 56 $ChiralFlag = 0; 57 $PropertyCount = 999; 58 $Version = "V2000"; 59 } 60 if ($AtomCount > 999) { 61 croak "Error: SDFileUtil::GenerateCmpdCountsLine: The atom count, $AtomCount, exceeds maximum of 999 allowed for CTAB version 2000. The Extended Connection Table (V3000) format in MDL MOL and SD files is not supported by the current release of MayaChemTools..."; 62 } 63 $Line = sprintf "%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%6s", $AtomCount, $BondCount, 0, 0, $ChiralFlag, 0, 0, 0, 0, 0, $PropertyCount, $Version; 64 65 return ($Line); 66 } 67 68 # Generate comments line... 69 sub GenerateCmpdCommentsLine { 70 my($Comments) = @_; 71 my($Line); 72 73 $Line = (length($Comments) > 80) ? substr($Comments, 0, 80) : $Comments; 74 75 return $Line; 76 } 77 78 # Generate molname line... 79 sub GenerateCmpdMolNameLine { 80 my($MolName) = @_; 81 my($Line); 82 83 $Line = (length($MolName) > 80) ? substr($MolName, 0, 80) : $MolName; 84 85 return $Line; 86 } 87 88 # Generate data for compounds misc info line... 89 sub GenerateCmpdMiscInfoLine { 90 my($ProgramName, $UserInitial, $Code) = @_; 91 my($Date, $Line); 92 93 if (!(defined($ProgramName) && $ProgramName)) { 94 $ProgramName = "MayaChem"; 95 } 96 if (!(defined($UserInitial) && $UserInitial)) { 97 $UserInitial = " "; 98 } 99 if (!(defined($Code) && $Code)) { 100 $Code = "2D"; 101 } 102 103 if (length($ProgramName) > 8) { 104 $ProgramName = substr($ProgramName, 0, 8); 105 } 106 if (length($UserInitial) > 2) { 107 $UserInitial = substr($UserInitial, 0, 2); 108 } 109 if (length($Code) > 2) { 110 $Code = substr($Code, 0, 2); 111 } 112 $Date = GenerateMiscLineDateStamp(); 113 114 $Line = "${UserInitial}${ProgramName}${Date}${Code}"; 115 116 return $Line; 117 } 118 119 # Generate data for compounds misc info line... 120 sub GenerateEmptyCtabBlockLines { 121 my($Date, $Lines); 122 123 if (@_ == 1) { 124 ($Date) = @_; 125 } 126 else { 127 $Date = GenerateMiscLineDateStamp(); 128 } 129 # First line: Blank molname line... 130 # Second line: Misc info... 131 # Third line: Blank comments line... 132 # Fourth line: Counts line reflecting empty structure data block... 133 $Lines = "\n"; 134 $Lines .= " MayaChem${Date}2D\n"; 135 $Lines .= "\n"; 136 $Lines .= GenerateCmpdCountsLine(0, 0, 0) . "\n"; 137 $Lines .= "M END"; 138 139 return $Lines; 140 } 141 142 # Generate SD file data stamp... 143 sub GenerateMiscLineDateStamp { 144 my($Date, $Sec, $Min, $Hour, $MDay, $Mon, $Year, $WDay, $YDay, $IsDst); 145 146 ($Sec, $Min, $Hour, $MDay, $Mon, $Year, $WDay, $YDay, $IsDst) = localtime; 147 $Mon += 1; 148 $Year = substr($Year, -1, 2); 149 $Date = sprintf "%2i%2i%2i%2i%2i", $Mon, $MDay, $Year, $Hour, $Min; 150 $Date =~ s/ /0/g; 151 152 return $Date; 153 } 154 155 # Generate data for compound atom line... 156 # 157 sub GenerateCmpdAtomLine { 158 my($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge, $StereoParity) = @_; 159 my($Line); 160 161 if (!defined $MassDifference) { 162 $MassDifference = 0; 163 } 164 if (!defined $Charge) { 165 $Charge = 0; 166 } 167 if (!defined $StereoParity) { 168 $StereoParity = 0; 169 } 170 $Line = sprintf "%10.4f%10.4f%10.4f %-3s%2i%3i%3i 0 0 0 0 0 0 0 0 0", $AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity; 171 172 return $Line 173 } 174 175 # Generate data for compound bond line... 176 # 177 sub GenerateCmpdBondLine { 178 my($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = @_; 179 my($Line); 180 181 if (!defined $BondStereo) { 182 $BondStereo = 0; 183 } 184 $Line = sprintf "%3i%3i%3i%3i 0 0 0", $FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo; 185 186 return $Line 187 } 188 189 # Generate charge property lines for CTAB block... 190 # 191 sub GenerateCmpdChargePropertyLines { 192 my($ChargeValuePairsRef) = @_; 193 194 return _GenerateCmpdGenericPropertyLines('Charge', $ChargeValuePairsRef); 195 } 196 197 # Generate isotope property lines for CTAB block... 198 # 199 sub GenerateCmpdIsotopePropertyLines { 200 my($IsotopeValuePairsRef) = @_; 201 202 return _GenerateCmpdGenericPropertyLines('Isotope', $IsotopeValuePairsRef); 203 } 204 205 # Generate radical property line property lines for CTAB block... 206 # 207 sub GenerateCmpdRadicalPropertyLines { 208 my($RadicalValuePairsRef) = @_; 209 210 return _GenerateCmpdGenericPropertyLines('Radical', $RadicalValuePairsRef); 211 } 212 213 # Generate data header labels and values lines... 214 # 215 sub GenerateCmpdDataHeaderLabelsAndValuesLines { 216 my($DataHeaderLabelsRef, $DataHeaderLabelsAndValuesRef, $SortDataLabels) = @_; 217 my($DataLabel, $DataValue, @DataLabels, @DataLines); 218 219 if (!defined $SortDataLabels) { 220 $SortDataLabels = 0; 221 } 222 223 @DataLines = (); 224 @DataLabels = (); 225 if ($SortDataLabels) { 226 push @DataLabels, sort @{$DataHeaderLabelsRef}; 227 } 228 else { 229 push @DataLabels, @{$DataHeaderLabelsRef}; 230 } 231 for $DataLabel (@DataLabels) { 232 $DataValue = ''; 233 if (exists $DataHeaderLabelsAndValuesRef->{$DataLabel}) { 234 $DataValue = $DataHeaderLabelsAndValuesRef->{$DataLabel}; 235 } 236 push @DataLines, ("> <${DataLabel}>", "$DataValue", ""); 237 } 238 return @DataLines; 239 } 240 241 # Parse data field header in SD file and return lists of all and common data field 242 # labels. 243 sub GetAllAndCommonCmpdDataHeaderLabels { 244 my($SDFileRef) = @_; 245 my($CmpdCount, $CmpdString, $Label, @CmpdLines, @DataFieldLabels, @CommonDataFieldLabels, %DataFieldLabelsMap); 246 247 $CmpdCount = 0; 248 @DataFieldLabels = (); 249 @CommonDataFieldLabels = (); 250 %DataFieldLabelsMap = (); 251 252 while ($CmpdString = ReadCmpdString($SDFileRef)) { 253 $CmpdCount++; 254 @CmpdLines = split "\n", $CmpdString; 255 # Process compound data header labels and figure out which ones are present for 256 # all the compounds... 257 if (@DataFieldLabels) { 258 my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines); 259 my(%CmpdDataFieldLabelsMap) = (); 260 # Setup a map for the current labels... 261 for $Label (@CmpdDataFieldLabels) { 262 $CmpdDataFieldLabelsMap{$Label} = "PresentInSome"; 263 } 264 # Check the presence old labels for this compound; otherwise, mark 'em new... 265 for $Label (@DataFieldLabels) { 266 if (!$CmpdDataFieldLabelsMap{$Label}) { 267 $DataFieldLabelsMap{$Label} = "PresentInSome"; 268 } 269 } 270 # Check the presence this compound in the old labels; otherwise, add 'em... 271 for $Label (@CmpdDataFieldLabels ) { 272 if (!$DataFieldLabelsMap{$Label}) { 273 # It's a new label... 274 push @DataFieldLabels, $Label; 275 $DataFieldLabelsMap{$Label} = "PresentInSome"; 276 } 277 } 278 } 279 else { 280 # Get the initial label set and set up a map... 281 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); 282 for $Label (@DataFieldLabels) { 283 $DataFieldLabelsMap{$Label} = "PresentInAll"; 284 } 285 } 286 } 287 # Identify the common data field labels... 288 @CommonDataFieldLabels = (); 289 for $Label (@DataFieldLabels) { 290 if ($DataFieldLabelsMap{$Label} eq "PresentInAll") { 291 push @CommonDataFieldLabels, $Label; 292 } 293 } 294 return ($CmpdCount, \@DataFieldLabels, \@CommonDataFieldLabels); 295 } 296 297 # Parse all the data header labels and return 'em as an list... 298 # 299 # Format: 300 # 301 #> Data header line 302 #Data line(s) 303 #Blank line 304 # 305 # [Data Header] (one line) precedes each item of data, starts with a greater than (>) sign, and 306 # contains at least one of the following: 307 # The field name enclosed in angle brackets. For example: <melting.point> 308 # The field number, DTn , where n represents the number assigned to the field in a MACCS-II database 309 # 310 #Optional information for the data header includes: 311 # The compound’s external and internal registry numbers. External registry numbers must be enclosed in parentheses. 312 # Any combination of information 313 # 314 #The following are examples of valid data headers: 315 #> <MELTING.POINT> 316 #> 55 (MD-08974) <BOILING.POINT> DT12 317 #> DT12 55 318 #> (MD-0894) <BOILING.POINT> FROM ARCHIVES 319 # 320 #Notes: Sometimes last blank line is missing and can be just followed by $$$$ 321 # 322 sub GetCmpdDataHeaderLabels { 323 my($CmpdLines) = @_; 324 my($CmpdLine, $Label, @Labels); 325 326 @Labels = (); 327 CMPDLINE: for $CmpdLine (@$CmpdLines) { 328 if ($CmpdLine !~ /^>/) { 329 next CMPDLINE; 330 } 331 # Does the line contains field name enclosed in angular brackets? 332 ($Label) = $CmpdLine =~ /<.*?>/g; 333 if (!defined($Label)) { 334 next CMPDLINE; 335 } 336 $Label =~ s/(<|>)//g; 337 push @Labels, $Label; 338 } 339 return (@Labels); 340 } 341 342 # Parse all the data header labels and values 343 sub GetCmpdDataHeaderLabelsAndValues { 344 my($CmpdLines) = @_; 345 my($CmpdLine, $CurrentLabel, $Label, $Value, $ValueCount, $ProcessingLabelData, @Values, %DataFields); 346 347 %DataFields = (); 348 $ProcessingLabelData = 0; 349 $ValueCount = 0; 350 CMPDLINE: for $CmpdLine (@$CmpdLines) { 351 if ($CmpdLine =~ /\$\$\$\$/) { 352 last CMPDLINE; 353 } 354 if ($CmpdLine =~ /^>/) { 355 # Does the line contains field name enclosed in angular brackets? 356 ($Label) = $CmpdLine =~ /<.*?>/g; 357 if (defined $Label) { 358 $CurrentLabel = $Label; 359 $CurrentLabel =~ s/(<|>)//g; 360 $ProcessingLabelData = 0; 361 $ValueCount = 0; 362 363 if ($CurrentLabel) { 364 $ProcessingLabelData = 1; 365 $DataFields{$CurrentLabel} = ''; 366 next CMPDLINE; 367 } 368 } 369 else { 370 if (!$ProcessingLabelData) { 371 # Data line containing no <label> as allowed by SDF format. Just ignore it... 372 next CMPDLINE; 373 } 374 } 375 } 376 if (!$ProcessingLabelData) { 377 next CMPDLINE; 378 } 379 if (!(defined($CmpdLine) && length($CmpdLine))) { 380 # Blank line terminates value for a label... 381 $CurrentLabel = ''; 382 $ValueCount = 0; 383 $ProcessingLabelData = 0; 384 next CMPDLINE; 385 } 386 $ValueCount++; 387 $Value = $CmpdLine; 388 389 if ($ValueCount > 1) { 390 $DataFields{$CurrentLabel} .= "\n" . $Value; 391 } 392 else { 393 $DataFields{$CurrentLabel} = $Value; 394 } 395 } 396 return (%DataFields); 397 } 398 399 # 400 # Using bond blocks, figure out the number of disconnected fragments and 401 # return their values along with the atom numbers in a string delimited by new 402 # line character. 403 # 404 sub GetCmpdFragments { 405 my($CmpdLines) = @_; 406 my($AtomCount, $BondCount, $FirstAtomNum, $SecondAtomNum, @AtomConnections, $BondType, $FragmentString, $FragmentCount, $LineIndex, $Index, $AtomNum, $NbrAtomNum, @ProcessedAtoms, $ProcessedAtomCount, $ProcessAtomNum, @ProcessingAtoms, @ConnectedAtoms, %Fragments, $FragmentNum, $AFragmentString); 407 408 # Setup the connection table for each atom... 409 @AtomConnections = (); 410 ($AtomCount, $BondCount) = ParseCmpdCountsLine(@$CmpdLines[3]); 411 for $AtomNum (1 .. $AtomCount) { 412 %{$AtomConnections[$AtomNum]} = (); 413 } 414 for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) { 415 ($FirstAtomNum, $SecondAtomNum, $BondType) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]); 416 if (!$AtomConnections[$FirstAtomNum]{$SecondAtomNum}) { 417 $AtomConnections[$FirstAtomNum]{$SecondAtomNum} = $BondType; 418 } 419 if (!$AtomConnections[$SecondAtomNum]{$FirstAtomNum}) { 420 $AtomConnections[$SecondAtomNum]{$FirstAtomNum} = $BondType; 421 } 422 } 423 424 #Get set to count fragments... 425 $ProcessedAtomCount = 0; 426 $FragmentNum = 0; 427 %Fragments = (); 428 @ProcessedAtoms = (); 429 for $AtomNum (1 .. $AtomCount) { 430 $ProcessedAtoms[$AtomNum] = 0; 431 } 432 while ($ProcessedAtomCount < $AtomCount) { 433 @ProcessingAtoms = (); 434 @ConnectedAtoms = (); 435 ATOMNUM: for $AtomNum (1 .. $AtomCount) { 436 if (!$ProcessedAtoms[$AtomNum]) { 437 $ProcessedAtomCount++; 438 $ProcessedAtoms[$AtomNum] = 1; 439 push @ProcessingAtoms, $AtomNum; 440 $FragmentNum++; 441 @{$Fragments{$FragmentNum} } = (); 442 push @{$Fragments{$FragmentNum} }, $AtomNum; 443 last ATOMNUM; 444 } 445 } 446 447 # Go over the neighbors and follow the connection trail while collecting the 448 # atoms numbers present in the connected fragment... 449 while (@ProcessingAtoms) { 450 for ($Index = 0; $Index < @ProcessingAtoms; $Index++) { 451 $ProcessAtomNum = $ProcessingAtoms[$Index]; 452 for $NbrAtomNum (keys %{$AtomConnections[$ProcessAtomNum]}) { 453 if (!$ProcessedAtoms[$NbrAtomNum]) { 454 $ProcessedAtomCount++; 455 $ProcessedAtoms[$NbrAtomNum] = 1; 456 push @ConnectedAtoms, $NbrAtomNum; 457 push @{ $Fragments{$FragmentNum} }, $NbrAtomNum; 458 } 459 } 460 } 461 @ProcessingAtoms = (); 462 @ProcessingAtoms = @ConnectedAtoms; 463 @ConnectedAtoms = (); 464 } 465 } 466 $FragmentCount = $FragmentNum; 467 $FragmentString = ""; 468 469 # Sort out the fragments by size... 470 for $FragmentNum (sort { @{$Fragments{$b}} <=> @{$Fragments{$a}} } keys %Fragments ) { 471 # Sort the atoms in a fragment by their numbers... 472 $AFragmentString = join " ", sort { $a <=> $b } @{ $Fragments{$FragmentNum} }; 473 if ($FragmentString) { 474 $FragmentString .= "\n" . $AFragmentString; 475 } 476 else { 477 $FragmentString = $AFragmentString; 478 } 479 } 480 return ($FragmentCount, $FragmentString); 481 } 482 483 # Count number of lines present in between 4th and line containg "M END" 484 sub GetCtabLinesCount { 485 my($CmpdLines) = @_; 486 my($LineIndex, $CtabLinesCount); 487 488 $CtabLinesCount = 0; 489 LINE: for ($LineIndex = 4; $LineIndex < @$CmpdLines; $LineIndex++) { 490 if ((@$CmpdLines[$LineIndex] =~ /^M /i) || (@$CmpdLines[$LineIndex] =~ /^M END/i)) { 491 $CtabLinesCount = $LineIndex - 4; 492 last LINE; 493 } 494 } 495 return $CtabLinesCount; 496 } 497 498 # Using atom blocks, count the number of atoms which contain special element 499 # symbols not present in the periodic table. 500 sub GetUnknownAtoms { 501 my($CmpdLines) = @_; 502 my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines, $LineIndex, $AtomCount, $AtomSymbol); 503 504 $UnknownAtomCount = 0; 505 $UnknownAtoms = ""; 506 $UnknownAtomLines = ""; 507 ($AtomCount) = ParseCmpdCountsLine(@$CmpdLines[3]); 508 for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) { 509 ($AtomSymbol) = ParseCmpdAtomLine(@$CmpdLines[$LineIndex]); 510 if (!IsElement($AtomSymbol)) { 511 $UnknownAtomCount++; 512 $UnknownAtoms .= " $AtomSymbol"; 513 if ($UnknownAtomLines) { 514 $UnknownAtomLines .= "\n" . @$CmpdLines[$LineIndex]; 515 } 516 else { 517 $UnknownAtomLines = @$CmpdLines[$LineIndex]; 518 } 519 } 520 } 521 return ($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines); 522 } 523 524 # Ctab lines: Atom block 525 # 526 # Format: xxxxx.xxxxyyyyy.yyyyzzzzz.zzzz aaaddcccssshhhbbbvvvHHHrrriiimmmnnneee 527 # A10 A10 A10 xA3 A2A3 A3 A3 A3 A3 A3 A3 A3 A3 A3 A3 528 # x,y,z: Atom coordinates 529 # aaa: Atom symbol. Entry in periodic table or L for atom list, A, Q, * for unspecified 530 # atom, and LP for lone pair, or R# for Rgroup label 531 # dd: Mass difference. -3, -2, -1, 0, 1, 2, 3, 4 (0 for value beyond these limits) 532 # ccc: Charge. 0 = uncharged or value other than these, 1 = +3, 2 = +2, 3 = +1, 533 # 4 = doublet radical, 5 = -1, 6 = -2, 7 = -3 534 # sss: Atom stereo parity. 0 = not stereo, 1 = odd, 2 = even, 3 = either or unmarked stereo center 535 # hhh: Hydrogen count + 1. 1 = H0, 2 = H1, 3 = H2, 4 = H3, 5 = H4 536 # bbb: Stereo care box. 0 = ignore stereo configuration of this double bond atom, 1 = stereo 537 # configuration of double bond atom must match 538 # vvv: Valence. 0 = no marking (default)(1 to 14) = (1 to 14) 15 = zero valence 539 # HHH: H0 designator. 0 = not specified, 1 = no H atoms allowed (redundant due to hhh) 540 # rrr: Not used 541 # iii: Not used 542 # mmm: Atom-atom mapping number. 1 - number of atoms 543 # nnn: Inversion/retention flag. 0 = property not applied, 1 = configuration is inverted, 544 # 2 = configuration is retained. 545 # eee: Exact change flag. 0 = property not applied, 1 = change on atom must be 546 # exactly as shown 547 # 548 # Notes: 549 # . StereoParity: 1 - ClockwiseStereo, 2 - AntiClockwiseStereo; 3 - Either; 0 - none. These 550 # values determine chirailty around the chiral center; a non zero value indicates atom 551 # has been marked as chiral center. 552 # 553 sub ParseCmpdAtomLine { 554 my($Line) = @_; 555 my ($LineIndex, $AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity); 556 557 ($AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity) = ('') x 7; 558 if (length($Line) > 31) { 559 ($AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity) = unpack("A10A10A10xA3A2A3A3", $Line); 560 } 561 else { 562 ($AtomX, $AtomY, $AtomZ, $AtomSymbol) = unpack("A10A10A10", $Line); 563 } 564 return ($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge, $StereoParity); 565 } 566 567 # Map MDL charge value used in SD and MOL files to internal charge used by MayaChemTools. 568 # 569 sub MDLChargeToInternalCharge { 570 my($MDLCharge) = @_; 571 my($InternalCharge); 572 573 CHARGE: { 574 if ($MDLCharge == 0) { $InternalCharge = 0; last CHARGE;} 575 if ($MDLCharge == 1) { $InternalCharge = 3; last CHARGE;} 576 if ($MDLCharge == 2) { $InternalCharge = 2; last CHARGE;} 577 if ($MDLCharge == 3) { $InternalCharge = 1; last CHARGE;} 578 if ($MDLCharge == 5) { $InternalCharge = -1; last CHARGE;} 579 if ($MDLCharge == 6) { $InternalCharge = -2; last CHARGE;} 580 if ($MDLCharge == 7) { $InternalCharge = -3; last CHARGE;} 581 # All other MDL charge values, including 4 corresponding to "doublet radical", 582 # are assigned internal value of 0. 583 $InternalCharge = 0; 584 carp "Warning: MDLChargeToInternalCharge: MDL charge value, $MDLCharge, is not supported: An internal charge value, 0, has been assigned..."; 585 } 586 return $InternalCharge; 587 } 588 589 # Map internal charge used by MayaChemTools to MDL charge value used in SD and MOL files. 590 # 591 sub InternalChargeToMDLCharge { 592 my($InternalCharge) = @_; 593 my($MDLCharge); 594 595 CHARGE: { 596 if ($InternalCharge == 3) { $MDLCharge = 1; last CHARGE;} 597 if ($InternalCharge == 2) { $MDLCharge = 2; last CHARGE;} 598 if ($InternalCharge == 1) { $MDLCharge = 3; last CHARGE;} 599 if ($InternalCharge == -1) { $MDLCharge = 5; last CHARGE;} 600 if ($InternalCharge == -2) { $MDLCharge = 6; last CHARGE;} 601 if ($InternalCharge == -3) { $MDLCharge = 7; last CHARGE;} 602 # All other MDL charge values, including 4 corresponding to "doublet radical", 603 # are assigned internal value of 0. 604 $MDLCharge = 0; 605 } 606 return $MDLCharge; 607 } 608 609 # Ctab lines: Bond block 610 # 611 # Format: 111222tttsssxxxrrrccc 612 # 613 # 111: First atom number. 614 # 222: Second atom number. 615 # ttt: Bond type. 1 = Single, 2 = Double, 3 = Triple, 4 = Aromatic, 5 = Single or Double, 616 # 6 = Single or Aromatic, 7 = Double or Aromatic, 8 = Any 617 # sss: Bond stereo. Single bonds: 0 = not stereo, 1 = Up, 4 = Either, 6 = Down, 618 # Double bonds: 0 = Use x-, y-, z-coords from atom block to determine cis or trans, 619 # 3 = Cis or trans (either) double bond 620 # xxx: Not used 621 # rrr: Bond topology. 0 = Either, 1 = Ring, 2 = Chain 622 # ccc: Reacting center status. 0 = unmarked, 1 = a center, -1 = not a center, 623 # Additional: 2 = no change,4 = bond made/broken, 8 = bond order changes 12 = 4+8 624 # (both made/broken and changes); 5 = (4 + 1), 9 = (8 + 1), and 13 = (12 + 1) are also possible 625 sub ParseCmpdBondLine { 626 my($Line) = @_; 627 my($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo); 628 629 ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = map {s/ //g; $_} unpack("A3A3A3A3", $Line); 630 return ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo); 631 } 632 633 # Map MDL bond type value used in SD and MOL files to internal bond order values used by MayaChemTools... 634 # 635 sub MDLBondTypeToInternalBondOrder { 636 my($MDLBondType) = @_; 637 my($InternalBondOrder); 638 639 BONDTYPE: { 640 if ($MDLBondType == 1) { $InternalBondOrder = 1; last BONDTYPE;} 641 if ($MDLBondType == 2) { $InternalBondOrder = 2; last BONDTYPE;} 642 if ($MDLBondType == 3) { $InternalBondOrder = 3; last BONDTYPE;} 643 if ($MDLBondType == 4) { $InternalBondOrder = 1.5; last BONDTYPE;} # Aromatic 644 # 645 # Although MDL aromatic bond values are used for query only and explicit Kekule bond order 646 # values must be assigned, internal value of 1.5 is allowed to indicate aromatic bond orders. 647 # 648 # All other MDL bond type values - 5 = Single or Double, 6 = Single or Aromatic, 7 = Double or Aromatic, 649 # 8 = Any - are assigned internal value of 1: These are meant to be used for structure queries by MDL products. 650 # 651 $InternalBondOrder = 1; 652 carp "Warning: MDLBondTypeToInternalBondOrder: MDL bond type value, $MDLBondType, is not supported: An internal bond order value, 1, has been assigned..."; 653 } 654 return $InternalBondOrder; 655 } 656 657 # Map internal bond order values used by MayaChemTools to MDL bond type value used in SD and MOL files... 658 # 659 sub InternalBondOrderToMDLBondType { 660 my($InternalBondOrder) = @_; 661 my($MDLBondType); 662 663 BONDTYPE: { 664 if ($InternalBondOrder == 1) { $MDLBondType = 1; last BONDTYPE;} 665 if ($InternalBondOrder == 2) { $MDLBondType = 2; last BONDTYPE;} 666 if ($InternalBondOrder == 3) { $MDLBondType = 3; last BONDTYPE;} 667 if ($InternalBondOrder == 1.5) { $MDLBondType = 4; last BONDTYPE;} # Aromatic 668 $MDLBondType = 1; 669 } 670 return $MDLBondType; 671 } 672 673 # Third line: Comments - A blank line is also allowed. 674 sub ParseCmpdCommentsLine { 675 my($Line) = @_; 676 my($Comments); 677 678 $Comments = unpack("A80", $Line); 679 680 return ($Comments); 681 } 682 683 # Map MDL bond stereo value used in SD and MOL files to internal bond type values used by MayaChemTools... 684 # 685 sub MDLBondStereoToInternalBondType { 686 my($MDLBondStereo) = @_; 687 my($InternalBondType); 688 689 $InternalBondType = ''; 690 691 BONDSTEREO: { 692 if ($MDLBondStereo == 1) { $InternalBondType = 'SingleUp'; last BONDSTEREO;} 693 if ($MDLBondStereo == 4) { $InternalBondType = 'SingleWavy'; last BONDSTEREO;} 694 if ($MDLBondStereo == 6) { $InternalBondType = 'SingleDown'; last BONDSTEREO;} 695 if ($MDLBondStereo == 3) { $InternalBondType = 'DoubleCross'; last BONDSTEREO;} 696 $InternalBondType = ''; 697 carp "Warning: MDLBondStereoToInternalBondType: MDL bond stereo value, $MDLBondStereo, is not supported: It has been ignored and bond order would be used to determine bond type..."; 698 } 699 return $InternalBondType; 700 } 701 702 # Map internal bond type values used by MayaChemTools to MDL bond stereo value used in SD and MOL files... 703 # 704 sub InternalBondTypeToMDLBondStereo { 705 my($InternalBondType) = @_; 706 my($MDLBondStereo); 707 708 $MDLBondStereo = 0; 709 710 BONDSTEREO: { 711 if ($InternalBondType =~ /^SingleUp$/i) { $MDLBondStereo = 1; last BONDSTEREO;} 712 if ($InternalBondType =~ /^SingleWavy$/i) { $MDLBondStereo = 4; last BONDSTEREO;} 713 if ($InternalBondType =~ /^SingleDown$/) { $MDLBondStereo = 6; last BONDSTEREO;} 714 if ($InternalBondType =~ /^DoubleCross$/) { $MDLBondStereo = 3; last BONDSTEREO;} 715 $MDLBondStereo = 0; 716 } 717 return $MDLBondStereo; 718 } 719 720 # Fourth line: Counts 721 # 722 # Format: aaabbblllfffcccsssxxxrrrpppiiimmmvvvvvv 723 # 724 # aaa: number of atoms; bbb: number of bonds; lll: number of atom lists; fff: (obsolete) 725 # ccc: chiral flag: 0=not chiral, 1=chiral; sss: number of stext entries; xxx,rrr,ppp,iii: 726 # (obsolete); mmm: number of lines of additional properties, including the M END line, No 727 # longer supported, default is set to 999; vvvvvv: version 728 729 sub ParseCmpdCountsLine { 730 my($Line) = @_; 731 my($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version); 732 733 ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) = unpack("A3A3x3x3A3x3x3x3x3x3A3A6", $Line); 734 735 if ($Version =~ /V3000/i) { 736 # Current version of MayaChemTools modules and classes for processing MDL MOL and SD don't support 737 # V3000. So instead of relying on callers, just exit with an error to disable any processing of V3000 738 # format. 739 croak "Error: SDFileUtil::ParseCmpdCountsLine: The Extended Connection Table (V3000) format in MDL MOL and SD files is not supported by the current release of MayaChemTools..."; 740 } 741 742 return ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version); 743 } 744 745 # Second line: Misc info 746 # 747 # Format: IIPPPPPPPPMMDDYYHHmmddSSssssssssssEEEEEEEEEEEERRRRRR 748 # A2A8 A10 A2I2A10 A12 A6 749 # User's first and last initials (I), program name (P), date/time (M/D/Y,H:m), 750 # dimensional codes - 2D or 3D (d),scaling factors (S, s), energy (E) if modeling program input, 751 # internal registry number (R) if input through MDL form. A blank line is also allowed. 752 sub ParseCmpdMiscInfoLine { 753 my($Line) = @_; 754 my($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum); 755 756 ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum) = unpack("A2A8A10A2A2A10A12A6", $Line); 757 return ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum); 758 } 759 760 # First line: Molecule name. This line is unformatted, but like all other lines in a 761 # molfile may not extend beyond column 80. A blank line is also allowed. 762 sub ParseCmpdMolNameLine { 763 my($Line) = @_; 764 my($MolName); 765 766 $MolName = unpack("A80", $Line); 767 768 return ($MolName); 769 } 770 771 # Parse charge property line in CTAB generic properties block. 772 # 773 # Charge property line format: 774 # 775 # M CHGnn8 aaa vvv ... 776 # 777 # nn8: Number of value pairs. Maximum of 8 pairs allowed. 778 # aaa: Atom number 779 # vvv: -15 to +15. Default of 0 = uncharged atom. When present, this property supersedes 780 # all charge and radical values in the atom block, forcing a 0 charge on all atoms not 781 # listed in an M CHG or M RAD line. 782 783 sub ParseCmpdChargePropertyLine { 784 my($Line) = @_; 785 786 return _ParseCmpdGenericPropertyLine('Charge', $Line); 787 } 788 789 790 # Parse isotope property line in CTAB generic properties block. 791 # 792 # Isoptope property line format: 793 # 794 # M ISOnn8 aaa vvv ... 795 # 796 # nn8: Number of value paris. Maximum of 8 pairs allowed. 797 # aaa: Atom number 798 # vvv: Absolute mass of the atom isotope as a positive integer. When present, this property 799 # supersedes all isotope values in the atom block. Default (no entry) means natural 800 # abundance. The difference between this absolute mass value and the natural 801 # abundance value specified in the PTABLE.DAT file must be within the range of -18 802 # to +12 803 # 804 # Notes: 805 # . Values correspond to mass numbers... 806 # 807 sub ParseCmpdIsotopePropertyLine { 808 my($Line) = @_; 809 810 return _ParseCmpdGenericPropertyLine('Isotope', $Line); 811 } 812 813 # Parse radical property line in CTAB generic properties block. 814 # 815 # Radical property line format: 816 # 817 # M RADnn8 aaa vvv ... 818 # 819 # nn8: Number of value paris. Maximum of 8 pairs allowed. 820 # aaa: Atom number 821 # vvv: Default of 0 = no radical, 1 = singlet (:), 2 = doublet ( . or ^), 3 = triplet (^^). When 822 # present, this property supersedes all charge and radical values in the atom block, 823 # forcing a 0 (zero) charge and radical on all atoms not listed in an M CHG or 824 # M RAD line. 825 # 826 sub ParseCmpdRadicalPropertyLine { 827 my($Line) = @_; 828 829 return _ParseCmpdGenericPropertyLine('Radical', $Line); 830 } 831 832 # Map MDL radical stereo value used in SD and MOL files to internal spin multiplicity values used by MayaChemTools... 833 # 834 sub MDLRadicalToInternalSpinMultiplicity { 835 my($MDLRadical) = @_; 836 my($InternalSpinMultiplicity); 837 838 $InternalSpinMultiplicity = ''; 839 840 BONDSTEREO: { 841 if ($MDLRadical == 0) { $InternalSpinMultiplicity = 0; last BONDSTEREO;} 842 if ($MDLRadical == 1) { $InternalSpinMultiplicity = 1; last BONDSTEREO;} 843 if ($MDLRadical == 2) { $InternalSpinMultiplicity = 2; last BONDSTEREO;} 844 if ($MDLRadical == 3) { $InternalSpinMultiplicity = 3; last BONDSTEREO;} 845 $InternalSpinMultiplicity = ''; 846 carp "Warning: MDLRadicalToInternalSpinMultiplicity: MDL radical value, $MDLRadical, specifed on line M RAD is not supported..."; 847 } 848 return $InternalSpinMultiplicity; 849 } 850 851 # Map internal spin multiplicity values used by MayaChemTools to MDL radical stereo value used in SD and MOL files... 852 # 853 sub InternalSpinMultiplicityToMDLRadical { 854 my($InternalSpinMultiplicity) = @_; 855 my($MDLRadical); 856 857 $MDLRadical = 0; 858 859 BONDSTEREO: { 860 if ($InternalSpinMultiplicity == 1) { $MDLRadical = 1; last BONDSTEREO;} 861 if ($InternalSpinMultiplicity == 2) { $MDLRadical = 2; last BONDSTEREO;} 862 if ($InternalSpinMultiplicity == 3) { $MDLRadical = 3; last BONDSTEREO;} 863 $MDLRadical = 0; 864 } 865 return $MDLRadical; 866 } 867 868 # Process generic CTAB property line... 869 sub _ParseCmpdGenericPropertyLine { 870 my($PropertyName, $Line) = @_; 871 872 my($Label, $PropertyLabel, $ValuesCount, $ValuePairsCount, @ValuePairs); 873 874 @ValuePairs = (); 875 ($Label, $PropertyLabel, $ValuesCount, @ValuePairs) = split(' ', $Line); 876 $ValuePairsCount = (scalar @ValuePairs)/2; 877 if ($ValuesCount != $ValuePairsCount) { 878 carp "Warning: _ParseCmpdGenericPropertyLine: Number of atom number and $PropertyName value paris specified on $Label $PropertyLabel property line, $ValuePairsCount, is does not match expected value of $ValuesCount..."; 879 } 880 881 return (@ValuePairs); 882 } 883 884 # Generic CTAB property lines for charge, istope and radical properties... 885 # 886 sub _GenerateCmpdGenericPropertyLines { 887 my($PropertyName, $PropertyValuePairsRef) = @_; 888 my($Index, $PropertyLabel, $Line, $PropertyCount, $AtomNum, $PropertyValue, @PropertyLines); 889 890 @PropertyLines = (); 891 NAME: { 892 if ($PropertyName =~ /^Charge$/i) { $PropertyLabel = "M CHG"; last NAME; } 893 if ($PropertyName =~ /^Isotope$/i) { $PropertyLabel = "M ISO"; last NAME; } 894 if ($PropertyName =~ /^Radical$/i) { $PropertyLabel = "M RAD"; last NAME; } 895 carp "Warning: _GenerateCmpdGenericPropertyLines: Unknown property name, $PropertyName, specified..."; 896 return @PropertyLines; 897 } 898 899 # A maximum of 8 property pair values allowed per line... 900 $PropertyCount = 0; 901 $Line = ''; 902 for ($Index = 0; $Index < $#{$PropertyValuePairsRef}; $Index += 2) { 903 if ($PropertyCount > 8) { 904 # Setup property line... 905 $Line = "${PropertyLabel} 8${Line}"; 906 push @PropertyLines, $Line; 907 908 $PropertyCount = 0; 909 $Line = ''; 910 } 911 $PropertyCount++; 912 $AtomNum = $PropertyValuePairsRef->[$Index]; 913 $PropertyValue = $PropertyValuePairsRef->[$Index + 1]; 914 $Line .= sprintf " %3i %3i", $AtomNum, $PropertyValue; 915 } 916 if ($Line) { 917 $Line = "${PropertyLabel} ${PropertyCount}${Line}"; 918 push @PropertyLines, $Line; 919 } 920 return @PropertyLines; 921 } 922 923 # 924 # Read compound data into a string and return its value 925 sub ReadCmpdString { 926 my($SDFileRef) = @_; 927 my($CmpdString); 928 929 $CmpdString = ""; 930 LINE: while (defined($_ = <$SDFileRef>)) { 931 # Change Windows and Mac new line char to UNIX... 932 s/(\r\n)|(\r)/\n/g; 933 934 if (/\$\$\$\$/) { 935 chomp; 936 $CmpdString .= $_; 937 last LINE; 938 } 939 else { 940 $CmpdString .= $_; 941 } 942 } 943 return $CmpdString; 944 } 945 946 # Find out the number of fragements in the compouns. And for the compound with 947 # more than one fragment, remove all the others besides the largest one. 948 sub WashCmpd { 949 my($CmpdLines) = @_; 950 my($WashedCmpdString, $FragmentCount, $Fragments); 951 952 $WashedCmpdString = ""; 953 ($FragmentCount, $Fragments) = GetCmpdFragments($CmpdLines); 954 if ($FragmentCount > 1) { 955 # Go over the compound data for the largest fragment including property 956 # data... 957 my (@AllFragments, @LargestFragment, %LargestFragmentAtoms, @WashedCmpdLines, $Index, $LineIndex, $AtomCount, $BondCount, $NewAtomCount, $NewBondCount, $FirstAtomNum, $AtomNum, $ChiralFlag); 958 959 @AllFragments = (); @LargestFragment = (); 960 %LargestFragmentAtoms = (); 961 @AllFragments = split "\n", $Fragments; 962 @LargestFragment = split " ", $AllFragments[0]; 963 for $Index (0 .. $#LargestFragment) { 964 $LargestFragmentAtoms{$LargestFragment[$Index]} = $LargestFragment[$Index]; 965 } 966 @WashedCmpdLines = (); 967 push @WashedCmpdLines, @$CmpdLines[0], @$CmpdLines[1], @$CmpdLines[2], @$CmpdLines[3]; 968 ($AtomCount, $BondCount, $ChiralFlag) = ParseCmpdCountsLine(@$CmpdLines[3]); 969 $NewAtomCount = @LargestFragment; 970 $NewBondCount = 0; 971 $AtomNum = 0; 972 # Retrieve the largest fragment atom lines... 973 for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) { 974 $AtomNum++; 975 if ($LargestFragmentAtoms{$AtomNum}) { 976 push @WashedCmpdLines, @$CmpdLines[$LineIndex]; 977 } 978 } 979 # Retrieve the largest fragment bond lines... 980 for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) { 981 ($FirstAtomNum) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]); 982 if ($LargestFragmentAtoms{$FirstAtomNum}) { 983 $NewBondCount++; 984 push @WashedCmpdLines, @$CmpdLines[$LineIndex]; 985 } 986 } 987 # Retrieve the property data... 988 for ($LineIndex = (4 + $AtomCount + $BondCount); $LineIndex < @$CmpdLines; $LineIndex++) { 989 push @WashedCmpdLines, @$CmpdLines[$LineIndex]; 990 } 991 # Update atom and bond count line... 992 $WashedCmpdLines[3] = GenerateCmpdCountsLine($NewAtomCount, $NewBondCount, $ChiralFlag); 993 994 $WashedCmpdString = join "\n", @WashedCmpdLines; 995 } 996 return ($FragmentCount, $Fragments, $WashedCmpdString); 997 } 998