MayaChemTools

   1 package SDFileUtil;
   2 #
   3 # $RCSfile: SDFileUtil.pm,v $
   4 # $Date: 2008/04/25 00:00:46 $
   5 # $Revision: 1.26 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 use 5.006;
  29 use strict;
  30 use Exporter;
  31 use Carp;
  32 use PeriodicTable qw(IsElement);
  33 
  34 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  35 
  36 $VERSION = '1.00';
  37 @ISA = qw(Exporter);
  38 @EXPORT = qw(GenerateCmpdAtomLine GenerateCmpdBondLine GenerateCmpdChargePropertyLines GenerateCmpdCommentsLine GenerateCmpdCountsLine GenerateCmpdIsotopePropertyLines GenerateCmpdDataHeaderLabelsAndValuesLines GenerateCmpdMiscInfoLine GenerateCmpdRadicalPropertyLines GenerateCmpdMolNameLine GenerateEmptyCtabBlockLines GenerateMiscLineDateStamp GetAllAndCommonCmpdDataHeaderLabels GetCmpdDataHeaderLabels GetCmpdDataHeaderLabelsAndValues GetCmpdFragments GetCtabLinesCount GetUnknownAtoms MDLChargeToInternalCharge InternalChargeToMDLCharge MDLBondTypeToInternalBondOrder InternalBondOrderToMDLBondType InternalSpinMultiplicityToMDLRadical MDLRadicalToInternalSpinMultiplicity ParseCmpdAtomLine ParseCmpdBondLine ParseCmpdCommentsLine ParseCmpdCountsLine ParseCmpdMiscInfoLine ParseCmpdMolNameLine ParseCmpdChargePropertyLine ParseCmpdIsotopePropertyLine ParseCmpdRadicalPropertyLine ReadCmpdString WashCmpd);
  39 @EXPORT_OK = qw();
  40 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  41 
  42 # Format data for compounds count line...
  43 sub GenerateCmpdCountsLine {
  44   my($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version, $Line);
  45 
  46   if (@_ == 5) {
  47     ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) = @_;
  48   }
  49   elsif (@_ == 3) {
  50     ($AtomCount, $BondCount, $ChiralFlag) = @_;
  51     $PropertyCount = 999;
  52     $Version = "V2000";
  53   }
  54   else {
  55     ($AtomCount, $BondCount) = @_;
  56     $ChiralFlag = 0;
  57     $PropertyCount = 999;
  58     $Version = "V2000";
  59   }
  60   if ($AtomCount > 999) {
  61     croak "Error: SDFileUtil::GenerateCmpdCountsLine: The atom count, $AtomCount, exceeds maximum of 999 allowed for CTAB version 2000. The Extended Connection Table (V3000) format in MDL MOL and SD files is not supported by the current release of MayaChemTools...";
  62   }
  63   $Line = sprintf "%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%6s", $AtomCount, $BondCount, 0, 0, $ChiralFlag, 0, 0, 0, 0, 0, $PropertyCount, $Version;
  64 
  65   return ($Line);
  66 }
  67 
  68 # Generate comments line...
  69 sub GenerateCmpdCommentsLine {
  70   my($Comments) = @_;
  71   my($Line);
  72 
  73   $Line = (length($Comments) > 80) ? substr($Comments, 0, 80) : $Comments;
  74 
  75   return $Line;
  76 }
  77 
  78 # Generate molname line...
  79 sub GenerateCmpdMolNameLine {
  80   my($MolName) = @_;
  81   my($Line);
  82 
  83   $Line = (length($MolName) > 80) ? substr($MolName, 0, 80) : $MolName;
  84 
  85   return $Line;
  86 }
  87 
  88 # Generate data for compounds misc info line...
  89 sub GenerateCmpdMiscInfoLine {
  90   my($ProgramName, $UserInitial, $Code) = @_;
  91   my($Date, $Line);
  92 
  93   if (!(defined($ProgramName) && $ProgramName)) {
  94     $ProgramName = "MayaChem";
  95   }
  96   if (!(defined($UserInitial) && $UserInitial)) {
  97     $UserInitial = "  ";
  98   }
  99   if (!(defined($Code) && $Code)) {
 100     $Code = "2D";
 101   }
 102 
 103   if (length($ProgramName) > 8) {
 104     $ProgramName = substr($ProgramName, 0, 8);
 105   }
 106   if (length($UserInitial) > 2) {
 107     $UserInitial = substr($UserInitial, 0, 2);
 108   }
 109   if (length($Code) > 2) {
 110     $Code = substr($Code, 0, 2);
 111   }
 112   $Date = GenerateMiscLineDateStamp();
 113 
 114   $Line = "${UserInitial}${ProgramName}${Date}${Code}";
 115 
 116   return $Line;
 117 }
 118 
 119 # Generate data for compounds misc info line...
 120 sub GenerateEmptyCtabBlockLines {
 121   my($Date, $Lines);
 122 
 123   if (@_ == 1) {
 124     ($Date) = @_;
 125   }
 126   else {
 127     $Date = GenerateMiscLineDateStamp();
 128   }
 129   # First line: Blank molname line...
 130   # Second line: Misc info...
 131   # Third line: Blank comments line...
 132   # Fourth line: Counts line reflecting empty structure data block...
 133   $Lines = "\n";
 134   $Lines .= "  MayaChem${Date}2D\n";
 135   $Lines .= "\n";
 136   $Lines .= GenerateCmpdCountsLine(0, 0, 0) . "\n";
 137   $Lines .= "M  END";
 138 
 139   return $Lines;
 140 }
 141 
 142 # Generate SD file data stamp...
 143 sub GenerateMiscLineDateStamp {
 144   my($Date, $Sec, $Min, $Hour, $MDay, $Mon, $Year, $WDay, $YDay, $IsDst);
 145 
 146   ($Sec, $Min, $Hour, $MDay, $Mon, $Year, $WDay, $YDay, $IsDst) = localtime;
 147   $Mon += 1;
 148   $Year = substr($Year, -1, 2);
 149   $Date = sprintf "%2i%2i%2i%2i%2i", $Mon, $MDay, $Year, $Hour, $Min;
 150   $Date =~ s/ /0/g;
 151 
 152   return $Date;
 153 }
 154 
 155 # Generate data for compound atom line...
 156 #
 157 sub GenerateCmpdAtomLine {
 158   my($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge, $StereoParity) = @_;
 159   my($Line);
 160 
 161   if (!defined $MassDifference) {
 162     $MassDifference = 0;
 163   }
 164   if (!defined $Charge) {
 165     $Charge = 0;
 166   }
 167   if (!defined $StereoParity) {
 168     $StereoParity = 0;
 169   }
 170   $Line = sprintf "%10.4f%10.4f%10.4f %-3s%2i%3i%3i  0  0  0  0  0  0  0  0  0", $AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity;
 171 
 172   return $Line
 173 }
 174 
 175 # Generate data for compound bond line...
 176 #
 177 sub GenerateCmpdBondLine {
 178   my($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = @_;
 179   my($Line);
 180 
 181   if (!defined $BondStereo) {
 182     $BondStereo = 0;
 183   }
 184   $Line = sprintf "%3i%3i%3i%3i  0  0  0", $FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo;
 185 
 186   return $Line
 187 }
 188 
 189 # Generate charge property lines for CTAB block...
 190 #
 191 sub GenerateCmpdChargePropertyLines {
 192   my($ChargeValuePairsRef) = @_;
 193 
 194   return _GenerateCmpdGenericPropertyLines('Charge', $ChargeValuePairsRef);
 195 }
 196 
 197 # Generate isotope property lines for CTAB block...
 198 #
 199 sub GenerateCmpdIsotopePropertyLines {
 200   my($IsotopeValuePairsRef) = @_;
 201 
 202   return _GenerateCmpdGenericPropertyLines('Isotope', $IsotopeValuePairsRef);
 203 }
 204 
 205 # Generate radical property line property lines for CTAB block...
 206 #
 207 sub GenerateCmpdRadicalPropertyLines {
 208   my($RadicalValuePairsRef) = @_;
 209 
 210   return _GenerateCmpdGenericPropertyLines('Radical', $RadicalValuePairsRef);
 211 }
 212 
 213 # Generate data header labels and values lines...
 214 #
 215 sub GenerateCmpdDataHeaderLabelsAndValuesLines {
 216   my($DataHeaderLabelsRef, $DataHeaderLabelsAndValuesRef, $SortDataLabels) = @_;
 217   my($DataLabel, $DataValue, @DataLabels, @DataLines);
 218 
 219   if (!defined $SortDataLabels) {
 220     $SortDataLabels = 0;
 221   }
 222 
 223   @DataLines = ();
 224   @DataLabels = ();
 225   if ($SortDataLabels) {
 226     push @DataLabels, sort @{$DataHeaderLabelsRef};
 227   }
 228   else {
 229     push @DataLabels,  @{$DataHeaderLabelsRef};
 230   }
 231   for $DataLabel (@DataLabels) {
 232     $DataValue = '';
 233     if (exists $DataHeaderLabelsAndValuesRef->{$DataLabel}) {
 234       $DataValue = $DataHeaderLabelsAndValuesRef->{$DataLabel};
 235     }
 236     push @DataLines, (">  <${DataLabel}>", "$DataValue", "");
 237   }
 238   return @DataLines;
 239 }
 240 
 241 # Parse data field header in SD file and return lists of all and common data field
 242 # labels.
 243 sub GetAllAndCommonCmpdDataHeaderLabels {
 244   my($SDFileRef) = @_;
 245   my($CmpdCount, $CmpdString, $Label, @CmpdLines, @DataFieldLabels, @CommonDataFieldLabels, %DataFieldLabelsMap);
 246 
 247   $CmpdCount = 0;
 248   @DataFieldLabels = ();
 249   @CommonDataFieldLabels = ();
 250   %DataFieldLabelsMap = ();
 251 
 252   while ($CmpdString = ReadCmpdString($SDFileRef)) {
 253     $CmpdCount++;
 254     @CmpdLines = split "\n", $CmpdString;
 255     # Process compound data header labels and figure out which ones are present for
 256     # all the compounds...
 257     if (@DataFieldLabels) {
 258       my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines);
 259       my(%CmpdDataFieldLabelsMap) = ();
 260       # Setup a map for the current labels...
 261       for $Label (@CmpdDataFieldLabels) {
 262 	$CmpdDataFieldLabelsMap{$Label} = "PresentInSome";
 263       }
 264       # Check the presence old labels for this compound; otherwise, mark 'em new...
 265       for $Label (@DataFieldLabels) {
 266 	if (!$CmpdDataFieldLabelsMap{$Label}) {
 267 	  $DataFieldLabelsMap{$Label} = "PresentInSome";
 268 	}
 269       }
 270       # Check the presence this compound in the old labels; otherwise, add 'em...
 271       for $Label (@CmpdDataFieldLabels ) {
 272 	if (!$DataFieldLabelsMap{$Label}) {
 273 	  # It's a new label...
 274 	  push @DataFieldLabels, $Label;
 275 	  $DataFieldLabelsMap{$Label} = "PresentInSome";
 276 	}
 277       }
 278     }
 279     else {
 280       # Get the initial label set and set up a map...
 281       @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
 282       for $Label (@DataFieldLabels) {
 283 	$DataFieldLabelsMap{$Label} = "PresentInAll";
 284       }
 285     }
 286   }
 287   # Identify the common data field labels...
 288   @CommonDataFieldLabels = ();
 289   for $Label (@DataFieldLabels) {
 290     if ($DataFieldLabelsMap{$Label} eq "PresentInAll") {
 291       push @CommonDataFieldLabels, $Label;
 292     }
 293   }
 294   return ($CmpdCount, \@DataFieldLabels, \@CommonDataFieldLabels);
 295 }
 296 
 297 # Parse all the data header labels and return 'em as an list...
 298 #
 299 # Format:
 300 #
 301 #> Data header line
 302 #Data line(s)
 303 #Blank line
 304 #
 305 # [Data Header] (one line) precedes each item of data, starts with a greater than (>) sign, and
 306 # contains at least one of the following:
 307 #  The field name enclosed in angle brackets. For example: <melting.point>
 308 #  The field number, DTn , where n represents the number assigned to the field in a MACCS-II database
 309 #
 310 #Optional information for the data header includes:
 311 #  The compound’s external and internal registry numbers. External registry numbers must be enclosed in parentheses.
 312 #  Any combination of information
 313 #
 314 #The following are examples of valid data headers:
 315 #> <MELTING.POINT>
 316 #> 55 (MD-08974) <BOILING.POINT> DT12
 317 #> DT12 55
 318 #> (MD-0894) <BOILING.POINT> FROM ARCHIVES
 319 #
 320 #Notes: Sometimes last blank line is missing and can be just followed by $$$$
 321 #
 322 sub GetCmpdDataHeaderLabels {
 323   my($CmpdLines) = @_;
 324   my($CmpdLine, $Label, @Labels);
 325 
 326   @Labels = ();
 327   CMPDLINE: for $CmpdLine (@$CmpdLines) {
 328     if ($CmpdLine !~ /^>/) {
 329       next CMPDLINE;
 330     }
 331     # Does the line contains field name enclosed in angular brackets?
 332     ($Label) = $CmpdLine =~ /<.*?>/g;
 333     if (!defined($Label)) {
 334       next CMPDLINE;
 335     }
 336     $Label =~ s/(<|>)//g;
 337     push @Labels, $Label;
 338   }
 339   return (@Labels);
 340 }
 341 
 342 # Parse all the data header labels and values
 343 sub GetCmpdDataHeaderLabelsAndValues {
 344   my($CmpdLines) = @_;
 345   my($CmpdLine, $CurrentLabel, $Label, $Value, $ValueCount, $ProcessingLabelData, @Values, %DataFields);
 346 
 347   %DataFields = ();
 348   $ProcessingLabelData = 0;
 349   $ValueCount = 0;
 350   CMPDLINE: for $CmpdLine (@$CmpdLines) {
 351     if ($CmpdLine =~ /\$\$\$\$/) {
 352       last CMPDLINE;
 353     }
 354     if ($CmpdLine =~ /^>/) {
 355       # Does the line contains field name enclosed in angular brackets?
 356       ($Label) = $CmpdLine =~ /<.*?>/g;
 357       if (defined $Label) {
 358 	$CurrentLabel = $Label;
 359 	$CurrentLabel =~ s/(<|>)//g;
 360 	$ProcessingLabelData = 0;
 361 	$ValueCount = 0;
 362 
 363 	if ($CurrentLabel) {
 364 	  $ProcessingLabelData = 1;
 365 	  $DataFields{$CurrentLabel} = '';
 366 	  next CMPDLINE;
 367 	}
 368       }
 369       else {
 370 	if (!$ProcessingLabelData) {
 371 	  # Data line containing no <label> as allowed by SDF format. Just ignore it...
 372 	  next CMPDLINE;
 373 	}
 374       }
 375     }
 376     if (!$ProcessingLabelData) {
 377       next CMPDLINE;
 378     }
 379     if (!(defined($CmpdLine) && length($CmpdLine))) {
 380       # Blank line terminates value for a label...
 381       $CurrentLabel = '';
 382       $ValueCount = 0;
 383       $ProcessingLabelData = 0;
 384       next CMPDLINE;
 385     }
 386     $ValueCount++;
 387     $Value = $CmpdLine;
 388 
 389     if ($ValueCount > 1) {
 390       $DataFields{$CurrentLabel} .= "\n" . $Value;
 391     }
 392     else {
 393       $DataFields{$CurrentLabel} = $Value;
 394     }
 395   }
 396   return (%DataFields);
 397 }
 398 
 399 #
 400 # Using bond blocks, figure out the number of disconnected fragments  and
 401 # return their values along with the atom numbers in a string delimited by new
 402 # line character.
 403 #
 404 sub GetCmpdFragments {
 405   my($CmpdLines) = @_;
 406   my($AtomCount, $BondCount, $FirstAtomNum, $SecondAtomNum, @AtomConnections, $BondType, $FragmentString, $FragmentCount, $LineIndex, $Index, $AtomNum, $NbrAtomNum, @ProcessedAtoms, $ProcessedAtomCount, $ProcessAtomNum, @ProcessingAtoms, @ConnectedAtoms, %Fragments, $FragmentNum, $AFragmentString);
 407 
 408   # Setup the connection table for each atom...
 409   @AtomConnections = ();
 410   ($AtomCount, $BondCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
 411   for $AtomNum (1 .. $AtomCount) {
 412     %{$AtomConnections[$AtomNum]} = ();
 413   }
 414   for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) {
 415     ($FirstAtomNum, $SecondAtomNum, $BondType) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]);
 416     if (!$AtomConnections[$FirstAtomNum]{$SecondAtomNum}) {
 417       $AtomConnections[$FirstAtomNum]{$SecondAtomNum} = $BondType;
 418     }
 419     if (!$AtomConnections[$SecondAtomNum]{$FirstAtomNum}) {
 420       $AtomConnections[$SecondAtomNum]{$FirstAtomNum} = $BondType;
 421     }
 422   }
 423 
 424   #Get set to count fragments...
 425   $ProcessedAtomCount = 0;
 426   $FragmentNum = 0;
 427   %Fragments = ();
 428   @ProcessedAtoms = ();
 429   for $AtomNum (1 .. $AtomCount) {
 430     $ProcessedAtoms[$AtomNum] = 0;
 431   }
 432   while ($ProcessedAtomCount < $AtomCount) {
 433     @ProcessingAtoms = ();
 434     @ConnectedAtoms = ();
 435   ATOMNUM: for $AtomNum (1 .. $AtomCount) {
 436       if (!$ProcessedAtoms[$AtomNum]) {
 437 	$ProcessedAtomCount++;
 438 	$ProcessedAtoms[$AtomNum] = 1;
 439 	push @ProcessingAtoms, $AtomNum;
 440 	$FragmentNum++;
 441 	@{$Fragments{$FragmentNum} } = ();
 442 	push @{$Fragments{$FragmentNum} }, $AtomNum;
 443 	last ATOMNUM;
 444       }
 445     }
 446 
 447     # Go over the neighbors and follow the connection trail while collecting the
 448     # atoms numbers present in the connected fragment...
 449     while (@ProcessingAtoms) {
 450       for ($Index = 0; $Index < @ProcessingAtoms; $Index++) {
 451 	$ProcessAtomNum = $ProcessingAtoms[$Index];
 452 	for $NbrAtomNum (keys %{$AtomConnections[$ProcessAtomNum]})  {
 453 	  if (!$ProcessedAtoms[$NbrAtomNum]) {
 454 	    $ProcessedAtomCount++;
 455 	    $ProcessedAtoms[$NbrAtomNum] = 1;
 456 	    push @ConnectedAtoms, $NbrAtomNum;
 457 	    push @{ $Fragments{$FragmentNum} }, $NbrAtomNum;
 458 	  }
 459 	}
 460       }
 461       @ProcessingAtoms = ();
 462       @ProcessingAtoms = @ConnectedAtoms;
 463       @ConnectedAtoms = ();
 464     }
 465   }
 466   $FragmentCount = $FragmentNum;
 467   $FragmentString = "";
 468 
 469   # Sort out the fragments by size...
 470   for $FragmentNum (sort { @{$Fragments{$b}} <=> @{$Fragments{$a}}  } keys %Fragments ) {
 471     # Sort the atoms in a fragment by their numbers...
 472     $AFragmentString = join " ", sort { $a <=> $b } @{ $Fragments{$FragmentNum} };
 473     if ($FragmentString) {
 474       $FragmentString .=  "\n" . $AFragmentString;
 475     }
 476     else {
 477       $FragmentString = $AFragmentString;
 478     }
 479   }
 480   return ($FragmentCount, $FragmentString);
 481 }
 482 
 483 # Count number of lines present in between 4th and line containg "M END"
 484 sub GetCtabLinesCount {
 485   my($CmpdLines) = @_;
 486   my($LineIndex, $CtabLinesCount);
 487 
 488   $CtabLinesCount = 0;
 489  LINE: for ($LineIndex = 4; $LineIndex < @$CmpdLines; $LineIndex++) {
 490     if ((@$CmpdLines[$LineIndex] =~ /^M /i) || (@$CmpdLines[$LineIndex] =~ /^M  END/i)) {
 491       $CtabLinesCount = $LineIndex - 4;
 492       last LINE;
 493     }
 494   }
 495   return $CtabLinesCount;
 496 }
 497 
 498 # Using atom blocks, count the number of atoms which contain special element
 499 # symbols not present in the periodic table.
 500 sub GetUnknownAtoms {
 501   my($CmpdLines) = @_;
 502   my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines, $LineIndex, $AtomCount, $AtomSymbol);
 503 
 504   $UnknownAtomCount = 0;
 505   $UnknownAtoms = "";
 506   $UnknownAtomLines = "";
 507   ($AtomCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
 508   for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) {
 509     ($AtomSymbol) = ParseCmpdAtomLine(@$CmpdLines[$LineIndex]);
 510     if (!IsElement($AtomSymbol)) {
 511       $UnknownAtomCount++;
 512       $UnknownAtoms .= " $AtomSymbol";
 513       if ($UnknownAtomLines) {
 514 	$UnknownAtomLines .= "\n" . @$CmpdLines[$LineIndex];
 515       }
 516       else {
 517 	$UnknownAtomLines = @$CmpdLines[$LineIndex];
 518       }
 519     }
 520   }
 521   return ($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines);
 522 }
 523 
 524 # Ctab lines: Atom block
 525 #
 526 # Format: xxxxx.xxxxyyyyy.yyyyzzzzz.zzzz aaaddcccssshhhbbbvvvHHHrrriiimmmnnneee
 527 #         A10       A10       A10       xA3 A2A3 A3 A3 A3 A3 A3 A3 A3 A3 A3 A3
 528 # x,y,z: Atom coordinates
 529 # aaa: Atom symbol. Entry in periodic table or L for atom list, A, Q, * for unspecified
 530 #      atom, and LP for lone pair, or R# for Rgroup label
 531 # dd: Mass difference. -3, -2, -1, 0, 1, 2, 3, 4 (0 for value beyond these limits)
 532 # ccc: Charge. 0 = uncharged or value other than these, 1 = +3, 2 = +2, 3 = +1,
 533 #      4 = doublet radical, 5 = -1, 6 = -2, 7 = -3
 534 # sss: Atom stereo parity. 0 = not stereo, 1 = odd, 2 = even, 3 = either or unmarked stereo center
 535 # hhh: Hydrogen count + 1. 1 = H0, 2 = H1, 3 = H2, 4 = H3, 5 = H4
 536 # bbb: Stereo care box. 0 = ignore stereo configuration of this double bond atom, 1 = stereo
 537 #      configuration of double bond atom must match
 538 # vvv: Valence. 0 = no marking (default)(1 to 14) = (1 to 14) 15 = zero valence
 539 # HHH: H0 designator. 0 = not specified, 1 = no H atoms allowed (redundant due to hhh)
 540 # rrr: Not used
 541 # iii: Not used
 542 # mmm: Atom-atom mapping number. 1 - number of atoms
 543 # nnn: Inversion/retention flag. 0 = property not applied, 1 = configuration is inverted,
 544 #      2 = configuration is retained.
 545 # eee: Exact change flag. 0 = property not applied, 1 = change on atom must be
 546 #      exactly as shown
 547 #
 548 # Notes:
 549 #  . StereoParity: 1 - ClockwiseStereo, 2 - AntiClockwiseStereo; 3 - Either; 0 - none. These
 550 #    values determine chirailty around the chiral center; a non zero value indicates atom
 551 #    has been marked as chiral center.
 552 #
 553 sub ParseCmpdAtomLine {
 554   my($Line) = @_;
 555   my ($LineIndex, $AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity);
 556 
 557   ($AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity) = ('') x 7;
 558   if (length($Line) > 31) {
 559     ($AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity) = unpack("A10A10A10xA3A2A3A3", $Line);
 560   }
 561   else {
 562     ($AtomX, $AtomY, $AtomZ, $AtomSymbol) = unpack("A10A10A10", $Line);
 563   }
 564   return ($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge, $StereoParity);
 565 }
 566 
 567 # Map MDL charge value used in SD and MOL files to internal charge used by MayaChemTools.
 568 #
 569 sub MDLChargeToInternalCharge {
 570   my($MDLCharge) = @_;
 571   my($InternalCharge);
 572 
 573   CHARGE: {
 574     if ($MDLCharge == 0) { $InternalCharge = 0; last CHARGE;}
 575     if ($MDLCharge == 1) { $InternalCharge = 3; last CHARGE;}
 576     if ($MDLCharge == 2) { $InternalCharge = 2; last CHARGE;}
 577     if ($MDLCharge == 3) { $InternalCharge = 1; last CHARGE;}
 578     if ($MDLCharge == 5) { $InternalCharge = -1; last CHARGE;}
 579     if ($MDLCharge == 6) { $InternalCharge = -2; last CHARGE;}
 580     if ($MDLCharge == 7) { $InternalCharge = -3; last CHARGE;}
 581     # All other MDL charge values, including 4 corresponding to "doublet radical",
 582     # are assigned internal value of 0.
 583     $InternalCharge = 0;
 584     carp "Warning: MDLChargeToInternalCharge: MDL charge value, $MDLCharge, is not supported: An internal charge value, 0, has been assigned...";
 585   }
 586   return $InternalCharge;
 587 }
 588 
 589 # Map internal charge used by MayaChemTools to MDL charge value used in SD and MOL files.
 590 #
 591 sub InternalChargeToMDLCharge {
 592   my($InternalCharge) = @_;
 593   my($MDLCharge);
 594 
 595   CHARGE: {
 596     if ($InternalCharge == 3) { $MDLCharge = 1; last CHARGE;}
 597     if ($InternalCharge == 2) { $MDLCharge = 2; last CHARGE;}
 598     if ($InternalCharge == 1) { $MDLCharge = 3; last CHARGE;}
 599     if ($InternalCharge == -1) { $MDLCharge = 5; last CHARGE;}
 600     if ($InternalCharge == -2) { $MDLCharge = 6; last CHARGE;}
 601     if ($InternalCharge == -3) { $MDLCharge = 7; last CHARGE;}
 602     # All other MDL charge values, including 4 corresponding to "doublet radical",
 603     # are assigned internal value of 0.
 604     $MDLCharge = 0;
 605   }
 606   return $MDLCharge;
 607 }
 608 
 609 # Ctab lines: Bond block
 610 #
 611 # Format: 111222tttsssxxxrrrccc
 612 #
 613 # 111: First atom number.
 614 # 222: Second atom number.
 615 # ttt: Bond type. 1 = Single, 2 = Double, 3 = Triple, 4 = Aromatic, 5 = Single or Double,
 616 #      6 = Single or Aromatic, 7 = Double or Aromatic, 8 = Any
 617 # sss: Bond stereo. Single bonds: 0 = not stereo, 1 = Up, 4 = Either, 6 = Down,
 618 #      Double bonds: 0 = Use x-, y-, z-coords from atom block to determine cis or trans,
 619 #      3 = Cis or trans (either) double bond
 620 # xxx: Not used
 621 # rrr: Bond topology. 0 = Either, 1 = Ring, 2 = Chain
 622 # ccc: Reacting center status. 0 = unmarked, 1 = a center, -1 = not a center,
 623 #      Additional: 2 = no change,4 = bond made/broken, 8 = bond order changes 12 = 4+8
 624 #      (both made/broken and changes); 5 = (4 + 1), 9 = (8 + 1), and 13 = (12 + 1) are also possible
 625 sub ParseCmpdBondLine {
 626   my($Line) = @_;
 627   my($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo);
 628 
 629   ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = map {s/ //g; $_} unpack("A3A3A3A3", $Line);
 630   return ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo);
 631 }
 632 
 633 # Map MDL bond type value used in SD and MOL files to internal bond order values used by MayaChemTools...
 634 #
 635 sub MDLBondTypeToInternalBondOrder {
 636   my($MDLBondType) = @_;
 637   my($InternalBondOrder);
 638 
 639   BONDTYPE: {
 640     if ($MDLBondType == 1) { $InternalBondOrder = 1; last BONDTYPE;}
 641     if ($MDLBondType == 2) { $InternalBondOrder = 2; last BONDTYPE;}
 642     if ($MDLBondType == 3) { $InternalBondOrder = 3; last BONDTYPE;}
 643     if ($MDLBondType == 4) { $InternalBondOrder = 1.5; last BONDTYPE;} # Aromatic
 644     #
 645     # Although MDL aromatic bond values are used for query only and explicit Kekule bond order
 646     # values must be assigned, internal value of 1.5 is allowed to indicate aromatic bond orders.
 647     #
 648     # All other MDL bond type values -  5 = Single or Double, 6 = Single or Aromatic, 7 = Double or Aromatic,
 649     # 8 = Any - are assigned internal value of 1: These are meant to be used for structure queries by MDL products.
 650     #
 651     $InternalBondOrder = 1;
 652     carp "Warning: MDLBondTypeToInternalBondOrder: MDL bond type value, $MDLBondType, is not supported: An internal bond order value, 1, has been assigned...";
 653   }
 654   return $InternalBondOrder;
 655 }
 656 
 657 # Map internal bond order values used by MayaChemTools to MDL bond type value used in SD and MOL files...
 658 #
 659 sub InternalBondOrderToMDLBondType {
 660   my($InternalBondOrder) = @_;
 661   my($MDLBondType);
 662 
 663   BONDTYPE: {
 664     if ($InternalBondOrder == 1) { $MDLBondType = 1; last BONDTYPE;}
 665     if ($InternalBondOrder == 2) { $MDLBondType = 2; last BONDTYPE;}
 666     if ($InternalBondOrder == 3) { $MDLBondType = 3; last BONDTYPE;}
 667     if ($InternalBondOrder == 1.5) { $MDLBondType = 4; last BONDTYPE;} # Aromatic
 668     $MDLBondType = 1;
 669   }
 670   return $MDLBondType;
 671 }
 672 
 673 # Third line: Comments - A blank line is also allowed.
 674 sub ParseCmpdCommentsLine {
 675   my($Line) = @_;
 676   my($Comments);
 677 
 678   $Comments = unpack("A80", $Line);
 679 
 680   return ($Comments);
 681 }
 682 
 683 # Map MDL bond stereo value used in SD and MOL files to internal bond type values used by MayaChemTools...
 684 #
 685 sub MDLBondStereoToInternalBondType {
 686   my($MDLBondStereo) = @_;
 687   my($InternalBondType);
 688 
 689   $InternalBondType = '';
 690 
 691   BONDSTEREO: {
 692     if ($MDLBondStereo == 1) { $InternalBondType = 'SingleUp'; last BONDSTEREO;}
 693     if ($MDLBondStereo == 4) { $InternalBondType = 'SingleWavy'; last BONDSTEREO;}
 694     if ($MDLBondStereo == 6) { $InternalBondType = 'SingleDown'; last BONDSTEREO;}
 695     if ($MDLBondStereo == 3) { $InternalBondType = 'DoubleCross'; last BONDSTEREO;}
 696     $InternalBondType = '';
 697     carp "Warning: MDLBondStereoToInternalBondType: MDL bond stereo value, $MDLBondStereo, is not supported: It has been ignored and bond order would be used to determine bond type...";
 698   }
 699   return $InternalBondType;
 700 }
 701 
 702 # Map internal bond type values used by MayaChemTools to MDL bond stereo value used in SD and MOL files...
 703 #
 704 sub InternalBondTypeToMDLBondStereo {
 705   my($InternalBondType) = @_;
 706   my($MDLBondStereo);
 707 
 708   $MDLBondStereo = 0;
 709 
 710   BONDSTEREO: {
 711     if ($InternalBondType =~ /^SingleUp$/i) { $MDLBondStereo = 1; last BONDSTEREO;}
 712     if ($InternalBondType =~ /^SingleWavy$/i) { $MDLBondStereo = 4; last BONDSTEREO;}
 713     if ($InternalBondType =~ /^SingleDown$/) { $MDLBondStereo = 6; last BONDSTEREO;}
 714     if ($InternalBondType =~ /^DoubleCross$/) { $MDLBondStereo = 3; last BONDSTEREO;}
 715     $MDLBondStereo = 0;
 716   }
 717   return $MDLBondStereo;
 718 }
 719 
 720 # Fourth line: Counts
 721 #
 722 # Format: aaabbblllfffcccsssxxxrrrpppiiimmmvvvvvv
 723 #
 724 # aaa: number of atoms; bbb: number of bonds; lll: number of atom lists; fff: (obsolete)
 725 # ccc: chiral flag: 0=not chiral, 1=chiral; sss: number of stext entries; xxx,rrr,ppp,iii:
 726 # (obsolete); mmm: number of lines of additional properties, including the M END line, No
 727 # longer supported, default is set to 999; vvvvvv: version
 728 
 729 sub ParseCmpdCountsLine {
 730   my($Line) = @_;
 731   my($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version);
 732 
 733   ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) = unpack("A3A3x3x3A3x3x3x3x3x3A3A6", $Line);
 734 
 735   if ($Version =~ /V3000/i) {
 736     # Current version of MayaChemTools modules and classes for processing MDL MOL and SD don't support
 737     # V3000. So instead of relying on callers, just exit with an error to disable any processing of V3000
 738     # format.
 739     croak "Error: SDFileUtil::ParseCmpdCountsLine: The Extended Connection Table (V3000) format in MDL MOL and SD files is not supported by the current release of MayaChemTools...";
 740   }
 741 
 742   return ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version);
 743 }
 744 
 745 # Second line: Misc info
 746 #
 747 # Format: IIPPPPPPPPMMDDYYHHmmddSSssssssssssEEEEEEEEEEEERRRRRR
 748 #         A2A8      A10       A2I2A10       A12         A6
 749 # User's first and last initials (I), program name (P), date/time (M/D/Y,H:m),
 750 # dimensional codes - 2D or 3D (d),scaling factors (S, s), energy (E) if modeling program input,
 751 # internal registry number (R) if input through MDL form. A blank line is also allowed.
 752 sub ParseCmpdMiscInfoLine {
 753   my($Line) = @_;
 754   my($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum);
 755 
 756   ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum) = unpack("A2A8A10A2A2A10A12A6", $Line);
 757   return ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum);
 758 }
 759 
 760 # First line: Molecule name. This line is unformatted, but like all other lines in a
 761 # molfile may not extend beyond column 80. A blank line is also allowed.
 762 sub ParseCmpdMolNameLine {
 763   my($Line) = @_;
 764   my($MolName);
 765 
 766   $MolName = unpack("A80", $Line);
 767 
 768   return ($MolName);
 769 }
 770 
 771 # Parse charge property line in CTAB generic properties block.
 772 #
 773 # Charge property line format:
 774 #
 775 # M  CHGnn8 aaa vvv ...
 776 #
 777 #    nn8: Number of value pairs. Maximum of 8 pairs allowed.
 778 #    aaa: Atom number
 779 #    vvv: -15 to +15. Default of 0 = uncharged atom. When present, this property supersedes
 780 #    all charge and radical values in the atom block, forcing a 0 charge on all atoms not
 781 #    listed in an M CHG or M RAD line.
 782 
 783 sub ParseCmpdChargePropertyLine {
 784   my($Line) = @_;
 785 
 786   return _ParseCmpdGenericPropertyLine('Charge', $Line);
 787 }
 788 
 789 
 790 # Parse isotope property line in CTAB generic properties block.
 791 #
 792 # Isoptope property line format:
 793 #
 794 # M  ISOnn8 aaa vvv ...
 795 #
 796 #    nn8: Number of value paris. Maximum of 8 pairs allowed.
 797 #    aaa: Atom number
 798 #    vvv: Absolute mass of the atom isotope as a positive integer. When present, this property
 799 #    supersedes all isotope values in the atom block. Default (no entry) means natural
 800 #    abundance. The difference between this absolute mass value and the natural
 801 #    abundance value specified in the PTABLE.DAT file must be within the range of -18
 802 #    to +12
 803 #
 804 # Notes:
 805 #  . Values correspond to mass numbers...
 806 #
 807 sub ParseCmpdIsotopePropertyLine {
 808   my($Line) = @_;
 809 
 810   return _ParseCmpdGenericPropertyLine('Isotope', $Line);
 811 }
 812 
 813 # Parse radical property line in CTAB generic properties block.
 814 #
 815 # Radical property line format:
 816 #
 817 # M  RADnn8 aaa vvv ...
 818 #
 819 #    nn8: Number of value paris. Maximum of 8 pairs allowed.
 820 #    aaa: Atom number
 821 #    vvv: Default of 0 = no radical, 1 = singlet (:), 2 = doublet ( . or ^), 3 = triplet (^^). When
 822 #    present, this property supersedes all charge and radical values in the atom block,
 823 #    forcing a 0 (zero) charge and radical on all atoms not listed in an M CHG or
 824 #    M RAD line.
 825 #
 826 sub ParseCmpdRadicalPropertyLine {
 827   my($Line) = @_;
 828 
 829   return _ParseCmpdGenericPropertyLine('Radical', $Line);
 830 }
 831 
 832 # Map MDL radical stereo value used in SD and MOL files to internal spin multiplicity values used by MayaChemTools...
 833 #
 834 sub MDLRadicalToInternalSpinMultiplicity {
 835   my($MDLRadical) = @_;
 836   my($InternalSpinMultiplicity);
 837 
 838   $InternalSpinMultiplicity = '';
 839 
 840   BONDSTEREO: {
 841     if ($MDLRadical == 0) { $InternalSpinMultiplicity = 0; last BONDSTEREO;}
 842     if ($MDLRadical == 1) { $InternalSpinMultiplicity = 1; last BONDSTEREO;}
 843     if ($MDLRadical == 2) { $InternalSpinMultiplicity = 2; last BONDSTEREO;}
 844     if ($MDLRadical == 3) { $InternalSpinMultiplicity = 3; last BONDSTEREO;}
 845     $InternalSpinMultiplicity = '';
 846     carp "Warning: MDLRadicalToInternalSpinMultiplicity: MDL radical value, $MDLRadical, specifed on line M  RAD is not supported...";
 847   }
 848   return $InternalSpinMultiplicity;
 849 }
 850 
 851 # Map internal spin multiplicity values used by MayaChemTools to MDL radical stereo value used in SD and MOL files...
 852 #
 853 sub InternalSpinMultiplicityToMDLRadical {
 854   my($InternalSpinMultiplicity) = @_;
 855   my($MDLRadical);
 856 
 857   $MDLRadical = 0;
 858 
 859   BONDSTEREO: {
 860     if ($InternalSpinMultiplicity == 1) { $MDLRadical = 1; last BONDSTEREO;}
 861     if ($InternalSpinMultiplicity == 2) { $MDLRadical = 2; last BONDSTEREO;}
 862     if ($InternalSpinMultiplicity == 3) { $MDLRadical = 3; last BONDSTEREO;}
 863     $MDLRadical = 0;
 864   }
 865   return $MDLRadical;
 866 }
 867 
 868 # Process generic CTAB property line...
 869 sub _ParseCmpdGenericPropertyLine {
 870   my($PropertyName, $Line) = @_;
 871 
 872   my($Label, $PropertyLabel, $ValuesCount, $ValuePairsCount, @ValuePairs);
 873 
 874   @ValuePairs = ();
 875   ($Label, $PropertyLabel, $ValuesCount, @ValuePairs) = split(' ', $Line);
 876   $ValuePairsCount = (scalar @ValuePairs)/2;
 877   if ($ValuesCount != $ValuePairsCount) {
 878     carp "Warning: _ParseCmpdGenericPropertyLine: Number of atom number and $PropertyName value paris specified on $Label $PropertyLabel property line, $ValuePairsCount, is does not match expected value of $ValuesCount...";
 879   }
 880 
 881   return (@ValuePairs);
 882 }
 883 
 884 # Generic CTAB property lines for charge, istope and radical properties...
 885 #
 886 sub _GenerateCmpdGenericPropertyLines {
 887   my($PropertyName, $PropertyValuePairsRef) = @_;
 888   my($Index, $PropertyLabel, $Line, $PropertyCount, $AtomNum, $PropertyValue, @PropertyLines);
 889 
 890   @PropertyLines = ();
 891   NAME: {
 892     if ($PropertyName =~ /^Charge$/i) { $PropertyLabel = "M  CHG"; last NAME; }
 893     if ($PropertyName =~ /^Isotope$/i) { $PropertyLabel = "M  ISO"; last NAME; }
 894     if ($PropertyName =~ /^Radical$/i) { $PropertyLabel = "M  RAD"; last NAME; }
 895     carp "Warning: _GenerateCmpdGenericPropertyLines: Unknown property name, $PropertyName, specified...";
 896     return @PropertyLines;
 897   }
 898 
 899   # A maximum of 8 property pair values allowed per line...
 900   $PropertyCount = 0;
 901   $Line = '';
 902   for ($Index = 0; $Index < $#{$PropertyValuePairsRef}; $Index += 2) {
 903     if ($PropertyCount > 8) {
 904       # Setup property line...
 905       $Line = "${PropertyLabel}  8${Line}";
 906       push @PropertyLines, $Line;
 907 
 908       $PropertyCount = 0;
 909       $Line = '';
 910     }
 911     $PropertyCount++;
 912     $AtomNum = $PropertyValuePairsRef->[$Index];
 913     $PropertyValue = $PropertyValuePairsRef->[$Index + 1];
 914     $Line .= sprintf " %3i %3i", $AtomNum, $PropertyValue;
 915   }
 916   if ($Line) {
 917     $Line = "${PropertyLabel}  ${PropertyCount}${Line}";
 918     push @PropertyLines, $Line;
 919   }
 920   return @PropertyLines;
 921 }
 922 
 923 #
 924 # Read compound data into a string and return its value
 925 sub ReadCmpdString {
 926   my($SDFileRef) = @_;
 927   my($CmpdString);
 928 
 929   $CmpdString = "";
 930  LINE: while (defined($_ = <$SDFileRef>)) {
 931     # Change Windows and Mac new line char to UNIX...
 932     s/(\r\n)|(\r)/\n/g;
 933 
 934     if (/\$\$\$\$/) {
 935       chomp;
 936       $CmpdString .=  $_;
 937       last LINE;
 938     }
 939     else {
 940       $CmpdString .=  $_;
 941     }
 942   }
 943   return $CmpdString;
 944 }
 945 
 946 # Find out the number of fragements in the compouns. And for the compound with
 947 # more than one fragment, remove all the others besides the largest one.
 948 sub WashCmpd {
 949   my($CmpdLines) = @_;
 950   my($WashedCmpdString, $FragmentCount, $Fragments);
 951 
 952   $WashedCmpdString = "";
 953   ($FragmentCount, $Fragments) = GetCmpdFragments($CmpdLines);
 954   if ($FragmentCount > 1) {
 955     # Go over the compound data for the largest fragment including property
 956     # data...
 957     my (@AllFragments, @LargestFragment, %LargestFragmentAtoms, @WashedCmpdLines, $Index, $LineIndex, $AtomCount, $BondCount, $NewAtomCount, $NewBondCount, $FirstAtomNum, $AtomNum, $ChiralFlag);
 958 
 959     @AllFragments = (); @LargestFragment = ();
 960     %LargestFragmentAtoms = ();
 961     @AllFragments = split "\n", $Fragments;
 962     @LargestFragment = split " ", $AllFragments[0];
 963     for $Index (0 .. $#LargestFragment) {
 964       $LargestFragmentAtoms{$LargestFragment[$Index]} = $LargestFragment[$Index];
 965     }
 966     @WashedCmpdLines = ();
 967     push @WashedCmpdLines, @$CmpdLines[0], @$CmpdLines[1], @$CmpdLines[2], @$CmpdLines[3];
 968     ($AtomCount, $BondCount, $ChiralFlag) = ParseCmpdCountsLine(@$CmpdLines[3]);
 969     $NewAtomCount = @LargestFragment;
 970     $NewBondCount = 0;
 971     $AtomNum = 0;
 972     # Retrieve the largest fragment atom lines...
 973     for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) {
 974       $AtomNum++;
 975       if ($LargestFragmentAtoms{$AtomNum}) {
 976 	push @WashedCmpdLines, @$CmpdLines[$LineIndex];
 977       }
 978     }
 979     # Retrieve the largest fragment bond lines...
 980     for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) {
 981       ($FirstAtomNum) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]);
 982       if ($LargestFragmentAtoms{$FirstAtomNum}) {
 983 	$NewBondCount++;
 984 	push @WashedCmpdLines, @$CmpdLines[$LineIndex];
 985       }
 986     }
 987     # Retrieve the property data...
 988     for ($LineIndex = (4 + $AtomCount + $BondCount); $LineIndex < @$CmpdLines; $LineIndex++) {
 989       push @WashedCmpdLines, @$CmpdLines[$LineIndex];
 990     }
 991     # Update atom and bond count line...
 992     $WashedCmpdLines[3] = GenerateCmpdCountsLine($NewAtomCount, $NewBondCount, $ChiralFlag);
 993 
 994     $WashedCmpdString = join "\n", @WashedCmpdLines;
 995   }
 996   return ($FragmentCount, $Fragments, $WashedCmpdString);
 997 }
 998