MayaChemTools

   1 package SDFileUtil;
   2 #
   3 # $RCSfile: SDFileUtil.pm,v $
   4 # $Date: 2011/12/26 21:53:29 $
   5 # $Revision: 1.41 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2012 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use Exporter;
  31 use Carp;
  32 use PeriodicTable qw(IsElement);
  33 use TimeUtil ();
  34 
  35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  36 
  37 @ISA = qw(Exporter);
  38 @EXPORT = qw(GenerateCmpdAtomLine GenerateCmpdBondLine GenerateCmpdChargePropertyLines GenerateCmpdCommentsLine GenerateCmpdCountsLine GenerateCmpdAtomAliasPropertyLines GenerateCmpdIsotopePropertyLines GenerateCmpdDataHeaderLabelsAndValuesLines GenerateCmpdMiscInfoLine GenerateCmpdRadicalPropertyLines GenerateCmpdMolNameLine GenerateEmptyCtabBlockLines GenerateMiscLineDateStamp GetAllAndCommonCmpdDataHeaderLabels GetCmpdDataHeaderLabels GetCmpdDataHeaderLabelsAndValues GetCmpdFragments GetCtabLinesCount GetUnknownAtoms GetInvalidAtomNumbers MDLChargeToInternalCharge InternalChargeToMDLCharge MDLBondTypeToInternalBondOrder InternalBondOrderToMDLBondType InternalSpinMultiplicityToMDLRadical MDLRadicalToInternalSpinMultiplicity IsCmpd3D IsCmpd2D ParseCmpdAtomLine ParseCmpdBondLine ParseCmpdCommentsLine ParseCmpdCountsLine ParseCmpdMiscInfoLine ParseCmpdMolNameLine ParseCmpdAtomAliasPropertyLine ParseCmpdChargePropertyLine ParseCmpdIsotopePropertyLine ParseCmpdRadicalPropertyLine ReadCmpdString RemoveCmpdDataHeaderLabelAndValue WashCmpd);
  39 @EXPORT_OK = qw();
  40 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  41 
  42 # Format data for compounds count line...
  43 sub GenerateCmpdCountsLine {
  44   my($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version, $Line);
  45 
  46   if (@_ == 5) {
  47     ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) = @_;
  48   }
  49   elsif (@_ == 3) {
  50     ($AtomCount, $BondCount, $ChiralFlag) = @_;
  51     $PropertyCount = 999;
  52     $Version = "V2000";
  53   }
  54   else {
  55     ($AtomCount, $BondCount) = @_;
  56     $ChiralFlag = 0;
  57     $PropertyCount = 999;
  58     $Version = "V2000";
  59   }
  60   if ($AtomCount > 999) {
  61     croak "Error: SDFileUtil::GenerateCmpdCountsLine: The atom count, $AtomCount, exceeds maximum of 999 allowed for CTAB version 2000. The Extended Connection Table (V3000) format in MDL MOL and SD files is not supported by the current release of MayaChemTools...";
  62   }
  63   $Line = sprintf "%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%6s", $AtomCount, $BondCount, 0, 0, $ChiralFlag, 0, 0, 0, 0, 0, $PropertyCount, $Version;
  64 
  65   return ($Line);
  66 }
  67 
  68 # Generate comments line...
  69 sub GenerateCmpdCommentsLine {
  70   my($Comments) = @_;
  71   my($Line);
  72 
  73   $Line = (length($Comments) > 80) ? substr($Comments, 0, 80) : $Comments;
  74 
  75   return $Line;
  76 }
  77 
  78 # Generate molname line...
  79 sub GenerateCmpdMolNameLine {
  80   my($MolName) = @_;
  81   my($Line);
  82 
  83   $Line = (length($MolName) > 80) ? substr($MolName, 0, 80) : $MolName;
  84 
  85   return $Line;
  86 }
  87 
  88 # Generate data for compounds misc info line...
  89 sub GenerateCmpdMiscInfoLine {
  90   my($ProgramName, $UserInitial, $Code) = @_;
  91   my($Date, $Line);
  92 
  93   if (!(defined($ProgramName) && $ProgramName)) {
  94     $ProgramName = "MayaChem";
  95   }
  96   if (!(defined($UserInitial) && $UserInitial)) {
  97     $UserInitial = "  ";
  98   }
  99   if (!(defined($Code) && $Code)) {
 100     $Code = "2D";
 101   }
 102 
 103   if (length($ProgramName) > 8) {
 104     $ProgramName = substr($ProgramName, 0, 8);
 105   }
 106   if (length($UserInitial) > 2) {
 107     $UserInitial = substr($UserInitial, 0, 2);
 108   }
 109   if (length($Code) > 2) {
 110     $Code = substr($Code, 0, 2);
 111   }
 112   $Date = GenerateMiscLineDateStamp();
 113 
 114   $Line = "${UserInitial}${ProgramName}${Date}${Code}";
 115 
 116   return $Line;
 117 }
 118 
 119 # Generate data for compounds misc info line...
 120 sub GenerateEmptyCtabBlockLines {
 121   my($Date, $Lines);
 122 
 123   if (@_ == 1) {
 124     ($Date) = @_;
 125   }
 126   else {
 127     $Date = GenerateMiscLineDateStamp();
 128   }
 129   # First line: Blank molname line...
 130   # Second line: Misc info...
 131   # Third line: Blank comments line...
 132   # Fourth line: Counts line reflecting empty structure data block...
 133   $Lines = "\n";
 134   $Lines .= "  MayaChem${Date}2D\n";
 135   $Lines .= "\n";
 136   $Lines .= GenerateCmpdCountsLine(0, 0, 0) . "\n";
 137   $Lines .= "M  END";
 138 
 139   return $Lines;
 140 }
 141 
 142 # Generate SD file data stamp...
 143 sub GenerateMiscLineDateStamp {
 144   return TimeUtil::SDFileTimeStamp();
 145 }
 146 
 147 # Generate data for compound atom line...
 148 #
 149 sub GenerateCmpdAtomLine {
 150   my($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge, $StereoParity) = @_;
 151   my($Line);
 152 
 153   if (!defined $MassDifference) {
 154     $MassDifference = 0;
 155   }
 156   if (!defined $Charge) {
 157     $Charge = 0;
 158   }
 159   if (!defined $StereoParity) {
 160     $StereoParity = 0;
 161   }
 162   $Line = sprintf "%10.4f%10.4f%10.4f %-3s%2i%3i%3i  0  0  0  0  0  0  0  0  0", $AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity;
 163 
 164   return $Line
 165 }
 166 
 167 # Generate data for compound bond line...
 168 #
 169 sub GenerateCmpdBondLine {
 170   my($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = @_;
 171   my($Line);
 172 
 173   if (!defined $BondStereo) {
 174     $BondStereo = 0;
 175   }
 176   $Line = sprintf "%3i%3i%3i%3i  0  0  0", $FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo;
 177 
 178   return $Line
 179 }
 180 
 181 # Generate charge property lines for CTAB block...
 182 #
 183 sub GenerateCmpdChargePropertyLines {
 184   my($ChargeValuePairsRef) = @_;
 185 
 186   return _GenerateCmpdGenericPropertyLines('Charge', $ChargeValuePairsRef);
 187 }
 188 
 189 # Generate isotope property lines for CTAB block...
 190 #
 191 sub GenerateCmpdIsotopePropertyLines {
 192   my($IsotopeValuePairsRef) = @_;
 193 
 194   return _GenerateCmpdGenericPropertyLines('Isotope', $IsotopeValuePairsRef);
 195 }
 196 
 197 # Generate radical property line property lines for CTAB block...
 198 #
 199 sub GenerateCmpdRadicalPropertyLines {
 200   my($RadicalValuePairsRef) = @_;
 201 
 202   return _GenerateCmpdGenericPropertyLines('Radical', $RadicalValuePairsRef);
 203 }
 204 
 205 # Generate atom alias property line property lines for CTAB block...
 206 #
 207 # Atom alias property line format:
 208 #
 209 # A  aaa
 210 # x...
 211 #
 212 #    aaa: Atom number
 213 #    x: Atom alias in next line
 214 #
 215 sub GenerateCmpdAtomAliasPropertyLines {
 216   my($PropertyValuePairsRef) = @_;
 217   my($Index, $AtomNum, $AtomAlias, $Line, @PropertyLines);
 218 
 219   @PropertyLines = ();
 220 
 221   for ($Index = 0; $Index < $#{$PropertyValuePairsRef}; $Index += 2) {
 222     $AtomNum = $PropertyValuePairsRef->[$Index];
 223     $AtomAlias = $PropertyValuePairsRef->[$Index + 1];
 224 
 225     $Line = "A  " . sprintf "%3i", $AtomNum;
 226 
 227     push @PropertyLines, $Line;
 228     push @PropertyLines, $AtomAlias;
 229   }
 230 
 231   return @PropertyLines;
 232 }
 233 
 234 # Generate data header labels and values lines...
 235 #
 236 sub GenerateCmpdDataHeaderLabelsAndValuesLines {
 237   my($DataHeaderLabelsRef, $DataHeaderLabelsAndValuesRef, $SortDataLabels) = @_;
 238   my($DataLabel, $DataValue, @DataLabels, @DataLines);
 239 
 240   if (!defined $SortDataLabels) {
 241     $SortDataLabels = 0;
 242   }
 243 
 244   @DataLines = ();
 245   @DataLabels = ();
 246   if ($SortDataLabels) {
 247     push @DataLabels, sort @{$DataHeaderLabelsRef};
 248   }
 249   else {
 250     push @DataLabels,  @{$DataHeaderLabelsRef};
 251   }
 252   for $DataLabel (@DataLabels) {
 253     $DataValue = '';
 254     if (exists $DataHeaderLabelsAndValuesRef->{$DataLabel}) {
 255       $DataValue = $DataHeaderLabelsAndValuesRef->{$DataLabel};
 256     }
 257     push @DataLines, (">  <${DataLabel}>", "$DataValue", "");
 258   }
 259   return @DataLines;
 260 }
 261 
 262 # Parse data field header in SD file and return lists of all and common data field
 263 # labels.
 264 sub GetAllAndCommonCmpdDataHeaderLabels {
 265   my($SDFileRef) = @_;
 266   my($CmpdCount, $CmpdString, $Label, @CmpdLines, @DataFieldLabels, @CommonDataFieldLabels, %DataFieldLabelsMap);
 267 
 268   $CmpdCount = 0;
 269   @DataFieldLabels = ();
 270   @CommonDataFieldLabels = ();
 271   %DataFieldLabelsMap = ();
 272 
 273   while ($CmpdString = ReadCmpdString($SDFileRef)) {
 274     $CmpdCount++;
 275     @CmpdLines = split "\n", $CmpdString;
 276     # Process compound data header labels and figure out which ones are present for
 277     # all the compounds...
 278     if (@DataFieldLabels) {
 279       my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines);
 280       my(%CmpdDataFieldLabelsMap) = ();
 281       # Setup a map for the current labels...
 282       for $Label (@CmpdDataFieldLabels) {
 283         $CmpdDataFieldLabelsMap{$Label} = "PresentInSome";
 284       }
 285       # Check the presence old labels for this compound; otherwise, mark 'em new...
 286       for $Label (@DataFieldLabels) {
 287         if (!$CmpdDataFieldLabelsMap{$Label}) {
 288           $DataFieldLabelsMap{$Label} = "PresentInSome";
 289         }
 290       }
 291       # Check the presence this compound in the old labels; otherwise, add 'em...
 292       for $Label (@CmpdDataFieldLabels ) {
 293         if (!$DataFieldLabelsMap{$Label}) {
 294           # It's a new label...
 295           push @DataFieldLabels, $Label;
 296           $DataFieldLabelsMap{$Label} = "PresentInSome";
 297         }
 298       }
 299     }
 300     else {
 301       # Get the initial label set and set up a map...
 302       @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
 303       for $Label (@DataFieldLabels) {
 304         $DataFieldLabelsMap{$Label} = "PresentInAll";
 305       }
 306     }
 307   }
 308   # Identify the common data field labels...
 309   @CommonDataFieldLabels = ();
 310   for $Label (@DataFieldLabels) {
 311     if ($DataFieldLabelsMap{$Label} eq "PresentInAll") {
 312       push @CommonDataFieldLabels, $Label;
 313     }
 314   }
 315   return ($CmpdCount, \@DataFieldLabels, \@CommonDataFieldLabels);
 316 }
 317 
 318 # Parse all the data header labels and return 'em as an list...
 319 #
 320 # Format:
 321 #
 322 #> Data header line
 323 #Data line(s)
 324 #Blank line
 325 #
 326 # [Data Header] (one line) precedes each item of data, starts with a greater than (>) sign, and
 327 # contains at least one of the following:
 328 #  The field name enclosed in angle brackets. For example: <melting.point>
 329 #  The field number, DTn , where n represents the number assigned to the field in a MACCS-II database
 330 #
 331 #Optional information for the data header includes:
 332 #  The compound’s external and internal registry numbers. External registry numbers must be enclosed in parentheses.
 333 #  Any combination of information
 334 #
 335 #The following are examples of valid data headers:
 336 #> <MELTING.POINT>
 337 #> 55 (MD-08974) <BOILING.POINT> DT12
 338 #> DT12 55
 339 #> (MD-0894) <BOILING.POINT> FROM ARCHIVES
 340 #
 341 #Notes: Sometimes last blank line is missing and can be just followed by $$$$
 342 #
 343 sub GetCmpdDataHeaderLabels {
 344   my($CmpdLines) = @_;
 345   my($CmpdLine, $Label, @Labels);
 346 
 347   @Labels = ();
 348   CMPDLINE: for $CmpdLine (@$CmpdLines) {
 349     if ($CmpdLine !~ /^>/) {
 350       next CMPDLINE;
 351     }
 352     # Does the line contains field name enclosed in angular brackets?
 353     ($Label) = $CmpdLine =~ /<.*?>/g;
 354     if (!defined($Label)) {
 355       next CMPDLINE;
 356     }
 357     $Label =~ s/(<|>)//g;
 358     push @Labels, $Label;
 359   }
 360   return (@Labels);
 361 }
 362 
 363 # Parse all the data header labels and values
 364 sub GetCmpdDataHeaderLabelsAndValues {
 365   my($CmpdLines) = @_;
 366   my($CmpdLine, $CurrentLabel, $Label, $Value, $ValueCount, $ProcessingLabelData, @Values, %DataFields);
 367 
 368   %DataFields = ();
 369   $ProcessingLabelData = 0;
 370   $ValueCount = 0;
 371   CMPDLINE: for $CmpdLine (@$CmpdLines) {
 372     if ($CmpdLine =~ /^\$\$\$\$/) {
 373       last CMPDLINE;
 374     }
 375     if ($CmpdLine =~ /^>/) {
 376       # Does the line contains field name enclosed in angular brackets?
 377       ($Label) = $CmpdLine =~ /<.*?>/g;
 378       if (defined $Label) {
 379         $CurrentLabel = $Label;
 380         $CurrentLabel =~ s/(<|>)//g;
 381         $ProcessingLabelData = 0;
 382         $ValueCount = 0;
 383 
 384         if ($CurrentLabel) {
 385           $ProcessingLabelData = 1;
 386           $DataFields{$CurrentLabel} = '';
 387           next CMPDLINE;
 388         }
 389       }
 390       else {
 391         if (!$ProcessingLabelData) {
 392           # Data line containing no <label> as allowed by SDF format. Just ignore it...
 393           next CMPDLINE;
 394         }
 395       }
 396     }
 397     if (!$ProcessingLabelData) {
 398       next CMPDLINE;
 399     }
 400     if (!(defined($CmpdLine) && length($CmpdLine))) {
 401       # Blank line terminates value for a label...
 402       $CurrentLabel = '';
 403       $ValueCount = 0;
 404       $ProcessingLabelData = 0;
 405       next CMPDLINE;
 406     }
 407     $ValueCount++;
 408     $Value = $CmpdLine;
 409 
 410     if ($ValueCount > 1) {
 411       $DataFields{$CurrentLabel} .= "\n" . $Value;
 412     }
 413     else {
 414       $DataFields{$CurrentLabel} = $Value;
 415     }
 416   }
 417   return (%DataFields);
 418 }
 419 
 420 # Return an updated compoud string after removing  data header label along with its
 421 # value from the specified compound string...
 422 #
 423 sub RemoveCmpdDataHeaderLabelAndValue {
 424   my($CmpdString, $DataHeaderLabel) = @_;
 425   my($Line, $PorcessingDataHeaderLabel, @CmpdLines);
 426 
 427   @CmpdLines = ();
 428   $PorcessingDataHeaderLabel = 0;
 429 
 430   CMPDLINE: for $Line (split "\n", $CmpdString) {
 431     if ($Line =~ /^>/ && $Line =~ /<$DataHeaderLabel>/i) {
 432       $PorcessingDataHeaderLabel = 1;
 433       next CMPDLINE;
 434     }
 435 
 436     if ($PorcessingDataHeaderLabel) {
 437       # Blank line indicates end of fingerprints data value...
 438       if ($Line =~ /^\$\$\$\$/) {
 439         push @CmpdLines, $Line;
 440         $PorcessingDataHeaderLabel = 0;
 441       }
 442       elsif (!length($Line)) {
 443         $PorcessingDataHeaderLabel = 0;
 444       }
 445       next CMPDLINE;
 446     }
 447 
 448     # Track compound lines without fingerprints data...
 449     push @CmpdLines, $Line;
 450   }
 451 
 452   return join "\n", @CmpdLines;
 453 }
 454 
 455 #
 456 # Using bond blocks, figure out the number of disconnected fragments  and
 457 # return their values along with the atom numbers in a string delimited by new
 458 # line character.
 459 #
 460 sub GetCmpdFragments {
 461   my($CmpdLines) = @_;
 462   my($AtomCount, $BondCount, $FirstAtomNum, $SecondAtomNum, @AtomConnections, $BondType, $FragmentString, $FragmentCount, $LineIndex, $Index, $AtomNum, $NbrAtomNum, @ProcessedAtoms, $ProcessedAtomCount, $ProcessAtomNum, @ProcessingAtoms, @ConnectedAtoms, %Fragments, $FragmentNum, $AFragmentString);
 463 
 464   # Setup the connection table for each atom...
 465   @AtomConnections = ();
 466   ($AtomCount, $BondCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
 467   for $AtomNum (1 .. $AtomCount) {
 468     %{$AtomConnections[$AtomNum]} = ();
 469   }
 470   for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) {
 471     ($FirstAtomNum, $SecondAtomNum, $BondType) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]);
 472     if (!$AtomConnections[$FirstAtomNum]{$SecondAtomNum}) {
 473       $AtomConnections[$FirstAtomNum]{$SecondAtomNum} = $BondType;
 474     }
 475     if (!$AtomConnections[$SecondAtomNum]{$FirstAtomNum}) {
 476       $AtomConnections[$SecondAtomNum]{$FirstAtomNum} = $BondType;
 477     }
 478   }
 479 
 480   #Get set to count fragments...
 481   $ProcessedAtomCount = 0;
 482   $FragmentNum = 0;
 483   %Fragments = ();
 484   @ProcessedAtoms = ();
 485   for $AtomNum (1 .. $AtomCount) {
 486     $ProcessedAtoms[$AtomNum] = 0;
 487   }
 488   while ($ProcessedAtomCount < $AtomCount) {
 489     @ProcessingAtoms = ();
 490     @ConnectedAtoms = ();
 491     ATOMNUM: for $AtomNum (1 .. $AtomCount) {
 492       if (!$ProcessedAtoms[$AtomNum]) {
 493         $ProcessedAtomCount++;
 494         $ProcessedAtoms[$AtomNum] = 1;
 495         push @ProcessingAtoms, $AtomNum;
 496         $FragmentNum++;
 497         @{$Fragments{$FragmentNum} } = ();
 498         push @{$Fragments{$FragmentNum} }, $AtomNum;
 499         last ATOMNUM;
 500       }
 501     }
 502 
 503     # Go over the neighbors and follow the connection trail while collecting the
 504     # atoms numbers present in the connected fragment...
 505     while (@ProcessingAtoms) {
 506       for ($Index = 0; $Index < @ProcessingAtoms; $Index++) {
 507         $ProcessAtomNum = $ProcessingAtoms[$Index];
 508         for $NbrAtomNum (keys %{$AtomConnections[$ProcessAtomNum]})  {
 509           if (!$ProcessedAtoms[$NbrAtomNum]) {
 510             $ProcessedAtomCount++;
 511             $ProcessedAtoms[$NbrAtomNum] = 1;
 512             push @ConnectedAtoms, $NbrAtomNum;
 513             push @{ $Fragments{$FragmentNum} }, $NbrAtomNum;
 514           }
 515         }
 516       }
 517       @ProcessingAtoms = ();
 518       @ProcessingAtoms = @ConnectedAtoms;
 519       @ConnectedAtoms = ();
 520     }
 521   }
 522   $FragmentCount = $FragmentNum;
 523   $FragmentString = "";
 524 
 525   # Sort out the fragments by size...
 526   for $FragmentNum (sort { @{$Fragments{$b}} <=> @{$Fragments{$a}}  } keys %Fragments ) {
 527     # Sort the atoms in a fragment by their numbers...
 528     $AFragmentString = join " ", sort { $a <=> $b } @{ $Fragments{$FragmentNum} };
 529     if ($FragmentString) {
 530       $FragmentString .=  "\n" . $AFragmentString;
 531     }
 532     else {
 533       $FragmentString = $AFragmentString;
 534     }
 535   }
 536   return ($FragmentCount, $FragmentString);
 537 }
 538 
 539 # Count number of lines present in between 4th and line containg "M END"
 540 sub GetCtabLinesCount {
 541   my($CmpdLines) = @_;
 542   my($LineIndex, $CtabLinesCount);
 543 
 544   $CtabLinesCount = 0;
 545  LINE: for ($LineIndex = 4; $LineIndex < @$CmpdLines; $LineIndex++) {
 546     #
 547     # Any line after atom and bond data starting with anything other than space or
 548     # a digit indicates end of Ctab atom/bond data block...
 549     #
 550     if (@$CmpdLines[$LineIndex] !~ /^[0-9 ]/) {
 551       $CtabLinesCount = $LineIndex - 4;
 552       last LINE;
 553     }
 554   }
 555   return $CtabLinesCount;
 556 }
 557 
 558 # Using atom blocks, count the number of atoms which contain special element
 559 # symbols not present in the periodic table.
 560 sub GetUnknownAtoms {
 561   my($CmpdLines) = @_;
 562   my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines, $LineIndex, $AtomCount, $AtomSymbol);
 563 
 564   $UnknownAtomCount = 0;
 565   $UnknownAtoms = "";
 566   $UnknownAtomLines = "";
 567   ($AtomCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
 568   for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) {
 569     ($AtomSymbol) = ParseCmpdAtomLine(@$CmpdLines[$LineIndex]);
 570     if (!IsElement($AtomSymbol)) {
 571       $UnknownAtomCount++;
 572       $UnknownAtoms .= " $AtomSymbol";
 573       if ($UnknownAtomLines) {
 574         $UnknownAtomLines .= "\n" . @$CmpdLines[$LineIndex];
 575       }
 576       else {
 577         $UnknownAtomLines = @$CmpdLines[$LineIndex];
 578       }
 579     }
 580   }
 581   return ($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines);
 582 }
 583 
 584 # Check z coordinates of all atoms to see whether any of them is non-zero
 585 # which makes the compound geometry three dimensional...
 586 #
 587 sub IsCmpd3D {
 588   my($CmpdLines) = @_;
 589   my($LineIndex, $AtomCount, $AtomSymbol, $AtomX, $AtomY, $AtomZ);
 590 
 591   ($AtomCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
 592   for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) {
 593     ($AtomSymbol, $AtomX, $AtomY, $AtomZ) = ParseCmpdAtomLine(@$CmpdLines[$LineIndex]);
 594     if ($AtomZ != 0) {
 595       return 1;
 596     }
 597   }
 598   return 0;
 599 }
 600 
 601 # Check whether it's a 2D compound...
 602 #
 603 sub IsCmpd2D {
 604   my($CmpdLines) = @_;
 605 
 606   return IsCmpd3D($CmpdLines) ? 0 : 1;
 607 }
 608 
 609 # Using bond blocks, count the number of bond lines which contain atom numbers
 610 # greater than atom count specified in compound count line...
 611 #
 612 sub GetInvalidAtomNumbers {
 613   my($CmpdLines) = @_;
 614   my($LineIndex, $AtomCount, $BondCount, $FirstAtomNum, $SecondAtomNum, $InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines, $Line, $InvalidAtomPropertyLine, $ValuePairIndex, $AtomNum, $Value, @ValuePairs);
 615 
 616   ($AtomCount, $BondCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
 617 
 618   $InvalidAtomNumbersCount = 0;
 619   $InvalidAtomNumbers = "";
 620   $InvalidAtomNumberLines = "";
 621 
 622   # Go over bond block lines...
 623   LINE: for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) {
 624     ($FirstAtomNum, $SecondAtomNum) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]);
 625     if ($FirstAtomNum <= $AtomCount && $SecondAtomNum <= $AtomCount) {
 626       next LINE;
 627     }
 628     if ($FirstAtomNum > $AtomCount) {
 629       $InvalidAtomNumbersCount++;
 630       $InvalidAtomNumbers .= " $FirstAtomNum";
 631     }
 632     if ($SecondAtomNum > $AtomCount) {
 633       $InvalidAtomNumbersCount++;
 634       $InvalidAtomNumbers .= " $SecondAtomNum";
 635     }
 636     if ($InvalidAtomNumberLines) {
 637       $InvalidAtomNumberLines .= "\n" . @$CmpdLines[$LineIndex];
 638     }
 639     else {
 640       $InvalidAtomNumberLines = @$CmpdLines[$LineIndex];
 641     }
 642   }
 643   # Go over property lines before M  END...
 644   #
 645   LINE: for ($LineIndex = (4 + $AtomCount + $BondCount); $LineIndex < @$CmpdLines; $LineIndex++) {
 646     $Line = @$CmpdLines[$LineIndex];
 647     @ValuePairs = ();
 648     if ($Line =~ /^M  END/i) {
 649       last LINE;
 650     }
 651     @ValuePairs = ();
 652     if ($Line =~ /^M  CHG/i) {
 653       @ValuePairs = ParseCmpdChargePropertyLine($Line);
 654     }
 655     elsif ($Line =~ /^M  RAD/i) {
 656       @ValuePairs = ParseCmpdRadicalPropertyLine($Line);
 657     }
 658     elsif ($Line =~ /^M  ISO/i) {
 659       @ValuePairs = ParseCmpdIsotopePropertyLine($Line);
 660     }
 661     elsif ($Line =~ /^A  /i) {
 662       my($NextLine);
 663       $LineIndex++;
 664       $NextLine = @$CmpdLines[$LineIndex];
 665       @ValuePairs = ParseCmpdAtomAliasPropertyLine($Line, $NextLine);
 666     }
 667     else {
 668       next LINE;
 669     }
 670 
 671     $InvalidAtomPropertyLine = 0;
 672     for ($ValuePairIndex = 0; $ValuePairIndex < $#ValuePairs; $ValuePairIndex += 2) {
 673       $AtomNum = $ValuePairs[$ValuePairIndex]; $Value = $ValuePairs[$ValuePairIndex + 1];
 674       if ($AtomNum > $AtomCount) {
 675         $InvalidAtomPropertyLine = 1;
 676         $InvalidAtomNumbersCount++;
 677         $InvalidAtomNumbers .= " $AtomNum";
 678       }
 679     }
 680     if ($InvalidAtomPropertyLine) {
 681       if ($InvalidAtomNumberLines) {
 682         $InvalidAtomNumberLines .= "\n" . $Line;
 683       }
 684       else {
 685         $InvalidAtomNumberLines = $Line;
 686       }
 687     }
 688   }
 689 
 690   return ($InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines);
 691 }
 692 
 693 # Ctab lines: Atom block
 694 #
 695 # Format: xxxxx.xxxxyyyyy.yyyyzzzzz.zzzz aaaddcccssshhhbbbvvvHHHrrriiimmmnnneee
 696 #         A10       A10       A10       xA3 A2A3 A3 A3 A3 A3 A3 A3 A3 A3 A3 A3
 697 # x,y,z: Atom coordinates
 698 # aaa: Atom symbol. Entry in periodic table or L for atom list, A, Q, * for unspecified
 699 #      atom, and LP for lone pair, or R# for Rgroup label
 700 # dd: Mass difference. -3, -2, -1, 0, 1, 2, 3, 4 (0 for value beyond these limits)
 701 # ccc: Charge. 0 = uncharged or value other than these, 1 = +3, 2 = +2, 3 = +1,
 702 #      4 = doublet radical, 5 = -1, 6 = -2, 7 = -3
 703 # sss: Atom stereo parity. 0 = not stereo, 1 = odd, 2 = even, 3 = either or unmarked stereo center
 704 # hhh: Hydrogen count + 1. 1 = H0, 2 = H1, 3 = H2, 4 = H3, 5 = H4
 705 # bbb: Stereo care box. 0 = ignore stereo configuration of this double bond atom, 1 = stereo
 706 #      configuration of double bond atom must match
 707 # vvv: Valence. 0 = no marking (default)(1 to 14) = (1 to 14) 15 = zero valence
 708 # HHH: H0 designator. 0 = not specified, 1 = no H atoms allowed (redundant due to hhh)
 709 # rrr: Not used
 710 # iii: Not used
 711 # mmm: Atom-atom mapping number. 1 - number of atoms
 712 # nnn: Inversion/retention flag. 0 = property not applied, 1 = configuration is inverted,
 713 #      2 = configuration is retained.
 714 # eee: Exact change flag. 0 = property not applied, 1 = change on atom must be
 715 #      exactly as shown
 716 #
 717 # Notes:
 718 #  . StereoParity: 1 - ClockwiseStereo, 2 - AntiClockwiseStereo; 3 - Either; 0 - none. These
 719 #    values determine chirailty around the chiral center; a non zero value indicates atom
 720 #    has been marked as chiral center.
 721 #
 722 sub ParseCmpdAtomLine {
 723   my($Line) = @_;
 724   my ($LineIndex, $AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity);
 725 
 726   ($AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity) = ('') x 7;
 727   if (length($Line) > 31) {
 728     ($AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity) = unpack("A10A10A10xA3A2A3A3", $Line);
 729   }
 730   else {
 731     ($AtomX, $AtomY, $AtomZ, $AtomSymbol) = unpack("A10A10A10", $Line);
 732   }
 733   return ($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge, $StereoParity);
 734 }
 735 
 736 # Map MDL charge value used in SD and MOL files to internal charge used by MayaChemTools.
 737 #
 738 sub MDLChargeToInternalCharge {
 739   my($MDLCharge) = @_;
 740   my($InternalCharge);
 741 
 742   CHARGE: {
 743     if ($MDLCharge == 0) { $InternalCharge = 0; last CHARGE;}
 744     if ($MDLCharge == 1) { $InternalCharge = 3; last CHARGE;}
 745     if ($MDLCharge == 2) { $InternalCharge = 2; last CHARGE;}
 746     if ($MDLCharge == 3) { $InternalCharge = 1; last CHARGE;}
 747     if ($MDLCharge == 5) { $InternalCharge = -1; last CHARGE;}
 748     if ($MDLCharge == 6) { $InternalCharge = -2; last CHARGE;}
 749     if ($MDLCharge == 7) { $InternalCharge = -3; last CHARGE;}
 750     # All other MDL charge values, including 4 corresponding to "doublet radical",
 751     # are assigned internal value of 0.
 752     $InternalCharge = 0;
 753     if ($MDLCharge != 4) {
 754       carp "Warning: MDLChargeToInternalCharge: MDL charge value, $MDLCharge, is not supported: An internal charge value, 0, has been assigned...";
 755     }
 756   }
 757   return $InternalCharge;
 758 }
 759 
 760 # Map internal charge used by MayaChemTools to MDL charge value used in SD and MOL files.
 761 #
 762 sub InternalChargeToMDLCharge {
 763   my($InternalCharge) = @_;
 764   my($MDLCharge);
 765 
 766   CHARGE: {
 767     if ($InternalCharge == 3) { $MDLCharge = 1; last CHARGE;}
 768     if ($InternalCharge == 2) { $MDLCharge = 2; last CHARGE;}
 769     if ($InternalCharge == 1) { $MDLCharge = 3; last CHARGE;}
 770     if ($InternalCharge == -1) { $MDLCharge = 5; last CHARGE;}
 771     if ($InternalCharge == -2) { $MDLCharge = 6; last CHARGE;}
 772     if ($InternalCharge == -3) { $MDLCharge = 7; last CHARGE;}
 773     # All other MDL charge values, including 4 corresponding to "doublet radical",
 774     # are assigned internal value of 0.
 775     $MDLCharge = 0;
 776   }
 777   return $MDLCharge;
 778 }
 779 
 780 # Ctab lines: Bond block
 781 #
 782 # Format: 111222tttsssxxxrrrccc
 783 #
 784 # 111: First atom number.
 785 # 222: Second atom number.
 786 # ttt: Bond type. 1 = Single, 2 = Double, 3 = Triple, 4 = Aromatic, 5 = Single or Double,
 787 #      6 = Single or Aromatic, 7 = Double or Aromatic, 8 = Any
 788 # sss: Bond stereo. Single bonds: 0 = not stereo, 1 = Up, 4 = Either, 6 = Down,
 789 #      Double bonds: 0 = Use x-, y-, z-coords from atom block to determine cis or trans,
 790 #      3 = Cis or trans (either) double bond
 791 # xxx: Not used
 792 # rrr: Bond topology. 0 = Either, 1 = Ring, 2 = Chain
 793 # ccc: Reacting center status. 0 = unmarked, 1 = a center, -1 = not a center,
 794 #      Additional: 2 = no change,4 = bond made/broken, 8 = bond order changes 12 = 4+8
 795 #      (both made/broken and changes); 5 = (4 + 1), 9 = (8 + 1), and 13 = (12 + 1) are also possible
 796 sub ParseCmpdBondLine {
 797   my($Line) = @_;
 798   my($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo);
 799 
 800   ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = map {s/ //g; $_} unpack("A3A3A3A3", $Line);
 801   return ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo);
 802 }
 803 
 804 # Map MDL bond type value used in SD and MOL files to internal bond order values used by MayaChemTools...
 805 #
 806 sub MDLBondTypeToInternalBondOrder {
 807   my($MDLBondType) = @_;
 808   my($InternalBondOrder);
 809 
 810   BONDTYPE: {
 811     if ($MDLBondType == 1) { $InternalBondOrder = 1; last BONDTYPE;}
 812     if ($MDLBondType == 2) { $InternalBondOrder = 2; last BONDTYPE;}
 813     if ($MDLBondType == 3) { $InternalBondOrder = 3; last BONDTYPE;}
 814     if ($MDLBondType == 4) { $InternalBondOrder = 1.5; last BONDTYPE;} # Aromatic
 815     #
 816     # Although MDL aromatic bond values are used for query only and explicit Kekule bond order
 817     # values must be assigned, internal value of 1.5 is allowed to indicate aromatic bond orders.
 818     #
 819     # All other MDL bond type values -  5 = Single or Double, 6 = Single or Aromatic, 7 = Double or Aromatic,
 820     # 8 = Any - are assigned internal value of 1: These are meant to be used for structure queries by MDL products.
 821     #
 822     $InternalBondOrder = 1;
 823     carp "Warning: MDLBondTypeToInternalBondOrder: MDL bond type value, $MDLBondType, is not supported: An internal bond order value, 1, has been assigned...";
 824   }
 825   return $InternalBondOrder;
 826 }
 827 
 828 # Map internal bond order values used by MayaChemTools to MDL bond type value used in SD and MOL files...
 829 #
 830 sub InternalBondOrderToMDLBondType {
 831   my($InternalBondOrder) = @_;
 832   my($MDLBondType);
 833 
 834   BONDTYPE: {
 835     if ($InternalBondOrder == 1) { $MDLBondType = 1; last BONDTYPE;}
 836     if ($InternalBondOrder == 2) { $MDLBondType = 2; last BONDTYPE;}
 837     if ($InternalBondOrder == 3) { $MDLBondType = 3; last BONDTYPE;}
 838     if ($InternalBondOrder == 1.5) { $MDLBondType = 4; last BONDTYPE;} # Aromatic
 839     $MDLBondType = 1;
 840   }
 841   return $MDLBondType;
 842 }
 843 
 844 # Third line: Comments - A blank line is also allowed.
 845 sub ParseCmpdCommentsLine {
 846   my($Line) = @_;
 847   my($Comments);
 848 
 849   $Comments = unpack("A80", $Line);
 850 
 851   return ($Comments);
 852 }
 853 
 854 # Map MDL bond stereo value used in SD and MOL files to internal bond type values used by MayaChemTools...
 855 #
 856 sub MDLBondStereoToInternalBondType {
 857   my($MDLBondStereo) = @_;
 858   my($InternalBondType);
 859 
 860   $InternalBondType = '';
 861 
 862   BONDSTEREO: {
 863     if ($MDLBondStereo == 1) { $InternalBondType = 'SingleUp'; last BONDSTEREO;}
 864     if ($MDLBondStereo == 4) { $InternalBondType = 'SingleWavy'; last BONDSTEREO;}
 865     if ($MDLBondStereo == 6) { $InternalBondType = 'SingleDown'; last BONDSTEREO;}
 866     if ($MDLBondStereo == 3) { $InternalBondType = 'DoubleCross'; last BONDSTEREO;}
 867     $InternalBondType = '';
 868     carp "Warning: MDLBondStereoToInternalBondType: MDL bond stereo value, $MDLBondStereo, is not supported: It has been ignored and bond order would be used to determine bond type...";
 869   }
 870   return $InternalBondType;
 871 }
 872 
 873 # Map internal bond type values used by MayaChemTools to MDL bond stereo value used in SD and MOL files...
 874 #
 875 sub InternalBondTypeToMDLBondStereo {
 876   my($InternalBondType) = @_;
 877   my($MDLBondStereo);
 878 
 879   $MDLBondStereo = 0;
 880 
 881   BONDSTEREO: {
 882     if ($InternalBondType =~ /^SingleUp$/i) { $MDLBondStereo = 1; last BONDSTEREO;}
 883     if ($InternalBondType =~ /^SingleWavy$/i) { $MDLBondStereo = 4; last BONDSTEREO;}
 884     if ($InternalBondType =~ /^SingleDown$/) { $MDLBondStereo = 6; last BONDSTEREO;}
 885     if ($InternalBondType =~ /^DoubleCross$/) { $MDLBondStereo = 3; last BONDSTEREO;}
 886     $MDLBondStereo = 0;
 887   }
 888   return $MDLBondStereo;
 889 }
 890 
 891 # Fourth line: Counts
 892 #
 893 # Format: aaabbblllfffcccsssxxxrrrpppiiimmmvvvvvv
 894 #
 895 # aaa: number of atoms; bbb: number of bonds; lll: number of atom lists; fff: (obsolete)
 896 # ccc: chiral flag: 0=not chiral, 1=chiral; sss: number of stext entries; xxx,rrr,ppp,iii:
 897 # (obsolete); mmm: number of lines of additional properties, including the M END line, No
 898 # longer supported, default is set to 999; vvvvvv: version
 899 
 900 sub ParseCmpdCountsLine {
 901   my($Line) = @_;
 902   my($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version);
 903 
 904   if (length($Line) >= 39) {
 905     ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) = unpack("A3A3x3x3A3x3x3x3x3x3A3A6", $Line);
 906   }
 907   elsif (length($Line) >= 15) {
 908     ($PropertyCount, $Version) = ("999", "v2000");
 909     ($AtomCount, $BondCount, $ChiralFlag) = unpack("A3A3x3x3A3", $Line);
 910   }
 911   else {
 912     ($ChiralFlag, $PropertyCount, $Version) = ("0", "999", "v2000");
 913     ($AtomCount, $BondCount) = unpack("A3A3", $Line);
 914   }
 915 
 916   if ($Version =~ /V3000/i) {
 917     # Current version of MayaChemTools modules and classes for processing MDL MOL and SD don't support
 918     # V3000. So instead of relying on callers, just exit with an error to disable any processing of V3000
 919     # format.
 920     croak "Error: SDFileUtil::ParseCmpdCountsLine: The Extended Connection Table (V3000) format in MDL MOL and SD files is not supported by the current release of MayaChemTools...";
 921   }
 922 
 923   return ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version);
 924 }
 925 
 926 # Second line: Misc info
 927 #
 928 # Format: IIPPPPPPPPMMDDYYHHmmddSSssssssssssEEEEEEEEEEEERRRRRR
 929 #         A2A8      A10       A2I2A10       A12         A6
 930 # User's first and last initials (I), program name (P), date/time (M/D/Y,H:m),
 931 # dimensional codes - 2D or 3D (d),scaling factors (S, s), energy (E) if modeling program input,
 932 # internal registry number (R) if input through MDL form. A blank line is also allowed.
 933 sub ParseCmpdMiscInfoLine {
 934   my($Line) = @_;
 935   my($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum);
 936 
 937   ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum) = unpack("A2A8A10A2A2A10A12A6", $Line);
 938   return ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum);
 939 }
 940 
 941 # First line: Molecule name. This line is unformatted, but like all other lines in a
 942 # molfile may not extend beyond column 80. A blank line is also allowed.
 943 sub ParseCmpdMolNameLine {
 944   my($Line) = @_;
 945   my($MolName);
 946 
 947   $MolName = unpack("A80", $Line);
 948 
 949   return ($MolName);
 950 }
 951 
 952 # Parse atom alias property line in CTAB generic properties block.
 953 #
 954 # Atom alias property line format:
 955 #
 956 # A  aaa
 957 # x...
 958 #
 959 #    aaa: Atom number
 960 #    x: Atom alias in next line
 961 #
 962 sub ParseCmpdAtomAliasPropertyLine {
 963   my($Line, $NextLine) = @_;
 964   my($Label, $AtomNumber, $AtomAlias);
 965 
 966   ($Label, $AtomNumber) = split(' ', $Line);
 967   $AtomAlias = $NextLine;
 968 
 969   if (!$AtomAlias) {
 970     carp "Warning: _ParseCmpdAtomAliasPropertyLine: No atom alias value specified on the line following atom alias property line...";
 971   }
 972 
 973   return ($AtomNumber, $AtomAlias);
 974 }
 975 
 976 # Parse charge property line in CTAB generic properties block.
 977 #
 978 # Charge property line format:
 979 #
 980 # M  CHGnn8 aaa vvv ...
 981 #
 982 #    nn8: Number of value pairs. Maximum of 8 pairs allowed.
 983 #    aaa: Atom number
 984 #    vvv: -15 to +15. Default of 0 = uncharged atom. When present, this property supersedes
 985 #    all charge and radical values in the atom block, forcing a 0 charge on all atoms not
 986 #    listed in an M  CHG or M  RAD line.
 987 #
 988 sub ParseCmpdChargePropertyLine {
 989   my($Line) = @_;
 990 
 991   return _ParseCmpdGenericPropertyLine('Charge', $Line);
 992 }
 993 
 994 
 995 # Parse isotope property line in CTAB generic properties block.
 996 #
 997 # Isoptope property line format:
 998 #
 999 # M  ISOnn8 aaa vvv ...
1000 #
1001 #    nn8: Number of value paris. Maximum of 8 pairs allowed.
1002 #    aaa: Atom number
1003 #    vvv: Absolute mass of the atom isotope as a positive integer. When present, this property
1004 #    supersedes all isotope values in the atom block. Default (no entry) means natural
1005 #    abundance. The difference between this absolute mass value and the natural
1006 #    abundance value specified in the PTABLE.DAT file must be within the range of -18
1007 #    to +12
1008 #
1009 # Notes:
1010 #  . Values correspond to mass numbers...
1011 #
1012 sub ParseCmpdIsotopePropertyLine {
1013   my($Line) = @_;
1014 
1015   return _ParseCmpdGenericPropertyLine('Isotope', $Line);
1016 }
1017 
1018 # Parse radical property line in CTAB generic properties block.
1019 #
1020 # Radical property line format:
1021 #
1022 # M  RADnn8 aaa vvv ...
1023 #
1024 #    nn8: Number of value paris. Maximum of 8 pairs allowed.
1025 #    aaa: Atom number
1026 #    vvv: Default of 0 = no radical, 1 = singlet (:), 2 = doublet ( . or ^), 3 = triplet (^^). When
1027 #    present, this property supersedes all charge and radical values in the atom block,
1028 #    forcing a 0 (zero) charge and radical on all atoms not listed in an M  CHG or
1029 #    M  RAD line.
1030 #
1031 sub ParseCmpdRadicalPropertyLine {
1032   my($Line) = @_;
1033 
1034   return _ParseCmpdGenericPropertyLine('Radical', $Line);
1035 }
1036 
1037 # Map MDL radical stereo value used in SD and MOL files to internal spin multiplicity values used by MayaChemTools...
1038 #
1039 sub MDLRadicalToInternalSpinMultiplicity {
1040   my($MDLRadical) = @_;
1041   my($InternalSpinMultiplicity);
1042 
1043   $InternalSpinMultiplicity = '';
1044 
1045   BONDSTEREO: {
1046     if ($MDLRadical == 0) { $InternalSpinMultiplicity = 0; last BONDSTEREO;}
1047     if ($MDLRadical == 1) { $InternalSpinMultiplicity = 1; last BONDSTEREO;}
1048     if ($MDLRadical == 2) { $InternalSpinMultiplicity = 2; last BONDSTEREO;}
1049     if ($MDLRadical == 3) { $InternalSpinMultiplicity = 3; last BONDSTEREO;}
1050     $InternalSpinMultiplicity = '';
1051     carp "Warning: MDLRadicalToInternalSpinMultiplicity: MDL radical value, $MDLRadical, specifed on line M  RAD is not supported...";
1052   }
1053   return $InternalSpinMultiplicity;
1054 }
1055 
1056 # Map internal spin multiplicity values used by MayaChemTools to MDL radical stereo value used in SD and MOL files...
1057 #
1058 sub InternalSpinMultiplicityToMDLRadical {
1059   my($InternalSpinMultiplicity) = @_;
1060   my($MDLRadical);
1061 
1062   $MDLRadical = 0;
1063 
1064   BONDSTEREO: {
1065     if ($InternalSpinMultiplicity == 1) { $MDLRadical = 1; last BONDSTEREO;}
1066     if ($InternalSpinMultiplicity == 2) { $MDLRadical = 2; last BONDSTEREO;}
1067     if ($InternalSpinMultiplicity == 3) { $MDLRadical = 3; last BONDSTEREO;}
1068     $MDLRadical = 0;
1069   }
1070   return $MDLRadical;
1071 }
1072 
1073 # Process generic CTAB property line...
1074 sub _ParseCmpdGenericPropertyLine {
1075   my($PropertyName, $Line) = @_;
1076 
1077   my($Label, $PropertyLabel, $ValuesCount, $ValuePairsCount, @ValuePairs);
1078 
1079   @ValuePairs = ();
1080   ($Label, $PropertyLabel, $ValuesCount, @ValuePairs) = split(' ', $Line);
1081   $ValuePairsCount = (scalar @ValuePairs)/2;
1082   if ($ValuesCount != $ValuePairsCount) {
1083     carp "Warning: _ParseCmpdGenericPropertyLine: Number of atom number and $PropertyName value paris specified on $Label $PropertyLabel property line, $ValuePairsCount, does not match expected value of $ValuesCount...";
1084   }
1085 
1086   return (@ValuePairs);
1087 }
1088 
1089 # Generic CTAB property lines for charge, istope and radical properties...
1090 #
1091 sub _GenerateCmpdGenericPropertyLines {
1092   my($PropertyName, $PropertyValuePairsRef) = @_;
1093   my($Index, $PropertyLabel, $Line, $PropertyCount, $AtomNum, $PropertyValue, @PropertyLines);
1094 
1095   @PropertyLines = ();
1096   NAME: {
1097     if ($PropertyName =~ /^Charge$/i) { $PropertyLabel = "M  CHG"; last NAME; }
1098     if ($PropertyName =~ /^Isotope$/i) { $PropertyLabel = "M  ISO"; last NAME; }
1099     if ($PropertyName =~ /^Radical$/i) { $PropertyLabel = "M  RAD"; last NAME; }
1100     carp "Warning: _GenerateCmpdGenericPropertyLines: Unknown property name, $PropertyName, specified...";
1101     return @PropertyLines;
1102   }
1103 
1104   # A maximum of 8 property pair values allowed per line...
1105   $PropertyCount = 0;
1106   $Line = '';
1107   for ($Index = 0; $Index < $#{$PropertyValuePairsRef}; $Index += 2) {
1108     if ($PropertyCount > 8) {
1109       # Setup property line...
1110       $Line = "${PropertyLabel}  8${Line}";
1111       push @PropertyLines, $Line;
1112 
1113       $PropertyCount = 0;
1114       $Line = '';
1115     }
1116     $PropertyCount++;
1117     $AtomNum = $PropertyValuePairsRef->[$Index];
1118     $PropertyValue = $PropertyValuePairsRef->[$Index + 1];
1119     $Line .= sprintf " %3i %3i", $AtomNum, $PropertyValue;
1120   }
1121   if ($Line) {
1122     $Line = "${PropertyLabel}  ${PropertyCount}${Line}";
1123     push @PropertyLines, $Line;
1124   }
1125   return @PropertyLines;
1126 }
1127 
1128 #
1129 # Read compound data into a string and return its value
1130 sub ReadCmpdString {
1131   my($SDFileRef) = @_;
1132   my($CmpdString);
1133 
1134   $CmpdString = "";
1135  LINE: while (defined($_ = <$SDFileRef>)) {
1136     # Change Windows and Mac new line char to UNIX...
1137     s/(\r\n)|(\r)/\n/g;
1138 
1139     if (/^\$\$\$\$/) {
1140       chomp;
1141       $CmpdString .=  $_;
1142       last LINE;
1143     }
1144     else {
1145       $CmpdString .=  $_;
1146     }
1147   }
1148   return $CmpdString;
1149 }
1150 
1151 # Find out the number of fragements in the compounds. And for the compound with
1152 # more than one fragment, remove all the others besides the largest one.
1153 sub WashCmpd {
1154   my($CmpdLines) = @_;
1155   my($WashedCmpdString, $FragmentCount, $Fragments);
1156 
1157   $WashedCmpdString = "";
1158   ($FragmentCount, $Fragments) = GetCmpdFragments($CmpdLines);
1159   if ($FragmentCount > 1) {
1160     # Go over the compound data for the largest fragment including property
1161     # data...
1162     my (@AllFragments, @LargestFragment, %LargestFragmentAtoms, @WashedCmpdLines, $Index, $LineIndex, $AtomCount, $BondCount, $NewAtomCount, $NewBondCount, $FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo, $FirstNewAtomNum, $SecondNewAtomNum, $AtomNum, $ChiralFlag, $BondLine, $MENDLineIndex, $Line, $Value, @ValuePairs, @NewValuePairs, $ValuePairIndex, $NewAtomNum, @NewPropertyLines);
1163 
1164     @AllFragments = (); @LargestFragment = ();
1165     %LargestFragmentAtoms = ();
1166     @AllFragments = split "\n", $Fragments;
1167     @LargestFragment = split " ", $AllFragments[0];
1168     for $Index (0 .. $#LargestFragment) {
1169       # Map old atom numbers to new atom numbers as the fragment atom numbers are sorted
1170       # from lowest to highest old atom numbers...
1171       $LargestFragmentAtoms{$LargestFragment[$Index]} = $Index + 1;
1172     }
1173     @WashedCmpdLines = ();
1174     push @WashedCmpdLines, @$CmpdLines[0], @$CmpdLines[1], @$CmpdLines[2], @$CmpdLines[3];
1175     ($AtomCount, $BondCount, $ChiralFlag) = ParseCmpdCountsLine(@$CmpdLines[3]);
1176     $NewAtomCount = @LargestFragment;
1177     $NewBondCount = 0;
1178     $AtomNum = 0;
1179     # Retrieve the largest fragment atom lines...
1180     for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) {
1181       $AtomNum++;
1182       if ($LargestFragmentAtoms{$AtomNum}) {
1183         push @WashedCmpdLines, @$CmpdLines[$LineIndex];
1184       }
1185     }
1186     # Retrieve the largest fragment bond lines...
1187     for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) {
1188       ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]);
1189       if ($LargestFragmentAtoms{$FirstAtomNum} && $LargestFragmentAtoms{$SecondAtomNum}) {
1190         $NewBondCount++;
1191         # Set up bond line with new atom number mapping...
1192         $FirstNewAtomNum =  $LargestFragmentAtoms{$FirstAtomNum};
1193         $SecondNewAtomNum =  $LargestFragmentAtoms{$SecondAtomNum};
1194         $BondLine = GenerateCmpdBondLine($FirstNewAtomNum, $SecondNewAtomNum, $BondType, $BondStereo);
1195         push @WashedCmpdLines, $BondLine;
1196       }
1197     }
1198     # Get property lines for CHG, ISO and RAD label and map the old atom numbers to new
1199     # atom numners; Others, property lines before M  END line are skipped as atom numbers for
1200     # other properties might not valid anymore...
1201     #
1202     $MENDLineIndex = $LineIndex;
1203     LINE: for ($LineIndex = (4 + $AtomCount + $BondCount); $LineIndex < @$CmpdLines; $LineIndex++) {
1204       $Line = @$CmpdLines[$LineIndex];
1205       if ($Line =~ /^M  END/i) {
1206         push @WashedCmpdLines, "M  END";
1207         $MENDLineIndex = $LineIndex;
1208         last LINE;
1209       }
1210 
1211       @ValuePairs = ();
1212       if ($Line =~ /^M  CHG/i) {
1213         @ValuePairs = ParseCmpdChargePropertyLine($Line);
1214       }
1215       elsif ($Line =~ /^M  RAD/i) {
1216         @ValuePairs = ParseCmpdRadicalPropertyLine($Line);
1217       }
1218       elsif ($Line =~ /^M  ISO/i) {
1219         @ValuePairs = ParseCmpdIsotopePropertyLine($Line);
1220       }
1221       elsif ($Line =~ /^A  /i) {
1222         my($NextLine);
1223         $LineIndex++;
1224         $NextLine = @$CmpdLines[$LineIndex];
1225         @ValuePairs = ParseCmpdAtomAliasPropertyLine($Line, $NextLine);
1226       }
1227       else {
1228         next LINE;
1229       }
1230 
1231       if (!@ValuePairs) {
1232         next LINE;
1233       }
1234 
1235       # Collect values for valid atom numbers with mapping to new atom numbers...
1236       @NewValuePairs = ();
1237       VALUEINDEX: for ($ValuePairIndex = 0; $ValuePairIndex < $#ValuePairs; $ValuePairIndex += 2) {
1238         $AtomNum = $ValuePairs[$ValuePairIndex]; $Value = $ValuePairs[$ValuePairIndex + 1];
1239         if (!exists $LargestFragmentAtoms{$AtomNum}) {
1240           next VALUEINDEX;
1241         }
1242         $NewAtomNum = $LargestFragmentAtoms{$AtomNum};
1243         push @NewValuePairs, ($NewAtomNum, $Value)
1244       }
1245       if (!@NewValuePairs) {
1246         next LINE;
1247       }
1248       @NewPropertyLines = ();
1249       if ($Line =~ /^M  CHG/i) {
1250         @NewPropertyLines = GenerateCmpdChargePropertyLines(\@NewValuePairs);
1251       }
1252       elsif ($Line =~ /^M  RAD/i) {
1253         @NewPropertyLines = GenerateCmpdRadicalPropertyLines(\@NewValuePairs);
1254       }
1255       elsif ($Line =~ /^M  ISO/i) {
1256         @NewPropertyLines = GenerateCmpdIsotopePropertyLines(\@NewValuePairs);
1257       }
1258       elsif ($Line =~ /^A  /i) {
1259         @NewPropertyLines = GenerateCmpdAtomAliasPropertyLines(\@NewValuePairs);
1260       }
1261       push @WashedCmpdLines, @NewPropertyLines;
1262     }
1263 
1264     # Retrieve rest of the data label and value property data...
1265     for ($LineIndex = (1 + $MENDLineIndex); $LineIndex < @$CmpdLines; $LineIndex++) {
1266       push @WashedCmpdLines, @$CmpdLines[$LineIndex];
1267     }
1268     # Update atom and bond count line...
1269     $WashedCmpdLines[3] = GenerateCmpdCountsLine($NewAtomCount, $NewBondCount, $ChiralFlag);
1270 
1271     $WashedCmpdString = join "\n", @WashedCmpdLines;
1272   }
1273   return ($FragmentCount, $Fragments, $WashedCmpdString);
1274 }
1275