MayaChemTools

   1 package MolecularFormula;
   2 #
   3 # $RCSfile: MolecularFormula.pm,v $
   4 # $Date: 2008/04/19 16:11:01 $
   5 # $Revision: 1.13 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 use 5.006;
  29 use strict;
  30 use Carp;
  31 use Text::ParseWords;
  32 use TextUtil;
  33 use PeriodicTable;
  34 
  35 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  36 
  37 $VERSION = '1.00';
  38 @ISA = qw(Exporter);
  39 @EXPORT = qw();
  40 @EXPORT_OK = qw(CalculateMolecularWeight CalculateExactMass CalculateElementalComposition FormatCompositionInfomation GetElementsAndCount IsMolecularFormula);
  41 
  42 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  43 
  44 #
  45 # Calculate molecular weight assuming its a valid molecular formula...
  46 #
  47 sub CalculateMolecularWeight {
  48   my($MolecularFormula) = @_;
  49   my($Index, $MolecularWeight, $ElementSymbol, $ElementCount, $AtomicWeight, $FormulaElementsRef, $FormulaElementCountRef);
  50 
  51   ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula);
  52   if (!(defined($FormulaElementsRef) && defined($FormulaElementCountRef))) {
  53     return undef;
  54   }
  55 
  56   $MolecularWeight = 0;
  57 
  58   for $Index (0 .. $#{$FormulaElementsRef}) {
  59     $ElementSymbol = $FormulaElementsRef->[$Index];
  60     $ElementCount = $FormulaElementCountRef->[$Index];
  61     $AtomicWeight = PeriodicTable::GetElementAtomicWeight($ElementSymbol);
  62     $MolecularWeight += $AtomicWeight * $ElementCount;
  63   }
  64   return $MolecularWeight;
  65 }
  66 
  67 #
  68 # Calculate exact mass assuming it's a valid formula...
  69 #
  70 sub CalculateExactMass {
  71   my($MolecularFormula) = @_;
  72   my($Index, $ElementSymbol, $ElementCount, $ExactMass, $RelativeAtomicMass, $FormulaElementsRef, $FormulaElementCountRef);
  73 
  74   ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula);
  75   if (!(defined($FormulaElementsRef) && defined($FormulaElementCountRef))) {
  76     return undef;
  77   }
  78   $ExactMass = 0;
  79 
  80   for $Index (0 .. $#{$FormulaElementsRef}) {
  81     $ElementSymbol = $FormulaElementsRef->[$Index];
  82     $ElementCount = $FormulaElementCountRef->[$Index];
  83     $RelativeAtomicMass = PeriodicTable::GetElementMostAbundantNaturalIsotopeMass($ElementSymbol);
  84     if (!defined($RelativeAtomicMass)) {
  85       next ELEMENT;
  86     }
  87     $ExactMass += $RelativeAtomicMass * $ElementCount;
  88   }
  89   return $ExactMass;
  90 }
  91 
  92 
  93 #
  94 # Calculate elemental composition and return reference to arrays
  95 # containing elements and their percent composition...
  96 #
  97 sub CalculateElementalComposition {
  98   my($MolecularFormula) = @_;
  99   my($Index, $MolecularWeight, $ElementSymbol, $ElementCount, $AtomicWeight, $Composition, $CompositionMultiplier, $FormulaElementsRef, $FormulaElementCountRef, @FormulaElements, @FormulaElementComposition);
 100 
 101   $MolecularWeight = CalculateMolecularWeight($MolecularFormula);
 102   if (! defined $MolecularWeight) {
 103     return (undef, undef);
 104   }
 105   ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula);
 106 
 107   @FormulaElements = ();
 108   @FormulaElementComposition = ();
 109 
 110   if (!$MolecularWeight) {
 111     return ( \@FormulaElements, \@FormulaElementComposition);
 112   }
 113 
 114   $CompositionMultiplier = 100 / $MolecularWeight;
 115 
 116   for $Index (0 .. $#{$FormulaElementsRef}) {
 117     $ElementSymbol = $FormulaElementsRef->[$Index];
 118     $ElementCount = $FormulaElementCountRef->[$Index];
 119     $AtomicWeight = PeriodicTable::GetElementAtomicWeight($ElementSymbol);
 120     $Composition = ($AtomicWeight * $ElementCount) * $CompositionMultiplier;
 121 
 122     push @FormulaElements, $ElementSymbol;
 123     push @FormulaElementComposition, $Composition;
 124   }
 125 
 126   return ( \@FormulaElements, \@FormulaElementComposition);
 127 }
 128 
 129 # Using refernece to element and its composition arrays, format composition information
 130 # as: Element: Composition;...
 131 #
 132 sub FormatCompositionInfomation {
 133   my($Index, $ElementSymbol, $ElementComposition, $ElementsRef, $ElementCompositionRef, $Precision, $Composition);
 134 
 135   $Precision = 2;
 136   if (@_ == 3) {
 137     ($ElementsRef, $ElementCompositionRef, $Precision) = @_;
 138   }
 139   else {
 140     ($ElementsRef, $ElementCompositionRef) = @_;
 141   }
 142 
 143   $Composition = '';
 144   for $Index (0 .. $#{$ElementsRef}) {
 145     $ElementSymbol = $ElementsRef->[$Index];
 146     $ElementComposition = $ElementCompositionRef->[$Index];
 147     $ElementComposition = sprintf("%.${Precision}f", $ElementComposition);
 148 
 149     $Composition .= ($Composition) ? '; ' : '';
 150     $Composition .=  "${ElementSymbol}: ${ElementComposition}%";
 151   }
 152 
 153   return $Composition;
 154 }
 155 
 156 #
 157 # Get elements and their count...
 158 #
 159 sub GetElementsAndCount {
 160   my($MolecularFormula) = @_;
 161   my($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg);
 162 
 163   ($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg) = _ProcessMolecularFormula($MolecularFormula);
 164 
 165   return ($FormulaElementsRef, $FormulaElementCountRef);
 166 }
 167 
 168 #
 169 # Is it a valid molecular formula?
 170 #
 171 sub IsMolecularFormula {
 172   my($MolecularFormula, $PrintErrorMsg, $Status, $FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg);
 173 
 174   ($MolecularFormula) = @_;
 175 
 176   ($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg) = _ProcessMolecularFormula($MolecularFormula);
 177   $Status = (defined($FormulaElementsRef) && defined($FormulaElementCountRef)) ? 1 : 0;
 178 
 179   return (wantarray ? ($Status, $ErrorMsg) : $Status);
 180 }
 181 
 182 #
 183 # Process molecular formula. For a valid formula, return references to arrays conatining elements
 184 # and element count; otherwsie, return undef.
 185 #
 186 sub _ProcessMolecularFormula {
 187   my($MolecularFormula) = @_;
 188   my($ErrorMsg) = '';
 189 
 190   $MolecularFormula = _CleanUpFormula($MolecularFormula);
 191 
 192   # Make sure it only contains numbers and letters...
 193   if ($MolecularFormula =~ /[^a-zA-Z0-9\(\)\[\]]/) {
 194     $ErrorMsg = 'Molecular formula contains characters other than a-zA-Z0-9';
 195     return (undef, undef, $ErrorMsg);
 196   }
 197 
 198   # Parse the formula...
 199   my($ElementSpec, $FormulaElementSpec, $Spec, $ElementSymbol, $ElementCount,  @FormulaElements, @ElementCount, %FormulaElementsToCountMap, @SubFormulaElements, %SubFormulaElementsToCountMap);
 200 
 201   @FormulaElements = (); @ElementCount = ();
 202   %FormulaElementsToCountMap = ();
 203 
 204 # Setup element symbol and count regular expression...
 205 # IUPAC: http://www.iupac.org/reports/provisional/abstract04/RB-prs310804/Chap4-3.04.pdf
 206 #
 207 
 208   $FormulaElementSpec = qr/
 209                                       \G(                         # $1
 210                                                   (?:
 211                                                       ([A-Z][a-z]?)   # Two or one letter element symbol; $2
 212                                                       ([0-9]*)          # Optionally followed by element count; $3
 213                                                   )
 214                                                   | \( | \[
 215                                                   | \)[0-9]* | \][0-9]*
 216                                                   | .
 217                                             )
 218                                       /x;
 219 
 220   my($ProcessingParenthesis);
 221   $ProcessingParenthesis = 0;
 222   # Go over the formula...
 223   FORMULA: while ($MolecularFormula =~ /$FormulaElementSpec/gx) {
 224     ($Spec, $ElementSymbol, $ElementCount) = ($1, $2, $3);
 225 
 226     # Handle parenthesis in formula to indicate repeating units...
 227     if ($Spec =~ /^(\(|\[)/) {
 228       if ($ProcessingParenthesis) {
 229 	$ErrorMsg = "Molecular formula contains multiple level of () or []";
 230 	return (undef, undef, $ErrorMsg);
 231       }
 232       $ProcessingParenthesis = 1;
 233       @SubFormulaElements = ();
 234       %SubFormulaElementsToCountMap = ();
 235       next FORMULA;
 236     }
 237     elsif ($Spec =~ /^(\)|\])/) {
 238       $ProcessingParenthesis = 0;
 239 
 240       # Retrieve repeat count and move data to @FormulaElements and %FormulaElementsToCountMap;
 241       my($RepeatCount, $Symbol, $Count);
 242       $RepeatCount = $Spec;
 243       $RepeatCount =~  s/(\)|\])//g;
 244       if (!$RepeatCount) {
 245 	$RepeatCount = 1;
 246       }
 247       # Copy data...
 248       for $Symbol (@SubFormulaElements) {
 249 	$Count = $SubFormulaElementsToCountMap{$Symbol} * $RepeatCount;
 250 	_SetupFormulaElementData(\@FormulaElements, \%FormulaElementsToCountMap, $Symbol, $Count);
 251       }
 252 
 253       # Get ready again...
 254       @SubFormulaElements = ();
 255       %SubFormulaElementsToCountMap = ();
 256 
 257       next FORMULA;
 258     }
 259 
 260     # Retrieve element symbol and count...
 261     $ElementSymbol = ($Spec && !$ElementSymbol) ? $Spec : ($ElementSymbol ? $ElementSymbol : '');
 262     $ElementCount = $ElementCount ? $ElementCount : 1;
 263     if (!PeriodicTable::IsElement($ElementSymbol)) {
 264       $ErrorMsg = "Molecular formula contains unknown elemental symbol $ElementSymbol";
 265       return (undef, undef, $ErrorMsg);
 266     }
 267 
 268     if ($ProcessingParenthesis) {
 269       _SetupFormulaElementData(\@SubFormulaElements, \%SubFormulaElementsToCountMap, $ElementSymbol, $ElementCount);
 270     }
 271     else {
 272       _SetupFormulaElementData(\@FormulaElements, \%FormulaElementsToCountMap, $ElementSymbol, $ElementCount);
 273     }
 274   }
 275 
 276   # Setup element count array...
 277   for $ElementSymbol (@FormulaElements) {
 278     $ElementCount = $FormulaElementsToCountMap{$ElementSymbol};
 279     push @ElementCount, $ElementCount;
 280   }
 281 
 282   # Make sure it all adds up to 100%; otherwise, adjust the last value..
 283 
 284   return (\@FormulaElements, \@ElementCount, $ErrorMsg);
 285 }
 286 
 287 # Clean it up...
 288 sub _CleanUpFormula {
 289   my($MolecularFormula) = @_;
 290   #Take out any spaces...
 291   $MolecularFormula =~ s/ //g;
 292 
 293   # Eliminate any charge specifications: +, - or [1-9]+[+-]
 294   # e.g NO+ [Al(H2O)6]3+ [H2NO3]+
 295   if ($MolecularFormula =~ /[\+\-]/) {
 296     if ($MolecularFormula =~ /\][0-9]+[\+\-]/) {
 297       # Bracket followed optionally by number and then, +/- ...
 298       # [Al(H2O)6]3+ ...
 299       $MolecularFormula =~ s/\][0-9]+[\+\-]/\]/g;
 300     }
 301     elsif ($MolecularFormula =~ /[\+\-][0-9]*/) {
 302       # +/- followed optionally by a number...
 303       # C37H42N2O6+2, Cu+
 304       $MolecularFormula =~ s/[\+\-][0-9]*//g;
 305     }
 306   }
 307 
 308   # Eliminate any brackets - ] or ) - not followed by numbers:
 309   # e.g. Li[H2PO4]
 310   if ($MolecularFormula !~ /\][0-9]+/) {
 311     $MolecularFormula =~ s/[\[\]]//g;
 312   }
 313   if ($MolecularFormula !~ /\)[0-9]+/) {
 314     $MolecularFormula =~ s/[\(\)]//g;
 315   }
 316   # Change adducts to parenthesis format...
 317   # Na2CO3.10H2O -> Na2CO3(H2O)10
 318   # 3CdSO4.8H2O -> (CdSO4)3(H2O)8
 319   if ($MolecularFormula =~ /\./) {
 320     my($SubFormula, $Count, $Spec);
 321     my(@MolecularFormulaSplits) = split /\./, $MolecularFormula;
 322     $MolecularFormula = '';
 323     for $SubFormula (@MolecularFormulaSplits) {
 324       ($Count, $Spec) = $SubFormula =~ /^([0-9]*)(.*?)$/;
 325       if ($Count) {
 326 	$MolecularFormula .= "(${Spec})${Count}";
 327       }
 328       else {
 329 	$MolecularFormula .= $Spec;
 330       }
 331     }
 332   }
 333 
 334   return $MolecularFormula;
 335 }
 336 
 337 # Store the element and count...
 338 sub _SetupFormulaElementData {
 339   my($ElementsRef, $ElementsToCountMapRef, $Element, $Count) = @_;
 340 
 341   if (exists $ElementsToCountMapRef->{$Element}) {
 342     $ElementsToCountMapRef->{$Element} += $Count;
 343   }
 344   else {
 345     push @{$ElementsRef}, $Element;
 346     $ElementsToCountMapRef->{$Element} = $Count;
 347   }
 348 }
 349