1 package MolecularFormula; 2 # 3 # $RCSfile: MolecularFormula.pm,v $ 4 # $Date: 2008/04/19 16:11:01 $ 5 # $Revision: 1.13 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 use 5.006; 29 use strict; 30 use Carp; 31 use Text::ParseWords; 32 use TextUtil; 33 use PeriodicTable; 34 35 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 36 37 $VERSION = '1.00'; 38 @ISA = qw(Exporter); 39 @EXPORT = qw(); 40 @EXPORT_OK = qw(CalculateMolecularWeight CalculateExactMass CalculateElementalComposition FormatCompositionInfomation GetElementsAndCount IsMolecularFormula); 41 42 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 43 44 # 45 # Calculate molecular weight assuming its a valid molecular formula... 46 # 47 sub CalculateMolecularWeight { 48 my($MolecularFormula) = @_; 49 my($Index, $MolecularWeight, $ElementSymbol, $ElementCount, $AtomicWeight, $FormulaElementsRef, $FormulaElementCountRef); 50 51 ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula); 52 if (!(defined($FormulaElementsRef) && defined($FormulaElementCountRef))) { 53 return undef; 54 } 55 56 $MolecularWeight = 0; 57 58 for $Index (0 .. $#{$FormulaElementsRef}) { 59 $ElementSymbol = $FormulaElementsRef->[$Index]; 60 $ElementCount = $FormulaElementCountRef->[$Index]; 61 $AtomicWeight = PeriodicTable::GetElementAtomicWeight($ElementSymbol); 62 $MolecularWeight += $AtomicWeight * $ElementCount; 63 } 64 return $MolecularWeight; 65 } 66 67 # 68 # Calculate exact mass assuming it's a valid formula... 69 # 70 sub CalculateExactMass { 71 my($MolecularFormula) = @_; 72 my($Index, $ElementSymbol, $ElementCount, $ExactMass, $RelativeAtomicMass, $FormulaElementsRef, $FormulaElementCountRef); 73 74 ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula); 75 if (!(defined($FormulaElementsRef) && defined($FormulaElementCountRef))) { 76 return undef; 77 } 78 $ExactMass = 0; 79 80 for $Index (0 .. $#{$FormulaElementsRef}) { 81 $ElementSymbol = $FormulaElementsRef->[$Index]; 82 $ElementCount = $FormulaElementCountRef->[$Index]; 83 $RelativeAtomicMass = PeriodicTable::GetElementMostAbundantNaturalIsotopeMass($ElementSymbol); 84 if (!defined($RelativeAtomicMass)) { 85 next ELEMENT; 86 } 87 $ExactMass += $RelativeAtomicMass * $ElementCount; 88 } 89 return $ExactMass; 90 } 91 92 93 # 94 # Calculate elemental composition and return reference to arrays 95 # containing elements and their percent composition... 96 # 97 sub CalculateElementalComposition { 98 my($MolecularFormula) = @_; 99 my($Index, $MolecularWeight, $ElementSymbol, $ElementCount, $AtomicWeight, $Composition, $CompositionMultiplier, $FormulaElementsRef, $FormulaElementCountRef, @FormulaElements, @FormulaElementComposition); 100 101 $MolecularWeight = CalculateMolecularWeight($MolecularFormula); 102 if (! defined $MolecularWeight) { 103 return (undef, undef); 104 } 105 ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula); 106 107 @FormulaElements = (); 108 @FormulaElementComposition = (); 109 110 if (!$MolecularWeight) { 111 return ( \@FormulaElements, \@FormulaElementComposition); 112 } 113 114 $CompositionMultiplier = 100 / $MolecularWeight; 115 116 for $Index (0 .. $#{$FormulaElementsRef}) { 117 $ElementSymbol = $FormulaElementsRef->[$Index]; 118 $ElementCount = $FormulaElementCountRef->[$Index]; 119 $AtomicWeight = PeriodicTable::GetElementAtomicWeight($ElementSymbol); 120 $Composition = ($AtomicWeight * $ElementCount) * $CompositionMultiplier; 121 122 push @FormulaElements, $ElementSymbol; 123 push @FormulaElementComposition, $Composition; 124 } 125 126 return ( \@FormulaElements, \@FormulaElementComposition); 127 } 128 129 # Using refernece to element and its composition arrays, format composition information 130 # as: Element: Composition;... 131 # 132 sub FormatCompositionInfomation { 133 my($Index, $ElementSymbol, $ElementComposition, $ElementsRef, $ElementCompositionRef, $Precision, $Composition); 134 135 $Precision = 2; 136 if (@_ == 3) { 137 ($ElementsRef, $ElementCompositionRef, $Precision) = @_; 138 } 139 else { 140 ($ElementsRef, $ElementCompositionRef) = @_; 141 } 142 143 $Composition = ''; 144 for $Index (0 .. $#{$ElementsRef}) { 145 $ElementSymbol = $ElementsRef->[$Index]; 146 $ElementComposition = $ElementCompositionRef->[$Index]; 147 $ElementComposition = sprintf("%.${Precision}f", $ElementComposition); 148 149 $Composition .= ($Composition) ? '; ' : ''; 150 $Composition .= "${ElementSymbol}: ${ElementComposition}%"; 151 } 152 153 return $Composition; 154 } 155 156 # 157 # Get elements and their count... 158 # 159 sub GetElementsAndCount { 160 my($MolecularFormula) = @_; 161 my($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg); 162 163 ($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg) = _ProcessMolecularFormula($MolecularFormula); 164 165 return ($FormulaElementsRef, $FormulaElementCountRef); 166 } 167 168 # 169 # Is it a valid molecular formula? 170 # 171 sub IsMolecularFormula { 172 my($MolecularFormula, $PrintErrorMsg, $Status, $FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg); 173 174 ($MolecularFormula) = @_; 175 176 ($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg) = _ProcessMolecularFormula($MolecularFormula); 177 $Status = (defined($FormulaElementsRef) && defined($FormulaElementCountRef)) ? 1 : 0; 178 179 return (wantarray ? ($Status, $ErrorMsg) : $Status); 180 } 181 182 # 183 # Process molecular formula. For a valid formula, return references to arrays conatining elements 184 # and element count; otherwsie, return undef. 185 # 186 sub _ProcessMolecularFormula { 187 my($MolecularFormula) = @_; 188 my($ErrorMsg) = ''; 189 190 $MolecularFormula = _CleanUpFormula($MolecularFormula); 191 192 # Make sure it only contains numbers and letters... 193 if ($MolecularFormula =~ /[^a-zA-Z0-9\(\)\[\]]/) { 194 $ErrorMsg = 'Molecular formula contains characters other than a-zA-Z0-9'; 195 return (undef, undef, $ErrorMsg); 196 } 197 198 # Parse the formula... 199 my($ElementSpec, $FormulaElementSpec, $Spec, $ElementSymbol, $ElementCount, @FormulaElements, @ElementCount, %FormulaElementsToCountMap, @SubFormulaElements, %SubFormulaElementsToCountMap); 200 201 @FormulaElements = (); @ElementCount = (); 202 %FormulaElementsToCountMap = (); 203 204 # Setup element symbol and count regular expression... 205 # IUPAC: http://www.iupac.org/reports/provisional/abstract04/RB-prs310804/Chap4-3.04.pdf 206 # 207 208 $FormulaElementSpec = qr/ 209 \G( # $1 210 (?: 211 ([A-Z][a-z]?) # Two or one letter element symbol; $2 212 ([0-9]*) # Optionally followed by element count; $3 213 ) 214 | \( | \[ 215 | \)[0-9]* | \][0-9]* 216 | . 217 ) 218 /x; 219 220 my($ProcessingParenthesis); 221 $ProcessingParenthesis = 0; 222 # Go over the formula... 223 FORMULA: while ($MolecularFormula =~ /$FormulaElementSpec/gx) { 224 ($Spec, $ElementSymbol, $ElementCount) = ($1, $2, $3); 225 226 # Handle parenthesis in formula to indicate repeating units... 227 if ($Spec =~ /^(\(|\[)/) { 228 if ($ProcessingParenthesis) { 229 $ErrorMsg = "Molecular formula contains multiple level of () or []"; 230 return (undef, undef, $ErrorMsg); 231 } 232 $ProcessingParenthesis = 1; 233 @SubFormulaElements = (); 234 %SubFormulaElementsToCountMap = (); 235 next FORMULA; 236 } 237 elsif ($Spec =~ /^(\)|\])/) { 238 $ProcessingParenthesis = 0; 239 240 # Retrieve repeat count and move data to @FormulaElements and %FormulaElementsToCountMap; 241 my($RepeatCount, $Symbol, $Count); 242 $RepeatCount = $Spec; 243 $RepeatCount =~ s/(\)|\])//g; 244 if (!$RepeatCount) { 245 $RepeatCount = 1; 246 } 247 # Copy data... 248 for $Symbol (@SubFormulaElements) { 249 $Count = $SubFormulaElementsToCountMap{$Symbol} * $RepeatCount; 250 _SetupFormulaElementData(\@FormulaElements, \%FormulaElementsToCountMap, $Symbol, $Count); 251 } 252 253 # Get ready again... 254 @SubFormulaElements = (); 255 %SubFormulaElementsToCountMap = (); 256 257 next FORMULA; 258 } 259 260 # Retrieve element symbol and count... 261 $ElementSymbol = ($Spec && !$ElementSymbol) ? $Spec : ($ElementSymbol ? $ElementSymbol : ''); 262 $ElementCount = $ElementCount ? $ElementCount : 1; 263 if (!PeriodicTable::IsElement($ElementSymbol)) { 264 $ErrorMsg = "Molecular formula contains unknown elemental symbol $ElementSymbol"; 265 return (undef, undef, $ErrorMsg); 266 } 267 268 if ($ProcessingParenthesis) { 269 _SetupFormulaElementData(\@SubFormulaElements, \%SubFormulaElementsToCountMap, $ElementSymbol, $ElementCount); 270 } 271 else { 272 _SetupFormulaElementData(\@FormulaElements, \%FormulaElementsToCountMap, $ElementSymbol, $ElementCount); 273 } 274 } 275 276 # Setup element count array... 277 for $ElementSymbol (@FormulaElements) { 278 $ElementCount = $FormulaElementsToCountMap{$ElementSymbol}; 279 push @ElementCount, $ElementCount; 280 } 281 282 # Make sure it all adds up to 100%; otherwise, adjust the last value.. 283 284 return (\@FormulaElements, \@ElementCount, $ErrorMsg); 285 } 286 287 # Clean it up... 288 sub _CleanUpFormula { 289 my($MolecularFormula) = @_; 290 #Take out any spaces... 291 $MolecularFormula =~ s/ //g; 292 293 # Eliminate any charge specifications: +, - or [1-9]+[+-] 294 # e.g NO+ [Al(H2O)6]3+ [H2NO3]+ 295 if ($MolecularFormula =~ /[\+\-]/) { 296 if ($MolecularFormula =~ /\][0-9]+[\+\-]/) { 297 # Bracket followed optionally by number and then, +/- ... 298 # [Al(H2O)6]3+ ... 299 $MolecularFormula =~ s/\][0-9]+[\+\-]/\]/g; 300 } 301 elsif ($MolecularFormula =~ /[\+\-][0-9]*/) { 302 # +/- followed optionally by a number... 303 # C37H42N2O6+2, Cu+ 304 $MolecularFormula =~ s/[\+\-][0-9]*//g; 305 } 306 } 307 308 # Eliminate any brackets - ] or ) - not followed by numbers: 309 # e.g. Li[H2PO4] 310 if ($MolecularFormula !~ /\][0-9]+/) { 311 $MolecularFormula =~ s/[\[\]]//g; 312 } 313 if ($MolecularFormula !~ /\)[0-9]+/) { 314 $MolecularFormula =~ s/[\(\)]//g; 315 } 316 # Change adducts to parenthesis format... 317 # Na2CO3.10H2O -> Na2CO3(H2O)10 318 # 3CdSO4.8H2O -> (CdSO4)3(H2O)8 319 if ($MolecularFormula =~ /\./) { 320 my($SubFormula, $Count, $Spec); 321 my(@MolecularFormulaSplits) = split /\./, $MolecularFormula; 322 $MolecularFormula = ''; 323 for $SubFormula (@MolecularFormulaSplits) { 324 ($Count, $Spec) = $SubFormula =~ /^([0-9]*)(.*?)$/; 325 if ($Count) { 326 $MolecularFormula .= "(${Spec})${Count}"; 327 } 328 else { 329 $MolecularFormula .= $Spec; 330 } 331 } 332 } 333 334 return $MolecularFormula; 335 } 336 337 # Store the element and count... 338 sub _SetupFormulaElementData { 339 my($ElementsRef, $ElementsToCountMapRef, $Element, $Count) = @_; 340 341 if (exists $ElementsToCountMapRef->{$Element}) { 342 $ElementsToCountMapRef->{$Element} += $Count; 343 } 344 else { 345 push @{$ElementsRef}, $Element; 346 $ElementsToCountMapRef->{$Element} = $Count; 347 } 348 } 349