1 package NucleicAcids; 2 # 3 # $RCSfile: NucleicAcids.pm,v $ 4 # $Date: 2008/04/25 00:00:46 $ 5 # $Revision: 1.13 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 use 5.006; 29 use strict; 30 use Carp; 31 use Text::ParseWords; 32 use TextUtil; 33 use FileUtil; 34 35 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 36 37 $VERSION = '1.00'; 38 @ISA = qw(Exporter); 39 @EXPORT = qw(); 40 @EXPORT_OK = qw(GetNucleicAcids GetNucleicAcidsByType GetNucleicAcidPropertiesData GetNucleicAcidPropertiesNames IsNucleicAcid IsNucleicAcidProperty IsNucleicAcidType); 41 42 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 43 44 # 45 # Load nucleic acids data... 46 # 47 my(%NucleicAcidDataMap, %NucleicAcidCodeMap, %NucleicAcidOtherCodeMap, %NucleicAcidNameMap, @NucleicAcidCodes, @NucleicAcidPropertyNames, %NucleicAcidPropertyNamesMap, %NucleicAcidTypesMap); 48 _LoadNucleicAcidsData(); 49 50 # 51 # Get a list of all known nucleic acids as one of these values: 52 # code or nucleic acid name... 53 # 54 sub GetNucleicAcids { 55 my($NameType, $Code, $Name, @NucleicAcidNames); 56 57 $NameType = 'Code'; 58 if (@_ >= 1) { 59 ($NameType) = @_; 60 } 61 62 # Collect names... 63 @NucleicAcidNames = (); 64 for $Code (@NucleicAcidCodes) { 65 NAME : { 66 if ($NameType =~ /^Name$/i) {$Name = $NucleicAcidDataMap{$Code}{Name}; last NAME; } 67 $Name = $Code; 68 } 69 push @NucleicAcidNames, $Name; 70 } 71 72 return (wantarray ? @NucleicAcidNames : \@NucleicAcidNames); 73 } 74 75 # 76 # Get a list of all known nucleic acids by one of these specified types: 77 # Nucleobase, Nucleoside, Deoxynucleoside, Nucleotide, Deoxynucleotide. Default: Nucleoside 78 # 79 sub GetNucleicAcidsByType { 80 my($NameType, $Type, $Code, $Name, @NucleicAcidNames); 81 82 $Type = 'Nucleoside'; 83 $NameType = 'Code'; 84 if (@_ == 2) { 85 ($Type, $NameType) = @_; 86 } 87 elsif (@_ == 1) { 88 ($Type) = @_; 89 } 90 91 # Collect names... 92 @NucleicAcidNames = (); 93 CODE: for $Code (@NucleicAcidCodes) { 94 if ($NucleicAcidDataMap{$Code}{Type} !~ /^$Type$/i ) { 95 next CODE; 96 } 97 NAME : { 98 if ($NameType =~ /^Name$/i) {$Name = $NucleicAcidDataMap{$Code}{Name}; last NAME; } 99 $Name = $Code; 100 } 101 push @NucleicAcidNames, $Name; 102 } 103 104 return (wantarray ? @NucleicAcidNames : \@NucleicAcidNames); 105 } 106 107 # 108 # Get all available properties data for an nucleic acid using any of these symbols: 109 # code, other code or name. 110 # 111 # A reference to a hash array is returned with keys and values representing property 112 # name and its values respectively. 113 # 114 sub GetNucleicAcidPropertiesData { 115 my($NucleicAcidID) = @_; 116 my($Code); 117 118 if ($Code = _ValidateNucleicAcidID($NucleicAcidID)) { 119 return \%{$NucleicAcidDataMap{$Code}}; 120 } 121 else { 122 return undef; 123 } 124 } 125 126 # 127 # Get names of all available nucleic acid properties. A reference to an array containing 128 # names of all available properties is returned. 129 # 130 sub GetNucleicAcidPropertiesNames { 131 my($Mode); 132 my($PropertyName, @PropertyNames); 133 134 $Mode = 'ByGroup'; 135 if (@_ == 1) { 136 ($Mode) = @_; 137 } 138 139 @PropertyNames = (); 140 if ($Mode =~ /^Alphabetical$/i) { 141 my($PropertyName); 142 # Code, OtherCodes and Name are always listed first... 143 push @PropertyNames, qw(Code OtherCodes Name); 144 for $PropertyName (sort keys %NucleicAcidPropertyNamesMap) { 145 if ($PropertyName !~ /^(Code|OtherCodes|Name)$/) { 146 push @PropertyNames, $PropertyName; 147 } 148 } 149 } 150 else { 151 push @PropertyNames, @NucleicAcidPropertyNames; 152 } 153 return (wantarray ? @PropertyNames : \@PropertyNames); 154 } 155 156 # 157 # Is it a known nucleic acid? Input is either a code or a name 158 # 159 sub IsNucleicAcid { 160 my($NucleicAcidID) = @_; 161 my($Status); 162 163 $Status = (_ValidateNucleicAcidID($NucleicAcidID)) ? 1 : 0; 164 165 return $Status; 166 } 167 168 # 169 # Is it an available nucleic acid property? 170 # 171 sub IsNucleicAcidProperty { 172 my($PropertyName) = @_; 173 my($Status); 174 175 $Status = (exists($NucleicAcidPropertyNamesMap{$PropertyName})) ? 1 : 0; 176 177 return $Status; 178 } 179 180 # 181 # Is it an available nucleic acid type? 182 # 183 sub IsNucleicAcidType { 184 my($Type) = @_; 185 my($Status); 186 187 $Status = (exists($NucleicAcidTypesMap{lc($Type)})) ? 1 : 0; 188 189 return $Status; 190 } 191 192 # 193 # Implents GetNucleicAcid<PropertyName> for a valid proprty name. 194 # 195 sub AUTOLOAD { 196 my($NucleicAcidID) = @_; 197 my($FunctionName, $PropertyName, $PropertyValue, $Code); 198 199 $PropertyValue = undef; 200 201 use vars qw($AUTOLOAD); 202 $FunctionName = $AUTOLOAD; 203 $FunctionName =~ s/.*:://; 204 205 # Only Get<PropertyName> functions are supported... 206 if ($FunctionName !~ /^Get/) { 207 croak "Error: Function, NucleicAcid::$FunctionName, is not supported by AUTOLOAD in NucleicAcid module: Only Get<PropertyName> functions are implemented..."; 208 } 209 210 $PropertyName = $FunctionName; 211 $PropertyName =~ s/^GetNucleicAcid//; 212 if (!exists $NucleicAcidPropertyNamesMap{$PropertyName}) { 213 croak "Error: Function, NucleicAcid::$FunctionName, is not supported by AUTOLOAD in NucleicAcid module: Unknown nucleic acid property name, $PropertyName, specified..."; 214 } 215 216 if (!($Code = _ValidateNucleicAcidID($NucleicAcidID))) { 217 return undef; 218 } 219 $PropertyValue = $NucleicAcidDataMap{$Code}{$PropertyName}; 220 return $PropertyValue; 221 } 222 223 # 224 # Load NucleicAcidsData.csv files from <MayaChemTools>/lib directory... 225 # 226 sub _LoadNucleicAcidsData { 227 my($NucleicAcidsDataFile, $MayaChemToolsLibDir); 228 229 $MayaChemToolsLibDir = GetMayaChemToolsLibDirName(); 230 231 $NucleicAcidsDataFile = "$MayaChemToolsLibDir" . "/data/NucleicAcidsData.csv"; 232 233 if (! -e "$NucleicAcidsDataFile") { 234 croak "Error: MayaChemTools package file, $NucleicAcidsDataFile, is missing: Possible installation problems..."; 235 } 236 237 _LoadData($NucleicAcidsDataFile); 238 } 239 240 # 241 # Load NucleicAcidsData.csv file from <MayaChemTools>/lib directory... 242 # 243 sub _LoadData { 244 my($NucleicAcidsDataFile) = @_; 245 246 %NucleicAcidDataMap = (); 247 @NucleicAcidCodes = (); 248 @NucleicAcidPropertyNames = (); 249 %NucleicAcidPropertyNamesMap = (); 250 %NucleicAcidCodeMap = (); 251 %NucleicAcidOtherCodeMap = (); 252 %NucleicAcidNameMap = (); 253 %NucleicAcidTypesMap = (); 254 255 # Load property data for all nucleic acids... 256 # 257 # File Format: 258 # "Code","OtherCodes","BasePair","Name","Type","ChemicalFormula","ChemicalFormulaAtpH7.5","MolecularWeight","ExactMass","ElementalComposition" 259 # 260 my($Code, $OtherCodes, $NucleicAcidName, $Line, $NumOfCols, $InDelim, $Index, $Name, $Value, $Units, @LineWords, @ColLabels); 261 262 $InDelim = "\,"; 263 open NUCLEICACIDSDATAFILE, "$NucleicAcidsDataFile" or croak "Couldn't open $NucleicAcidsDataFile: $! ..."; 264 265 # Skip lines up to column labels... 266 LINE: while ($Line = GetTextLine(\*NUCLEICACIDSDATAFILE)) { 267 if ($Line !~ /^#/) { 268 last LINE; 269 } 270 } 271 @ColLabels= quotewords($InDelim, 0, $Line); 272 $NumOfCols = @ColLabels; 273 274 # Extract property names from column labels... 275 @NucleicAcidPropertyNames = (); 276 for $Index (0 .. $#ColLabels) { 277 $Name = $ColLabels[$Index]; 278 push @NucleicAcidPropertyNames, $Name; 279 280 # Store property names... 281 $NucleicAcidPropertyNamesMap{$Name} = $Name; 282 } 283 284 # Process nucleic acid data... 285 LINE: while ($Line = GetTextLine(\*NUCLEICACIDSDATAFILE)) { 286 if ($Line =~ /^#/) { 287 next LINE; 288 } 289 @LineWords = (); 290 @LineWords = quotewords($InDelim, 0, $Line); 291 if (@LineWords != $NumOfCols) { 292 croak "Error: The number of data fields, @LineWords, in $NucleicAcidsDataFile must be $NumOfCols.\nLine: $Line..."; 293 } 294 $Code = $LineWords[0]; $OtherCodes = $LineWords[1]; $NucleicAcidName = $LineWords[3]; 295 if (exists $NucleicAcidDataMap{$Code}) { 296 carp "Warning: Ignoring data for nucleic acid $Code: It has already been loaded.\nLine: $Line...."; 297 next LINE; 298 } 299 300 # Store all the values... 301 push @NucleicAcidCodes, $Code; 302 %{$NucleicAcidDataMap{$Code}} = (); 303 for $Index (0 .. $#LineWords) { 304 $Name = $NucleicAcidPropertyNames[$Index]; 305 $Value = $LineWords[$Index]; 306 $NucleicAcidDataMap{$Code}{$Name} = $Value; 307 } 308 } 309 close NUCLEICACIDSDATAFILE; 310 311 # Setup one letter and nucleic acid name maps... 312 _SetupNucleicAcidIDMap(); 313 } 314 315 # 316 # Setup lowercase other codes and name maps pointing 317 # to code as show in data file. 318 # 319 sub _SetupNucleicAcidIDMap { 320 my($Code, @OtherCodes, $OtherCode, $NucleicAcidName, $NucleicAcidType); 321 322 %NucleicAcidCodeMap = (); 323 %NucleicAcidOtherCodeMap = (); 324 %NucleicAcidNameMap = (); 325 %NucleicAcidTypesMap = (); 326 327 for $Code (keys %NucleicAcidDataMap) { 328 $NucleicAcidCodeMap{lc($Code)} = $Code; 329 330 $NucleicAcidName = $NucleicAcidDataMap{$Code}{Name}; 331 $NucleicAcidNameMap{lc($NucleicAcidName)} = $Code; 332 333 $NucleicAcidType = $NucleicAcidDataMap{$Code}{Type}; 334 if (! exists $NucleicAcidTypesMap{$NucleicAcidType}) { 335 $NucleicAcidTypesMap{lc($NucleicAcidType)} = $NucleicAcidType; 336 } 337 338 @OtherCodes = split /\,/, $NucleicAcidDataMap{$Code}{OtherCodes}; 339 OTHERCODE: for $OtherCode (@OtherCodes) { 340 if (!$OtherCode) { 341 next OTHERCODE; 342 } 343 $OtherCode = RemoveLeadingAndTrailingWhiteSpaces($OtherCode); 344 $NucleicAcidOtherCodeMap{lc($OtherCode)} = $Code; 345 } 346 } 347 } 348 349 # Validate Nucleic acid ID... 350 sub _ValidateNucleicAcidID { 351 my($NucleicAcidID) = @_; 352 my($Code) = undef; 353 354 if (exists $NucleicAcidCodeMap{lc($NucleicAcidID)}) { 355 $Code = $NucleicAcidCodeMap{lc($NucleicAcidID)}; 356 } 357 elsif (exists $NucleicAcidOtherCodeMap{lc($NucleicAcidID)}) { 358 $Code = $NucleicAcidOtherCodeMap{lc($NucleicAcidID)}; 359 } 360 elsif (exists $NucleicAcidNameMap{lc($NucleicAcidID)}) { 361 $Code = $NucleicAcidNameMap{lc($NucleicAcidID)}; 362 } 363 return $Code; 364 } 365 366