MayaChemTools

   1 package NucleicAcids;
   2 #
   3 # $RCSfile: NucleicAcids.pm,v $
   4 # $Date: 2008/04/25 00:00:46 $
   5 # $Revision: 1.13 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 use 5.006;
  29 use strict;
  30 use Carp;
  31 use Text::ParseWords;
  32 use TextUtil;
  33 use FileUtil;
  34 
  35 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  36 
  37 $VERSION = '1.00';
  38 @ISA = qw(Exporter);
  39 @EXPORT = qw();
  40 @EXPORT_OK = qw(GetNucleicAcids GetNucleicAcidsByType GetNucleicAcidPropertiesData GetNucleicAcidPropertiesNames IsNucleicAcid IsNucleicAcidProperty IsNucleicAcidType);
  41 
  42 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  43 
  44 #
  45 # Load nucleic acids data...
  46 #
  47 my(%NucleicAcidDataMap, %NucleicAcidCodeMap, %NucleicAcidOtherCodeMap, %NucleicAcidNameMap, @NucleicAcidCodes, @NucleicAcidPropertyNames, %NucleicAcidPropertyNamesMap, %NucleicAcidTypesMap);
  48 _LoadNucleicAcidsData();
  49 
  50 #
  51 # Get a list of all known nucleic acids as one of these values:
  52 # code or nucleic acid name...
  53 #
  54 sub GetNucleicAcids {
  55   my($NameType, $Code, $Name, @NucleicAcidNames);
  56 
  57   $NameType = 'Code';
  58   if (@_ >= 1) {
  59     ($NameType) = @_;
  60   }
  61 
  62   # Collect names...
  63   @NucleicAcidNames = ();
  64   for $Code (@NucleicAcidCodes) {
  65     NAME : {
  66       if ($NameType =~ /^Name$/i) {$Name = $NucleicAcidDataMap{$Code}{Name}; last NAME; }
  67       $Name = $Code;
  68     }
  69     push @NucleicAcidNames, $Name;
  70   }
  71 
  72   return (wantarray ? @NucleicAcidNames : \@NucleicAcidNames);
  73 }
  74 
  75 #
  76 # Get a list of all known nucleic acids by one of these specified types:
  77 # Nucleobase, Nucleoside, Deoxynucleoside, Nucleotide, Deoxynucleotide. Default: Nucleoside
  78 #
  79 sub GetNucleicAcidsByType {
  80   my($NameType, $Type, $Code, $Name, @NucleicAcidNames);
  81 
  82   $Type = 'Nucleoside';
  83   $NameType = 'Code';
  84   if (@_ == 2) {
  85     ($Type, $NameType) = @_;
  86   }
  87   elsif (@_ == 1) {
  88     ($Type) = @_;
  89   }
  90 
  91   # Collect names...
  92   @NucleicAcidNames = ();
  93   CODE: for $Code (@NucleicAcidCodes) {
  94     if ($NucleicAcidDataMap{$Code}{Type} !~ /^$Type$/i ) {
  95       next CODE;
  96     }
  97     NAME : {
  98       if ($NameType =~ /^Name$/i) {$Name = $NucleicAcidDataMap{$Code}{Name}; last NAME; }
  99       $Name = $Code;
 100     }
 101     push @NucleicAcidNames, $Name;
 102   }
 103 
 104   return (wantarray ? @NucleicAcidNames : \@NucleicAcidNames);
 105 }
 106 
 107 #
 108 # Get all available properties data for an nucleic acid using any of these symbols:
 109 # code, other code or name.
 110 #
 111 # A reference to a hash array is returned with keys and values representing property
 112 # name and its values respectively.
 113 #
 114 sub GetNucleicAcidPropertiesData {
 115   my($NucleicAcidID) = @_;
 116   my($Code);
 117 
 118   if ($Code = _ValidateNucleicAcidID($NucleicAcidID)) {
 119     return \%{$NucleicAcidDataMap{$Code}};
 120   }
 121   else {
 122     return undef;
 123   }
 124 }
 125 
 126 #
 127 # Get names of all available nucleic acid properties. A reference to  an array containing
 128 # names of all available properties is returned.
 129 #
 130 sub GetNucleicAcidPropertiesNames {
 131   my($Mode);
 132   my($PropertyName, @PropertyNames);
 133 
 134   $Mode = 'ByGroup';
 135   if (@_ == 1) {
 136     ($Mode) = @_;
 137   }
 138 
 139   @PropertyNames = ();
 140   if ($Mode =~ /^Alphabetical$/i) {
 141     my($PropertyName);
 142     # Code, OtherCodes and Name are always listed first...
 143     push @PropertyNames, qw(Code OtherCodes Name);
 144     for $PropertyName (sort keys %NucleicAcidPropertyNamesMap) {
 145       if ($PropertyName !~ /^(Code|OtherCodes|Name)$/) {
 146 	push @PropertyNames, $PropertyName;
 147       }
 148     }
 149   }
 150   else {
 151     push @PropertyNames, @NucleicAcidPropertyNames;
 152   }
 153   return (wantarray ? @PropertyNames : \@PropertyNames);
 154 }
 155 
 156 #
 157 # Is it a known nucleic acid? Input is either a code or a name
 158 #
 159 sub IsNucleicAcid {
 160   my($NucleicAcidID) = @_;
 161   my($Status);
 162 
 163   $Status = (_ValidateNucleicAcidID($NucleicAcidID)) ? 1 : 0;
 164 
 165   return $Status;
 166 }
 167 
 168 #
 169 # Is it an available nucleic acid property?
 170 #
 171 sub IsNucleicAcidProperty {
 172   my($PropertyName) = @_;
 173   my($Status);
 174 
 175   $Status = (exists($NucleicAcidPropertyNamesMap{$PropertyName})) ? 1 : 0;
 176 
 177   return $Status;
 178 }
 179 
 180 #
 181 # Is it an available nucleic acid type?
 182 #
 183 sub IsNucleicAcidType {
 184   my($Type) = @_;
 185   my($Status);
 186 
 187   $Status = (exists($NucleicAcidTypesMap{lc($Type)})) ? 1 : 0;
 188 
 189   return $Status;
 190 }
 191 
 192 #
 193 # Implents GetNucleicAcid<PropertyName> for a valid proprty name.
 194 #
 195 sub AUTOLOAD {
 196   my($NucleicAcidID) = @_;
 197   my($FunctionName, $PropertyName, $PropertyValue, $Code);
 198 
 199   $PropertyValue = undef;
 200 
 201   use vars qw($AUTOLOAD);
 202   $FunctionName = $AUTOLOAD;
 203   $FunctionName =~ s/.*:://;
 204 
 205   # Only Get<PropertyName> functions are supported...
 206   if ($FunctionName !~ /^Get/) {
 207     croak "Error: Function, NucleicAcid::$FunctionName, is not supported by AUTOLOAD in NucleicAcid module: Only Get<PropertyName> functions are implemented...";
 208   }
 209 
 210   $PropertyName = $FunctionName;
 211   $PropertyName =~  s/^GetNucleicAcid//;
 212   if (!exists $NucleicAcidPropertyNamesMap{$PropertyName}) {
 213     croak "Error: Function, NucleicAcid::$FunctionName, is not supported by AUTOLOAD in NucleicAcid module: Unknown nucleic acid property name, $PropertyName, specified...";
 214   }
 215 
 216   if (!($Code = _ValidateNucleicAcidID($NucleicAcidID))) {
 217     return undef;
 218   }
 219   $PropertyValue = $NucleicAcidDataMap{$Code}{$PropertyName};
 220   return $PropertyValue;
 221 }
 222 
 223 #
 224 # Load NucleicAcidsData.csv files from <MayaChemTools>/lib directory...
 225 #
 226 sub _LoadNucleicAcidsData {
 227   my($NucleicAcidsDataFile, $MayaChemToolsLibDir);
 228 
 229   $MayaChemToolsLibDir = GetMayaChemToolsLibDirName();
 230 
 231   $NucleicAcidsDataFile =  "$MayaChemToolsLibDir" . "/data/NucleicAcidsData.csv";
 232 
 233   if (! -e "$NucleicAcidsDataFile") {
 234     croak "Error: MayaChemTools package file, $NucleicAcidsDataFile, is missing: Possible installation problems...";
 235   }
 236 
 237   _LoadData($NucleicAcidsDataFile);
 238 }
 239 
 240 #
 241 # Load NucleicAcidsData.csv file from <MayaChemTools>/lib directory...
 242 #
 243 sub _LoadData {
 244   my($NucleicAcidsDataFile) = @_;
 245 
 246   %NucleicAcidDataMap = ();
 247   @NucleicAcidCodes = ();
 248   @NucleicAcidPropertyNames = ();
 249   %NucleicAcidPropertyNamesMap = ();
 250   %NucleicAcidCodeMap = ();
 251   %NucleicAcidOtherCodeMap = ();
 252   %NucleicAcidNameMap = ();
 253   %NucleicAcidTypesMap = ();
 254 
 255   # Load property data for all nucleic acids...
 256   #
 257   # File Format:
 258   # "Code","OtherCodes","BasePair","Name","Type","ChemicalFormula","ChemicalFormulaAtpH7.5","MolecularWeight","ExactMass","ElementalComposition"
 259   #
 260   my($Code, $OtherCodes, $NucleicAcidName, $Line, $NumOfCols, $InDelim, $Index, $Name, $Value, $Units, @LineWords, @ColLabels);
 261 
 262   $InDelim = "\,";
 263   open NUCLEICACIDSDATAFILE, "$NucleicAcidsDataFile" or croak "Couldn't open $NucleicAcidsDataFile: $! ...";
 264 
 265   # Skip lines up to column labels...
 266   LINE: while ($Line = GetTextLine(\*NUCLEICACIDSDATAFILE)) {
 267     if ($Line !~ /^#/) {
 268       last LINE;
 269     }
 270   }
 271   @ColLabels= quotewords($InDelim, 0, $Line);
 272   $NumOfCols = @ColLabels;
 273 
 274   # Extract property names from column labels...
 275   @NucleicAcidPropertyNames = ();
 276   for $Index (0 .. $#ColLabels) {
 277     $Name = $ColLabels[$Index];
 278     push @NucleicAcidPropertyNames, $Name;
 279 
 280     # Store property names...
 281     $NucleicAcidPropertyNamesMap{$Name} = $Name;
 282   }
 283 
 284   # Process nucleic acid data...
 285   LINE: while ($Line = GetTextLine(\*NUCLEICACIDSDATAFILE)) {
 286     if ($Line =~ /^#/) {
 287       next LINE;
 288     }
 289     @LineWords = ();
 290     @LineWords = quotewords($InDelim, 0, $Line);
 291     if (@LineWords != $NumOfCols) {
 292       croak "Error: The number of data fields, @LineWords, in $NucleicAcidsDataFile must be $NumOfCols.\nLine: $Line...";
 293     }
 294     $Code = $LineWords[0]; $OtherCodes = $LineWords[1]; $NucleicAcidName = $LineWords[3];
 295     if (exists $NucleicAcidDataMap{$Code}) {
 296       carp "Warning: Ignoring data for nucleic acid $Code: It has already been loaded.\nLine: $Line....";
 297       next LINE;
 298     }
 299 
 300     # Store all the values...
 301     push @NucleicAcidCodes, $Code;
 302     %{$NucleicAcidDataMap{$Code}} = ();
 303     for $Index (0 .. $#LineWords) {
 304       $Name = $NucleicAcidPropertyNames[$Index];
 305       $Value = $LineWords[$Index];
 306       $NucleicAcidDataMap{$Code}{$Name} = $Value;
 307     }
 308   }
 309   close NUCLEICACIDSDATAFILE;
 310 
 311   # Setup one letter and nucleic acid name maps...
 312   _SetupNucleicAcidIDMap();
 313 }
 314 
 315 #
 316 # Setup lowercase other codes and name maps pointing
 317 # to code as show in data file.
 318 #
 319 sub _SetupNucleicAcidIDMap {
 320   my($Code, @OtherCodes, $OtherCode, $NucleicAcidName, $NucleicAcidType);
 321 
 322   %NucleicAcidCodeMap = ();
 323   %NucleicAcidOtherCodeMap = ();
 324   %NucleicAcidNameMap = ();
 325   %NucleicAcidTypesMap = ();
 326 
 327   for $Code (keys %NucleicAcidDataMap) {
 328     $NucleicAcidCodeMap{lc($Code)} = $Code;
 329 
 330     $NucleicAcidName = $NucleicAcidDataMap{$Code}{Name};
 331     $NucleicAcidNameMap{lc($NucleicAcidName)} = $Code;
 332 
 333     $NucleicAcidType = $NucleicAcidDataMap{$Code}{Type};
 334     if (! exists $NucleicAcidTypesMap{$NucleicAcidType}) {
 335       $NucleicAcidTypesMap{lc($NucleicAcidType)} = $NucleicAcidType;
 336     }
 337 
 338     @OtherCodes = split /\,/, $NucleicAcidDataMap{$Code}{OtherCodes};
 339     OTHERCODE: for $OtherCode (@OtherCodes) {
 340       if (!$OtherCode) {
 341 	next OTHERCODE;
 342       }
 343       $OtherCode = RemoveLeadingAndTrailingWhiteSpaces($OtherCode);
 344       $NucleicAcidOtherCodeMap{lc($OtherCode)} = $Code;
 345     }
 346   }
 347 }
 348 
 349 # Validate Nucleic acid ID...
 350 sub _ValidateNucleicAcidID {
 351   my($NucleicAcidID) = @_;
 352   my($Code) = undef;
 353 
 354   if (exists $NucleicAcidCodeMap{lc($NucleicAcidID)}) {
 355     $Code = $NucleicAcidCodeMap{lc($NucleicAcidID)};
 356   }
 357   elsif (exists $NucleicAcidOtherCodeMap{lc($NucleicAcidID)}) {
 358     $Code = $NucleicAcidOtherCodeMap{lc($NucleicAcidID)};
 359   }
 360   elsif (exists $NucleicAcidNameMap{lc($NucleicAcidID)}) {
 361     $Code = $NucleicAcidNameMap{lc($NucleicAcidID)};
 362   }
 363   return $Code;
 364 }
 365 
 366