MayaChemTools

   1 package SequenceFileUtil;
   2 #
   3 # $RCSfile: SequenceFileUtil.pm,v $
   4 # $Date: 2008/04/19 16:11:03 $
   5 # $Revision: 1.21 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 use 5.006;
  29 use strict;
  30 use Exporter;
  31 use Text::ParseWords;
  32 use TextUtil;
  33 use FileUtil;
  34 
  35 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  36 
  37 $VERSION = '1.00';
  38 @ISA = qw(Exporter);
  39 @EXPORT = qw(AreSequenceLengthsIdentical CalcuatePercentSequenceIdentity CalculatePercentSequenceIdentityMatrix GetLongestSequence GetShortestSequence GetSequenceLength IsGapResidue IsSupportedSequenceFile IsClustalWSequenceFile IsPearsonFastaSequenceFile IsMSFSequenceFile ReadSequenceFile RemoveSequenceGaps RemoveSequenceAlignmentGapColumns WritePearsonFastaSequenceFile);
  40 @EXPORT_OK = qw();
  41 
  42 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  43 
  44 # Compare lengths of all sequences...
  45 sub AreSequenceLengthsIdentical {
  46   my($SequencesDataRef) = @_;
  47   my($Status, $ID, $FirstID, $FirstSeqLen, $FirstDifferentLenID, $SeqLen);
  48 
  49   $Status = 1;
  50   $FirstID = '';
  51   $FirstDifferentLenID = '';
  52 
  53   ID: for $ID (@{$SequencesDataRef->{IDs}}) {
  54     if (!$FirstID) {
  55       $FirstID = $ID;
  56       $FirstSeqLen = length($SequencesDataRef->{Sequence}{$ID});
  57       next ID;
  58     }
  59     $SeqLen = length($SequencesDataRef->{Sequence}{$ID});
  60     if ($SeqLen != $FirstSeqLen) {
  61       $Status = 0;
  62       $FirstDifferentLenID = $ID;
  63       last ID;
  64     }
  65   }
  66   return ($Status);
  67 }
  68 
  69 # Calculate percent identity between two sequences. By default, gaps are ignored.
  70 sub CalcuatePercentSequenceIdentity {
  71   my($Sequence1, $Sequence2, $PercentIdentity, $IgnoreGaps, $Precision);
  72 
  73   $PercentIdentity = '';
  74   $Precision = 1;
  75   $IgnoreGaps = 1;
  76   if (@_ == 4) {
  77     ($Sequence1, $Sequence2, $IgnoreGaps, $Precision) = @_;
  78   }
  79   elsif (@_ == 3) {
  80     ($Sequence1, $Sequence2, $IgnoreGaps) = @_;
  81   }
  82   elsif (@_ == 2) {
  83     ($Sequence1, $Sequence2) = @_;
  84   }
  85   else {
  86     return $PercentIdentity;
  87   }
  88   if (!(IsNotEmpty($Sequence1) && IsNotEmpty($Sequence2))) {
  89     return $PercentIdentity;
  90   }
  91   my($Index, $Identity, $Sequence1Len, $Sequence2Len, $Residue1, $Residue2, $ResMatchCount, $ResCount);
  92 
  93   $Sequence1Len = length($Sequence1);
  94   $Sequence2Len = length($Sequence2);
  95 
  96   $ResMatchCount = 0;
  97   $ResCount = 0;
  98   RESIDUE: for $Index (0 .. ($Sequence1Len - 1)) {
  99     $Residue1 = substr($Sequence1, $Index, 1);
 100     $Residue2 = ($Index < $Sequence2Len) ? substr($Sequence2, $Index, 1) : '';
 101     if ($IgnoreGaps) {
 102       if ($Residue1 !~ /[A-Z]/i || $Residue2 !~ /[A-Z]/i) {
 103 	next RESIDUE;
 104       }
 105     }
 106     if ($Residue1 eq $Residue2) {
 107       $ResMatchCount++;
 108     }
 109     $ResCount++;
 110   }
 111   $Identity = $ResCount ? ($ResMatchCount/$ResCount) : 0.0;
 112   $PercentIdentity = sprintf("%.${Precision}f", ($Identity * 100));
 113 
 114   return $PercentIdentity;
 115 }
 116 
 117 # Calculate pairwise identify matrix for all the sequences and return a reference
 118 # to a hash with the following keys:
 119 #
 120 # {IDs} - Sequence IDs
 121 # {Count} - Number of IDs
 122 # {PercentIdentity}{$RowID}{$ColID} - Percent identify for a pair of sequences
 123 #
 124 sub CalculatePercentSequenceIdentityMatrix {
 125   my($SequencesDataRef, $IgnoreGaps, , $Precision, $ID, $RowID, $ColID, $RowIDSeq, $ColIDSeq, $PercentIdentity, %IdentityMatrixData);
 126 
 127   $IgnoreGaps = 1;
 128   $Precision = 1;
 129   if (@_ == 3) {
 130     ($SequencesDataRef, $IgnoreGaps, $Precision) = @_;
 131   }
 132   elsif (@_ == 2) {
 133     ($SequencesDataRef, $IgnoreGaps) = @_;
 134   }
 135   else {
 136     ($SequencesDataRef) = @_;
 137   }
 138 
 139   %IdentityMatrixData = ();
 140   @{$IdentityMatrixData{IDs}} = ();
 141   %{$IdentityMatrixData{PercentIdentity}} = ();
 142   $IdentityMatrixData{Count} = 0;
 143 
 144   for $ID (@{$SequencesDataRef->{IDs}}) {
 145     push @{$IdentityMatrixData{IDs}}, $ID;
 146     $IdentityMatrixData{Count} += 1;
 147   }
 148   # Initialize and calculate percent identity data values...
 149   for $RowID (@{$SequencesDataRef->{IDs}}) {
 150     %{$IdentityMatrixData{PercentIdentity}{$RowID}} = ();
 151     $RowIDSeq = $SequencesDataRef->{Sequence}{$RowID};
 152     for $ColID (@{$SequencesDataRef->{IDs}}) {
 153       $IdentityMatrixData{$RowID}{$ColID} = '';
 154       $ColIDSeq = $SequencesDataRef->{Sequence}{$ColID};
 155       $PercentIdentity = CalcuatePercentSequenceIdentity($RowIDSeq, $ColIDSeq, $IgnoreGaps, $Precision);
 156       $IdentityMatrixData{PercentIdentity}{$RowID}{$ColID} = $PercentIdentity;
 157     }
 158   }
 159   return \%IdentityMatrixData;
 160 }
 161 
 162 # Retrieve information about shortest sequence...
 163 sub GetShortestSequence {
 164   my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description);
 165 
 166   $IgnoreGaps = 1;
 167   if (@_ == 2) {
 168     ($SequencesDataRef, $IgnoreGaps) = @_;
 169   }
 170   else {
 171     ($SequencesDataRef) = @_;
 172   }
 173 
 174   ($ID, $Sequence, $SeqLen, $Description) =  _GetShortestOrLongestSequence($SequencesDataRef, 'Shortest', $IgnoreGaps);
 175   return ($ID, $Sequence, $SeqLen, $Description);
 176 }
 177 
 178 # Retrieve information about longest sequence..
 179 sub GetLongestSequence {
 180   my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description);
 181 
 182   $IgnoreGaps = 1;
 183   if (@_ == 2) {
 184     ($SequencesDataRef, $IgnoreGaps) = @_;
 185   }
 186   else {
 187     ($SequencesDataRef) = @_;
 188   }
 189 
 190   ($ID, $Sequence, $SeqLen, $Description) =  _GetShortestOrLongestSequence($SequencesDataRef, 'Longest', $IgnoreGaps);
 191   return ($ID, $Sequence, $SeqLen, $Description);
 192 }
 193 
 194 # Get sequence length...
 195 sub GetSequenceLength {
 196   my($Seq, $SeqLen, $IgnoreGaps);
 197 
 198   $SeqLen = ''; $IgnoreGaps = 1;
 199   if (@_ == 2) {
 200     ($Seq, $IgnoreGaps) = @_;
 201   }
 202   else {
 203     ($Seq) = @_;
 204   }
 205   if ($IgnoreGaps) {
 206     my($Index, $Residue);
 207     $SeqLen = 0;
 208     for $Index (0 .. (length($Seq) - 1)) {
 209       $Residue = substr($Seq, $Index, 1);
 210       if ($Residue =~ /[A-Z]/i) {
 211 	$SeqLen++;
 212       }
 213     }
 214   }
 215   else {
 216     $SeqLen = length($Seq);
 217   }
 218 
 219   return $SeqLen;
 220 }
 221 
 222 # Is it a gap residue...
 223 sub IsGapResidue {
 224   my($Residue) = @_;
 225   my($Status);
 226 
 227   $Status = ($Residue !~ /[A-Z]/i ) ? 1 : 0;
 228 
 229   return $Status;
 230 }
 231 
 232 # Is it a supported sequence file?
 233 #
 234 # Supported seqence formats are:
 235 #
 236 # ALN/ClustalW   .aln
 237 # GCG/MSF         .msf
 238 # PILEUP/MSF     .msf
 239 # Fasts(Pearson) .fasta, .fta
 240 # NBRF/PIR         .pir
 241 #
 242 sub IsSupportedSequenceFile {
 243   my($SequenceFile) = @_;
 244   my($Status, $SequenceFormat);
 245   $Status = 0; $SequenceFormat = 'NotSupported';
 246 
 247   SEQFORMAT: {
 248       if (IsClustalWSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'ClustalW'; last SEQFORMAT}
 249       if (IsPearsonFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'Pearson'; last SEQFORMAT}
 250       if (IsPIRFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'PIR'; last SEQFORMAT}
 251       if (IsMSFSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'MSF'; last SEQFORMAT}
 252       $Status = 0; $SequenceFormat = 'NotSupported';
 253   }
 254   return ($Status, $SequenceFormat);
 255 }
 256 
 257 # Is it a ClustalW multiple sequence sequence file...
 258 sub IsClustalWSequenceFile {
 259   my($SequenceFile) = @_;
 260   my($Status, $Line);
 261 
 262   $Status = 0;
 263 
 264   open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n";
 265   $Line = GetTextLine(\*SEQUENCEFILE);
 266   $Status = ($Line =~ /(ClustalW|Clustal W|Clustal)/i ) ? 1 : 0;
 267   close SEQUENCEFILE;
 268 
 269   return $Status;
 270 }
 271 
 272 # Is it a valid Pearson fasta sequence or alignment file?
 273 #
 274 sub IsPearsonFastaSequenceFile {
 275   my($FastaFile, $Line, $Status);
 276 
 277   ($FastaFile) = @_;
 278   $Status = 0;
 279 
 280   open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n";
 281   $Line = GetTextLine(\*FASTAFILE);
 282 
 283   # First line starts with > and the fourth character is not ';'; otherwise, it's
 284   # PIR FASTA format.
 285   if ($Line =~ /^>/) {
 286     my($FourthChar);
 287     $FourthChar = substr($Line, 3, 1);
 288     $Status = ($FourthChar !~ /\;/) ? 1 : 0;
 289   }
 290   close FASTAFILE;
 291 
 292   return $Status;
 293 }
 294 
 295 # Is it a valid NBRF/PIR fasta sequence or alignment file?
 296 #
 297 sub IsPIRFastaSequenceFile {
 298   my($FastaFile, $Line, $Status);
 299 
 300   ($FastaFile) = @_;
 301   $Status = 0;
 302 
 303   open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n";
 304   $Line = GetTextLine(\*FASTAFILE);
 305 
 306   # First line starts with > and the fourth character is ';'; otherwise, it's
 307   # a Pearson FASTA format.
 308   if ($Line =~ /^>/) {
 309     my($FourthChar);
 310     $FourthChar = substr($Line, 3, 1);
 311     $Status = ($FourthChar =~ /\;/) ? 1 : 0;
 312   }
 313   close FASTAFILE;
 314 
 315   return $Status;
 316 }
 317 
 318 # Is it a valid MSF sequence or alignment file?
 319 #
 320 sub IsMSFSequenceFile {
 321   my($MSFFile) = @_;
 322 
 323   open MSFFILE, "$MSFFile" or die "Couldn't open $MSFFile: $!\n";
 324 
 325   my($Line, $Status);
 326 
 327   $Status = 0;
 328   # Find a line that contains MSF: keyword and ends with '..'
 329   LINE: while ($Line = GetTextLine(\*MSFFILE)) {
 330     $Line = RemoveLeadingWhiteSpaces($Line);
 331     if ($Line =~ /MSF:/i && $Line =~ /\.\.[ ]*$/) {
 332       $Status = 1;
 333       last LINE;
 334     }
 335     elsif ($Line =~ /(!!AA_MULTIPLE_ALIGNMENT|!!NA_MULTIPLE_ALIGNMENT|PILEUP)/i) {
 336       # Pileup MSF...
 337       $Status = 1;
 338       last LINE;
 339     }
 340   }
 341   close MSFFILE;
 342 
 343   return $Status;
 344 }
 345 
 346 # Read sequence or sequence alignment file...
 347 sub ReadSequenceFile {
 348   my($SequenceFile) = @_;
 349 
 350   if (IsPearsonFastaSequenceFile($SequenceFile)) {
 351     return ReadPearsonFastaSequenceFile($SequenceFile);
 352   }
 353   elsif (IsPIRFastaSequenceFile($SequenceFile)) {
 354     return ReadPIRFastaSequenceFile($SequenceFile);
 355   }
 356   elsif (IsMSFSequenceFile($SequenceFile)) {
 357     return ReadMSFSequenceFile($SequenceFile);
 358   }
 359   elsif (IsClustalWSequenceFile($SequenceFile)) {
 360     return ReadClustalWSequenceFile($SequenceFile);
 361   }
 362   else {
 363     return undef;
 364   }
 365 }
 366 
 367 # Read file and setup alignment data...
 368 sub ReadClustalWSequenceFile {
 369   my($SequenceFile) = @_;
 370 
 371   return _ReadFileAndSetupSequencesData($SequenceFile, 'ClustalW');
 372 }
 373 
 374 # Read file and setup alignment data...
 375 sub ReadPearsonFastaSequenceFile {
 376   my($SequenceFile) = @_;
 377 
 378   return _ReadFileAndSetupSequencesData($SequenceFile, 'Pearson');
 379 }
 380 
 381 # Read file and setup alignment data...
 382 sub ReadPIRFastaSequenceFile {
 383   my($SequenceFile) = @_;
 384 
 385   return _ReadFileAndSetupSequencesData($SequenceFile, 'PIR');
 386 }
 387 
 388 
 389 # Read file and setup sequence data...
 390 sub ReadMSFSequenceFile {
 391   my($SequenceFile) = @_;
 392 
 393   return _ReadFileAndSetupSequencesData($SequenceFile, 'MSF');
 394 }
 395 
 396 # Write out a Pearson FASTA file...
 397 sub WritePearsonFastaSequenceFile {
 398   my($SequenceFileName, $SequenceDataRef, $MaxLength, $ID, $Description, $Sequence, $WrappedSequence);
 399 
 400   $MaxLength = 80;
 401   if (@_ == 3) {
 402     ($SequenceFileName, $SequenceDataRef, $MaxLength) = @_;
 403   }
 404   elsif (@_ == 2) {
 405     ($SequenceFileName, $SequenceDataRef) = @_;
 406   }
 407   open SEQUENCEFILE, ">$SequenceFileName" or die "Can't open $SequenceFileName: $!\n";
 408   for $ID (@{$SequenceDataRef->{IDs}}) {
 409     $Description = $SequenceDataRef->{Description}{$ID};
 410     $Sequence = $SequenceDataRef->{Sequence}{$ID};
 411     $WrappedSequence = WrapText($Sequence, $MaxLength, "\n");
 412 
 413     # Description also contains ID...
 414     print SEQUENCEFILE ">$Description\n";
 415     print SEQUENCEFILE "$WrappedSequence\n";
 416   }
 417   close SEQUENCEFILE;
 418 }
 419 
 420 # Get ID, Sequence and Length for smallest or longest sequence
 421 sub _GetShortestOrLongestSequence {
 422   my($SequencesDataRef, $SequenceType, $IgnoreGaps) = @_;
 423   my($ID, $Seq, $SeqLen, $Description, $FirstID, $FirstSeqLen, $CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
 424 
 425   ($ID, $Seq, $SeqLen) = ('', '', '');
 426   $FirstID = '';
 427 
 428   ID: for $CurrentID (@{$SequencesDataRef->{IDs}}) {
 429     $CurrentSeq = $IgnoreGaps ? RemoveSequenceGaps($SequencesDataRef->{Sequence}{$CurrentID}) : $SequencesDataRef->{Sequence}{$CurrentID};
 430     $CurrentSeqLen = GetSequenceLength($CurrentSeq, $IgnoreGaps);
 431     $CurrentDescription = $SequencesDataRef->{Description}{$CurrentID};
 432     if (!$FirstID) {
 433       $FirstID = $ID; $FirstSeqLen = $CurrentSeqLen;
 434       ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
 435       next ID;
 436     }
 437     if ($CurrentSeqLen != $SeqLen) {
 438       if (($SequenceType =~ /Shortest/i) && ($CurrentSeqLen < $SeqLen)) {
 439 	($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
 440       }
 441       elsif (($SequenceType =~ /Longest/i) && ($CurrentSeqLen > $SeqLen) ) {
 442 	($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
 443       }
 444     }
 445   }
 446   return ($ID, $Seq, $SeqLen, $Description);
 447 }
 448 
 449 # Remove gaps in the sequence and return new sequence...
 450 sub RemoveSequenceGaps {
 451   my($Seq) = @_;
 452   my($SeqWithoutGaps, $SeqLen, $Index, $Residue);
 453 
 454   $SeqWithoutGaps = '';
 455   $SeqLen = length($Seq);
 456   for $Index (0 .. ($SeqLen - 1)) {
 457     $Residue = substr($Seq, $Index, 1);
 458     if ($Residue =~ /[A-Z]/i) {
 459       $SeqWithoutGaps .= $Residue;
 460     }
 461   }
 462 
 463   return $SeqWithoutGaps;
 464 }
 465 
 466 # Using input alignment data map ref containing following keys, generate
 467 # a new hash with same set of keys after residue columns containg only
 468 # gaps have been removed:
 469 #
 470 # {IDs} : Array of IDs in order as they appear in file
 471 # {Count}: ID count...
 472 # {Description}{$ID} : Description data...
 473 # {Sequence}{$ID} : Sequence data...
 474 #
 475 sub RemoveSequenceAlignmentGapColumns {
 476   my($ID, $AlignmentDataMapRef, %NewAlignmentDataMap);
 477 
 478   ($AlignmentDataMapRef) = @_;
 479 
 480   %NewAlignmentDataMap = ();
 481   @{$NewAlignmentDataMap{IDs}} =();
 482   %{$NewAlignmentDataMap{Description}} =();
 483   %{$NewAlignmentDataMap{Sequence}} =();
 484   $NewAlignmentDataMap{Count} = 0;
 485 
 486   # Transfer ID and count information...
 487   for $ID (@{$AlignmentDataMapRef->{IDs}}) {
 488     push @{$NewAlignmentDataMap{IDs}}, $ID;
 489     $NewAlignmentDataMap{Description}{$ID} = $AlignmentDataMapRef->{Description}{$ID};
 490     $NewAlignmentDataMap{Sequence}{$ID} = '';
 491     $NewAlignmentDataMap{Count} += 1;
 492   }
 493 
 494   # Go over residue columns and transfer the data...
 495   my($FirstID, $FirstSeq, $FirstSeqLen, $Index, $Res, $GapColumn);
 496 
 497   $FirstID = $AlignmentDataMapRef->{IDs}[0];
 498   $FirstSeq = $AlignmentDataMapRef->{Sequence}{$FirstID};
 499   $FirstSeqLen = length($FirstSeq);
 500 
 501   RES: for $Index (0 .. ($FirstSeqLen - 1)) {
 502     # Is this a gap column?
 503     $GapColumn = 1;
 504     ID: for $ID (@{$AlignmentDataMapRef->{IDs}}) {
 505       $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1);
 506       if ($Res =~ /[A-Z]/i) {
 507 	$GapColumn = 0;
 508 	last ID;
 509       }
 510     }
 511     if ($GapColumn) {
 512       next RES;
 513     }
 514     # Transfer this residue...
 515     for $ID (@{$AlignmentDataMapRef->{IDs}}) {
 516       $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1);
 517       $NewAlignmentDataMap{Sequence}{$ID} .= $Res;
 518     }
 519   }
 520 
 521   return (\%NewAlignmentDataMap);
 522 }
 523 
 524 #
 525 # Read sequences file and return a reference to hash with the following keys:
 526 #
 527 # {IDs} - Array of sequence IDs
 528 # {Count} - Number of sequences
 529 # {Description}{$ID} - Sequence description
 530 # {Sequence}{$ID} - Sequence for a specific ID
 531 # {InputFileType} - Sequence file format
 532 # {ConservedAnnotation} - Conserved residue annonation
 533 #
 534 # Note:
 535 #   . Conserved residue annotation either exist in the input sequence alignment file or set
 536 #     for a file containing same number of residues for all the sequence using the following
 537 #     notation: * - Residue conserved; ' ' - Residue not conserved.
 538 #
 539 sub _ReadFileAndSetupSequencesData {
 540   my($SequenceFile, $SequenceType) = @_;
 541   my($SequenceDataMapRef);
 542 
 543   $SequenceDataMapRef = undef;
 544 
 545   # Read sequence file...
 546   $SequenceDataMapRef = '';
 547   if ($SequenceType =~ /^ClustalW$/i) {
 548     $SequenceDataMapRef = _ReadClustalWFile($SequenceFile);
 549   }
 550   elsif ($SequenceType =~ /^Pearson$/i) {
 551     $SequenceDataMapRef = _ReadPearsonFastaFile($SequenceFile);
 552   }
 553   elsif ($SequenceType =~ /^PIR$/i) {
 554     $SequenceDataMapRef = _ReadPIRFastaFile($SequenceFile);
 555   }
 556   elsif ($SequenceType =~ /^MSF$/i) {
 557     $SequenceDataMapRef = _ReadMSFFile($SequenceFile);
 558   }
 559   else {
 560     return $SequenceDataMapRef;
 561   }
 562 
 563   if (exists $SequenceDataMapRef->{ConservedAnnotation}) {
 564     return ($SequenceDataMapRef);
 565   }
 566   if (!(($SequenceDataMapRef->{Count} > 1) && (AreSequenceLengthsIdentical($SequenceDataMapRef)))) {
 567     return ($SequenceDataMapRef);
 568   }
 569 
 570   # Use the first sequence to setup an empty ConservedAnnotation key...
 571   # And mark fully conserved residues...
 572   #
 573   my($ID, $Sequence, $FirstSequence, $FirstSeqLen, $Res, $FirstRes, $ResConserved, $Index);
 574   $ID = $SequenceDataMapRef->{IDs}[0];
 575   $FirstSequence = $SequenceDataMapRef->{Sequence}{$ID};
 576   $FirstSeqLen = length($FirstSequence);
 577   $SequenceDataMapRef->{ConservedAnnotation} = '';
 578   for $Index (0 .. ($FirstSeqLen - 1)) {
 579     $FirstRes = '';
 580     $ResConserved = 1;
 581     ID: for $ID (@{$SequenceDataMapRef->{IDs}}) {
 582       $Sequence = $SequenceDataMapRef->{Sequence}{$ID};
 583       $Res = substr($Sequence, $Index, 1);
 584       if (!$FirstRes) {
 585 	$FirstRes = $Res;
 586 	next ID;
 587       }
 588       if (($Res !~ /[A-Z]/i) || ($Res ne $FirstRes)) {
 589 	$ResConserved = 0;
 590 	last ID;
 591       }
 592     }
 593     if ($ResConserved) {
 594       $SequenceDataMapRef->{ConservedAnnotation} .= '*';
 595     }
 596     else {
 597       $SequenceDataMapRef->{ConservedAnnotation} .= ' ';
 598     }
 599   }
 600 
 601   return ($SequenceDataMapRef);
 602 }
 603 
 604 # Read sequence data in ClustalW multiple sequence alignment file and
 605 # return a reference to hash with these keys and values:
 606 #
 607 # {IDs} - Array of sequence IDs
 608 # {Count} - Number of sequences
 609 # {Description}{$ID} - Sequence description
 610 # {Sequence}{$ID} - Sequence for a specific ID
 611 # {InputFileType} - Sequence file format
 612 # {ConservedAnnotation} - Conserved residue annonations: space, *, : , .
 613 #
 614 #
 615 #
 616 # And based on ClustalW/X manual, here is what the ConservedAnnonations mean:
 617 #
 618 # '*' indicates positions which have a single, fully conserved residue
 619 #
 620 # ':' indicates that one of the following 'strong' groups is fully conserved: STA
 621 #    NEQK NHQK NDEQ QHRK MILV MILF HY FYW
 622 
 623 # '.' indicates that one of the following 'weaker' groups is fully conserved:
 624 #     CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY
 625 #
 626 # These are all the positively scoring groups that occur in the Gonnet Pam250
 627 # matrix. The strong and weak groups are defined as strong score >0.5 and weak
 628 # score =<0.5 respectively.
 629 #
 630 sub _ReadClustalWFile {
 631   my($SequenceFile) = @_;
 632   my(%SequencesDataMap);
 633 
 634   # Initialize data...
 635   %SequencesDataMap = ();
 636   @{$SequencesDataMap{IDs}} = ();
 637   %{$SequencesDataMap{Description}} = ();
 638   %{$SequencesDataMap{Sequence}} = ();
 639   $SequencesDataMap{Count} = 0;
 640   $SequencesDataMap{ConservedAnnotation} = '';
 641   $SequencesDataMap{InputFileType} = 'ClustalW';
 642 
 643   open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n";
 644 
 645   my($Line, $LineLength, $AnnotationStart, $AnnotationLength, $Annotation, $Sequence, $SequenceLength, $ID, $IDIndex);
 646 
 647   # Ignore the header line...
 648   $Line = <SEQUENCEFILE>;
 649 
 650   LINE: while ($Line = GetTextLine(\*SEQUENCEFILE)) {
 651     if (($Line =~ /^[ \*\:\.]/) && ($Line !~ /[A-Z]/i)) {
 652       # Annotation for sequences: fully conserverd, weaker or stronger group conserverd.
 653       # Extract it and save...
 654       $LineLength = length($Line);
 655       $AnnotationStart = $LineLength - $SequenceLength;
 656       $AnnotationLength = $SequenceLength;
 657       $Annotation = substr($Line, $AnnotationStart, $AnnotationLength);
 658       $SequencesDataMap{ConservedAnnotation} .= $Annotation;
 659     }
 660     else {
 661       # Extract ID and sequences...
 662       ($ID, $Sequence)= $Line =~ /^[ ]*(.*?)[ ]+(.*?)[ 01-9]*$/;
 663       $Sequence =~ s/ //g;
 664       if (!($ID && $Sequence)) {
 665 	next LINE;
 666       }
 667 
 668       if (exists $SequencesDataMap{Sequence}{$ID}) {
 669 	# Append to existing alignment value...
 670 	$SequenceLength = length($Sequence);
 671 	$SequencesDataMap{Sequence}{$ID} .= $Sequence;
 672       }
 673       else {
 674 	# New alignment data...
 675 	$SequencesDataMap{Count} += 1;
 676 	push @{$SequencesDataMap{IDs}}, $ID;
 677 	$SequencesDataMap{Description}{$ID} = $ID;
 678 	$SequencesDataMap{Sequence}{$ID} = $Sequence;
 679 	$SequenceLength = length($Sequence);
 680       }
 681     }
 682   }
 683   close SEQUENCEFILE;
 684   return (\%SequencesDataMap);
 685 }
 686 
 687 # Read Pearson fasta file and return a reference to hash with these keys:
 688 #
 689 # {IDs} - Array of sequence IDs
 690 # {Count} - Number of sequences
 691 # {Description}{$ID} - Sequence description
 692 # {Sequence}{$ID} - Sequence for a specific ID
 693 # {InputFileType} - Sequence file format
 694 # {ConservedAnnotation} - Conserved residue annonation
 695 #
 696 sub _ReadPearsonFastaFile {
 697   my($FastaFileName, $ID, $Description, $Line, $IgnoreID, @LineWords, %FastaDataMap);
 698 
 699   ($FastaFileName) = @_;
 700 
 701   %FastaDataMap = ();
 702   @{$FastaDataMap{IDs}} =();
 703   %{$FastaDataMap{Description}} =();
 704   %{$FastaDataMap{Sequence}} =();
 705   $FastaDataMap{Count} = 0;
 706   $FastaDataMap{InputFileType} = 'Pearson';
 707 
 708   open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n";
 709   $ID = '';
 710   $IgnoreID = 0;
 711   LINE: while ($Line = GetTextLine(\*FASTAFILE)) {
 712     if ($Line =~ /^\>/) {
 713       # Start of a new ID...
 714       $Line =~ s/^\>//;
 715       $Line = RemoveLeadingWhiteSpaces($Line);
 716       @LineWords = ();
 717       @LineWords = split / /, $Line;
 718 
 719       $ID = $LineWords[0];
 720       $ID =~ s/ //g;
 721       $Description = $Line;
 722 
 723       $IgnoreID = 0;
 724       if (exists $FastaDataMap{Sequence}{$ID}) {
 725 	$IgnoreID = 1;
 726 	warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n";
 727 	next LINE;
 728       }
 729       push @{$FastaDataMap{IDs}}, $ID;
 730       $FastaDataMap{Description}{$ID} = $Description;
 731       $FastaDataMap{Count} += 1;
 732       next LINE;
 733     }
 734     if ($IgnoreID) { next LINE; }
 735 
 736     # Remove any spaces in the sequence...
 737     $Line =~ s/ //g;
 738     # Sequence data for active ID...
 739     if (exists $FastaDataMap{Sequence}{$ID}) {
 740       $FastaDataMap{Sequence}{$ID} .= $Line;
 741     }
 742     else {
 743       $FastaDataMap{Sequence}{$ID} = $Line;
 744     }
 745   }
 746   close FASTAFILE;
 747   return \%FastaDataMap;
 748 }
 749 
 750 # Read PIR fasta file and return a reference to hash with these keys:
 751 #
 752 # {IDs} - Array of sequence IDs
 753 # {Count} - Number of sequences
 754 # {Description}{$ID} - Sequence description
 755 # {Sequence}{$ID} - Sequence for a specific ID
 756 # {InputFileType} - Sequence file format
 757 # {ConservedAnnotation} - Conserved residue annonation
 758 #
 759 # Format:
 760 # A sequence in PIR format consists of:
 761 # One line starting with
 762 #   a ">" (greater-than) sign, followed by
 763 #   a two-letter code describing the sequence type code (P1, F1, DL, DC, RL, RC, N3, N1 or XX), followed by
 764 #   a semicolon, followed by
 765 #   the sequence identification code (the database ID-code).
 766 # One line containing a textual description of the sequence.
 767 # One or more lines containing the sequence itself. The end of the
 768 # sequence is marked by a "*" (asterisk) character.
 769 #
 770 # A file in PIR format may comprise more than one sequence.
 771 #
 772 # The PIR format is also often referred to as the NBRF format.
 773 #
 774 # Code SequenceType
 775 # P1    Protein (complete)
 776 # F1    Protein (fragment)
 777 # DL    DNA (linear)
 778 # DC    DNA (circular)
 779 # RL    RNA (linear)
 780 # RC   RNA (circular)
 781 # N3    tRNA
 782 # N1    Other functional RNA
 783 #
 784 
 785 sub _ReadPIRFastaFile {
 786   my($FastaFileName, $ID, $Description, $Line, $SequenceTypeCode, $ReadingSequenceData, %FastaDataMap);
 787 
 788   ($FastaFileName) = @_;
 789 
 790   %FastaDataMap = ();
 791   @{$FastaDataMap{IDs}} =();
 792   %{$FastaDataMap{Description}} =();
 793   %{$FastaDataMap{Sequence}} =();
 794   %{$FastaDataMap{SequenceTypeCode}} =();
 795   $FastaDataMap{Count} = 0;
 796   $FastaDataMap{InputFileType} = 'PIR';
 797 
 798   open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n";
 799   $ID = '';
 800   $ReadingSequenceData = 0;
 801   LINE: while ($Line = GetTextLine(\*FASTAFILE)) {
 802     if ($Line =~ /^\>/) {
 803       # Start of a new ID...
 804       $Line =~ s/^\>//;
 805       $Line = RemoveLeadingWhiteSpaces($Line);
 806       ($SequenceTypeCode, $ID) = /^\>(.*?)\;(.*?)$/;
 807 
 808       # Use next line to retrieve sequence description...
 809       $Line = GetTextLine(\*FASTAFILE);
 810       $Line = RemoveLeadingWhiteSpaces($Line);
 811       $Description = $Line;
 812 
 813       if (exists $FastaDataMap{Sequence}{$ID}) {
 814 	warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n";
 815 	next LINE;
 816       }
 817       $ReadingSequenceData = 1;
 818       push @{$FastaDataMap{IDs}}, $ID;
 819       $FastaDataMap{SequenceTypeCode}{$ID} = $SequenceTypeCode;
 820       $FastaDataMap{Description}{$ID} = $Description;
 821       $FastaDataMap{Count} += 1;
 822       next LINE;
 823     }
 824     if (!$ReadingSequenceData) { next LINE; }
 825 
 826     # Remove any spaces in the sequence...
 827     $Line =~ s/ //g;
 828     if ($Line =~ /[\*]$/) {
 829       # End of sequence...
 830       $ReadingSequenceData = 0;
 831       $Line =~ s/[\*]$//;
 832     }
 833     # Sequence data for active ID...
 834     if (exists $FastaDataMap{Sequence}{$ID}) {
 835       $FastaDataMap{Sequence}{$ID} .= $Line;
 836     }
 837     else {
 838       $FastaDataMap{Sequence}{$ID} = $Line;
 839     }
 840   }
 841   close FASTAFILE;
 842   return \%FastaDataMap;
 843 }
 844 
 845 # Read MSF file and return a reference to hash with these keys:
 846 #
 847 # {IDs} : Array of IDs in order as they appear in file
 848 # {Count}: ID count...
 849 # {Description}{$ID} : Description data...
 850 # {Sequence}{$ID} : Sequence data...
 851 #
 852 sub _ReadMSFFile {
 853   my($MSFFileName, $Line, @LineWords, %MSFDataMap);
 854 
 855   ($MSFFileName) = @_;
 856 
 857   %MSFDataMap = ();
 858   @{$MSFDataMap{IDs}} =();
 859   %{$MSFDataMap{Description}} =();
 860   %{$MSFDataMap{Sequence}} =();
 861   $MSFDataMap{Count} = 0;
 862   $MSFDataMap{InputFileType} = 'MSF';
 863 
 864   open MSFFILE, "$MSFFileName" or die "Couldn't open $MSFFileName: $!\n";
 865 
 866   # Collect sequences and IDs...
 867   #
 868   # '//' after the name fields indicates end of header list and start of sequence data.
 869   #
 870   my($ID, $Len, $Check, $Weight, $Sequence, $NameFieldsFound, %MSFIDsMap);
 871   %MSFIDsMap = ();
 872   $NameFieldsFound = 0;
 873   LINE: while ($Line = GetTextLine(\*MSFFILE)) {
 874     if ($Line =~ /Name:/) {
 875       $NameFieldsFound++;
 876       ($ID, $Len, $Check, $Weight) = $Line =~ /^[ ]*Name:[ ]+(.*?)[ ]+Len:[ ]+(.*?)[ ]+Check:[ ]+(.*?)[ ]+Weight:[ ]+(.*?)[ ]*$/;
 877       if ($ID =~ / /) {
 878 	($ID) = $ID =~ /^(.*?)[ ]+/
 879       }
 880       if (exists $MSFIDsMap{$ID}) {
 881 	warn "Warning: ID, $ID, in MSF file already exists. Ignoring ID and sequence data...\n";
 882 	next LINE;
 883       }
 884       $MSFIDsMap{$ID} = $ID;
 885       push @{$MSFDataMap{IDs}}, $ID;
 886       $MSFDataMap{Description}{$ID} = $ID;
 887       $MSFDataMap{Count} += 1;
 888     }
 889     elsif ( /\/\// && $NameFieldsFound) {
 890       # End of header list...
 891       last LINE;
 892     }
 893   }
 894   # Collect all sequences...
 895   #
 896   my($FirstField, $SecondField);
 897   while ($Line = GetTextLine(\*MSFFILE)) {
 898     ($FirstField, $SecondField) = $Line =~ /^[ ]*(.*?)[ ]+(.*?)$/;
 899     if (exists $MSFIDsMap{$FirstField}) {
 900       # It's ID and sequence data...
 901       $ID = $FirstField;
 902       $Sequence = $SecondField;
 903       # Take out spaces and leave the gap characters...
 904       $Sequence =~ s/ //g;
 905       if ($MSFDataMap{Sequence}{$ID}) {
 906 	$MSFDataMap{Sequence}{$ID} .= $Sequence;
 907       }
 908       else {
 909 	$MSFDataMap{Sequence}{$ID} = $Sequence;
 910       }
 911     }
 912   }
 913 
 914   close MSFFILE;
 915   return \%MSFDataMap;
 916 }
 917 
 918