MayaChemTools:Code:SequenceFileUtil.pm

   1 package SequenceFileUtil;
   2 #
   3 # File: SequenceFileUtil.pm
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use Exporter;
  28 use Text::ParseWords;
  29 use TextUtil;
  30 use FileUtil;
  31 
  32 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  33 
  34 @ISA = qw(Exporter);
  35 @EXPORT = qw(AreSequenceLengthsIdentical CalcuatePercentSequenceIdentity CalculatePercentSequenceIdentityMatrix GetLongestSequence GetShortestSequence GetSequenceLength IsGapResidue IsSupportedSequenceFile IsClustalWSequenceFile IsPearsonFastaSequenceFile IsMSFSequenceFile ReadSequenceFile RemoveSequenceGaps RemoveSequenceAlignmentGapColumns WritePearsonFastaSequenceFile);
  36 @EXPORT_OK = qw();
  37 
  38 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  39 
  40 # Compare lengths of all sequences...
  41 sub AreSequenceLengthsIdentical {
  42   my($SequencesDataRef) = @_;
  43   my($Status, $ID, $FirstID, $FirstSeqLen, $FirstDifferentLenID, $SeqLen);
  44 
  45   $Status = 1;
  46   $FirstID = '';
  47   $FirstDifferentLenID = '';
  48 
  49   ID: for $ID (@{$SequencesDataRef->{IDs}}) {
  50     if (!$FirstID) {
  51       $FirstID = $ID;
  52       $FirstSeqLen = length($SequencesDataRef->{Sequence}{$ID});
  53       next ID;
  54     }
  55     $SeqLen = length($SequencesDataRef->{Sequence}{$ID});
  56     if ($SeqLen != $FirstSeqLen) {
  57       $Status = 0;
  58       $FirstDifferentLenID = $ID;
  59       last ID;
  60     }
  61   }
  62   return ($Status);
  63 }
  64 
  65 # Calculate percent identity between two sequences. By default, gaps are ignored.
  66 sub CalcuatePercentSequenceIdentity {
  67   my($Sequence1, $Sequence2, $PercentIdentity, $IgnoreGaps, $Precision);
  68 
  69   $PercentIdentity = '';
  70   $Precision = 1;
  71   $IgnoreGaps = 1;
  72   if (@_ == 4) {
  73     ($Sequence1, $Sequence2, $IgnoreGaps, $Precision) = @_;
  74   }
  75   elsif (@_ == 3) {
  76     ($Sequence1, $Sequence2, $IgnoreGaps) = @_;
  77   }
  78   elsif (@_ == 2) {
  79     ($Sequence1, $Sequence2) = @_;
  80   }
  81   else {
  82     return $PercentIdentity;
  83   }
  84   if (!(IsNotEmpty($Sequence1) && IsNotEmpty($Sequence2))) {
  85     return $PercentIdentity;
  86   }
  87   my($Index, $Identity, $Sequence1Len, $Sequence2Len, $Residue1, $Residue2, $ResMatchCount, $ResCount);
  88 
  89   $Sequence1Len = length($Sequence1);
  90   $Sequence2Len = length($Sequence2);
  91 
  92   $ResMatchCount = 0;
  93   $ResCount = 0;
  94   RESIDUE: for $Index (0 .. ($Sequence1Len - 1)) {
  95     $Residue1 = substr($Sequence1, $Index, 1);
  96     $Residue2 = ($Index < $Sequence2Len) ? substr($Sequence2, $Index, 1) : '';
  97     if ($IgnoreGaps) {
  98       if ($Residue1 !~ /[A-Z]/i || $Residue2 !~ /[A-Z]/i) {
  99         next RESIDUE;
 100       }
 101     }
 102     if ($Residue1 eq $Residue2) {
 103       $ResMatchCount++;
 104     }
 105     $ResCount++;
 106   }
 107   $Identity = $ResCount ? ($ResMatchCount/$ResCount) : 0.0;
 108   $PercentIdentity = sprintf("%.${Precision}f", ($Identity * 100));
 109 
 110   return $PercentIdentity;
 111 }
 112 
 113 # Calculate pairwise identify matrix for all the sequences and return a reference
 114 # to a hash with the following keys:
 115 #
 116 # {IDs} - Sequence IDs
 117 # {Count} - Number of IDs
 118 # {PercentIdentity}{$RowID}{$ColID} - Percent identify for a pair of sequences
 119 #
 120 sub CalculatePercentSequenceIdentityMatrix {
 121   my($SequencesDataRef, $IgnoreGaps, , $Precision, $ID, $RowID, $ColID, $RowIDSeq, $ColIDSeq, $PercentIdentity, %IdentityMatrixData);
 122 
 123   $IgnoreGaps = 1;
 124   $Precision = 1;
 125   if (@_ == 3) {
 126     ($SequencesDataRef, $IgnoreGaps, $Precision) = @_;
 127   }
 128   elsif (@_ == 2) {
 129     ($SequencesDataRef, $IgnoreGaps) = @_;
 130   }
 131   else {
 132     ($SequencesDataRef) = @_;
 133   }
 134 
 135   %IdentityMatrixData = ();
 136   @{$IdentityMatrixData{IDs}} = ();
 137   %{$IdentityMatrixData{PercentIdentity}} = ();
 138   $IdentityMatrixData{Count} = 0;
 139 
 140   for $ID (@{$SequencesDataRef->{IDs}}) {
 141     push @{$IdentityMatrixData{IDs}}, $ID;
 142     $IdentityMatrixData{Count} += 1;
 143   }
 144   # Initialize and calculate percent identity data values...
 145   for $RowID (@{$SequencesDataRef->{IDs}}) {
 146     %{$IdentityMatrixData{PercentIdentity}{$RowID}} = ();
 147     $RowIDSeq = $SequencesDataRef->{Sequence}{$RowID};
 148     for $ColID (@{$SequencesDataRef->{IDs}}) {
 149       $IdentityMatrixData{$RowID}{$ColID} = '';
 150       $ColIDSeq = $SequencesDataRef->{Sequence}{$ColID};
 151       $PercentIdentity = CalcuatePercentSequenceIdentity($RowIDSeq, $ColIDSeq, $IgnoreGaps, $Precision);
 152       $IdentityMatrixData{PercentIdentity}{$RowID}{$ColID} = $PercentIdentity;
 153     }
 154   }
 155   return \%IdentityMatrixData;
 156 }
 157 
 158 # Retrieve information about shortest sequence...
 159 sub GetShortestSequence {
 160   my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description);
 161 
 162   $IgnoreGaps = 1;
 163   if (@_ == 2) {
 164     ($SequencesDataRef, $IgnoreGaps) = @_;
 165   }
 166   else {
 167     ($SequencesDataRef) = @_;
 168   }
 169 
 170   ($ID, $Sequence, $SeqLen, $Description) =  _GetShortestOrLongestSequence($SequencesDataRef, 'Shortest', $IgnoreGaps);
 171   return ($ID, $Sequence, $SeqLen, $Description);
 172 }
 173 
 174 # Retrieve information about longest sequence..
 175 sub GetLongestSequence {
 176   my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description);
 177 
 178   $IgnoreGaps = 1;
 179   if (@_ == 2) {
 180     ($SequencesDataRef, $IgnoreGaps) = @_;
 181   }
 182   else {
 183     ($SequencesDataRef) = @_;
 184   }
 185 
 186   ($ID, $Sequence, $SeqLen, $Description) =  _GetShortestOrLongestSequence($SequencesDataRef, 'Longest', $IgnoreGaps);
 187   return ($ID, $Sequence, $SeqLen, $Description);
 188 }
 189 
 190 # Get sequence length...
 191 sub GetSequenceLength {
 192   my($Seq, $SeqLen, $IgnoreGaps);
 193 
 194   $SeqLen = ''; $IgnoreGaps = 1;
 195   if (@_ == 2) {
 196     ($Seq, $IgnoreGaps) = @_;
 197   }
 198   else {
 199     ($Seq) = @_;
 200   }
 201   if ($IgnoreGaps) {
 202     my($Index, $Residue);
 203     $SeqLen = 0;
 204     for $Index (0 .. (length($Seq) - 1)) {
 205       $Residue = substr($Seq, $Index, 1);
 206       if ($Residue =~ /[A-Z]/i) {
 207         $SeqLen++;
 208       }
 209     }
 210   }
 211   else {
 212     $SeqLen = length($Seq);
 213   }
 214 
 215   return $SeqLen;
 216 }
 217 
 218 # Is it a gap residue...
 219 sub IsGapResidue {
 220   my($Residue) = @_;
 221   my($Status);
 222 
 223   $Status = ($Residue !~ /[A-Z]/i ) ? 1 : 0;
 224 
 225   return $Status;
 226 }
 227 
 228 # Is it a supported sequence file?
 229 #
 230 # Supported seqence formats are:
 231 #
 232 # ALN/ClustalW   .aln
 233 # GCG/MSF         .msf
 234 # PILEUP/MSF     .msf
 235 # Fasts(Pearson) .fasta, .fta
 236 # NBRF/PIR         .pir
 237 #
 238 sub IsSupportedSequenceFile {
 239   my($SequenceFile) = @_;
 240   my($Status, $SequenceFormat);
 241   $Status = 0; $SequenceFormat = 'NotSupported';
 242 
 243   SEQFORMAT: {
 244       if (IsClustalWSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'ClustalW'; last SEQFORMAT}
 245       if (IsPearsonFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'Pearson'; last SEQFORMAT}
 246       if (IsPIRFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'PIR'; last SEQFORMAT}
 247       if (IsMSFSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'MSF'; last SEQFORMAT}
 248       $Status = 0; $SequenceFormat = 'NotSupported';
 249   }
 250   return ($Status, $SequenceFormat);
 251 }
 252 
 253 # Is it a ClustalW multiple sequence sequence file...
 254 sub IsClustalWSequenceFile {
 255   my($SequenceFile) = @_;
 256   my($Status, $Line);
 257 
 258   $Status = 0;
 259 
 260   open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n";
 261   $Line = GetTextLine(\*SEQUENCEFILE);
 262   $Status = ($Line =~ /(ClustalW|Clustal W|Clustal)/i ) ? 1 : 0;
 263   close SEQUENCEFILE;
 264 
 265   return $Status;
 266 }
 267 
 268 # Is it a valid Pearson fasta sequence or alignment file?
 269 #
 270 sub IsPearsonFastaSequenceFile {
 271   my($FastaFile, $Line, $Status);
 272 
 273   ($FastaFile) = @_;
 274   $Status = 0;
 275 
 276   open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n";
 277   $Line = GetTextLine(\*FASTAFILE);
 278 
 279   # First line starts with > and the fourth character is not ';'; otherwise, it's
 280   # PIR FASTA format.
 281   if ($Line =~ /^>/) {
 282     my($FourthChar);
 283     $FourthChar = substr($Line, 3, 1);
 284     $Status = ($FourthChar !~ /\;/) ? 1 : 0;
 285   }
 286   close FASTAFILE;
 287 
 288   return $Status;
 289 }
 290 
 291 # Is it a valid NBRF/PIR fasta sequence or alignment file?
 292 #
 293 sub IsPIRFastaSequenceFile {
 294   my($FastaFile, $Line, $Status);
 295 
 296   ($FastaFile) = @_;
 297   $Status = 0;
 298 
 299   open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n";
 300   $Line = GetTextLine(\*FASTAFILE);
 301 
 302   # First line starts with > and the fourth character is ';'; otherwise, it's
 303   # a Pearson FASTA format.
 304   if ($Line =~ /^>/) {
 305     my($FourthChar);
 306     $FourthChar = substr($Line, 3, 1);
 307     $Status = ($FourthChar =~ /\;/) ? 1 : 0;
 308   }
 309   close FASTAFILE;
 310 
 311   return $Status;
 312 }
 313 
 314 # Is it a valid MSF sequence or alignment file?
 315 #
 316 sub IsMSFSequenceFile {
 317   my($MSFFile) = @_;
 318 
 319   open MSFFILE, "$MSFFile" or die "Couldn't open $MSFFile: $!\n";
 320 
 321   my($Line, $Status);
 322 
 323   $Status = 0;
 324   # Find a line that contains MSF: keyword and ends with '..'
 325   LINE: while ($Line = GetTextLine(\*MSFFILE)) {
 326     $Line = RemoveLeadingWhiteSpaces($Line);
 327     if ($Line =~ /MSF:/i && $Line =~ /\.\.[ ]*$/) {
 328       $Status = 1;
 329       last LINE;
 330     }
 331     elsif ($Line =~ /(!!AA_MULTIPLE_ALIGNMENT|!!NA_MULTIPLE_ALIGNMENT|PILEUP)/i) {
 332       # Pileup MSF...
 333       $Status = 1;
 334       last LINE;
 335     }
 336   }
 337   close MSFFILE;
 338 
 339   return $Status;
 340 }
 341 
 342 # Read sequence or sequence alignment file...
 343 sub ReadSequenceFile {
 344   my($SequenceFile) = @_;
 345 
 346   if (IsPearsonFastaSequenceFile($SequenceFile)) {
 347     return ReadPearsonFastaSequenceFile($SequenceFile);
 348   }
 349   elsif (IsPIRFastaSequenceFile($SequenceFile)) {
 350     return ReadPIRFastaSequenceFile($SequenceFile);
 351   }
 352   elsif (IsMSFSequenceFile($SequenceFile)) {
 353     return ReadMSFSequenceFile($SequenceFile);
 354   }
 355   elsif (IsClustalWSequenceFile($SequenceFile)) {
 356     return ReadClustalWSequenceFile($SequenceFile);
 357   }
 358   else {
 359     return undef;
 360   }
 361 }
 362 
 363 # Read file and setup alignment data...
 364 sub ReadClustalWSequenceFile {
 365   my($SequenceFile) = @_;
 366 
 367   return _ReadFileAndSetupSequencesData($SequenceFile, 'ClustalW');
 368 }
 369 
 370 # Read file and setup alignment data...
 371 sub ReadPearsonFastaSequenceFile {
 372   my($SequenceFile) = @_;
 373 
 374   return _ReadFileAndSetupSequencesData($SequenceFile, 'Pearson');
 375 }
 376 
 377 # Read file and setup alignment data...
 378 sub ReadPIRFastaSequenceFile {
 379   my($SequenceFile) = @_;
 380 
 381   return _ReadFileAndSetupSequencesData($SequenceFile, 'PIR');
 382 }
 383 
 384 
 385 # Read file and setup sequence data...
 386 sub ReadMSFSequenceFile {
 387   my($SequenceFile) = @_;
 388 
 389   return _ReadFileAndSetupSequencesData($SequenceFile, 'MSF');
 390 }
 391 
 392 # Write out a Pearson FASTA file...
 393 sub WritePearsonFastaSequenceFile {
 394   my($SequenceFileName, $SequenceDataRef, $MaxLength, $ID, $Description, $Sequence, $WrappedSequence);
 395 
 396   $MaxLength = 80;
 397   if (@_ == 3) {
 398     ($SequenceFileName, $SequenceDataRef, $MaxLength) = @_;
 399   }
 400   elsif (@_ == 2) {
 401     ($SequenceFileName, $SequenceDataRef) = @_;
 402   }
 403   open SEQUENCEFILE, ">$SequenceFileName" or die "Can't open $SequenceFileName: $!\n";
 404   for $ID (@{$SequenceDataRef->{IDs}}) {
 405     $Description = $SequenceDataRef->{Description}{$ID};
 406     $Sequence = $SequenceDataRef->{Sequence}{$ID};
 407     $WrappedSequence = WrapText($Sequence, $MaxLength, "\n");
 408 
 409     # Description also contains ID...
 410     print SEQUENCEFILE ">$Description\n";
 411     print SEQUENCEFILE "$WrappedSequence\n";
 412   }
 413   close SEQUENCEFILE;
 414 }
 415 
 416 # Get ID, Sequence and Length for smallest or longest sequence
 417 sub _GetShortestOrLongestSequence {
 418   my($SequencesDataRef, $SequenceType, $IgnoreGaps) = @_;
 419   my($ID, $Seq, $SeqLen, $Description, $FirstID, $FirstSeqLen, $CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
 420 
 421   ($ID, $Seq, $SeqLen) = ('', '', '');
 422   $FirstID = '';
 423 
 424   ID: for $CurrentID (@{$SequencesDataRef->{IDs}}) {
 425     $CurrentSeq = $IgnoreGaps ? RemoveSequenceGaps($SequencesDataRef->{Sequence}{$CurrentID}) : $SequencesDataRef->{Sequence}{$CurrentID};
 426     $CurrentSeqLen = GetSequenceLength($CurrentSeq, $IgnoreGaps);
 427     $CurrentDescription = $SequencesDataRef->{Description}{$CurrentID};
 428     if (!$FirstID) {
 429       $FirstID = $ID; $FirstSeqLen = $CurrentSeqLen;
 430       ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
 431       next ID;
 432     }
 433     if ($CurrentSeqLen != $SeqLen) {
 434       if (($SequenceType =~ /Shortest/i) && ($CurrentSeqLen < $SeqLen)) {
 435         ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
 436       }
 437       elsif (($SequenceType =~ /Longest/i) && ($CurrentSeqLen > $SeqLen) ) {
 438         ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
 439       }
 440     }
 441   }
 442   return ($ID, $Seq, $SeqLen, $Description);
 443 }
 444 
 445 # Remove gaps in the sequence and return new sequence...
 446 sub RemoveSequenceGaps {
 447   my($Seq) = @_;
 448   my($SeqWithoutGaps, $SeqLen, $Index, $Residue);
 449 
 450   $SeqWithoutGaps = '';
 451   $SeqLen = length($Seq);
 452   for $Index (0 .. ($SeqLen - 1)) {
 453     $Residue = substr($Seq, $Index, 1);
 454     if ($Residue =~ /[A-Z]/i) {
 455       $SeqWithoutGaps .= $Residue;
 456     }
 457   }
 458 
 459   return $SeqWithoutGaps;
 460 }
 461 
 462 # Using input alignment data map ref containing following keys, generate
 463 # a new hash with same set of keys after residue columns containg only
 464 # gaps have been removed:
 465 #
 466 # {IDs} : Array of IDs in order as they appear in file
 467 # {Count}: ID count...
 468 # {Description}{$ID} : Description data...
 469 # {Sequence}{$ID} : Sequence data...
 470 #
 471 sub RemoveSequenceAlignmentGapColumns {
 472   my($ID, $AlignmentDataMapRef, %NewAlignmentDataMap);
 473 
 474   ($AlignmentDataMapRef) = @_;
 475 
 476   %NewAlignmentDataMap = ();
 477   @{$NewAlignmentDataMap{IDs}} =();
 478   %{$NewAlignmentDataMap{Description}} =();
 479   %{$NewAlignmentDataMap{Sequence}} =();
 480   $NewAlignmentDataMap{Count} = 0;
 481 
 482   # Transfer ID and count information...
 483   for $ID (@{$AlignmentDataMapRef->{IDs}}) {
 484     push @{$NewAlignmentDataMap{IDs}}, $ID;
 485     $NewAlignmentDataMap{Description}{$ID} = $AlignmentDataMapRef->{Description}{$ID};
 486     $NewAlignmentDataMap{Sequence}{$ID} = '';
 487     $NewAlignmentDataMap{Count} += 1;
 488   }
 489 
 490   # Go over residue columns and transfer the data...
 491   my($FirstID, $FirstSeq, $FirstSeqLen, $Index, $Res, $GapColumn);
 492 
 493   $FirstID = $AlignmentDataMapRef->{IDs}[0];
 494   $FirstSeq = $AlignmentDataMapRef->{Sequence}{$FirstID};
 495   $FirstSeqLen = length($FirstSeq);
 496 
 497   RES: for $Index (0 .. ($FirstSeqLen - 1)) {
 498     # Is this a gap column?
 499     $GapColumn = 1;
 500     ID: for $ID (@{$AlignmentDataMapRef->{IDs}}) {
 501       $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1);
 502       if ($Res =~ /[A-Z]/i) {
 503         $GapColumn = 0;
 504         last ID;
 505       }
 506     }
 507     if ($GapColumn) {
 508       next RES;
 509     }
 510     # Transfer this residue...
 511     for $ID (@{$AlignmentDataMapRef->{IDs}}) {
 512       $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1);
 513       $NewAlignmentDataMap{Sequence}{$ID} .= $Res;
 514     }
 515   }
 516 
 517   return (\%NewAlignmentDataMap);
 518 }
 519 
 520 #
 521 # Read sequences file and return a reference to hash with the following keys:
 522 #
 523 # {IDs} - Array of sequence IDs
 524 # {Count} - Number of sequences
 525 # {Description}{$ID} - Sequence description
 526 # {Sequence}{$ID} - Sequence for a specific ID
 527 # {InputFileType} - Sequence file format
 528 # {ConservedAnnotation} - Conserved residue annonation
 529 #
 530 # Note:
 531 #   . Conserved residue annotation either exist in the input sequence alignment file or set
 532 #     for a file containing same number of residues for all the sequence using the following
 533 #     notation: * - Residue conserved; ' ' - Residue not conserved.
 534 #
 535 sub _ReadFileAndSetupSequencesData {
 536   my($SequenceFile, $SequenceType) = @_;
 537   my($SequenceDataMapRef);
 538 
 539   $SequenceDataMapRef = undef;
 540 
 541   # Read sequence file...
 542   $SequenceDataMapRef = '';
 543   if ($SequenceType =~ /^ClustalW$/i) {
 544     $SequenceDataMapRef = _ReadClustalWFile($SequenceFile);
 545   }
 546   elsif ($SequenceType =~ /^Pearson$/i) {
 547     $SequenceDataMapRef = _ReadPearsonFastaFile($SequenceFile);
 548   }
 549   elsif ($SequenceType =~ /^PIR$/i) {
 550     $SequenceDataMapRef = _ReadPIRFastaFile($SequenceFile);
 551   }
 552   elsif ($SequenceType =~ /^MSF$/i) {
 553     $SequenceDataMapRef = _ReadMSFFile($SequenceFile);
 554   }
 555   else {
 556     return $SequenceDataMapRef;
 557   }
 558 
 559   if (exists $SequenceDataMapRef->{ConservedAnnotation}) {
 560     return ($SequenceDataMapRef);
 561   }
 562   if (!(($SequenceDataMapRef->{Count} > 1) && (AreSequenceLengthsIdentical($SequenceDataMapRef)))) {
 563     return ($SequenceDataMapRef);
 564   }
 565 
 566   # Use the first sequence to setup an empty ConservedAnnotation key...
 567   # And mark fully conserved residues...
 568   #
 569   my($ID, $Sequence, $FirstSequence, $FirstSeqLen, $Res, $FirstRes, $ResConserved, $Index);
 570   $ID = $SequenceDataMapRef->{IDs}[0];
 571   $FirstSequence = $SequenceDataMapRef->{Sequence}{$ID};
 572   $FirstSeqLen = length($FirstSequence);
 573   $SequenceDataMapRef->{ConservedAnnotation} = '';
 574   for $Index (0 .. ($FirstSeqLen - 1)) {
 575     $FirstRes = '';
 576     $ResConserved = 1;
 577     ID: for $ID (@{$SequenceDataMapRef->{IDs}}) {
 578       $Sequence = $SequenceDataMapRef->{Sequence}{$ID};
 579       $Res = substr($Sequence, $Index, 1);
 580       if (!$FirstRes) {
 581         $FirstRes = $Res;
 582         next ID;
 583       }
 584       if (($Res !~ /[A-Z]/i) || ($Res ne $FirstRes)) {
 585         $ResConserved = 0;
 586         last ID;
 587       }
 588     }
 589     if ($ResConserved) {
 590       $SequenceDataMapRef->{ConservedAnnotation} .= '*';
 591     }
 592     else {
 593       $SequenceDataMapRef->{ConservedAnnotation} .= ' ';
 594     }
 595   }
 596 
 597   return ($SequenceDataMapRef);
 598 }
 599 
 600 # Read sequence data in ClustalW multiple sequence alignment file and
 601 # return a reference to hash with these keys and values:
 602 #
 603 # {IDs} - Array of sequence IDs
 604 # {Count} - Number of sequences
 605 # {Description}{$ID} - Sequence description
 606 # {Sequence}{$ID} - Sequence for a specific ID
 607 # {InputFileType} - Sequence file format
 608 # {ConservedAnnotation} - Conserved residue annonations: space, *, : , .
 609 #
 610 #
 611 #
 612 # And based on ClustalW/X manual, here is what the ConservedAnnonations mean:
 613 #
 614 # '*' indicates positions which have a single, fully conserved residue
 615 #
 616 # ':' indicates that one of the following 'strong' groups is fully conserved: STA
 617 #    NEQK NHQK NDEQ QHRK MILV MILF HY FYW
 618 
 619 # '.' indicates that one of the following 'weaker' groups is fully conserved:
 620 #     CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY
 621 #
 622 # These are all the positively scoring groups that occur in the Gonnet Pam250
 623 # matrix. The strong and weak groups are defined as strong score >0.5 and weak
 624 # score =<0.5 respectively.
 625 #
 626 sub _ReadClustalWFile {
 627   my($SequenceFile) = @_;
 628   my(%SequencesDataMap);
 629 
 630   # Initialize data...
 631   %SequencesDataMap = ();
 632   @{$SequencesDataMap{IDs}} = ();
 633   %{$SequencesDataMap{Description}} = ();
 634   %{$SequencesDataMap{Sequence}} = ();
 635   $SequencesDataMap{Count} = 0;
 636   $SequencesDataMap{ConservedAnnotation} = '';
 637   $SequencesDataMap{InputFileType} = 'ClustalW';
 638 
 639   open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n";
 640 
 641   my($Line, $LineLength, $AnnotationStart, $AnnotationLength, $Annotation, $Sequence, $SequenceLength, $ID, $IDIndex);
 642 
 643   # Ignore the header line...
 644   $Line = <SEQUENCEFILE>;
 645 
 646   LINE: while ($Line = GetTextLine(\*SEQUENCEFILE)) {
 647     if (($Line =~ /^[ \*\:\.]/) && ($Line !~ /[A-Z]/i)) {
 648       # Annotation for sequences: fully conserverd, weaker or stronger group conserverd.
 649       # Extract it and save...
 650       $LineLength = length($Line);
 651       $AnnotationStart = $LineLength - $SequenceLength;
 652       $AnnotationLength = $SequenceLength;
 653       $Annotation = substr($Line, $AnnotationStart, $AnnotationLength);
 654       $SequencesDataMap{ConservedAnnotation} .= $Annotation;
 655     }
 656     else {
 657       # Extract ID and sequences...
 658       ($ID, $Sequence)= $Line =~ /^[ ]*(.*?)[ ]+(.*?)[ 01-9]*$/;
 659       $Sequence =~ s/ //g;
 660       if (!($ID && $Sequence)) {
 661         next LINE;
 662       }
 663 
 664       if (exists $SequencesDataMap{Sequence}{$ID}) {
 665         # Append to existing alignment value...
 666         $SequenceLength = length($Sequence);
 667         $SequencesDataMap{Sequence}{$ID} .= $Sequence;
 668       }
 669       else {
 670         # New alignment data...
 671         $SequencesDataMap{Count} += 1;
 672         push @{$SequencesDataMap{IDs}}, $ID;
 673         $SequencesDataMap{Description}{$ID} = $ID;
 674         $SequencesDataMap{Sequence}{$ID} = $Sequence;
 675         $SequenceLength = length($Sequence);
 676       }
 677     }
 678   }
 679   close SEQUENCEFILE;
 680   return (\%SequencesDataMap);
 681 }
 682 
 683 # Read Pearson fasta file and return a reference to hash with these keys:
 684 #
 685 # {IDs} - Array of sequence IDs
 686 # {Count} - Number of sequences
 687 # {Description}{$ID} - Sequence description
 688 # {Sequence}{$ID} - Sequence for a specific ID
 689 # {InputFileType} - Sequence file format
 690 # {ConservedAnnotation} - Conserved residue annonation
 691 #
 692 sub _ReadPearsonFastaFile {
 693   my($FastaFileName, $ID, $Description, $Line, $IgnoreID, @LineWords, %FastaDataMap);
 694 
 695   ($FastaFileName) = @_;
 696 
 697   %FastaDataMap = ();
 698   @{$FastaDataMap{IDs}} =();
 699   %{$FastaDataMap{Description}} =();
 700   %{$FastaDataMap{Sequence}} =();
 701   $FastaDataMap{Count} = 0;
 702   $FastaDataMap{InputFileType} = 'Pearson';
 703 
 704   open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n";
 705   $ID = '';
 706   $IgnoreID = 0;
 707   LINE: while ($Line = GetTextLine(\*FASTAFILE)) {
 708     if ($Line =~ /^\>/) {
 709       # Start of a new ID...
 710       $Line =~ s/^\>//;
 711       $Line = RemoveLeadingWhiteSpaces($Line);
 712       @LineWords = ();
 713       @LineWords = split / /, $Line;
 714 
 715       $ID = $LineWords[0];
 716       $ID =~ s/ //g;
 717       $Description = $Line;
 718 
 719       $IgnoreID = 0;
 720       if (exists $FastaDataMap{Sequence}{$ID}) {
 721         $IgnoreID = 1;
 722         warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n";
 723         next LINE;
 724       }
 725       push @{$FastaDataMap{IDs}}, $ID;
 726       $FastaDataMap{Description}{$ID} = $Description;
 727       $FastaDataMap{Count} += 1;
 728       next LINE;
 729     }
 730     if ($IgnoreID) { next LINE; }
 731 
 732     # Remove any spaces in the sequence...
 733     $Line =~ s/ //g;
 734     # Sequence data for active ID...
 735     if (exists $FastaDataMap{Sequence}{$ID}) {
 736       $FastaDataMap{Sequence}{$ID} .= $Line;
 737     }
 738     else {
 739       $FastaDataMap{Sequence}{$ID} = $Line;
 740     }
 741   }
 742   close FASTAFILE;
 743   return \%FastaDataMap;
 744 }
 745 
 746 # Read PIR fasta file and return a reference to hash with these keys:
 747 #
 748 # {IDs} - Array of sequence IDs
 749 # {Count} - Number of sequences
 750 # {Description}{$ID} - Sequence description
 751 # {Sequence}{$ID} - Sequence for a specific ID
 752 # {InputFileType} - Sequence file format
 753 # {ConservedAnnotation} - Conserved residue annonation
 754 #
 755 # Format:
 756 # A sequence in PIR format consists of:
 757 # One line starting with
 758 #   a ">" (greater-than) sign, followed by
 759 #   a two-letter code describing the sequence type code (P1, F1, DL, DC, RL, RC, N3, N1 or XX), followed by
 760 #   a semicolon, followed by
 761 #   the sequence identification code (the database ID-code).
 762 # One line containing a textual description of the sequence.
 763 # One or more lines containing the sequence itself. The end of the
 764 # sequence is marked by a "*" (asterisk) character.
 765 #
 766 # A file in PIR format may comprise more than one sequence.
 767 #
 768 # The PIR format is also often referred to as the NBRF format.
 769 #
 770 # Code SequenceType
 771 # P1    Protein (complete)
 772 # F1    Protein (fragment)
 773 # DL    DNA (linear)
 774 # DC    DNA (circular)
 775 # RL    RNA (linear)
 776 # RC   RNA (circular)
 777 # N3    tRNA
 778 # N1    Other functional RNA
 779 #
 780 
 781 sub _ReadPIRFastaFile {
 782   my($FastaFileName, $ID, $Description, $Line, $SequenceTypeCode, $ReadingSequenceData, %FastaDataMap);
 783 
 784   ($FastaFileName) = @_;
 785 
 786   %FastaDataMap = ();
 787   @{$FastaDataMap{IDs}} =();
 788   %{$FastaDataMap{Description}} =();
 789   %{$FastaDataMap{Sequence}} =();
 790   %{$FastaDataMap{SequenceTypeCode}} =();
 791   $FastaDataMap{Count} = 0;
 792   $FastaDataMap{InputFileType} = 'PIR';
 793 
 794   open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n";
 795   $ID = '';
 796   $ReadingSequenceData = 0;
 797   LINE: while ($Line = GetTextLine(\*FASTAFILE)) {
 798     if ($Line =~ /^\>/) {
 799       # Start of a new ID...
 800       $Line =~ s/^\>//;
 801       $Line = RemoveLeadingWhiteSpaces($Line);
 802       ($SequenceTypeCode, $ID) = /^\>(.*?)\;(.*?)$/;
 803 
 804       # Use next line to retrieve sequence description...
 805       $Line = GetTextLine(\*FASTAFILE);
 806       $Line = RemoveLeadingWhiteSpaces($Line);
 807       $Description = $Line;
 808 
 809       if (exists $FastaDataMap{Sequence}{$ID}) {
 810         warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n";
 811         next LINE;
 812       }
 813       $ReadingSequenceData = 1;
 814       push @{$FastaDataMap{IDs}}, $ID;
 815       $FastaDataMap{SequenceTypeCode}{$ID} = $SequenceTypeCode;
 816       $FastaDataMap{Description}{$ID} = $Description;
 817       $FastaDataMap{Count} += 1;
 818       next LINE;
 819     }
 820     if (!$ReadingSequenceData) { next LINE; }
 821 
 822     # Remove any spaces in the sequence...
 823     $Line =~ s/ //g;
 824     if ($Line =~ /[\*]$/) {
 825       # End of sequence...
 826       $ReadingSequenceData = 0;
 827       $Line =~ s/[\*]$//;
 828     }
 829     # Sequence data for active ID...
 830     if (exists $FastaDataMap{Sequence}{$ID}) {
 831       $FastaDataMap{Sequence}{$ID} .= $Line;
 832     }
 833     else {
 834       $FastaDataMap{Sequence}{$ID} = $Line;
 835     }
 836   }
 837   close FASTAFILE;
 838   return \%FastaDataMap;
 839 }
 840 
 841 # Read MSF file and return a reference to hash with these keys:
 842 #
 843 # {IDs} : Array of IDs in order as they appear in file
 844 # {Count}: ID count...
 845 # {Description}{$ID} : Description data...
 846 # {Sequence}{$ID} : Sequence data...
 847 #
 848 sub _ReadMSFFile {
 849   my($MSFFileName, $Line, @LineWords, %MSFDataMap);
 850 
 851   ($MSFFileName) = @_;
 852 
 853   %MSFDataMap = ();
 854   @{$MSFDataMap{IDs}} =();
 855   %{$MSFDataMap{Description}} =();
 856   %{$MSFDataMap{Sequence}} =();
 857   $MSFDataMap{Count} = 0;
 858   $MSFDataMap{InputFileType} = 'MSF';
 859 
 860   open MSFFILE, "$MSFFileName" or die "Couldn't open $MSFFileName: $!\n";
 861 
 862   # Collect sequences and IDs...
 863   #
 864   # '//' after the name fields indicates end of header list and start of sequence data.
 865   #
 866   my($ID, $Len, $Check, $Weight, $Sequence, $NameFieldsFound, %MSFIDsMap);
 867   %MSFIDsMap = ();
 868   $NameFieldsFound = 0;
 869   LINE: while ($Line = GetTextLine(\*MSFFILE)) {
 870     if ($Line =~ /Name:/) {
 871       $NameFieldsFound++;
 872       ($ID, $Len, $Check, $Weight) = $Line =~ /^[ ]*Name:[ ]+(.*?)[ ]+Len:[ ]+(.*?)[ ]+Check:[ ]+(.*?)[ ]+Weight:[ ]+(.*?)[ ]*$/;
 873       if ($ID =~ / /) {
 874         ($ID) = $ID =~ /^(.*?)[ ]+/
 875       }
 876       if (exists $MSFIDsMap{$ID}) {
 877         warn "Warning: ID, $ID, in MSF file already exists. Ignoring ID and sequence data...\n";
 878         next LINE;
 879       }
 880       $MSFIDsMap{$ID} = $ID;
 881       push @{$MSFDataMap{IDs}}, $ID;
 882       $MSFDataMap{Description}{$ID} = $ID;
 883       $MSFDataMap{Count} += 1;
 884     }
 885     elsif ( /\/\// && $NameFieldsFound) {
 886       # End of header list...
 887       last LINE;
 888     }
 889   }
 890   # Collect all sequences...
 891   #
 892   my($FirstField, $SecondField);
 893   while ($Line = GetTextLine(\*MSFFILE)) {
 894     ($FirstField, $SecondField) = $Line =~ /^[ ]*(.*?)[ ]+(.*?)$/;
 895     if (exists $MSFIDsMap{$FirstField}) {
 896       # It's ID and sequence data...
 897       $ID = $FirstField;
 898       $Sequence = $SecondField;
 899       # Take out spaces and leave the gap characters...
 900       $Sequence =~ s/ //g;
 901       if ($MSFDataMap{Sequence}{$ID}) {
 902         $MSFDataMap{Sequence}{$ID} .= $Sequence;
 903       }
 904       else {
 905         $MSFDataMap{Sequence}{$ID} = $Sequence;
 906       }
 907     }
 908   }
 909 
 910   close MSFFILE;
 911   return \%MSFDataMap;
 912 }
 913 
 914