1 package SequenceFileUtil; 2 # 3 # $RCSfile: SequenceFileUtil.pm,v $ 4 # $Date: 2008/04/19 16:11:03 $ 5 # $Revision: 1.21 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 use 5.006; 29 use strict; 30 use Exporter; 31 use Text::ParseWords; 32 use TextUtil; 33 use FileUtil; 34 35 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 36 37 $VERSION = '1.00'; 38 @ISA = qw(Exporter); 39 @EXPORT = qw(AreSequenceLengthsIdentical CalcuatePercentSequenceIdentity CalculatePercentSequenceIdentityMatrix GetLongestSequence GetShortestSequence GetSequenceLength IsGapResidue IsSupportedSequenceFile IsClustalWSequenceFile IsPearsonFastaSequenceFile IsMSFSequenceFile ReadSequenceFile RemoveSequenceGaps RemoveSequenceAlignmentGapColumns WritePearsonFastaSequenceFile); 40 @EXPORT_OK = qw(); 41 42 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 43 44 # Compare lengths of all sequences... 45 sub AreSequenceLengthsIdentical { 46 my($SequencesDataRef) = @_; 47 my($Status, $ID, $FirstID, $FirstSeqLen, $FirstDifferentLenID, $SeqLen); 48 49 $Status = 1; 50 $FirstID = ''; 51 $FirstDifferentLenID = ''; 52 53 ID: for $ID (@{$SequencesDataRef->{IDs}}) { 54 if (!$FirstID) { 55 $FirstID = $ID; 56 $FirstSeqLen = length($SequencesDataRef->{Sequence}{$ID}); 57 next ID; 58 } 59 $SeqLen = length($SequencesDataRef->{Sequence}{$ID}); 60 if ($SeqLen != $FirstSeqLen) { 61 $Status = 0; 62 $FirstDifferentLenID = $ID; 63 last ID; 64 } 65 } 66 return ($Status); 67 } 68 69 # Calculate percent identity between two sequences. By default, gaps are ignored. 70 sub CalcuatePercentSequenceIdentity { 71 my($Sequence1, $Sequence2, $PercentIdentity, $IgnoreGaps, $Precision); 72 73 $PercentIdentity = ''; 74 $Precision = 1; 75 $IgnoreGaps = 1; 76 if (@_ == 4) { 77 ($Sequence1, $Sequence2, $IgnoreGaps, $Precision) = @_; 78 } 79 elsif (@_ == 3) { 80 ($Sequence1, $Sequence2, $IgnoreGaps) = @_; 81 } 82 elsif (@_ == 2) { 83 ($Sequence1, $Sequence2) = @_; 84 } 85 else { 86 return $PercentIdentity; 87 } 88 if (!(IsNotEmpty($Sequence1) && IsNotEmpty($Sequence2))) { 89 return $PercentIdentity; 90 } 91 my($Index, $Identity, $Sequence1Len, $Sequence2Len, $Residue1, $Residue2, $ResMatchCount, $ResCount); 92 93 $Sequence1Len = length($Sequence1); 94 $Sequence2Len = length($Sequence2); 95 96 $ResMatchCount = 0; 97 $ResCount = 0; 98 RESIDUE: for $Index (0 .. ($Sequence1Len - 1)) { 99 $Residue1 = substr($Sequence1, $Index, 1); 100 $Residue2 = ($Index < $Sequence2Len) ? substr($Sequence2, $Index, 1) : ''; 101 if ($IgnoreGaps) { 102 if ($Residue1 !~ /[A-Z]/i || $Residue2 !~ /[A-Z]/i) { 103 next RESIDUE; 104 } 105 } 106 if ($Residue1 eq $Residue2) { 107 $ResMatchCount++; 108 } 109 $ResCount++; 110 } 111 $Identity = $ResCount ? ($ResMatchCount/$ResCount) : 0.0; 112 $PercentIdentity = sprintf("%.${Precision}f", ($Identity * 100)); 113 114 return $PercentIdentity; 115 } 116 117 # Calculate pairwise identify matrix for all the sequences and return a reference 118 # to a hash with the following keys: 119 # 120 # {IDs} - Sequence IDs 121 # {Count} - Number of IDs 122 # {PercentIdentity}{$RowID}{$ColID} - Percent identify for a pair of sequences 123 # 124 sub CalculatePercentSequenceIdentityMatrix { 125 my($SequencesDataRef, $IgnoreGaps, , $Precision, $ID, $RowID, $ColID, $RowIDSeq, $ColIDSeq, $PercentIdentity, %IdentityMatrixData); 126 127 $IgnoreGaps = 1; 128 $Precision = 1; 129 if (@_ == 3) { 130 ($SequencesDataRef, $IgnoreGaps, $Precision) = @_; 131 } 132 elsif (@_ == 2) { 133 ($SequencesDataRef, $IgnoreGaps) = @_; 134 } 135 else { 136 ($SequencesDataRef) = @_; 137 } 138 139 %IdentityMatrixData = (); 140 @{$IdentityMatrixData{IDs}} = (); 141 %{$IdentityMatrixData{PercentIdentity}} = (); 142 $IdentityMatrixData{Count} = 0; 143 144 for $ID (@{$SequencesDataRef->{IDs}}) { 145 push @{$IdentityMatrixData{IDs}}, $ID; 146 $IdentityMatrixData{Count} += 1; 147 } 148 # Initialize and calculate percent identity data values... 149 for $RowID (@{$SequencesDataRef->{IDs}}) { 150 %{$IdentityMatrixData{PercentIdentity}{$RowID}} = (); 151 $RowIDSeq = $SequencesDataRef->{Sequence}{$RowID}; 152 for $ColID (@{$SequencesDataRef->{IDs}}) { 153 $IdentityMatrixData{$RowID}{$ColID} = ''; 154 $ColIDSeq = $SequencesDataRef->{Sequence}{$ColID}; 155 $PercentIdentity = CalcuatePercentSequenceIdentity($RowIDSeq, $ColIDSeq, $IgnoreGaps, $Precision); 156 $IdentityMatrixData{PercentIdentity}{$RowID}{$ColID} = $PercentIdentity; 157 } 158 } 159 return \%IdentityMatrixData; 160 } 161 162 # Retrieve information about shortest sequence... 163 sub GetShortestSequence { 164 my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description); 165 166 $IgnoreGaps = 1; 167 if (@_ == 2) { 168 ($SequencesDataRef, $IgnoreGaps) = @_; 169 } 170 else { 171 ($SequencesDataRef) = @_; 172 } 173 174 ($ID, $Sequence, $SeqLen, $Description) = _GetShortestOrLongestSequence($SequencesDataRef, 'Shortest', $IgnoreGaps); 175 return ($ID, $Sequence, $SeqLen, $Description); 176 } 177 178 # Retrieve information about longest sequence.. 179 sub GetLongestSequence { 180 my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description); 181 182 $IgnoreGaps = 1; 183 if (@_ == 2) { 184 ($SequencesDataRef, $IgnoreGaps) = @_; 185 } 186 else { 187 ($SequencesDataRef) = @_; 188 } 189 190 ($ID, $Sequence, $SeqLen, $Description) = _GetShortestOrLongestSequence($SequencesDataRef, 'Longest', $IgnoreGaps); 191 return ($ID, $Sequence, $SeqLen, $Description); 192 } 193 194 # Get sequence length... 195 sub GetSequenceLength { 196 my($Seq, $SeqLen, $IgnoreGaps); 197 198 $SeqLen = ''; $IgnoreGaps = 1; 199 if (@_ == 2) { 200 ($Seq, $IgnoreGaps) = @_; 201 } 202 else { 203 ($Seq) = @_; 204 } 205 if ($IgnoreGaps) { 206 my($Index, $Residue); 207 $SeqLen = 0; 208 for $Index (0 .. (length($Seq) - 1)) { 209 $Residue = substr($Seq, $Index, 1); 210 if ($Residue =~ /[A-Z]/i) { 211 $SeqLen++; 212 } 213 } 214 } 215 else { 216 $SeqLen = length($Seq); 217 } 218 219 return $SeqLen; 220 } 221 222 # Is it a gap residue... 223 sub IsGapResidue { 224 my($Residue) = @_; 225 my($Status); 226 227 $Status = ($Residue !~ /[A-Z]/i ) ? 1 : 0; 228 229 return $Status; 230 } 231 232 # Is it a supported sequence file? 233 # 234 # Supported seqence formats are: 235 # 236 # ALN/ClustalW .aln 237 # GCG/MSF .msf 238 # PILEUP/MSF .msf 239 # Fasts(Pearson) .fasta, .fta 240 # NBRF/PIR .pir 241 # 242 sub IsSupportedSequenceFile { 243 my($SequenceFile) = @_; 244 my($Status, $SequenceFormat); 245 $Status = 0; $SequenceFormat = 'NotSupported'; 246 247 SEQFORMAT: { 248 if (IsClustalWSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'ClustalW'; last SEQFORMAT} 249 if (IsPearsonFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'Pearson'; last SEQFORMAT} 250 if (IsPIRFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'PIR'; last SEQFORMAT} 251 if (IsMSFSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'MSF'; last SEQFORMAT} 252 $Status = 0; $SequenceFormat = 'NotSupported'; 253 } 254 return ($Status, $SequenceFormat); 255 } 256 257 # Is it a ClustalW multiple sequence sequence file... 258 sub IsClustalWSequenceFile { 259 my($SequenceFile) = @_; 260 my($Status, $Line); 261 262 $Status = 0; 263 264 open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n"; 265 $Line = GetTextLine(\*SEQUENCEFILE); 266 $Status = ($Line =~ /(ClustalW|Clustal W|Clustal)/i ) ? 1 : 0; 267 close SEQUENCEFILE; 268 269 return $Status; 270 } 271 272 # Is it a valid Pearson fasta sequence or alignment file? 273 # 274 sub IsPearsonFastaSequenceFile { 275 my($FastaFile, $Line, $Status); 276 277 ($FastaFile) = @_; 278 $Status = 0; 279 280 open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n"; 281 $Line = GetTextLine(\*FASTAFILE); 282 283 # First line starts with > and the fourth character is not ';'; otherwise, it's 284 # PIR FASTA format. 285 if ($Line =~ /^>/) { 286 my($FourthChar); 287 $FourthChar = substr($Line, 3, 1); 288 $Status = ($FourthChar !~ /\;/) ? 1 : 0; 289 } 290 close FASTAFILE; 291 292 return $Status; 293 } 294 295 # Is it a valid NBRF/PIR fasta sequence or alignment file? 296 # 297 sub IsPIRFastaSequenceFile { 298 my($FastaFile, $Line, $Status); 299 300 ($FastaFile) = @_; 301 $Status = 0; 302 303 open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n"; 304 $Line = GetTextLine(\*FASTAFILE); 305 306 # First line starts with > and the fourth character is ';'; otherwise, it's 307 # a Pearson FASTA format. 308 if ($Line =~ /^>/) { 309 my($FourthChar); 310 $FourthChar = substr($Line, 3, 1); 311 $Status = ($FourthChar =~ /\;/) ? 1 : 0; 312 } 313 close FASTAFILE; 314 315 return $Status; 316 } 317 318 # Is it a valid MSF sequence or alignment file? 319 # 320 sub IsMSFSequenceFile { 321 my($MSFFile) = @_; 322 323 open MSFFILE, "$MSFFile" or die "Couldn't open $MSFFile: $!\n"; 324 325 my($Line, $Status); 326 327 $Status = 0; 328 # Find a line that contains MSF: keyword and ends with '..' 329 LINE: while ($Line = GetTextLine(\*MSFFILE)) { 330 $Line = RemoveLeadingWhiteSpaces($Line); 331 if ($Line =~ /MSF:/i && $Line =~ /\.\.[ ]*$/) { 332 $Status = 1; 333 last LINE; 334 } 335 elsif ($Line =~ /(!!AA_MULTIPLE_ALIGNMENT|!!NA_MULTIPLE_ALIGNMENT|PILEUP)/i) { 336 # Pileup MSF... 337 $Status = 1; 338 last LINE; 339 } 340 } 341 close MSFFILE; 342 343 return $Status; 344 } 345 346 # Read sequence or sequence alignment file... 347 sub ReadSequenceFile { 348 my($SequenceFile) = @_; 349 350 if (IsPearsonFastaSequenceFile($SequenceFile)) { 351 return ReadPearsonFastaSequenceFile($SequenceFile); 352 } 353 elsif (IsPIRFastaSequenceFile($SequenceFile)) { 354 return ReadPIRFastaSequenceFile($SequenceFile); 355 } 356 elsif (IsMSFSequenceFile($SequenceFile)) { 357 return ReadMSFSequenceFile($SequenceFile); 358 } 359 elsif (IsClustalWSequenceFile($SequenceFile)) { 360 return ReadClustalWSequenceFile($SequenceFile); 361 } 362 else { 363 return undef; 364 } 365 } 366 367 # Read file and setup alignment data... 368 sub ReadClustalWSequenceFile { 369 my($SequenceFile) = @_; 370 371 return _ReadFileAndSetupSequencesData($SequenceFile, 'ClustalW'); 372 } 373 374 # Read file and setup alignment data... 375 sub ReadPearsonFastaSequenceFile { 376 my($SequenceFile) = @_; 377 378 return _ReadFileAndSetupSequencesData($SequenceFile, 'Pearson'); 379 } 380 381 # Read file and setup alignment data... 382 sub ReadPIRFastaSequenceFile { 383 my($SequenceFile) = @_; 384 385 return _ReadFileAndSetupSequencesData($SequenceFile, 'PIR'); 386 } 387 388 389 # Read file and setup sequence data... 390 sub ReadMSFSequenceFile { 391 my($SequenceFile) = @_; 392 393 return _ReadFileAndSetupSequencesData($SequenceFile, 'MSF'); 394 } 395 396 # Write out a Pearson FASTA file... 397 sub WritePearsonFastaSequenceFile { 398 my($SequenceFileName, $SequenceDataRef, $MaxLength, $ID, $Description, $Sequence, $WrappedSequence); 399 400 $MaxLength = 80; 401 if (@_ == 3) { 402 ($SequenceFileName, $SequenceDataRef, $MaxLength) = @_; 403 } 404 elsif (@_ == 2) { 405 ($SequenceFileName, $SequenceDataRef) = @_; 406 } 407 open SEQUENCEFILE, ">$SequenceFileName" or die "Can't open $SequenceFileName: $!\n"; 408 for $ID (@{$SequenceDataRef->{IDs}}) { 409 $Description = $SequenceDataRef->{Description}{$ID}; 410 $Sequence = $SequenceDataRef->{Sequence}{$ID}; 411 $WrappedSequence = WrapText($Sequence, $MaxLength, "\n"); 412 413 # Description also contains ID... 414 print SEQUENCEFILE ">$Description\n"; 415 print SEQUENCEFILE "$WrappedSequence\n"; 416 } 417 close SEQUENCEFILE; 418 } 419 420 # Get ID, Sequence and Length for smallest or longest sequence 421 sub _GetShortestOrLongestSequence { 422 my($SequencesDataRef, $SequenceType, $IgnoreGaps) = @_; 423 my($ID, $Seq, $SeqLen, $Description, $FirstID, $FirstSeqLen, $CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); 424 425 ($ID, $Seq, $SeqLen) = ('', '', ''); 426 $FirstID = ''; 427 428 ID: for $CurrentID (@{$SequencesDataRef->{IDs}}) { 429 $CurrentSeq = $IgnoreGaps ? RemoveSequenceGaps($SequencesDataRef->{Sequence}{$CurrentID}) : $SequencesDataRef->{Sequence}{$CurrentID}; 430 $CurrentSeqLen = GetSequenceLength($CurrentSeq, $IgnoreGaps); 431 $CurrentDescription = $SequencesDataRef->{Description}{$CurrentID}; 432 if (!$FirstID) { 433 $FirstID = $ID; $FirstSeqLen = $CurrentSeqLen; 434 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); 435 next ID; 436 } 437 if ($CurrentSeqLen != $SeqLen) { 438 if (($SequenceType =~ /Shortest/i) && ($CurrentSeqLen < $SeqLen)) { 439 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); 440 } 441 elsif (($SequenceType =~ /Longest/i) && ($CurrentSeqLen > $SeqLen) ) { 442 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); 443 } 444 } 445 } 446 return ($ID, $Seq, $SeqLen, $Description); 447 } 448 449 # Remove gaps in the sequence and return new sequence... 450 sub RemoveSequenceGaps { 451 my($Seq) = @_; 452 my($SeqWithoutGaps, $SeqLen, $Index, $Residue); 453 454 $SeqWithoutGaps = ''; 455 $SeqLen = length($Seq); 456 for $Index (0 .. ($SeqLen - 1)) { 457 $Residue = substr($Seq, $Index, 1); 458 if ($Residue =~ /[A-Z]/i) { 459 $SeqWithoutGaps .= $Residue; 460 } 461 } 462 463 return $SeqWithoutGaps; 464 } 465 466 # Using input alignment data map ref containing following keys, generate 467 # a new hash with same set of keys after residue columns containg only 468 # gaps have been removed: 469 # 470 # {IDs} : Array of IDs in order as they appear in file 471 # {Count}: ID count... 472 # {Description}{$ID} : Description data... 473 # {Sequence}{$ID} : Sequence data... 474 # 475 sub RemoveSequenceAlignmentGapColumns { 476 my($ID, $AlignmentDataMapRef, %NewAlignmentDataMap); 477 478 ($AlignmentDataMapRef) = @_; 479 480 %NewAlignmentDataMap = (); 481 @{$NewAlignmentDataMap{IDs}} =(); 482 %{$NewAlignmentDataMap{Description}} =(); 483 %{$NewAlignmentDataMap{Sequence}} =(); 484 $NewAlignmentDataMap{Count} = 0; 485 486 # Transfer ID and count information... 487 for $ID (@{$AlignmentDataMapRef->{IDs}}) { 488 push @{$NewAlignmentDataMap{IDs}}, $ID; 489 $NewAlignmentDataMap{Description}{$ID} = $AlignmentDataMapRef->{Description}{$ID}; 490 $NewAlignmentDataMap{Sequence}{$ID} = ''; 491 $NewAlignmentDataMap{Count} += 1; 492 } 493 494 # Go over residue columns and transfer the data... 495 my($FirstID, $FirstSeq, $FirstSeqLen, $Index, $Res, $GapColumn); 496 497 $FirstID = $AlignmentDataMapRef->{IDs}[0]; 498 $FirstSeq = $AlignmentDataMapRef->{Sequence}{$FirstID}; 499 $FirstSeqLen = length($FirstSeq); 500 501 RES: for $Index (0 .. ($FirstSeqLen - 1)) { 502 # Is this a gap column? 503 $GapColumn = 1; 504 ID: for $ID (@{$AlignmentDataMapRef->{IDs}}) { 505 $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1); 506 if ($Res =~ /[A-Z]/i) { 507 $GapColumn = 0; 508 last ID; 509 } 510 } 511 if ($GapColumn) { 512 next RES; 513 } 514 # Transfer this residue... 515 for $ID (@{$AlignmentDataMapRef->{IDs}}) { 516 $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1); 517 $NewAlignmentDataMap{Sequence}{$ID} .= $Res; 518 } 519 } 520 521 return (\%NewAlignmentDataMap); 522 } 523 524 # 525 # Read sequences file and return a reference to hash with the following keys: 526 # 527 # {IDs} - Array of sequence IDs 528 # {Count} - Number of sequences 529 # {Description}{$ID} - Sequence description 530 # {Sequence}{$ID} - Sequence for a specific ID 531 # {InputFileType} - Sequence file format 532 # {ConservedAnnotation} - Conserved residue annonation 533 # 534 # Note: 535 # . Conserved residue annotation either exist in the input sequence alignment file or set 536 # for a file containing same number of residues for all the sequence using the following 537 # notation: * - Residue conserved; ' ' - Residue not conserved. 538 # 539 sub _ReadFileAndSetupSequencesData { 540 my($SequenceFile, $SequenceType) = @_; 541 my($SequenceDataMapRef); 542 543 $SequenceDataMapRef = undef; 544 545 # Read sequence file... 546 $SequenceDataMapRef = ''; 547 if ($SequenceType =~ /^ClustalW$/i) { 548 $SequenceDataMapRef = _ReadClustalWFile($SequenceFile); 549 } 550 elsif ($SequenceType =~ /^Pearson$/i) { 551 $SequenceDataMapRef = _ReadPearsonFastaFile($SequenceFile); 552 } 553 elsif ($SequenceType =~ /^PIR$/i) { 554 $SequenceDataMapRef = _ReadPIRFastaFile($SequenceFile); 555 } 556 elsif ($SequenceType =~ /^MSF$/i) { 557 $SequenceDataMapRef = _ReadMSFFile($SequenceFile); 558 } 559 else { 560 return $SequenceDataMapRef; 561 } 562 563 if (exists $SequenceDataMapRef->{ConservedAnnotation}) { 564 return ($SequenceDataMapRef); 565 } 566 if (!(($SequenceDataMapRef->{Count} > 1) && (AreSequenceLengthsIdentical($SequenceDataMapRef)))) { 567 return ($SequenceDataMapRef); 568 } 569 570 # Use the first sequence to setup an empty ConservedAnnotation key... 571 # And mark fully conserved residues... 572 # 573 my($ID, $Sequence, $FirstSequence, $FirstSeqLen, $Res, $FirstRes, $ResConserved, $Index); 574 $ID = $SequenceDataMapRef->{IDs}[0]; 575 $FirstSequence = $SequenceDataMapRef->{Sequence}{$ID}; 576 $FirstSeqLen = length($FirstSequence); 577 $SequenceDataMapRef->{ConservedAnnotation} = ''; 578 for $Index (0 .. ($FirstSeqLen - 1)) { 579 $FirstRes = ''; 580 $ResConserved = 1; 581 ID: for $ID (@{$SequenceDataMapRef->{IDs}}) { 582 $Sequence = $SequenceDataMapRef->{Sequence}{$ID}; 583 $Res = substr($Sequence, $Index, 1); 584 if (!$FirstRes) { 585 $FirstRes = $Res; 586 next ID; 587 } 588 if (($Res !~ /[A-Z]/i) || ($Res ne $FirstRes)) { 589 $ResConserved = 0; 590 last ID; 591 } 592 } 593 if ($ResConserved) { 594 $SequenceDataMapRef->{ConservedAnnotation} .= '*'; 595 } 596 else { 597 $SequenceDataMapRef->{ConservedAnnotation} .= ' '; 598 } 599 } 600 601 return ($SequenceDataMapRef); 602 } 603 604 # Read sequence data in ClustalW multiple sequence alignment file and 605 # return a reference to hash with these keys and values: 606 # 607 # {IDs} - Array of sequence IDs 608 # {Count} - Number of sequences 609 # {Description}{$ID} - Sequence description 610 # {Sequence}{$ID} - Sequence for a specific ID 611 # {InputFileType} - Sequence file format 612 # {ConservedAnnotation} - Conserved residue annonations: space, *, : , . 613 # 614 # 615 # 616 # And based on ClustalW/X manual, here is what the ConservedAnnonations mean: 617 # 618 # '*' indicates positions which have a single, fully conserved residue 619 # 620 # ':' indicates that one of the following 'strong' groups is fully conserved: STA 621 # NEQK NHQK NDEQ QHRK MILV MILF HY FYW 622 623 # '.' indicates that one of the following 'weaker' groups is fully conserved: 624 # CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY 625 # 626 # These are all the positively scoring groups that occur in the Gonnet Pam250 627 # matrix. The strong and weak groups are defined as strong score >0.5 and weak 628 # score =<0.5 respectively. 629 # 630 sub _ReadClustalWFile { 631 my($SequenceFile) = @_; 632 my(%SequencesDataMap); 633 634 # Initialize data... 635 %SequencesDataMap = (); 636 @{$SequencesDataMap{IDs}} = (); 637 %{$SequencesDataMap{Description}} = (); 638 %{$SequencesDataMap{Sequence}} = (); 639 $SequencesDataMap{Count} = 0; 640 $SequencesDataMap{ConservedAnnotation} = ''; 641 $SequencesDataMap{InputFileType} = 'ClustalW'; 642 643 open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n"; 644 645 my($Line, $LineLength, $AnnotationStart, $AnnotationLength, $Annotation, $Sequence, $SequenceLength, $ID, $IDIndex); 646 647 # Ignore the header line... 648 $Line = <SEQUENCEFILE>; 649 650 LINE: while ($Line = GetTextLine(\*SEQUENCEFILE)) { 651 if (($Line =~ /^[ \*\:\.]/) && ($Line !~ /[A-Z]/i)) { 652 # Annotation for sequences: fully conserverd, weaker or stronger group conserverd. 653 # Extract it and save... 654 $LineLength = length($Line); 655 $AnnotationStart = $LineLength - $SequenceLength; 656 $AnnotationLength = $SequenceLength; 657 $Annotation = substr($Line, $AnnotationStart, $AnnotationLength); 658 $SequencesDataMap{ConservedAnnotation} .= $Annotation; 659 } 660 else { 661 # Extract ID and sequences... 662 ($ID, $Sequence)= $Line =~ /^[ ]*(.*?)[ ]+(.*?)[ 01-9]*$/; 663 $Sequence =~ s/ //g; 664 if (!($ID && $Sequence)) { 665 next LINE; 666 } 667 668 if (exists $SequencesDataMap{Sequence}{$ID}) { 669 # Append to existing alignment value... 670 $SequenceLength = length($Sequence); 671 $SequencesDataMap{Sequence}{$ID} .= $Sequence; 672 } 673 else { 674 # New alignment data... 675 $SequencesDataMap{Count} += 1; 676 push @{$SequencesDataMap{IDs}}, $ID; 677 $SequencesDataMap{Description}{$ID} = $ID; 678 $SequencesDataMap{Sequence}{$ID} = $Sequence; 679 $SequenceLength = length($Sequence); 680 } 681 } 682 } 683 close SEQUENCEFILE; 684 return (\%SequencesDataMap); 685 } 686 687 # Read Pearson fasta file and return a reference to hash with these keys: 688 # 689 # {IDs} - Array of sequence IDs 690 # {Count} - Number of sequences 691 # {Description}{$ID} - Sequence description 692 # {Sequence}{$ID} - Sequence for a specific ID 693 # {InputFileType} - Sequence file format 694 # {ConservedAnnotation} - Conserved residue annonation 695 # 696 sub _ReadPearsonFastaFile { 697 my($FastaFileName, $ID, $Description, $Line, $IgnoreID, @LineWords, %FastaDataMap); 698 699 ($FastaFileName) = @_; 700 701 %FastaDataMap = (); 702 @{$FastaDataMap{IDs}} =(); 703 %{$FastaDataMap{Description}} =(); 704 %{$FastaDataMap{Sequence}} =(); 705 $FastaDataMap{Count} = 0; 706 $FastaDataMap{InputFileType} = 'Pearson'; 707 708 open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n"; 709 $ID = ''; 710 $IgnoreID = 0; 711 LINE: while ($Line = GetTextLine(\*FASTAFILE)) { 712 if ($Line =~ /^\>/) { 713 # Start of a new ID... 714 $Line =~ s/^\>//; 715 $Line = RemoveLeadingWhiteSpaces($Line); 716 @LineWords = (); 717 @LineWords = split / /, $Line; 718 719 $ID = $LineWords[0]; 720 $ID =~ s/ //g; 721 $Description = $Line; 722 723 $IgnoreID = 0; 724 if (exists $FastaDataMap{Sequence}{$ID}) { 725 $IgnoreID = 1; 726 warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n"; 727 next LINE; 728 } 729 push @{$FastaDataMap{IDs}}, $ID; 730 $FastaDataMap{Description}{$ID} = $Description; 731 $FastaDataMap{Count} += 1; 732 next LINE; 733 } 734 if ($IgnoreID) { next LINE; } 735 736 # Remove any spaces in the sequence... 737 $Line =~ s/ //g; 738 # Sequence data for active ID... 739 if (exists $FastaDataMap{Sequence}{$ID}) { 740 $FastaDataMap{Sequence}{$ID} .= $Line; 741 } 742 else { 743 $FastaDataMap{Sequence}{$ID} = $Line; 744 } 745 } 746 close FASTAFILE; 747 return \%FastaDataMap; 748 } 749 750 # Read PIR fasta file and return a reference to hash with these keys: 751 # 752 # {IDs} - Array of sequence IDs 753 # {Count} - Number of sequences 754 # {Description}{$ID} - Sequence description 755 # {Sequence}{$ID} - Sequence for a specific ID 756 # {InputFileType} - Sequence file format 757 # {ConservedAnnotation} - Conserved residue annonation 758 # 759 # Format: 760 # A sequence in PIR format consists of: 761 # One line starting with 762 # a ">" (greater-than) sign, followed by 763 # a two-letter code describing the sequence type code (P1, F1, DL, DC, RL, RC, N3, N1 or XX), followed by 764 # a semicolon, followed by 765 # the sequence identification code (the database ID-code). 766 # One line containing a textual description of the sequence. 767 # One or more lines containing the sequence itself. The end of the 768 # sequence is marked by a "*" (asterisk) character. 769 # 770 # A file in PIR format may comprise more than one sequence. 771 # 772 # The PIR format is also often referred to as the NBRF format. 773 # 774 # Code SequenceType 775 # P1 Protein (complete) 776 # F1 Protein (fragment) 777 # DL DNA (linear) 778 # DC DNA (circular) 779 # RL RNA (linear) 780 # RC RNA (circular) 781 # N3 tRNA 782 # N1 Other functional RNA 783 # 784 785 sub _ReadPIRFastaFile { 786 my($FastaFileName, $ID, $Description, $Line, $SequenceTypeCode, $ReadingSequenceData, %FastaDataMap); 787 788 ($FastaFileName) = @_; 789 790 %FastaDataMap = (); 791 @{$FastaDataMap{IDs}} =(); 792 %{$FastaDataMap{Description}} =(); 793 %{$FastaDataMap{Sequence}} =(); 794 %{$FastaDataMap{SequenceTypeCode}} =(); 795 $FastaDataMap{Count} = 0; 796 $FastaDataMap{InputFileType} = 'PIR'; 797 798 open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n"; 799 $ID = ''; 800 $ReadingSequenceData = 0; 801 LINE: while ($Line = GetTextLine(\*FASTAFILE)) { 802 if ($Line =~ /^\>/) { 803 # Start of a new ID... 804 $Line =~ s/^\>//; 805 $Line = RemoveLeadingWhiteSpaces($Line); 806 ($SequenceTypeCode, $ID) = /^\>(.*?)\;(.*?)$/; 807 808 # Use next line to retrieve sequence description... 809 $Line = GetTextLine(\*FASTAFILE); 810 $Line = RemoveLeadingWhiteSpaces($Line); 811 $Description = $Line; 812 813 if (exists $FastaDataMap{Sequence}{$ID}) { 814 warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n"; 815 next LINE; 816 } 817 $ReadingSequenceData = 1; 818 push @{$FastaDataMap{IDs}}, $ID; 819 $FastaDataMap{SequenceTypeCode}{$ID} = $SequenceTypeCode; 820 $FastaDataMap{Description}{$ID} = $Description; 821 $FastaDataMap{Count} += 1; 822 next LINE; 823 } 824 if (!$ReadingSequenceData) { next LINE; } 825 826 # Remove any spaces in the sequence... 827 $Line =~ s/ //g; 828 if ($Line =~ /[\*]$/) { 829 # End of sequence... 830 $ReadingSequenceData = 0; 831 $Line =~ s/[\*]$//; 832 } 833 # Sequence data for active ID... 834 if (exists $FastaDataMap{Sequence}{$ID}) { 835 $FastaDataMap{Sequence}{$ID} .= $Line; 836 } 837 else { 838 $FastaDataMap{Sequence}{$ID} = $Line; 839 } 840 } 841 close FASTAFILE; 842 return \%FastaDataMap; 843 } 844 845 # Read MSF file and return a reference to hash with these keys: 846 # 847 # {IDs} : Array of IDs in order as they appear in file 848 # {Count}: ID count... 849 # {Description}{$ID} : Description data... 850 # {Sequence}{$ID} : Sequence data... 851 # 852 sub _ReadMSFFile { 853 my($MSFFileName, $Line, @LineWords, %MSFDataMap); 854 855 ($MSFFileName) = @_; 856 857 %MSFDataMap = (); 858 @{$MSFDataMap{IDs}} =(); 859 %{$MSFDataMap{Description}} =(); 860 %{$MSFDataMap{Sequence}} =(); 861 $MSFDataMap{Count} = 0; 862 $MSFDataMap{InputFileType} = 'MSF'; 863 864 open MSFFILE, "$MSFFileName" or die "Couldn't open $MSFFileName: $!\n"; 865 866 # Collect sequences and IDs... 867 # 868 # '//' after the name fields indicates end of header list and start of sequence data. 869 # 870 my($ID, $Len, $Check, $Weight, $Sequence, $NameFieldsFound, %MSFIDsMap); 871 %MSFIDsMap = (); 872 $NameFieldsFound = 0; 873 LINE: while ($Line = GetTextLine(\*MSFFILE)) { 874 if ($Line =~ /Name:/) { 875 $NameFieldsFound++; 876 ($ID, $Len, $Check, $Weight) = $Line =~ /^[ ]*Name:[ ]+(.*?)[ ]+Len:[ ]+(.*?)[ ]+Check:[ ]+(.*?)[ ]+Weight:[ ]+(.*?)[ ]*$/; 877 if ($ID =~ / /) { 878 ($ID) = $ID =~ /^(.*?)[ ]+/ 879 } 880 if (exists $MSFIDsMap{$ID}) { 881 warn "Warning: ID, $ID, in MSF file already exists. Ignoring ID and sequence data...\n"; 882 next LINE; 883 } 884 $MSFIDsMap{$ID} = $ID; 885 push @{$MSFDataMap{IDs}}, $ID; 886 $MSFDataMap{Description}{$ID} = $ID; 887 $MSFDataMap{Count} += 1; 888 } 889 elsif ( /\/\// && $NameFieldsFound) { 890 # End of header list... 891 last LINE; 892 } 893 } 894 # Collect all sequences... 895 # 896 my($FirstField, $SecondField); 897 while ($Line = GetTextLine(\*MSFFILE)) { 898 ($FirstField, $SecondField) = $Line =~ /^[ ]*(.*?)[ ]+(.*?)$/; 899 if (exists $MSFIDsMap{$FirstField}) { 900 # It's ID and sequence data... 901 $ID = $FirstField; 902 $Sequence = $SecondField; 903 # Take out spaces and leave the gap characters... 904 $Sequence =~ s/ //g; 905 if ($MSFDataMap{Sequence}{$ID}) { 906 $MSFDataMap{Sequence}{$ID} .= $Sequence; 907 } 908 else { 909 $MSFDataMap{Sequence}{$ID} = $Sequence; 910 } 911 } 912 } 913 914 close MSFFILE; 915 return \%MSFDataMap; 916 } 917 918