MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: AnalyzeSequenceFilesData.pl,v $
   4 # $Date: 2008/01/30 21:44:43 $
   5 # $Revision: 1.20 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use 5.006;
  30 use strict;
  31 use FindBin; use lib "$FindBin::Bin/../lib";
  32 use Getopt::Long;
  33 use File::Basename;
  34 use Text::ParseWords;
  35 use Benchmark;
  36 use FileUtil;
  37 use TextUtil;
  38 use SequenceFileUtil;
  39 use AminoAcids;
  40 use NucleicAcids;
  41 
  42 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  43 
  44 # Autoflush STDOUT
  45 $| = 1;
  46 
  47 # Starting message...
  48 $ScriptName = basename($0);
  49 print "\n$ScriptName: Starting...\n\n";
  50 $StartTime = new Benchmark;
  51 
  52 # Setup script usage message...
  53 SetupScriptUsage();
  54 if ($Options{help} || @ARGV < 1) {
  55   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  56 }
  57 
  58 # Expand wild card file names...
  59 my(@SequenceFilesList);
  60 @SequenceFilesList = ExpandFileNames(\@ARGV, "aln msf fasta fta pir");
  61 
  62 # Process options...
  63 my(%OptionsInfo);
  64 print "Processing options...\n";
  65 ProcessOptions();
  66 
  67 # Set up information about input files...
  68 print "Checking input sequence file(s)...\n";
  69 my(%SequenceFilesInfo);
  70 RetrieveSequenceFilesInfo();
  71 SetupSequenceRegionsData();
  72 
  73 # Process input files..
  74 my($FileIndex, $SequenceFile, $FileProcessingMsg);
  75 $FileProcessingMsg = "Processing file";
  76 if (@SequenceFilesList > 1) {
  77   print "Processing sequence files...\n";
  78   $FileProcessingMsg = "\n$FileProcessingMsg";
  79 }
  80 
  81 for $FileIndex (0 .. $#SequenceFilesList) {
  82   if ($SequenceFilesInfo{FilesOkay}[$FileIndex]) {
  83     $SequenceFile = $SequenceFilesList[$FileIndex];
  84     print "$FileProcessingMsg $SequenceFile...\n";
  85     AnalyzeSequenceFileData($FileIndex);
  86   }
  87 }
  88 
  89 print "$ScriptName:Done...\n\n";
  90 
  91 $EndTime = new Benchmark;
  92 $TotalTime = timediff ($EndTime, $StartTime);
  93 print "Total time: ", timestr($TotalTime), "\n";
  94 
  95 ###############################################################################
  96 
  97 # Analyze sequence file...
  98 sub AnalyzeSequenceFileData {
  99   my($FileIndex) = @_;
 100   my($SequenceFile, $SequenceDataRef);
 101 
 102   $SequenceFile = $SequenceFilesList[$FileIndex];
 103 
 104   open SEQUENCEFILE, "$SequenceFile" or die "Error: Can't open $SequenceFile: $! \n";
 105   $SequenceDataRef = ReadSequenceFile($SequenceFile);
 106   close SEQUENCEFILE;
 107 
 108   if ($OptionsInfo{CalculatePercentIdentityMatrix}) {
 109     CalculatePercentIdentityMatrix($FileIndex, $SequenceDataRef);
 110   }
 111   if ($OptionsInfo{PerformResidueFrequencyAnalysis}) {
 112     PerformResidueFrequencyAnalysis($FileIndex, $SequenceDataRef);
 113   }
 114 }
 115 
 116 # Calculate percent identity matrix...
 117 sub CalculatePercentIdentityMatrix {
 118   my($FileIndex, $SequenceDataRef) = @_;
 119   my($PercentIdentity, $PercentIdentityMatrixFile, $PercentIdentityMatrixRef, $RowID, $ColID, $Line, @LineWords);
 120 
 121   $PercentIdentityMatrixFile = $SequenceFilesInfo{OutFileRoot}[$FileIndex] . 'PercentIdentityMatrix.' . $SequenceFilesInfo{OutFileExt}[$FileIndex];
 122   $PercentIdentityMatrixRef = CalculatePercentSequenceIdentityMatrix($SequenceDataRef, $OptionsInfo{IgnoreGaps}, $OptionsInfo{Precision});
 123 
 124   print "Generating percent identity matrix file $PercentIdentityMatrixFile...\n";
 125   open OUTFILE, ">$PercentIdentityMatrixFile" or die "Can't open $PercentIdentityMatrixFile: $!\n";
 126 
 127   # Write out column labels...
 128   @LineWords = ();
 129   push @LineWords, '';
 130   for $ColID (@{$PercentIdentityMatrixRef->{IDs}}) {
 131     push @LineWords, $ColID;
 132   }
 133   $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 134   print OUTFILE "$Line\n";
 135 
 136   # Write out rows...
 137   for $RowID (@{$PercentIdentityMatrixRef->{IDs}}) {
 138     @LineWords = ();
 139     push @LineWords, $RowID;
 140     for $ColID (@{$PercentIdentityMatrixRef->{IDs}}) {
 141       $PercentIdentity = $PercentIdentityMatrixRef->{PercentIdentity}{$RowID}{$ColID};
 142       push @LineWords, $PercentIdentity;
 143     }
 144     $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 145     print OUTFILE "$Line\n";
 146   }
 147   close OUTFILE;
 148 }
 149 
 150 # Perform frequency analysis...
 151 sub PerformResidueFrequencyAnalysis {
 152   my($FileIndex, $SequenceDataRef) = @_;
 153 
 154   CountResiduesInRegions($FileIndex, $SequenceDataRef);
 155   CalculatePercentResidueFrequencyInRegions($FileIndex, $SequenceDataRef);
 156   GeneratePercentResidueFrequencyOutFilesForRegions($FileIndex, $SequenceDataRef);
 157 }
 158 
 159 # Count residues...
 160 sub CountResiduesInRegions {
 161   my($FileIndex, $SequenceDataRef) = @_;
 162 
 163   # Setup rerfernce sequence data...
 164   my($RefereceSequenceID, $RefereceSequence);
 165   $RefereceSequenceID = $SequenceFilesInfo{RefereceSequenceID}[$FileIndex];
 166   $RefereceSequence = $SequenceFilesInfo{RefereceSequence}[$FileIndex];
 167 
 168   # Count residues...
 169   my($RegionID, $StartResNum, $EndResNum, $ResNum, $ResIndex, $ID, $Sequence, $Residue);
 170   for $RegionID (@{$SequenceFilesInfo{RegionsData}[$FileIndex]{RegionIDs}}) {
 171     $StartResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{StartResNum};
 172     $EndResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{EndResNum};
 173     RESNUM: for $ResNum ($StartResNum .. $EndResNum) {
 174       $ResIndex = $ResNum - 1;
 175       if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) {
 176 	next RESNUM;
 177       }
 178       # Go over residues in column $ResNum in all the sequences...
 179       ID: for $ID (@{$SequenceDataRef->{IDs}}) {
 180 	$Sequence = $SequenceDataRef->{Sequence}{$ID};
 181 	$Residue = substr($Sequence, $ResIndex, 1);
 182 	if (IsGapResidue($Residue)) {
 183 	  $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{Gap} += 1;
 184 	}
 185 	else {
 186 	  if (exists $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue}) {
 187 	    $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue} += 1;
 188 	  }
 189 	  else {
 190 	    # Internal error...
 191 	    print "Warning: Ignoring residue $Residue in sequence $ID during ResidueFrequencyAnalysis calculation: Unknown residue...\n";
 192 	  }
 193 	}
 194       }
 195     }
 196   }
 197 }
 198 
 199 # Calculate percent frequency for various residues in the sequence regions...
 200 sub CalculatePercentResidueFrequencyInRegions {
 201   my($FileIndex, $SequenceDataRef) = @_;
 202   my($RegionID, $StartResNum, $EndResNum, $ResNum, $Residue, $Count, $PercentCount, $MaxResiduesCount, $Precision);
 203 
 204   $MaxResiduesCount = $SequenceDataRef->{Count};
 205   $Precision = $OptionsInfo{Precision};
 206   for $RegionID (@{$SequenceFilesInfo{RegionsData}[$FileIndex]{RegionIDs}}) {
 207     $StartResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{StartResNum};
 208     $EndResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{EndResNum};
 209     RESNUM: for $ResNum ($StartResNum .. $EndResNum) {
 210       if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) {
 211 	next RESNUM;
 212       }
 213       for $Residue (keys %{$SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}}) {
 214 	$Count = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue};
 215 	$PercentCount = ($Count / $MaxResiduesCount) * 100;
 216 	$PercentCount = sprintf("%.${Precision}f", $PercentCount) + 0;
 217 	$SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{PercentCount}{$ResNum}{$Residue} = $PercentCount;
 218       }
 219     }
 220   }
 221 }
 222 
 223 # Generate output files...
 224 sub GeneratePercentResidueFrequencyOutFilesForRegions {
 225   my($FileIndex, $SequenceDataRef) = @_;
 226 
 227   # Setup rerfernce sequence data...
 228   my($RefereceSequenceID, $RefereceSequence);
 229   $RefereceSequenceID = $SequenceFilesInfo{RefereceSequenceID}[$FileIndex];
 230   $RefereceSequence = $SequenceFilesInfo{RefereceSequence}[$FileIndex];
 231 
 232   my($RegionID, $StartResNum, $EndResNum, $ResNum, $Count, $PercentCount, $Residue, $RegionNum, $RegionOutFile, $PercentRegionOutFile, $OutFileRoot, $OutFileExt, $Line, @LineWords, @PercentLineWords);
 233 
 234   $OutFileRoot = $SequenceFilesInfo{OutFileRoot}[$FileIndex];
 235   $OutFileExt = $SequenceFilesInfo{OutFileExt}[$FileIndex];
 236   $RegionNum = 0;
 237   for $RegionID (@{$SequenceFilesInfo{RegionsData}[$FileIndex]{RegionIDs}}) {
 238     $RegionNum++;
 239     $StartResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{StartResNum};
 240     $EndResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{EndResNum};
 241 
 242     $RegionOutFile = "${OutFileRoot}ResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}";
 243     $PercentRegionOutFile = "${OutFileRoot}PercentResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}";
 244 
 245     print "Generating $RegionOutFile and $PercentRegionOutFile...\n";
 246     open REGIONOUTFILE, ">$RegionOutFile" or die "Error: Can't open $RegionOutFile: $! \n";
 247     open PERCENTREGIONOUTFILE, ">$PercentRegionOutFile" or die "Error: Can't open $PercentRegionOutFile: $! \n";
 248 
 249     # Write out reference residue positions as column values....
 250     @LineWords = ();
 251     push @LineWords, '';
 252     RESNUM: for $ResNum ($StartResNum .. $EndResNum) {
 253       if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) {
 254 	next RESNUM;
 255       }
 256       push @LineWords, $ResNum;
 257     }
 258     $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 259     print REGIONOUTFILE "$Line\n";
 260     print PERCENTREGIONOUTFILE "$Line\n";
 261 
 262 
 263     # Write out row data for each residue; Gap residue is written last...
 264     RESIDUE: for $Residue (sort @{$SequenceFilesInfo{ResidueCodes}[$FileIndex]}) {
 265       if ($Residue =~ /^Gap$/i) {
 266 	next RESIDUE;
 267       }
 268       @LineWords = ();
 269       push @LineWords, $Residue;
 270       @PercentLineWords = ();
 271       push @PercentLineWords, $Residue;
 272 
 273       RESNUM: for $ResNum ($StartResNum .. $EndResNum) {
 274 	if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) {
 275 	  next RESNUM;
 276 	}
 277 	$Count = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue};
 278 	push @LineWords, $Count;
 279 	$PercentCount = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{PercentCount}{$ResNum}{$Residue};
 280 	push @PercentLineWords, $PercentCount;
 281       }
 282       $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 283       print REGIONOUTFILE "$Line\n";
 284 
 285       $Line = JoinWords(\@PercentLineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 286       print PERCENTREGIONOUTFILE "$Line\n";
 287     }
 288 
 289     # Write out data for gap...
 290     $Residue = 'Gap';
 291     @LineWords = ();
 292     push @LineWords, $Residue;
 293     @PercentLineWords = ();
 294     push @PercentLineWords, $Residue;
 295 
 296     RESNUM: for $ResNum ($StartResNum .. $EndResNum) {
 297       if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) {
 298 	next RESNUM;
 299       }
 300       $Count = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue};
 301       push @LineWords, $Count;
 302 
 303       $PercentCount = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{PercentCount}{$ResNum}{$Residue};
 304       push @PercentLineWords, $PercentCount;
 305     }
 306     $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 307     print REGIONOUTFILE "$Line\n";
 308 
 309     $Line = JoinWords(\@PercentLineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 310     print PERCENTREGIONOUTFILE "$Line\n";
 311 
 312     close REGIONOUTFILE;
 313     close PERCENTREGIONOUTFILE;
 314   }
 315 }
 316 
 317 # Process option values...
 318 sub ProcessOptions {
 319   %OptionsInfo = ();
 320 
 321   # Setup analysis mode...
 322   $OptionsInfo{CalculatePercentIdentityMatrix} = ($Options{mode} =~ /^(PercentIdentityMatrix|All)$/i) ? 1 : 0;
 323   $OptionsInfo{PerformResidueFrequencyAnalysis} = ($Options{mode} =~ /^(ResidueFrequencyAnalysis|All)$/i) ? 1 : 0;
 324 
 325   # Setup delimiter and quotes...
 326   $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
 327   $OptionsInfo{OutQuote} = ($Options{quote} =~ /yes/i ) ? 1 : 0;
 328 
 329   # Setup reference sequence and regions for residue frequence analysis...
 330   $OptionsInfo{SpecifiedRefereceSequence} = $Options{referencesequence};
 331   $OptionsInfo{SpecifiedRegion} = $Options{region};
 332   @{$OptionsInfo{SpecifiedRegions}} = ();
 333 
 334   my(@SpecifiedRegions);
 335   @SpecifiedRegions = ();
 336   if ($Options{region} =~ /\,/) {
 337     @SpecifiedRegions = split /\,/, $OptionsInfo{SpecifiedRegion};
 338     if (@SpecifiedRegions % 2) {
 339       die "Error: The value specified, $Options{region}, for option \"--region\" is not valid. Allowed values: \"StartResNum,EndResNum,[StartResNum,EndResNum...]\" or UseCompleteSequence\n";
 340     }
 341     # Make sure EndResNum > StartResNum...
 342     my($StartResNum, $EndResNum, $Index, $RegionNum);
 343     $RegionNum = 0;
 344     for ($Index = 0; $Index <= $#SpecifiedRegions; $Index += 2) {
 345       $StartResNum = $SpecifiedRegions[$Index];
 346       $EndResNum = $SpecifiedRegions[$Index + 1];
 347       $RegionNum++;
 348       if (!IsPositiveInteger($StartResNum)) {
 349 	die "Error: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be a positive integer for region $RegionNum.\n";
 350       }
 351       if (!IsPositiveInteger($EndResNum)) {
 352 	die "Error: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $EndResNum, must be a positive integer for region $RegionNum.\n";
 353       }
 354       if ($StartResNum >= $EndResNum) {
 355 	die "Error: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be smaller than end residue number, $EndResNum, for region $RegionNum.\n";
 356       }
 357     }
 358   }
 359   else {
 360     if ($Options{region} !~ /^UseCompleteSequence$/i) {
 361       die "Error: The value specified, $Options{region}, for option \"--region\" is not valid. Allowed values: \"StartResNum,EndResNum,[StartResNum,EndResNum...]\" or UseCompleteSequence\n";
 362     }
 363   }
 364   push @{$OptionsInfo{SpecifiedRegions}}, @SpecifiedRegions;
 365 
 366   # Miscellaneous options...
 367   $OptionsInfo{Precision} = $Options{precision};
 368   $OptionsInfo{IgnoreGaps} = ($Options{ignoregaps} =~ /Yes/i) ? 1 : 0;
 369   $OptionsInfo{RegionResiduesMode} = $Options{regionresiduesmode};
 370 
 371   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 372   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 373 }
 374 
 375 # Retrieve information about sequence files...
 376 sub RetrieveSequenceFilesInfo {
 377   my($Index, $SequenceFile, $FileSupported, $FileFormat, $SequenceCount, $RefereceSequence, $RefereceSequenceID, $RefereceSequenceLen, $RefereceSequenceWithNoGaps, $RefereceSequenceWithNoGapsLen, $RefereceSequenceRegionCount, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFileExt, $SequenceDataRef, $SpecifiedRefereceSequence, @SpecifiedRegions, @RefereceSequenceRegions);
 378 
 379   %SequenceFilesInfo = ();
 380   @{$SequenceFilesInfo{FilesOkay}} = ();
 381   @{$SequenceFilesInfo{OutFileRoot}} = ();
 382   @{$SequenceFilesInfo{OutFileExt}} = ();
 383   @{$SequenceFilesInfo{Format}} = ();
 384   @{$SequenceFilesInfo{SequenceCount}} = ();
 385   @{$SequenceFilesInfo{RefereceSequenceID}} = ();
 386   @{$SequenceFilesInfo{RefereceSequence}} = ();
 387   @{$SequenceFilesInfo{RefereceSequenceLen}} = ();
 388   @{$SequenceFilesInfo{RefereceSequenceWithNoGaps}} = ();
 389   @{$SequenceFilesInfo{RefereceSequenceWithNoGapsLen}} = ();
 390   @{$SequenceFilesInfo{RefereceSequenceRegions}} = ();
 391   @{$SequenceFilesInfo{RefereceSequenceRegionCount}} = ();
 392   @{$SequenceFilesInfo{ResidueCodes}} = ();
 393 
 394   FILELIST: for $Index (0 .. $#SequenceFilesList) {
 395     $SequenceFile = $SequenceFilesList[$Index];
 396     $SequenceFilesInfo{FilesOkay}[$Index] = 0;
 397     $SequenceFilesInfo{OutFileRoot}[$Index] = '';
 398     $SequenceFilesInfo{OutFileExt}[$Index] = '';
 399     $SequenceFilesInfo{Format}[$Index] = 'NotSupported';
 400     $SequenceFilesInfo{SequenceCount}[$Index] = 0;
 401     $SequenceFilesInfo{RefereceSequenceID}[$Index] = '';
 402     $SequenceFilesInfo{RefereceSequence}[$Index] = '';
 403     $SequenceFilesInfo{RefereceSequenceLen}[$Index] = '';
 404     $SequenceFilesInfo{RefereceSequenceWithNoGaps}[$Index] = '';
 405     $SequenceFilesInfo{RefereceSequenceWithNoGapsLen}[$Index] = '';
 406     @{$SequenceFilesInfo{RefereceSequenceRegions}[$Index]} = ();
 407     $SequenceFilesInfo{RefereceSequenceRegionCount}[$Index] = 0;
 408     @{$SequenceFilesInfo{ResidueCodes}[$Index]} = ();
 409 
 410     if (! open SEQUENCEFILE, "$SequenceFile") {
 411       warn "Warning: Ignoring file $SequenceFile: Couldn't open it: $! \n";
 412       next FILELIST;
 413     }
 414     close SEQUENCEFILE;
 415 
 416     ($FileSupported, $FileFormat) = IsSupportedSequenceFile($SequenceFile);
 417     if (!$FileSupported) {
 418       warn "Warning: Ignoring file $SequenceFile: Sequence file format is not supported.\n";
 419       next FILELIST;
 420     }
 421 
 422     $SequenceDataRef = ReadSequenceFile($SequenceFile);
 423 
 424     $SequenceCount = $SequenceDataRef->{Count};
 425     if (!$SequenceCount) {
 426       warn "Warning: Ignoring file $SequenceFile: Sequence data is missing.\n";
 427       next FILELIST;
 428     }
 429 
 430     # Make sure all sequence lengths are identical...
 431     if (!AreSequenceLengthsIdentical($SequenceDataRef)) {
 432       warn "Warning: Ignoring file $SequenceFile: Sequence legths are not identical.\n";
 433       next FILELIST;
 434     }
 435     $SpecifiedRefereceSequence = $OptionsInfo{SpecifiedRefereceSequence};
 436     # Make sure reference sequence ID is valid...
 437     if ($SpecifiedRefereceSequence =~ /^UseFirstSequenceID$/i) {
 438       $RefereceSequenceID = $SequenceDataRef->{IDs}[0];
 439     }
 440     else {
 441       if (!exists($SequenceDataRef->{Sequence}{$SpecifiedRefereceSequence})) {
 442 	warn "Warning: Ignoring file $SequenceFile: Rreference sequence ID, $SpecifiedRefereceSequence, specified using option \"--referencesequence\" is missing.\n";
 443 	next FILELIST;
 444       }
 445       $RefereceSequenceID = $SpecifiedRefereceSequence;
 446     }
 447 
 448     # Make sure sequence regions corresponding to reference sequence are valid...
 449     @RefereceSequenceRegions = ();
 450     $RefereceSequenceRegionCount = 0;
 451     $RefereceSequence = $SequenceDataRef->{Sequence}{$RefereceSequenceID};
 452     $RefereceSequenceLen = length $RefereceSequence;
 453 
 454     $RefereceSequenceWithNoGaps = RemoveSequenceGaps($RefereceSequence);
 455     $RefereceSequenceWithNoGapsLen = length $RefereceSequenceWithNoGaps;
 456 
 457     @SpecifiedRegions = ();
 458     push @SpecifiedRegions, @{$OptionsInfo{SpecifiedRegions}};
 459     if (@SpecifiedRegions) {
 460       # Make sure specified regions are valid...
 461       my($StartResNum, $EndResNum, $RegionIndex, $RegionNum);
 462       $RegionNum = 0;
 463       for ($RegionIndex = 0; $RegionIndex <= $#SpecifiedRegions; $RegionIndex += 2) {
 464 	$StartResNum = $SpecifiedRegions[$RegionIndex];
 465 	$EndResNum = $SpecifiedRegions[$RegionIndex + 1];
 466 	$RegionNum++;
 467 	if ($OptionsInfo{IgnoreGaps}) {
 468 	  if ($StartResNum > $RefereceSequenceWithNoGapsLen) {
 469 	    warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be smaller the sequence length, $RefereceSequenceWithNoGapsLen, of reference sequence ID,  $RefereceSequenceID, in region $RegionNum. The reference sequence residue numbers correspond to the sequence with no gaps. Specify \"No\" value for \"-i, --ignoregaps\" option to use residue numbers corresponding to reference sequence including gaps.\n";
 470 	    next FILELIST;
 471 	  }
 472 	  if ($EndResNum > $RefereceSequenceWithNoGapsLen) {
 473 	    warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The end residue number, $EndResNum, must be smaller the sequence length, $RefereceSequenceWithNoGapsLen, of reference sequence ID,  $RefereceSequenceID, in region $RegionNum. The reference sequence residue numbers correspond to the sequence with no gaps. Specify \"No\" value for \"-i, --ignoregaps\" option to use residue numbers corresponding to reference sequence including gaps.\n";
 474 	    next FILELIST;
 475 	  }
 476 	}
 477 	else {
 478 	  if ($StartResNum > $RefereceSequenceLen) {
 479 	    warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be smaller the sequence length, $RefereceSequenceLen, of reference sequence ID,  $RefereceSequenceID, in region $RegionNum.\n";
 480 	    next FILELIST;
 481 	  }
 482 	  if ($EndResNum > $RefereceSequenceLen) {
 483 	    warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The end residue number, $EndResNum, must be smaller the sequence length, $RefereceSequenceLen, of reference sequence ID,  $RefereceSequenceID, in region $RegionNum.\n";
 484 	    next FILELIST;
 485 	  }
 486 	}
 487 	push @RefereceSequenceRegions, ($StartResNum, $EndResNum);
 488       }
 489       $RefereceSequenceRegionCount = $RegionNum;
 490     }
 491     else {
 492       # Use complete sequence corresponding to reference sequence ID...
 493       if ($OptionsInfo{IgnoreGaps}) {
 494 	push @RefereceSequenceRegions, (1, $RefereceSequenceWithNoGapsLen);
 495       }
 496       else {
 497 	push @RefereceSequenceRegions, (1, $RefereceSequenceLen);
 498       }
 499       $RefereceSequenceRegionCount = 1;
 500     }
 501     # Setup output file names...
 502     $FileDir = ""; $FileName = ""; $FileExt = "";
 503     ($FileDir, $FileName, $FileExt) = ParseFileName($SequenceFile);
 504     $FileExt = "csv";
 505     if ($Options{outdelim} =~ /^tab$/i) {
 506       $FileExt = "tsv";
 507     }
 508     $OutFileExt = $FileExt;
 509     if ($OptionsInfo{OutFileRoot} && (@SequenceFilesList == 1)) {
 510       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 511       if ($RootFileName && $RootFileExt) {
 512 	$FileName = $RootFileName;
 513       }
 514       else {
 515 	$FileName = $OptionsInfo{OutFileRoot};
 516       }
 517       $OutFileRoot = $FileName;
 518     }
 519     else {
 520       $OutFileRoot = $FileName;
 521     }
 522     if (!$OptionsInfo{OverwriteFiles}) {
 523       if ($OptionsInfo{CalculatePercentIdentityMatrix}) {
 524 	if (-e "${OutFileRoot}PercentIdentityMatrix.${OutFileExt}") {
 525 	  warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}PercentIdentityMatrix.${OutFileExt} already exists\n";
 526 	  next FILELIST;
 527 	}
 528       }
 529       if ($OptionsInfo{PerformResidueFrequencyAnalysis}) {
 530 	my($RegionNum);
 531 	for $RegionNum (1 .. $RefereceSequenceRegionCount) {
 532 	  if (-e "${OutFileRoot}ResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}") {
 533 	    warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}ResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt} already exists\n";
 534 	    next FILELIST;
 535 	  }
 536 	  if (-e "${OutFileRoot}PercentResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}") {
 537 	    warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}PercentResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt} already exists\n";
 538 	    next FILELIST;
 539 	  }
 540 	}
 541       }
 542     }
 543 
 544     $SequenceFilesInfo{FilesOkay}[$Index] = 1;
 545     $SequenceFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 546     $SequenceFilesInfo{OutFileExt}[$Index] = $OutFileExt;
 547 
 548     $SequenceFilesInfo{Format}[$Index] = $FileFormat;
 549     $SequenceFilesInfo{SequenceCount}[$Index] = $SequenceCount;
 550     $SequenceFilesInfo{RefereceSequenceID}[$Index] = $RefereceSequenceID;
 551     $SequenceFilesInfo{RefereceSequence}[$Index] = $RefereceSequence;
 552     $SequenceFilesInfo{RefereceSequenceLen}[$Index] = $RefereceSequenceLen;
 553     $SequenceFilesInfo{RefereceSequenceWithNoGaps}[$Index] = $RefereceSequenceWithNoGaps;
 554     $SequenceFilesInfo{RefereceSequenceWithNoGapsLen}[$Index] = $RefereceSequenceWithNoGapsLen;
 555     push @{$SequenceFilesInfo{RefereceSequenceRegions}[$Index]}, @RefereceSequenceRegions;
 556     $SequenceFilesInfo{RefereceSequenceRegionCount}[$Index] = $RefereceSequenceRegionCount;
 557 
 558     # Setup residue codes...
 559     SetupSequenceFileResidueCodes($SequenceDataRef, $Index);
 560   }
 561 }
 562 
 563 sub SetupSequenceFileResidueCodes {
 564   my($SequenceDataRef, $FileIndex) = @_;
 565   my($Residue, @ResidueCodesList, %ResidueCodesMap);
 566 
 567   # Initialize
 568   @{$SequenceFilesInfo{ResidueCodes}[$FileIndex]} = ();
 569 
 570   %ResidueCodesMap = ();
 571   @ResidueCodesList = ();
 572 
 573   # Setup standard amino acids and nucleic acids one letter codes...
 574   if ($OptionsInfo{RegionResiduesMode} =~ /^AminoAcids$/i) {
 575     @ResidueCodesList = AminoAcids::GetAminoAcids('OneLetterCode');
 576   }
 577   elsif ($OptionsInfo{RegionResiduesMode} =~ /^NucleicAcids$/i) {
 578     push @ResidueCodesList, ('A', 'G', 'T', 'U', 'C');
 579   }
 580   push @ResidueCodesList, 'Gap';
 581   for $Residue (@ResidueCodesList) {
 582     $ResidueCodesMap{$Residue} = $Residue;
 583   }
 584 
 585   # Go over all the residues in all the sequences and add missing ones to the list...
 586   my($ID, $Sequence, $ResIndex);
 587   for $ID (@{$SequenceDataRef->{IDs}}) {
 588     $Sequence = $SequenceDataRef->{Sequence}{$ID};
 589     RES: for $ResIndex (0 .. (length($Sequence) - 1)) {
 590       $Residue = substr($Sequence, $ResIndex, 1);
 591       if (IsGapResidue($Residue)) {
 592 	next RES;
 593       }
 594       if (exists $ResidueCodesMap{$Residue}) {
 595 	next RES;
 596       }
 597       push @ResidueCodesList, $Residue;
 598       $ResidueCodesMap{$Residue} = $Residue;
 599     }
 600   }
 601   push @{$SequenceFilesInfo{ResidueCodes}[$FileIndex]}, @ResidueCodesList;
 602 }
 603 
 604 # Setup regions data for performing residue frequency analysis...
 605 sub SetupSequenceRegionsData {
 606   my($Index, $RefereceSequence, $RefereceSequenceLen, $RegionID, $StartResNum, $EndResNum, $RegionIndex, $RegionNum, $NoGapResNum, $ResNum, $ResIndex, $Residue, $ResidueCode, @RefereceSequenceRegions);
 607 
 608 
 609   @{$SequenceFilesInfo{RefereceSequenceResNums}} = ();
 610   @{$SequenceFilesInfo{RegionsData}} = ();
 611 
 612   FILELIST: for $Index (0 .. $#SequenceFilesList) {
 613     %{$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{IsGap}} = ();
 614     %{$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}} = ();
 615     %{$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{GapToNoGap}} = ();
 616     %{$SequenceFilesInfo{RegionsData}[$Index]} = ();
 617     @{$SequenceFilesInfo{RegionsData}[$Index]{RegionIDs}} = ();
 618 
 619     if (!$SequenceFilesInfo{FilesOkay}[$Index]) {
 620       next FILELIST;
 621     }
 622     if (!$OptionsInfo{PerformResidueFrequencyAnalysis}) {
 623       next FILELIST;
 624     }
 625 
 626     $RefereceSequence = $SequenceFilesInfo{RefereceSequence}[$Index];
 627     $RefereceSequenceLen = $SequenceFilesInfo{RefereceSequenceLen}[$Index];
 628 
 629     # Setup residue number mapping and gap status for refernece sequence...
 630     $NoGapResNum = 0;
 631     $ResNum = 0;
 632     for $ResIndex (0 .. ($RefereceSequenceLen - 1)) {
 633       $ResNum++;
 634       $Residue = substr($RefereceSequence, $ResIndex, 1);
 635       $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{IsGap}{$ResNum} = 1;
 636       if (!IsGapResidue($Residue)) {
 637 	$NoGapResNum++;
 638 	$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{IsGap}{$ResNum} = 0;
 639 	$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}{$NoGapResNum} = $ResNum;
 640 	$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{GapToNoGap}{$ResNum} = $NoGapResNum;
 641       }
 642     }
 643     # Map residue numbers for specified regions to the reference residue in input sequence/alignment files
 644     $RegionNum = 0;
 645     @RefereceSequenceRegions = ();
 646     push @RefereceSequenceRegions, @{$SequenceFilesInfo{RefereceSequenceRegions}[$Index]};
 647     for ($RegionIndex = 0; $RegionIndex <= $#RefereceSequenceRegions; $RegionIndex += 2) {
 648       $StartResNum = $RefereceSequenceRegions[$RegionIndex];
 649       $EndResNum = $RefereceSequenceRegions[$RegionIndex + 1];
 650       $RegionNum++;
 651       $RegionID = "Region${RegionNum}";
 652       if ($OptionsInfo{IgnoreGaps}) {
 653 	# Map residue numbers to the reference sequence residue numbers to account for any ignored gaps...
 654 	$StartResNum = $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}{$StartResNum};
 655 	$EndResNum = $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}{$EndResNum};
 656       }
 657       push @{$SequenceFilesInfo{RegionsData}[$Index]{RegionIDs}}, $RegionID;
 658       $SequenceFilesInfo{RegionsData}[$Index]{$RegionID}{StartResNum} = $StartResNum;
 659       $SequenceFilesInfo{RegionsData}[$Index]{$RegionID}{EndResNum} = $EndResNum;
 660 
 661       # Initialize data for residue codes...
 662       for $ResNum ($StartResNum .. $EndResNum) {
 663 	for $ResidueCode (@{$SequenceFilesInfo{ResidueCodes}[$Index]}) {
 664 	  $SequenceFilesInfo{RegionsData}[$Index]{$RegionID}{Count}{$ResNum}{$ResidueCode} = 0;
 665 	}
 666       }
 667     }
 668   }
 669 }
 670 
 671 # Setup script usage  and retrieve command line arguments specified using various options...
 672 sub SetupScriptUsage {
 673 
 674   # Retrieve all the options...
 675   %Options = ();
 676   $Options{ignoregaps} = 'yes';
 677   $Options{regionresiduesmode} = 'None';
 678   $Options{mode} = 'PercentIdentityMatrix';
 679   $Options{outdelim} = 'comma';
 680   $Options{precision} = 2;
 681   $Options{quote} = 'yes';
 682   $Options{referencesequence} = 'UseFirstSequenceID';
 683   $Options{region} = 'UseCompleteSequence';
 684 
 685   if (!GetOptions(\%Options, "help|h", "ignoregaps|i=s", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=i", "quote|q=s", "referencesequence=s", "region=s", "regionresiduesmode=s", "root|r=s", "workingdir|w=s")) {
 686     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 687   }
 688   if ($Options{workingdir}) {
 689     if (! -d $Options{workingdir}) {
 690       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 691     }
 692     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 693   }
 694   if ($Options{ignoregaps} !~ /^(yes|no)$/i) {
 695     die "Error: The value specified, $Options{ignoregaps}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 696   }
 697   if ($Options{regionresiduesmode} !~ /^(AminoAcids|NucleicAcids|None)$/i) {
 698     die "Error: The value specified, $Options{regionresiduesmode}, for option \"--regionresiduesmode\" is not valid. Allowed values: AminoAcids, NucleicAcids or None\n";
 699   }
 700   if ($Options{mode} !~ /^(PercentIdentityMatrix|ResidueFrequencyAnalysis|All)$/i) {
 701     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: PercentIdentityMatrix, ResidueFrequencyAnalysis  or All\n";
 702   }
 703   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 704     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 705   }
 706   if ($Options{quote} !~ /^(yes|no)$/i) {
 707     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 708   }
 709   if (!IsPositiveInteger($Options{precision})) {
 710     die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n";
 711   }
 712 }
 713