MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: InfoSequenceFiles.pl,v $
   4 # $Date: 2008/01/30 21:44:47 $
   5 # $Revision: 1.16 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use 5.006;
  30 use strict;
  31 use FindBin; use lib "$FindBin::Bin/../lib";
  32 use Getopt::Long;
  33 use File::Basename;
  34 use Text::ParseWords;
  35 use Benchmark;
  36 use FileUtil;
  37 use TextUtil;
  38 use SequenceFileUtil;
  39 use StatisticsUtil;
  40 
  41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  42 
  43 # Autoflush STDOUT
  44 $| = 1;
  45 
  46 # Starting message...
  47 $ScriptName = basename($0);
  48 print "\n$ScriptName: Starting...\n\n";
  49 $StartTime = new Benchmark;
  50 
  51 # Get the options and setup script...
  52 SetupScriptUsage();
  53 if ($Options{help} || @ARGV < 1) {
  54   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  55 }
  56 
  57 my(@SequenceFilesList);
  58 @SequenceFilesList = ExpandFileNames(\@ARGV, "aln msf fasta fta pir");
  59 
  60 my($DetailLevel, $FrequencyAnalysis, $ListLongestSequence, $ListShortestSequence, $NumOfBins, $ListSequenceLengths, $IgnoreGaps, @BinRange);
  61 ProcessOptions();
  62 
  63 my(@SequenceFilesOkay, @SequenceFilesFormat, @SequenceFilesSize, @SequenceFilesLastModified);
  64 RetrieveSequenceFilesInfo();
  65 
  66 my($Index, $SequenceFile);
  67 if (@SequenceFilesList > 1) {
  68   print "Processing sequence files...\n";
  69 }
  70 for $Index (0 .. $#SequenceFilesList) {
  71   if ($SequenceFilesOkay[$Index]) {
  72     $SequenceFile = $SequenceFilesList[$Index];
  73     if (@SequenceFilesList > 1) {
  74       print "\nProcessing file $SequenceFile...\n";
  75     }
  76     else {
  77       print "Processing file $SequenceFile...\n"
  78     }
  79     ListSequenceFileInfo($Index);
  80   }
  81 }
  82 
  83 ListTotalSizeOfFiles();
  84 
  85 print "$ScriptName:Done...\n\n";
  86 
  87 $EndTime = new Benchmark;
  88 $TotalTime = timediff ($EndTime, $StartTime);
  89 print "Total time: ", timestr($TotalTime), "\n";
  90 
  91 ###############################################################################
  92 
  93 # List appropriate information...
  94 sub ListSequenceFileInfo {
  95   my($Index) = @_;
  96   my($SequenceFile, $SequenceDataRef);
  97 
  98   $SequenceFile = $SequenceFilesList[$Index];
  99 
 100   $SequenceDataRef = ReadSequenceFile($SequenceFile);
 101 
 102   my($SequencesCount) = $SequenceDataRef->{Count};
 103   print "\nNumber of sequences: $SequencesCount\n";
 104 
 105   if ($ListShortestSequence && ($SequencesCount > 1)) {
 106     my($ShortestSeqID, $ShortestSeq, $ShortestSeqLen, $Description) = GetShortestSequence($SequenceDataRef, $IgnoreGaps);
 107     print "\nShortest sequence information:\nID: $ShortestSeqID; Length:$ShortestSeqLen\n";
 108     if ($DetailLevel >= 2) {
 109       print "Description: $Description\n";
 110     }
 111     if ($DetailLevel >= 3) {
 112       print "Sequence: $ShortestSeq\n";
 113     }
 114   }
 115   if ($ListLongestSequence && ($SequencesCount > 1)) {
 116     my($LongestSeqID, $LongestSeq, $LongestSeqLen, $Description) = GetLongestSequence($SequenceDataRef, $IgnoreGaps);
 117     print "\nLongest sequence information:\nID: $LongestSeqID; Length: $LongestSeqLen\n";
 118     if ($DetailLevel >= 2) {
 119       print "Description: $Description\n";
 120     }
 121     if ($DetailLevel >= 3) {
 122       print "Sequence: $LongestSeq\n";
 123     }
 124   }
 125   if ($FrequencyAnalysis && ($SequencesCount > 1)) {
 126     PerformLengthFrequencyAnalysis($SequenceDataRef);
 127   }
 128   if ($ListSequenceLengths) {
 129     ListSequenceLengths($SequenceDataRef);
 130   }
 131 
 132   # File size and modification information...
 133   print "\nFile size: ", FormatFileSize($SequenceFilesSize[$Index]), " \n";
 134   print "Last modified: ", $SequenceFilesLastModified[$Index], " \n";
 135 }
 136 
 137 # List information about sequence lengths...
 138 sub ListSequenceLengths {
 139   my($SequenceDataRef) = @_;
 140   my($ID, $SeqLen, $Sequence, $Description);
 141 
 142   print "\nSequence lengths information:\n";
 143   for $ID (@{$SequenceDataRef->{IDs}}) {
 144     $Sequence = $SequenceDataRef->{Sequence}{$ID};
 145     $Description = $SequenceDataRef->{Description}{$ID};
 146     $SeqLen = GetSequenceLength($Sequence, $IgnoreGaps);
 147     if ($IgnoreGaps) {
 148       $Sequence = RemoveSequenceGaps($Sequence);
 149     }
 150     print "ID: $ID; Length:$SeqLen\n";
 151     if ($DetailLevel >= 2) {
 152       print "Description: $Description\n";
 153     }
 154     if ($DetailLevel >= 3) {
 155       print "Sequence: $Sequence\n";
 156     }
 157     if ($DetailLevel >= 2) {
 158       print "\n";
 159     }
 160   }
 161 }
 162 
 163 # Total size of all the fiels...
 164 sub ListTotalSizeOfFiles {
 165   my($FileOkayCount, $TotalSize, $Index);
 166 
 167   $FileOkayCount = 0;
 168   $TotalSize = 0;
 169 
 170   for $Index (0 .. $#SequenceFilesList) {
 171     if ($SequenceFilesOkay[$Index]) {
 172       $FileOkayCount++;
 173       $TotalSize += $SequenceFilesSize[$Index];
 174     }
 175   }
 176   if ($FileOkayCount > 1) {
 177     print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n";
 178   }
 179 }
 180 
 181 
 182 # Perform frequency analysis of sequence lengths
 183 sub PerformLengthFrequencyAnalysis {
 184   my($SequenceDataRef, $SequenceLengthsRef) = @_;
 185   my ($ID, $SeqLen, $Sequence, $SequenceLenBin, $LenBin, $SequenceLenCount, @SequenceLengths, %SequenceLenFrequency);
 186 
 187   @SequenceLengths = ();
 188   %SequenceLenFrequency = ();
 189   for $ID (@{$SequenceDataRef->{IDs}}) {
 190     $Sequence = $SequenceDataRef->{Sequence}{$ID};
 191     $SeqLen = GetSequenceLength($Sequence, $IgnoreGaps);
 192     push @SequenceLengths, $SeqLen;
 193   }
 194   if (@BinRange) {
 195     %SequenceLenFrequency = Frequency(\@SequenceLengths, \@BinRange);
 196   }
 197   else {
 198     %SequenceLenFrequency = Frequency(\@SequenceLengths, $NumOfBins);
 199   }
 200   print "\nDistribution of sequence lengths (LengthBin => Count):\n";
 201   for $SequenceLenBin (sort { $a <=> $b} keys %SequenceLenFrequency) {
 202     $SequenceLenCount = $SequenceLenFrequency{$SequenceLenBin};
 203     $LenBin = sprintf("%.1f", $SequenceLenBin) + 0;
 204     print "$LenBin => $SequenceLenCount; ";
 205   }
 206   print "\n";
 207 }
 208 
 209 # Process option values...
 210 sub ProcessOptions {
 211   $DetailLevel = $Options{detail};
 212 
 213   $FrequencyAnalysis = ($Options{all} || $Options{frequency}) ? 1 : 0;
 214   $ListLongestSequence = ($Options{all} || $Options{longest}) ? 1 : 0;
 215   $ListShortestSequence = ($Options{all} || $Options{shortest}) ? 1 : 0;
 216   $ListSequenceLengths = ($Options{all} || $Options{sequencelengths}) ? 1 : 0;
 217   $IgnoreGaps = ($Options{ignoregaps} =~ /Yes/i) ? 1 : 0;
 218 
 219   # Setup frequency bin values...
 220   $NumOfBins = 4;
 221   @BinRange = ();
 222   if ($Options{frequencybins} =~ /\,/) {
 223     my($BinValue, @SpecifiedBinRange);
 224     @SpecifiedBinRange = split /\,/,  $Options{frequencybins};
 225     if (@SpecifiedBinRange < 2) {
 226       die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n";
 227     }
 228     for $BinValue (@SpecifiedBinRange) {
 229       if (!IsNumerical($BinValue)) {
 230 	die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n";
 231       }
 232     }
 233     my($Index1, $Index2);
 234     for $Index1 (0 .. $#SpecifiedBinRange) {
 235       for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) {
 236 	if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) {
 237 	  die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n";
 238 	}
 239       }
 240     }
 241     push @BinRange, @SpecifiedBinRange;
 242   }
 243   else {
 244     $NumOfBins = $Options{frequencybins};
 245     if (!IsPositiveInteger($NumOfBins)) {
 246       die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n";
 247     }
 248   }
 249 }
 250 
 251 # Retrieve information about sequence files...
 252 sub RetrieveSequenceFilesInfo {
 253   my($Index, $SequenceFile, $FileSupported, $FileFormat, $ModifiedTimeString, $ModifiedDateString);
 254 
 255   @SequenceFilesOkay = ();
 256   @SequenceFilesFormat = ();
 257   @SequenceFilesSize = ();
 258   @SequenceFilesLastModified = ();
 259 
 260   FILELIST: for $Index (0 .. $#SequenceFilesList) {
 261     $SequenceFile = $SequenceFilesList[$Index];
 262     if (! open SEQUENCEFILE, "$SequenceFile") {
 263       warn "Warning: Ignoring file $SequenceFile: Couldn't open it: $! \n";
 264       next FILELIST;
 265     }
 266     close SEQUENCEFILE;
 267 
 268     $SequenceFilesOkay[$Index] = 0;
 269     $SequenceFilesFormat[$Index] = 'NotSupported';
 270     $SequenceFilesSize[$Index] = 0;
 271     $SequenceFilesLastModified[$Index] = '';
 272 
 273     ($FileSupported, $FileFormat) = IsSupportedSequenceFile($SequenceFile);
 274     if (!$FileSupported) {
 275       warn "Warning: Ignoring file $SequenceFile: Sequence file format is not supported.\n";
 276       next FILELIST;
 277     }
 278     $SequenceFilesOkay[$Index] = 1;
 279     $SequenceFilesFormat[$Index] = $FileFormat;
 280     $SequenceFilesSize[$Index] = FileSize($SequenceFile);
 281     ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($SequenceFile);
 282     $SequenceFilesLastModified[$Index] = "$ModifiedTimeString; $ModifiedDateString";
 283   }
 284 }
 285 
 286 
 287 # Setup script usage  and retrieve command line arguments specified using various options...
 288 sub SetupScriptUsage {
 289 
 290   # Retrieve all the options...
 291   %Options = ();
 292   $Options{detail} = 1;
 293   $Options{ignoregaps} = 'no';
 294   $Options{frequencybins} = 10;
 295 
 296   if (!GetOptions(\%Options, "all|a", "count|c", "detail|d=i", "frequency|f", "frequencybins=s", "help|h", "ignoregaps|i=s", "longest|l", "shortest|s", "sequencelengths", "workingdir|w=s")) {
 297     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 298   }
 299   if ($Options{workingdir}) {
 300     if (! -d $Options{workingdir}) {
 301       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 302     }
 303     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 304   }
 305   if (!IsPositiveInteger($Options{detail})) {
 306     die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
 307   }
 308   if ($Options{ignoregaps} !~ /^(yes|no)$/i) {
 309     die "Error: The value specified, $Options{ignoregaps}, for option \"-i --IgnoreGaps\" is not valid. Allowed values: yes or no\n";
 310   }
 311 }
 312