1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: InfoSequenceFiles.pl,v $ 4 # $Date: 2008/01/30 21:44:47 $ 5 # $Revision: 1.16 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use 5.006; 30 use strict; 31 use FindBin; use lib "$FindBin::Bin/../lib"; 32 use Getopt::Long; 33 use File::Basename; 34 use Text::ParseWords; 35 use Benchmark; 36 use FileUtil; 37 use TextUtil; 38 use SequenceFileUtil; 39 use StatisticsUtil; 40 41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 42 43 # Autoflush STDOUT 44 $| = 1; 45 46 # Starting message... 47 $ScriptName = basename($0); 48 print "\n$ScriptName: Starting...\n\n"; 49 $StartTime = new Benchmark; 50 51 # Get the options and setup script... 52 SetupScriptUsage(); 53 if ($Options{help} || @ARGV < 1) { 54 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 55 } 56 57 my(@SequenceFilesList); 58 @SequenceFilesList = ExpandFileNames(\@ARGV, "aln msf fasta fta pir"); 59 60 my($DetailLevel, $FrequencyAnalysis, $ListLongestSequence, $ListShortestSequence, $NumOfBins, $ListSequenceLengths, $IgnoreGaps, @BinRange); 61 ProcessOptions(); 62 63 my(@SequenceFilesOkay, @SequenceFilesFormat, @SequenceFilesSize, @SequenceFilesLastModified); 64 RetrieveSequenceFilesInfo(); 65 66 my($Index, $SequenceFile); 67 if (@SequenceFilesList > 1) { 68 print "Processing sequence files...\n"; 69 } 70 for $Index (0 .. $#SequenceFilesList) { 71 if ($SequenceFilesOkay[$Index]) { 72 $SequenceFile = $SequenceFilesList[$Index]; 73 if (@SequenceFilesList > 1) { 74 print "\nProcessing file $SequenceFile...\n"; 75 } 76 else { 77 print "Processing file $SequenceFile...\n" 78 } 79 ListSequenceFileInfo($Index); 80 } 81 } 82 83 ListTotalSizeOfFiles(); 84 85 print "$ScriptName:Done...\n\n"; 86 87 $EndTime = new Benchmark; 88 $TotalTime = timediff ($EndTime, $StartTime); 89 print "Total time: ", timestr($TotalTime), "\n"; 90 91 ############################################################################### 92 93 # List appropriate information... 94 sub ListSequenceFileInfo { 95 my($Index) = @_; 96 my($SequenceFile, $SequenceDataRef); 97 98 $SequenceFile = $SequenceFilesList[$Index]; 99 100 $SequenceDataRef = ReadSequenceFile($SequenceFile); 101 102 my($SequencesCount) = $SequenceDataRef->{Count}; 103 print "\nNumber of sequences: $SequencesCount\n"; 104 105 if ($ListShortestSequence && ($SequencesCount > 1)) { 106 my($ShortestSeqID, $ShortestSeq, $ShortestSeqLen, $Description) = GetShortestSequence($SequenceDataRef, $IgnoreGaps); 107 print "\nShortest sequence information:\nID: $ShortestSeqID; Length:$ShortestSeqLen\n"; 108 if ($DetailLevel >= 2) { 109 print "Description: $Description\n"; 110 } 111 if ($DetailLevel >= 3) { 112 print "Sequence: $ShortestSeq\n"; 113 } 114 } 115 if ($ListLongestSequence && ($SequencesCount > 1)) { 116 my($LongestSeqID, $LongestSeq, $LongestSeqLen, $Description) = GetLongestSequence($SequenceDataRef, $IgnoreGaps); 117 print "\nLongest sequence information:\nID: $LongestSeqID; Length: $LongestSeqLen\n"; 118 if ($DetailLevel >= 2) { 119 print "Description: $Description\n"; 120 } 121 if ($DetailLevel >= 3) { 122 print "Sequence: $LongestSeq\n"; 123 } 124 } 125 if ($FrequencyAnalysis && ($SequencesCount > 1)) { 126 PerformLengthFrequencyAnalysis($SequenceDataRef); 127 } 128 if ($ListSequenceLengths) { 129 ListSequenceLengths($SequenceDataRef); 130 } 131 132 # File size and modification information... 133 print "\nFile size: ", FormatFileSize($SequenceFilesSize[$Index]), " \n"; 134 print "Last modified: ", $SequenceFilesLastModified[$Index], " \n"; 135 } 136 137 # List information about sequence lengths... 138 sub ListSequenceLengths { 139 my($SequenceDataRef) = @_; 140 my($ID, $SeqLen, $Sequence, $Description); 141 142 print "\nSequence lengths information:\n"; 143 for $ID (@{$SequenceDataRef->{IDs}}) { 144 $Sequence = $SequenceDataRef->{Sequence}{$ID}; 145 $Description = $SequenceDataRef->{Description}{$ID}; 146 $SeqLen = GetSequenceLength($Sequence, $IgnoreGaps); 147 if ($IgnoreGaps) { 148 $Sequence = RemoveSequenceGaps($Sequence); 149 } 150 print "ID: $ID; Length:$SeqLen\n"; 151 if ($DetailLevel >= 2) { 152 print "Description: $Description\n"; 153 } 154 if ($DetailLevel >= 3) { 155 print "Sequence: $Sequence\n"; 156 } 157 if ($DetailLevel >= 2) { 158 print "\n"; 159 } 160 } 161 } 162 163 # Total size of all the fiels... 164 sub ListTotalSizeOfFiles { 165 my($FileOkayCount, $TotalSize, $Index); 166 167 $FileOkayCount = 0; 168 $TotalSize = 0; 169 170 for $Index (0 .. $#SequenceFilesList) { 171 if ($SequenceFilesOkay[$Index]) { 172 $FileOkayCount++; 173 $TotalSize += $SequenceFilesSize[$Index]; 174 } 175 } 176 if ($FileOkayCount > 1) { 177 print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n"; 178 } 179 } 180 181 182 # Perform frequency analysis of sequence lengths 183 sub PerformLengthFrequencyAnalysis { 184 my($SequenceDataRef, $SequenceLengthsRef) = @_; 185 my ($ID, $SeqLen, $Sequence, $SequenceLenBin, $LenBin, $SequenceLenCount, @SequenceLengths, %SequenceLenFrequency); 186 187 @SequenceLengths = (); 188 %SequenceLenFrequency = (); 189 for $ID (@{$SequenceDataRef->{IDs}}) { 190 $Sequence = $SequenceDataRef->{Sequence}{$ID}; 191 $SeqLen = GetSequenceLength($Sequence, $IgnoreGaps); 192 push @SequenceLengths, $SeqLen; 193 } 194 if (@BinRange) { 195 %SequenceLenFrequency = Frequency(\@SequenceLengths, \@BinRange); 196 } 197 else { 198 %SequenceLenFrequency = Frequency(\@SequenceLengths, $NumOfBins); 199 } 200 print "\nDistribution of sequence lengths (LengthBin => Count):\n"; 201 for $SequenceLenBin (sort { $a <=> $b} keys %SequenceLenFrequency) { 202 $SequenceLenCount = $SequenceLenFrequency{$SequenceLenBin}; 203 $LenBin = sprintf("%.1f", $SequenceLenBin) + 0; 204 print "$LenBin => $SequenceLenCount; "; 205 } 206 print "\n"; 207 } 208 209 # Process option values... 210 sub ProcessOptions { 211 $DetailLevel = $Options{detail}; 212 213 $FrequencyAnalysis = ($Options{all} || $Options{frequency}) ? 1 : 0; 214 $ListLongestSequence = ($Options{all} || $Options{longest}) ? 1 : 0; 215 $ListShortestSequence = ($Options{all} || $Options{shortest}) ? 1 : 0; 216 $ListSequenceLengths = ($Options{all} || $Options{sequencelengths}) ? 1 : 0; 217 $IgnoreGaps = ($Options{ignoregaps} =~ /Yes/i) ? 1 : 0; 218 219 # Setup frequency bin values... 220 $NumOfBins = 4; 221 @BinRange = (); 222 if ($Options{frequencybins} =~ /\,/) { 223 my($BinValue, @SpecifiedBinRange); 224 @SpecifiedBinRange = split /\,/, $Options{frequencybins}; 225 if (@SpecifiedBinRange < 2) { 226 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n"; 227 } 228 for $BinValue (@SpecifiedBinRange) { 229 if (!IsNumerical($BinValue)) { 230 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n"; 231 } 232 } 233 my($Index1, $Index2); 234 for $Index1 (0 .. $#SpecifiedBinRange) { 235 for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) { 236 if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) { 237 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n"; 238 } 239 } 240 } 241 push @BinRange, @SpecifiedBinRange; 242 } 243 else { 244 $NumOfBins = $Options{frequencybins}; 245 if (!IsPositiveInteger($NumOfBins)) { 246 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n"; 247 } 248 } 249 } 250 251 # Retrieve information about sequence files... 252 sub RetrieveSequenceFilesInfo { 253 my($Index, $SequenceFile, $FileSupported, $FileFormat, $ModifiedTimeString, $ModifiedDateString); 254 255 @SequenceFilesOkay = (); 256 @SequenceFilesFormat = (); 257 @SequenceFilesSize = (); 258 @SequenceFilesLastModified = (); 259 260 FILELIST: for $Index (0 .. $#SequenceFilesList) { 261 $SequenceFile = $SequenceFilesList[$Index]; 262 if (! open SEQUENCEFILE, "$SequenceFile") { 263 warn "Warning: Ignoring file $SequenceFile: Couldn't open it: $! \n"; 264 next FILELIST; 265 } 266 close SEQUENCEFILE; 267 268 $SequenceFilesOkay[$Index] = 0; 269 $SequenceFilesFormat[$Index] = 'NotSupported'; 270 $SequenceFilesSize[$Index] = 0; 271 $SequenceFilesLastModified[$Index] = ''; 272 273 ($FileSupported, $FileFormat) = IsSupportedSequenceFile($SequenceFile); 274 if (!$FileSupported) { 275 warn "Warning: Ignoring file $SequenceFile: Sequence file format is not supported.\n"; 276 next FILELIST; 277 } 278 $SequenceFilesOkay[$Index] = 1; 279 $SequenceFilesFormat[$Index] = $FileFormat; 280 $SequenceFilesSize[$Index] = FileSize($SequenceFile); 281 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($SequenceFile); 282 $SequenceFilesLastModified[$Index] = "$ModifiedTimeString; $ModifiedDateString"; 283 } 284 } 285 286 287 # Setup script usage and retrieve command line arguments specified using various options... 288 sub SetupScriptUsage { 289 290 # Retrieve all the options... 291 %Options = (); 292 $Options{detail} = 1; 293 $Options{ignoregaps} = 'no'; 294 $Options{frequencybins} = 10; 295 296 if (!GetOptions(\%Options, "all|a", "count|c", "detail|d=i", "frequency|f", "frequencybins=s", "help|h", "ignoregaps|i=s", "longest|l", "shortest|s", "sequencelengths", "workingdir|w=s")) { 297 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 298 } 299 if ($Options{workingdir}) { 300 if (! -d $Options{workingdir}) { 301 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 302 } 303 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 304 } 305 if (!IsPositiveInteger($Options{detail})) { 306 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; 307 } 308 if ($Options{ignoregaps} !~ /^(yes|no)$/i) { 309 die "Error: The value specified, $Options{ignoregaps}, for option \"-i --IgnoreGaps\" is not valid. Allowed values: yes or no\n"; 310 } 311 } 312