MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: ExtractFromSequenceFiles.pl,v $
   4 # $Date: 2008/01/30 21:44:46 $
   5 # $Revision: 1.10 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use 5.006;
  30 use strict;
  31 use FindBin; use lib "$FindBin::Bin/../lib";
  32 use Getopt::Long;
  33 use File::Basename;
  34 use Text::ParseWords;
  35 use Benchmark;
  36 use FileUtil;
  37 use TextUtil;
  38 use SequenceFileUtil;
  39 
  40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  41 
  42 # Autoflush STDOUT
  43 $| = 1;
  44 
  45 # Starting message...
  46 $ScriptName = basename($0);
  47 print "\n$ScriptName: Starting...\n\n";
  48 $StartTime = new Benchmark;
  49 
  50 # Setup script usage message...
  51 SetupScriptUsage();
  52 if ($Options{help} || @ARGV < 1) {
  53   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  54 }
  55 
  56 # Expand wild card file names...
  57 my(@SequenceFilesList);
  58 @SequenceFilesList = ExpandFileNames(\@ARGV, "aln msf fasta fta pir");
  59 
  60 # Process options...
  61 my(%OptionsInfo);
  62 print "Processing options...\n";
  63 ProcessOptions();
  64 
  65 # Set up information about input files...
  66 print "Checking input sequence file(s)...\n";
  67 my(%SequenceFilesInfo);
  68 RetrieveSequenceFilesInfo();
  69 
  70 # Process input files..
  71 my($FileIndex, $SequenceFile, $FileProcessingMsg);
  72 $FileProcessingMsg = "Processing file";
  73 if (@SequenceFilesList > 1) {
  74   print "Processing sequence files...\n";
  75   $FileProcessingMsg = "\n$FileProcessingMsg";
  76 }
  77 
  78 for $FileIndex (0 .. $#SequenceFilesList) {
  79   if ($SequenceFilesInfo{FilesOkay}[$FileIndex]) {
  80     $SequenceFile = $SequenceFilesList[$FileIndex];
  81     print "$FileProcessingMsg $SequenceFile...\n";
  82     ExtractFromSequenceFiles($FileIndex);
  83   }
  84 }
  85 
  86 print "$ScriptName:Done...\n\n";
  87 
  88 $EndTime = new Benchmark;
  89 $TotalTime = timediff ($EndTime, $StartTime);
  90 print "Total time: ", timestr($TotalTime), "\n";
  91 
  92 ###############################################################################
  93 
  94 # Extract from sequence files...
  95 sub ExtractFromSequenceFiles {
  96   my($FileIndex) = @_;
  97   my($OutSequenceFile, $SequenceFile, $SequenceDataRef, $SpecifiedSequenceDataRef);
  98 
  99   # Read sequence file...
 100   $SequenceFile = $SequenceFilesList[$FileIndex];
 101   open SEQUENCEFILE, "$SequenceFile" or die "Error: Can't open $SequenceFile: $! \n";
 102   $SequenceDataRef = ReadSequenceFile($SequenceFile);
 103   close SEQUENCEFILE;
 104 
 105   $OutSequenceFile = $SequenceFilesInfo{OutFile}[$FileIndex];
 106   print "Generating sequence file $OutSequenceFile...\n";
 107 
 108   # Retrieve sequence data for specified sequences...
 109   $SpecifiedSequenceDataRef = GetSpecifiedSequenceData($SequenceDataRef);
 110 
 111   # Handle gaps...
 112   if ($OptionsInfo{IgnoreGaps}) {
 113     if (@{$SpecifiedSequenceDataRef->{IDs}} > 1) {
 114       if (AreSequenceLengthsIdentical($SpecifiedSequenceDataRef)) {
 115 	$SpecifiedSequenceDataRef = RemoveSequenceAlignmentGapColumns($SpecifiedSequenceDataRef);
 116       }
 117     }
 118     else {
 119       # Remove the gaps from the sequence...
 120       my($ID, $Sequence);
 121       $ID = $SpecifiedSequenceDataRef->{IDs}[0];
 122       $Sequence = $SpecifiedSequenceDataRef->{Sequence}{$ID};
 123       $SpecifiedSequenceDataRef->{Sequence}{$ID} = RemoveSequenceGaps($Sequence);
 124     }
 125   }
 126 
 127   # Write out the file...
 128   WritePearsonFastaSequenceFile($OutSequenceFile, $SpecifiedSequenceDataRef, $OptionsInfo{MaxSequenceLength});
 129 }
 130 
 131 # Get specified sequence data...
 132 sub GetSpecifiedSequenceData {
 133   my($SequenceDataRef) = @_;
 134 
 135   if ($OptionsInfo{Mode} =~ /^SequenceID$/i) {
 136     return GetDataBySequenceIDs($SequenceDataRef);
 137   }
 138   elsif ($Options{mode} =~ /^SequenceNum$/i) {
 139     return GetDataBySequenceNums($SequenceDataRef);
 140   }
 141   elsif ($Options{mode} =~ /^SequenceNumRange$/i) {
 142     return GetDataBySequenceNumRange($SequenceDataRef);
 143   }
 144   else {
 145     return undef;
 146   }
 147 }
 148 
 149 # Get specified sequence data...
 150 sub GetDataBySequenceIDs {
 151   my($SequenceDataRef) = @_;
 152   my($ID, $SequenceCount, $IDMatched, $SpecifiedID, %SpecifiedSequenceDataMap);
 153 
 154   # Go over sequences and collect sequences for writing out a new sequence file...
 155   %SpecifiedSequenceDataMap = ();
 156   @{$SpecifiedSequenceDataMap{IDs}} = ();
 157   %{$SpecifiedSequenceDataMap{Description}} = ();
 158   %{$SpecifiedSequenceDataMap{Sequence}} = ();
 159 
 160   $SequenceCount = 0;
 161   ID: for $ID (@{$SequenceDataRef->{IDs}}) {
 162     if ($OptionsInfo{MatchExactSequenceIDs}) {
 163       if (!exists $OptionsInfo{SpecifiedSequenceIDsMap}{lc($ID)}) {
 164 	next ID;
 165       }
 166       if ($SequenceCount >= scalar @{$OptionsInfo{SpecifiedSequenceIDs}}) {
 167 	last ID;
 168       }
 169       $SequenceCount++;
 170     }
 171     else {
 172       # Does this ID contains specified ID as substring...
 173       $IDMatched = 0;
 174       SPECIFIEDID: for $SpecifiedID (@{$OptionsInfo{SpecifiedSequenceIDs}}) {
 175 	if ($ID =~ /$SpecifiedID/i) {
 176 	  $IDMatched = 1;
 177 	  last SPECIFIEDID;
 178 	}
 179       }
 180       if (!$IDMatched) {
 181 	next ID;
 182       }
 183       $SequenceCount++;
 184     }
 185     # Collect sequence data...
 186     push @{$SpecifiedSequenceDataMap{IDs}}, $ID;
 187     $SpecifiedSequenceDataMap{Description}{$ID} = $SequenceDataRef->{Description}{$ID};
 188     $SpecifiedSequenceDataMap{Sequence}{$ID} = $SequenceDataRef->{Sequence}{$ID};
 189   }
 190 
 191   return \%SpecifiedSequenceDataMap;
 192 }
 193 
 194 # Get specified sequence data...
 195 sub GetDataBySequenceNums {
 196   my($SequenceDataRef) = @_;
 197   my($ID, $SequenceNum, $SequenceCount, %SpecifiedSequenceDataMap);
 198 
 199   # Go over sequences and collect sequences for writing out a new sequence file...
 200   %SpecifiedSequenceDataMap = ();
 201   @{$SpecifiedSequenceDataMap{IDs}} = ();
 202   %{$SpecifiedSequenceDataMap{Description}} = ();
 203   %{$SpecifiedSequenceDataMap{Sequence}} = ();
 204 
 205   $SequenceNum = 0;
 206   $SequenceCount = 0;
 207   ID: for $ID (@{$SequenceDataRef->{IDs}}) {
 208     $SequenceNum++;
 209     if (!exists $OptionsInfo{SpecifiedSequenceIDsMap}{$SequenceNum}) {
 210       next ID;
 211     }
 212     if ($SequenceCount >= scalar @{$OptionsInfo{SpecifiedSequenceIDs}}) {
 213       last ID;
 214     }
 215     $SequenceCount++;
 216 
 217     # Collect sequence data...
 218     push @{$SpecifiedSequenceDataMap{IDs}}, $ID;
 219     $SpecifiedSequenceDataMap{Description}{$ID} = $SequenceDataRef->{Description}{$ID};
 220     $SpecifiedSequenceDataMap{Sequence}{$ID} = $SequenceDataRef->{Sequence}{$ID};
 221   }
 222 
 223   return \%SpecifiedSequenceDataMap;
 224 }
 225 
 226 # Get specified sequence data...
 227 sub GetDataBySequenceNumRange {
 228   my($SequenceDataRef) = @_;
 229   my($ID, $SequenceNum, $SequenceCount, %SpecifiedSequenceDataMap);
 230 
 231   # Go over sequences and collect sequences for writing out a new sequence file...
 232   %SpecifiedSequenceDataMap = ();
 233   @{$SpecifiedSequenceDataMap{IDs}} = ();
 234   %{$SpecifiedSequenceDataMap{Description}} = ();
 235   %{$SpecifiedSequenceDataMap{Sequence}} = ();
 236 
 237   $SequenceNum = 0;
 238   $SequenceCount = 0;
 239   ID: for $ID (@{$SequenceDataRef->{IDs}}) {
 240     $SequenceNum++;
 241 
 242     if (!($SequenceNum >= $OptionsInfo{SpecifiedSequenceIDs}[0] && $SequenceNum <= $OptionsInfo{SpecifiedSequenceIDs}[1])) {
 243       next ID;
 244     }
 245     if ($SequenceNum > $OptionsInfo{SpecifiedSequenceIDs}[1]) {
 246       last ID;
 247     }
 248     $SequenceCount++;
 249     # Collect sequence data...
 250     push @{$SpecifiedSequenceDataMap{IDs}}, $ID;
 251     $SpecifiedSequenceDataMap{Description}{$ID} = $SequenceDataRef->{Description}{$ID};
 252     $SpecifiedSequenceDataMap{Sequence}{$ID} = $SequenceDataRef->{Sequence}{$ID};
 253   }
 254 
 255   return \%SpecifiedSequenceDataMap;
 256 }
 257 
 258 
 259 # Process option values...
 260 sub ProcessOptions {
 261   %OptionsInfo = ();
 262 
 263   # Miscellaneous options...
 264   $OptionsInfo{IgnoreGaps} = ($Options{ignoregaps} =~ /Yes/i) ? 1 : 0;
 265 
 266   $OptionsInfo{Mode} = $Options{mode};
 267   $OptionsInfo{MatchExactSequenceIDs} = $Options{sequenceidmatch} =~ /Exact/i ? 1 :0;
 268 
 269   # Check specified sequences value...
 270   $OptionsInfo{SpecifiedSequences} = $Options{sequences};
 271   @{$OptionsInfo{SpecifiedSequenceIDs}} = ();
 272   %{$OptionsInfo{SpecifiedSequenceIDsMap}} = ();
 273 
 274   my(@SpecifiedSequenceIDs) = ();
 275   if ($Options{mode} =~ /^SequenceID$/i) {
 276     if (!$Options{sequences}) {
 277       die "Error: No value specified for option \"-s, --Sequences\" during \"SequenceID\" of \"-m, --mode\" option\n";
 278     }
 279     @SpecifiedSequenceIDs = split /\,/, $Options{sequences};
 280   }
 281   elsif ($Options{mode} =~ /^SequenceNum$/i) {
 282     if ($Options{sequences}) {
 283       @SpecifiedSequenceIDs = split /\,/, $Options{sequences};
 284       my($SequenceNum);
 285       for $SequenceNum (@SpecifiedSequenceIDs) {
 286 	if (!IsPositiveInteger($SequenceNum)) {
 287 	  die "Error: The value specified, $SequenceNum, in \"$Options{sequences}\" for option \"-s, --Sequences\" is not valid: Valid values: > 0\n";
 288 	}
 289       }
 290     }
 291     else {
 292       push @SpecifiedSequenceIDs, "1";
 293     }
 294   }
 295   elsif ($Options{mode} =~ /^SequenceNumRange$/i) {
 296     if (!$Options{sequences}) {
 297       die "Error: No value specified for option \"-s, --Sequences\" during \"SequenceNumRange\" of \"-m, --mode\" option\n";
 298     }
 299     @SpecifiedSequenceIDs = split /\,/, $Options{sequences};
 300     if (@SpecifiedSequenceIDs != 2) {
 301       die "Error: The number of values", scalar @SpecifiedSequenceIDs, " specified, $Options{sequences}, for option \"-s, --Sequences\" are not valid. Number of values must be 2 to indicate starting and ending sequence number.\n";
 302     }
 303     my($SequenceNum);
 304     for $SequenceNum (@SpecifiedSequenceIDs) {
 305       if (!IsPositiveInteger($SequenceNum)) {
 306 	die "Error: The value specified, $SequenceNum, in \"$Options{sequences}\" for option \"-s, --Sequences\" is not valid: Valid values: > 0\n";
 307       }
 308     }
 309     if ($SpecifiedSequenceIDs[0] > $SpecifiedSequenceIDs[1]) {
 310       die "Error: The value specified \"$Options{sequences}\" for option \"-s, --Sequences\" are not valid: Starting sequence number $SpecifiedSequenceIDs[0] must be smaller than ending sequence number $SpecifiedSequenceIDs[1]\n";
 311     }
 312   }
 313   push @{$OptionsInfo{SpecifiedSequenceIDs}}, @SpecifiedSequenceIDs;
 314   my($SequenceID);
 315   for $SequenceID (@SpecifiedSequenceIDs) {
 316     if ($Options{mode} =~ /^SequenceID$/i) {
 317       $OptionsInfo{SpecifiedSequenceIDsMap}{lc($SequenceID)} = $SequenceID;
 318     }
 319     else {
 320       $OptionsInfo{SpecifiedSequenceIDsMap}{$SequenceID} = $SequenceID;
 321     }
 322   }
 323 
 324   $OptionsInfo{MaxSequenceLength} = $Options{sequencelength};
 325   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 326   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 327 }
 328 
 329 # Retrieve information about sequence files...
 330 sub RetrieveSequenceFilesInfo {
 331   my($Index, $SequenceFile, $FileSupported, $FileFormat, $SequenceCount, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFileExt, $OutFileMode, $SequenceDataRef);
 332 
 333   %SequenceFilesInfo = ();
 334   @{$SequenceFilesInfo{FilesOkay}} = ();
 335   @{$SequenceFilesInfo{OutFileRoot}} = ();
 336   @{$SequenceFilesInfo{OutFileExt}} = ();
 337   @{$SequenceFilesInfo{OutFile}} = ();
 338   @{$SequenceFilesInfo{Format}} = ();
 339   @{$SequenceFilesInfo{SequenceCount}} = ();
 340 
 341   FILELIST: for $Index (0 .. $#SequenceFilesList) {
 342     $SequenceFile = $SequenceFilesList[$Index];
 343     $SequenceFilesInfo{FilesOkay}[$Index] = 0;
 344     $SequenceFilesInfo{OutFileRoot}[$Index] = '';
 345     $SequenceFilesInfo{OutFileExt}[$Index] = '';
 346     $SequenceFilesInfo{OutFile}[$Index] = '';
 347     $SequenceFilesInfo{Format}[$Index] = 'NotSupported';
 348     $SequenceFilesInfo{SequenceCount}[$Index] = 0;
 349 
 350     if (! open SEQUENCEFILE, "$SequenceFile") {
 351       warn "Warning: Ignoring file $SequenceFile: Couldn't open it: $! \n";
 352       next FILELIST;
 353     }
 354     close SEQUENCEFILE;
 355 
 356     ($FileSupported, $FileFormat) = IsSupportedSequenceFile($SequenceFile);
 357     if (!$FileSupported) {
 358       warn "Warning: Ignoring file $SequenceFile: Sequence file format is not supported.\n";
 359       next FILELIST;
 360     }
 361     $SequenceDataRef = ReadSequenceFile($SequenceFile);
 362 
 363     $SequenceCount = $SequenceDataRef->{Count};
 364     if (!$SequenceCount) {
 365       warn "Warning: Ignoring file $SequenceFile: Sequence data is missing.\n";
 366       next FILELIST;
 367     }
 368 
 369     # Setup output file names...
 370     $FileDir = ""; $FileName = ""; $FileExt = "";
 371     ($FileDir, $FileName, $FileExt) = ParseFileName($SequenceFile);
 372     $OutFileExt = 'fasta';
 373     if ($OptionsInfo{OutFileRoot} && (@SequenceFilesList == 1)) {
 374       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 375       if ($RootFileName && $RootFileExt) {
 376 	$FileName = $RootFileName;
 377       }
 378       else {
 379 	$FileName = $OptionsInfo{OutFileRoot};
 380       }
 381       $OutFileRoot = $FileName;
 382     }
 383     else {
 384       $OutFileRoot = $FileName;
 385     }
 386     MODE: {
 387 	if ($OptionsInfo{Mode} =~ /^SequenceID$/i) { $OutFileMode = 'SequenceID'; last MODE;}
 388 	if ($OptionsInfo{Mode} =~ /^SequenceNum$/i) { $OutFileMode = 'SequenceNum'; last MODE;}
 389 	if ($OptionsInfo{Mode} =~ /^SequenceNumRange$/i) { $OutFileMode = 'SequenceNumRange'; last MODE;}
 390 	$OutFileMode = '';
 391     }
 392     if (!$OptionsInfo{OverwriteFiles}) {
 393       if (-e "${OutFileRoot}${OutFileMode}.${OutFileExt}") {
 394 	warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}${OutFileMode}.${OutFileExt} already exists\n";
 395 	next FILELIST;
 396       }
 397     }
 398 
 399     $SequenceFilesInfo{FilesOkay}[$Index] = 1;
 400     $SequenceFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 401     $SequenceFilesInfo{OutFileExt}[$Index] = $OutFileExt;
 402     $SequenceFilesInfo{OutFile}[$Index] = "${OutFileRoot}${OutFileMode}.${OutFileExt}";
 403 
 404     $SequenceFilesInfo{Format}[$Index] = $FileFormat;
 405     $SequenceFilesInfo{SequenceCount}[$Index] = $SequenceCount;
 406   }
 407 }
 408 
 409 # Setup script usage  and retrieve command line arguments specified using various options...
 410 sub SetupScriptUsage {
 411 
 412   # Retrieve all the options...
 413   %Options = ();
 414   $Options{ignoregaps} = 'Yes';
 415   $Options{mode} = 'SequenceNum';
 416   $Options{sequenceidmatch} = 'Relaxed';
 417   $Options{sequencelength} = 80;
 418 
 419   if (!GetOptions(\%Options, "help|h", "ignoregaps|i=s", "mode|m=s", "overwrite|o", "root|r=s", "sequences|s=s", "sequenceidmatch=s", "sequencelength=i", "workingdir|w=s")) {
 420     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 421   }
 422   if ($Options{workingdir}) {
 423     if (! -d $Options{workingdir}) {
 424       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 425     }
 426     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 427   }
 428   if ($Options{ignoregaps} !~ /^(yes|no)$/i) {
 429     die "Error: The value specified, $Options{ignoregaps}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 430   }
 431   if ($Options{mode} !~ /^(SequenceID|SequenceNum|SequenceNumRange)$/i) {
 432     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: SequenceID, SequenceNum, or SequenceNumRange\n";
 433   }
 434   if ($Options{sequenceidmatch} !~ /^(Exact|Relaxed)$/i) {
 435     die "Error: The value specified, $Options{sequenceidmatch}, for option \"--SequenceIDMatch\" is not valid. Allowed values: Exact or Relaxed\n";
 436   }
 437   if (!IsPositiveInteger($Options{sequencelength})) {
 438     die "Error: The value specified, $Options{sequencelength}, for option \"--SequenceLength\" is not valid. Allowed values: >0\n";
 439   }
 440 }
 441