MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: InfoAminoAcids.pl,v $
   4 # $Date: 2008/01/30 21:44:46 $
   5 # $Revision: 1.12 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use 5.006;
  30 use strict;
  31 use FindBin; use lib "$FindBin::Bin/../lib";
  32 use Getopt::Long;
  33 use File::Basename;
  34 use Text::ParseWords;
  35 use Benchmark;
  36 use FileUtil;
  37 use TextUtil;
  38 use AminoAcids;
  39 
  40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  41 
  42 # Autoflush STDOUT
  43 $| = 1;
  44 
  45 # Starting message...
  46 $ScriptName = basename($0);
  47 print "\n$ScriptName: Starting...\n\n";
  48 $StartTime = new Benchmark;
  49 
  50 # Get the options and setup script...
  51 SetupScriptUsage();
  52 if ($Options{help}) {
  53   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  54 }
  55 
  56 my($OutDelim, $OutQuote, $AminoAcidRowsOutput, $FileOutput, $Precision, $OutFileName, @SpecifiedAminoAcidIDs, @SpecifiedProperies,);
  57 ProcessOptions();
  58 
  59 ListAminoAcidProperties();
  60 print "$ScriptName:Done...\n\n";
  61 
  62 $EndTime = new Benchmark;
  63 $TotalTime = timediff ($EndTime, $StartTime);
  64 print "Total time: ", timestr($TotalTime), "\n";
  65 
  66 ###############################################################################
  67 
  68 # Get propery names from categories...
  69 sub GetPropertyNamesFromCategories {
  70   my($CategoryName) = @_;
  71   my(@PropertyNames);
  72 
  73   @PropertyNames = ();
  74   if ($CategoryName =~ /^Basic$/i) {
  75     @PropertyNames = ('ThreeLetterCode', 'OneLetterCode', 'AminoAcid', 'DNACodons', 'RNACodons', 'ChemicalFormula','MolecularWeight', 'LinearStructure', 'LinearStructureAtpH7.4');
  76   } elsif ($CategoryName =~ /^BasicPlus$/i) {
  77     @PropertyNames = ('ThreeLetterCode', 'OneLetterCode', 'AminoAcid', 'DNACodons', 'RNACodons', 'AcidicBasic', 'PolarNonpolar', 'Charged', 'Aromatic', 'HydrophobicHydophilic', 'IsoelectricPoint', 'pKCOOH', 'pKNH3+', 'ChemicalFormula', 'MolecularWeight', 'ExactMass', 'ChemicalFormulaMinusH2O', 'MolecularWeightMinusH2O(18.01524)', 'ExactMassMinusH2O(18.01056)','LinearStructure', 'LinearStructureAtpH7.4');
  78   } elsif ($CategoryName =~ /^BasicAndHydrophobicity$/i) {
  79     @PropertyNames = ('ThreeLetterCode', 'OneLetterCode', 'AminoAcid', 'DNACodons', 'RNACodons', 'ChemicalFormula', 'MolecularWeight', 'LinearStructure', 'LinearStructureAtpH7.4', 'HydrophobicityEisenbergAndOthers', 'HydrophobicityHoppAndWoods', 'HydrophobicityJanin', 'HydrophobicityKyteAndDoolittle', 'HydrophobicityRoseAndOthers', 'HydrophobicityWolfendenAndOthers');
  80   } elsif ($CategoryName =~ /^BasicAndHydrophobicityPlus$/i) {
  81     @PropertyNames = ('ThreeLetterCode', 'OneLetterCode', 'AminoAcid', 'DNACodons', 'RNACodons', 'ChemicalFormula', 'MolecularWeight', 'LinearStructure', 'LinearStructureAtpH7.4', 'HydrophobicityAbrahamAndLeo', 'HydrophobicityBlack', 'HydrophobicityBullAndBreese', 'HydrophobicityChothia', 'HydrophobicityEisenbergAndOthers', 'HydrophobicityFauchereAndOthers', 'HydrophobicityGuy', 'HydrophobicityHPLCAtpH3.4Cowan', 'HydrophobicityHPLCAtpH7.5Cowan', 'HydrophobicityHPLCParkerAndOthers', 'HydrophobicityHPLCWilsonAndOthers', 'HydrophobicityHoppAndWoods', 'HydrophobicityJanin', 'HydrophobicityKyteAndDoolittle', 'HydrophobicityManavalanAndOthers', 'HydrophobicityMiyazawaAndOthers', 'HydrophobicityOMHSweetAndOthers', 'HydrophobicityRaoAndArgos', 'HydrophobicityRfMobility', 'HydrophobicityRoseAndOthers', 'HydrophobicityRoseman', 'HydrophobicityWellingAndOthers', 'HydrophobicityWolfendenAndOthers');
  82   }
  83 
  84   return @PropertyNames;
  85 }
  86 
  87 # List data for an amino acid...
  88 sub ListAminoAcidData {
  89   my($DataLabelRef, $DataValueRef) = @_;
  90   my($Index, $Line, $Value);
  91 
  92   if ($AminoAcidRowsOutput) {
  93     $Line = '';
  94     # Format data...
  95     if ($OutQuote || $Options{outdelim} !~ /^comma$/i) {
  96       $Line = JoinWords($DataValueRef, $OutDelim, $OutQuote);
  97     }
  98     else {
  99       # Always quote values containing commas...
 100       $Line = ($DataValueRef->[0] =~ /\,/) ? qq("$DataValueRef->[0]") : $DataValueRef->[0];
 101       for $Index (1 .. $#{$DataValueRef} ) {
 102 	$Value = $DataValueRef->[$Index];
 103 	if ($Value =~ /\,/) {
 104 	  $Value = qq("$Value");
 105 	}
 106 	$Line .= $OutDelim . $Value;
 107       }
 108     }
 109     if ($FileOutput) {
 110       print OUTFILE "$Line\n";
 111     }
 112     else {
 113       print "$Line\n";
 114     }
 115   }
 116   else {
 117     # Format and list data...
 118     $Line = '';
 119     for $Index (0 .. $#{$DataLabelRef} ) {
 120       $Line = $DataLabelRef->[$Index] . ' ' . $DataValueRef->[$Index];
 121       if ($FileOutput) {
 122 	print OUTFILE "$Line\n";
 123       }
 124       else {
 125 	print "$Line\n";
 126       }
 127     }
 128   }
 129 }
 130 
 131 # List data for an amino acid...
 132 sub ListHeaderRowData {
 133   my($DataLabelRef) = @_;
 134   my($Line);
 135 
 136   # Format data...
 137   $Line = JoinWords($DataLabelRef, $OutDelim, $OutQuote);
 138   $Line =~ s/\://g;
 139   # List data...
 140   if ($FileOutput) {
 141     print OUTFILE "$Line\n";
 142   }
 143   else {
 144     print "$Line\n";
 145   }
 146 }
 147 
 148 # List properties for amino acids...
 149 sub ListAminoAcidProperties {
 150   my($AminoAcidID, $AminoAcidDataRef, $PropertyName, $PropertyValue, @PropertyLabels, @PropertyValues);
 151 
 152   print "Listing information for amino acids(s)...\n";
 153 
 154   if ($FileOutput) {
 155     print "Generating file $OutFileName...\n";
 156     open OUTFILE, ">$OutFileName" or die "Couldn't open $OutFileName: $!\n";
 157   }
 158 
 159   # Setup property labels...
 160   @PropertyLabels = ();
 161   for $PropertyName (@SpecifiedProperies) {
 162     push @PropertyLabels, ("$PropertyName:");
 163   }
 164 
 165   if ($AminoAcidRowsOutput) {
 166     ListHeaderRowData(\@PropertyLabels);
 167   }
 168 
 169   # Go over specified properties...
 170   for $AminoAcidID (@SpecifiedAminoAcidIDs) {
 171     $AminoAcidDataRef = AminoAcids::GetAminoAcidPropertiesData($AminoAcidID);
 172 
 173     if (!$AminoAcidRowsOutput) {
 174       if ($FileOutput) {
 175 	print OUTFILE "\nListing properties for amino acid $AminoAcidID...\n\n";
 176       }
 177       else {
 178 	print "\nListing properties for amino acid $AminoAcidID...\n\n";
 179       }
 180     }
 181 
 182     # Collect data..
 183     @PropertyValues = ();
 184     for $PropertyName (@SpecifiedProperies) {
 185       $PropertyValue = $AminoAcidDataRef->{$PropertyName};
 186       if (IsFloat($PropertyValue)) {
 187 	$PropertyValue = sprintf("%.${Precision}f", $PropertyValue) + 0;
 188       }
 189       push @PropertyValues, $PropertyValue;
 190     }
 191     # List data...
 192     ListAminoAcidData(\@PropertyLabels, \@PropertyValues);
 193   }
 194   if ($FileOutput) {
 195     close OUTFILE;
 196   }
 197   print "\n";
 198 }
 199 
 200 # Process option values...
 201 sub ProcessOptions {
 202   $OutDelim = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,");
 203   $OutQuote = ($Options{quote} =~ /^yes$/i) ? 1 : 0;
 204 
 205   $AminoAcidRowsOutput = ($Options{outputstyle} =~ /^AminoAcidRows$/i) ? 1 : 0;
 206   $FileOutput = ($Options{output} =~ /^File$/i) ? 1 : 0;
 207 
 208   $Precision = $Options{precision};
 209 
 210   my($AminoAcidID, @AminoAcidIDs);
 211 
 212   @SpecifiedAminoAcidIDs = ();
 213 
 214   # Set up Amino Acids IDs except for All mode...
 215   @AminoAcidIDs = ();
 216 
 217   if (@ARGV >= 1) {
 218     push @AminoAcidIDs, @ARGV;
 219   }
 220   else {
 221     # Setup mode specified default values...
 222     push @AminoAcidIDs, 'Ala';
 223   }
 224 
 225   # Generate list of amino acids...
 226   if (@ARGV == 1 && $ARGV[0] =~ /^All$/i) {
 227     push @SpecifiedAminoAcidIDs, AminoAcids::GetAminoAcids();
 228   }
 229   else {
 230     ID: for $AminoAcidID (@AminoAcidIDs) {
 231       if (AminoAcids::IsAminoAcid($AminoAcidID)) {
 232 	push @SpecifiedAminoAcidIDs, $AminoAcidID;
 233       }
 234       else {
 235 	warn "Ignoring amino acid ID, $AminoAcidID, specified using command line parameter option: Unknown amino acid ID...\n";
 236 	next ID;
 237       }
 238     }
 239   }
 240   SetupSpecifiedProperties();
 241 
 242   # Setup output file name...
 243   $OutFileName = '';
 244   if ($FileOutput) {
 245     my($OutFileRoot, $OutFileExt);
 246 
 247     $OutFileRoot = '';
 248     $OutFileExt = "csv";
 249     if ($Options{outdelim} =~ /^tab$/i) {
 250       $OutFileExt = "tsv";
 251     }
 252     if ($Options{root}) {
 253       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 254       if ($RootFileName && $RootFileExt) {
 255 	$OutFileRoot = $RootFileName;
 256       }
 257       else {
 258 	$OutFileRoot = $Options{root};
 259       }
 260     }
 261     else {
 262       $OutFileRoot = 'AminoAcidsInfo';
 263     }
 264     $OutFileName = $OutFileRoot . '.' . $OutFileExt;
 265     if (!$Options{overwrite}) {
 266       if (-e $OutFileName) {
 267 	die "Error: Output file, $OutFileName, already exists.\nUse \-o --overwrite\ option or specify a different name using \"-r --root\" option.\n";
 268       }
 269     }
 270   }
 271 }
 272 
 273 # Setup properties to list...
 274 sub SetupSpecifiedProperties {
 275   # Make sure appropriate properties/category names are specified...
 276   @SpecifiedProperies = ();
 277   if ($Options{properties} && ($Options{propertiesmode} =~ /^All$/i) ) {
 278     warn "Warning: Ignoring values specifed by \"-p --properties\" option: Not valid for All value of \"--propertiesmode\" option...\n";
 279   }
 280   if ($Options{propertiesmode} =~ /^All$/i) {
 281     if ($Options{propertieslisting} =~ /^Alphabetical$/i) {
 282       push @SpecifiedProperies, AminoAcids::GetAminoAcidPropertiesNames('Alphabetical');
 283     }
 284     else {
 285       push @SpecifiedProperies, AminoAcids::GetAminoAcidPropertiesNames();
 286     }
 287   }
 288   else {
 289     if ($Options{properties}) {
 290       if ($Options{propertiesmode} =~ /^Categories$/i) {
 291 	# Check category name...
 292 	if ($Options{properties} !~ /^(Basic|BasicPlus|BasicAndHydrophobicity|BasicAndHydrophobicityPlus)$/i) {
 293 	  die "Error: The value specified, $Options{properties}, for option \"-p --properties\" in conjunction with \"Categories\" value for option \"--propertiesmode\" is not valid. Allowed values: Basic, BasicPlus, BasicAndHydrophobicity, and BasicAndHydrophobicityPlus\n";
 294 	}
 295 	# Set propertynames...
 296 	push @SpecifiedProperies, GetPropertyNamesFromCategories($Options{properties});
 297       }
 298       else {
 299 	# Check property names..
 300 	my($Name, $PropertyName, @Names);
 301 	@Names = split /\,/, $Options{properties};
 302 	NAME: for $Name (@Names) {
 303 	  $PropertyName = RemoveLeadingAndTrailingWhiteSpaces($Name);
 304 	  if (AminoAcids::IsAminoAcidProperty($PropertyName)) {
 305 	    push @SpecifiedProperies, $PropertyName;
 306 	  }
 307 	  else {
 308 	    warn "Warning: Ignoring value, $Name, specifed by \"-p --properties\" option: Unknown property name...\n";
 309 	  }
 310 	}
 311 	if ($Options{propertieslisting} =~ /^Alphabetical$/i) {
 312 	  # ThreeLetterCode, OneLetterCode and AminoAcid are always listed first...
 313 	  # NaturalIsotopeData in the end...
 314 	  my($OneLetterCodePresent, $ThreeLetterCodePresent, $AminoAcidPresent,  @AlphabeticalProperties, %PropertiesMap);
 315 	  %PropertiesMap = ();
 316 	  @AlphabeticalProperties = ();
 317 	  $OneLetterCodePresent = 0; $ThreeLetterCodePresent = 0; $AminoAcidPresent = 0;
 318 	  NAME: for $Name (@SpecifiedProperies) {
 319 	    if ($Name =~ /^OneLetterCode$/i) {
 320 	      $OneLetterCodePresent = 1;
 321 	      next NAME;
 322 	    }
 323 	    if ($Name =~ /^ThreeLetterCode$/i) {
 324 	      $ThreeLetterCodePresent = 1;
 325 	      next NAME;
 326 	    }
 327 	    if ($Name =~ /^AminoAcid$/i) {
 328 	      $AminoAcidPresent = 1;
 329 	      next NAME;
 330 	    }
 331 	    $PropertiesMap{$Name} = $Name;
 332 	  }
 333 	  # Setup the alphabetical list...
 334 	  if ($ThreeLetterCodePresent) {
 335 	    push @AlphabeticalProperties, 'ThreeLetterCode';
 336 	  }
 337 	  if ($OneLetterCodePresent) {
 338 	    push @AlphabeticalProperties, 'OneLetterCode';
 339 	  }
 340 	  if ($AminoAcidPresent) {
 341 	    push @AlphabeticalProperties, 'AminoAcid';
 342 	  }
 343 	  for $Name (sort keys %PropertiesMap) {
 344 	    push @AlphabeticalProperties, $Name;
 345 	  }
 346 	  @SpecifiedProperies = ();
 347 	  push @SpecifiedProperies, @AlphabeticalProperties;
 348 	}
 349       }
 350     }
 351     else {
 352       # Set default value...
 353       push @SpecifiedProperies, GetPropertyNamesFromCategories('Basic');
 354     }
 355   }
 356 }
 357 
 358 # Setup script usage  and retrieve command line arguments specified using various options...
 359 sub SetupScriptUsage {
 360 
 361   # Retrieve all the options...
 362   %Options = ();
 363   $Options{outdelim} = "comma";
 364   $Options{output} = "STDOUT";
 365   $Options{outputstyle} = "AminoAcidBlock";
 366   $Options{precision} = 4;
 367   $Options{propertiesmode} = "Categories";
 368   $Options{propertieslisting} = "ByGroup";
 369   $Options{quote} = "yes";
 370 
 371   if (!GetOptions(\%Options, "help|h", "outdelim=s", "output=s", "outputstyle=s", "overwrite|o", "precision=i", "properties|p=s", "propertieslisting=s", "propertiesmode=s", "quote|q=s", "root|r=s", "workingdir|w=s")) {
 372     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 373   }
 374   if ($Options{workingdir}) {
 375     if (! -d $Options{workingdir}) {
 376       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 377     }
 378     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 379   }
 380   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 381     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 382   }
 383   if ($Options{output} !~ /^(STDOUT|File)$/i) {
 384     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: STDOUT or File\n";
 385   }
 386   if ($Options{outputstyle} !~ /^(AminoAcidBlock|AminoAcidRows)$/i) {
 387     die "Error: The value specified, $Options{outputstyle}, for option \"--outputstyle\" is not valid. Allowed values: AminoAcidBlock or AminoAcidRows\n";
 388   }
 389   if (!IsPositiveInteger($Options{precision})) {
 390     die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n";
 391   }
 392   if ($Options{propertiesmode} !~ /^(Categories|Names|All)$/i) {
 393     die "Error: The value specified, $Options{propertiesmode}, for option \"--propertiesmode\" is not valid. Allowed values: Categories, Names, or All\n";
 394   }
 395   if ($Options{propertieslisting} !~ /^(ByGroup|Alphabetical)$/i) {
 396     die "Error: The value specified, $Options{propertieslisting}, for option \"--propertieslisting\" is not valid. Allowed values: ByGroup, or Alphabetical\n";
 397   }
 398   if ($Options{quote} !~ /^(yes|no)$/i) {
 399     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 400   }
 401 }
 402