MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: InfoNucleicAcids.pl,v $
   4 # $Date: 2008/01/30 21:44:46 $
   5 # $Revision: 1.12 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use 5.006;
  30 use strict;
  31 use FindBin; use lib "$FindBin::Bin/../lib";
  32 use Getopt::Long;
  33 use File::Basename;
  34 use Text::ParseWords;
  35 use Benchmark;
  36 use FileUtil;
  37 use TextUtil;
  38 use NucleicAcids;
  39 
  40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  41 
  42 # Autoflush STDOUT
  43 $| = 1;
  44 
  45 # Starting message...
  46 $ScriptName = basename($0);
  47 print "\n$ScriptName: Starting...\n\n";
  48 $StartTime = new Benchmark;
  49 
  50 # Get the options and setup script...
  51 SetupScriptUsage();
  52 if ($Options{help}) {
  53   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  54 }
  55 
  56 my($OutDelim, $OutQuote, $NucleicAcidRowsOutput, $FileOutput, $Precision, $OutFileName, @SpecifiedNucleicAcidIDs, @SpecifiedProperies,);
  57 ProcessOptions();
  58 
  59 ListNucleicAcidProperties();
  60 print "$ScriptName:Done...\n\n";
  61 
  62 $EndTime = new Benchmark;
  63 $TotalTime = timediff ($EndTime, $StartTime);
  64 print "Total time: ", timestr($TotalTime), "\n";
  65 
  66 ###############################################################################
  67 
  68 # Get propery names from categories...
  69 sub GetPropertyNamesFromCategories {
  70   my($CategoryName) = @_;
  71   my(@PropertyNames);
  72 
  73   @PropertyNames = ();
  74   if ($CategoryName =~ /^Basic$/i) {
  75     @PropertyNames = ('Code', 'OtherCodes', 'Name', 'Type', 'MolecularFormula', 'MolecularWeight');
  76   } elsif ($CategoryName =~ /^BasicPlus$/i) {
  77     @PropertyNames = ('Code', 'OtherCodes', 'Name', 'Type', 'MolecularFormula', 'MolecularWeight', 'ExactMass', 'ElementalComposition');
  78   }
  79 
  80   return @PropertyNames;
  81 }
  82 
  83 # List data for an nucleic acid...
  84 sub ListNucleicAcidData {
  85   my($DataLabelRef, $DataValueRef) = @_;
  86   my($Index, $Line, $Value);
  87 
  88   if ($NucleicAcidRowsOutput) {
  89     $Line = '';
  90     # Format data...
  91     if ($OutQuote || $Options{outdelim} !~ /^comma$/i) {
  92       $Line = JoinWords($DataValueRef, $OutDelim, $OutQuote);
  93     }
  94     else {
  95       # Always quote values containing commas...
  96       $Line = ($DataValueRef->[0] =~ /\,/) ? qq("$DataValueRef->[0]") : $DataValueRef->[0];
  97       for $Index (1 .. $#{$DataValueRef} ) {
  98 	$Value = $DataValueRef->[$Index];
  99 	if ($Value =~ /\,/) {
 100 	  $Value = qq("$Value");
 101 	}
 102 	$Line .= $OutDelim . $Value;
 103       }
 104     }
 105     if ($FileOutput) {
 106       print OUTFILE "$Line\n";
 107     }
 108     else {
 109       print "$Line\n";
 110     }
 111   }
 112   else {
 113     # Format and list data...
 114     $Line = '';
 115     for $Index (0 .. $#{$DataLabelRef} ) {
 116       $Line = $DataLabelRef->[$Index] . ' ' . $DataValueRef->[$Index];
 117       if ($FileOutput) {
 118 	print OUTFILE "$Line\n";
 119       }
 120       else {
 121 	print "$Line\n";
 122       }
 123     }
 124   }
 125 }
 126 
 127 # List data for an nucleic acid...
 128 sub ListHeaderRowData {
 129   my($DataLabelRef) = @_;
 130   my($Line);
 131 
 132   # Format data...
 133   $Line = JoinWords($DataLabelRef, $OutDelim, $OutQuote);
 134   $Line =~ s/\://g;
 135   # List data...
 136   if ($FileOutput) {
 137     print OUTFILE "$Line\n";
 138   }
 139   else {
 140     print "$Line\n";
 141   }
 142 }
 143 
 144 # List properties for nucleic acids...
 145 sub ListNucleicAcidProperties {
 146   my($NucleicAcidID, $NucleicAcidDataRef, $PropertyName, $PropertyValue, @PropertyLabels, @PropertyValues);
 147 
 148   print "Listing information for nucleic acids(s)...\n";
 149 
 150   if ($FileOutput) {
 151     print "Generating file $OutFileName...\n";
 152     open OUTFILE, ">$OutFileName" or die "Couldn't open $OutFileName: $!\n";
 153   }
 154 
 155   # Setup property labels...
 156   @PropertyLabels = ();
 157   for $PropertyName (@SpecifiedProperies) {
 158     push @PropertyLabels, ("$PropertyName:");
 159   }
 160 
 161   if ($NucleicAcidRowsOutput) {
 162     ListHeaderRowData(\@PropertyLabels);
 163   }
 164 
 165   # Go over specified properties...
 166   for $NucleicAcidID (@SpecifiedNucleicAcidIDs) {
 167     $NucleicAcidDataRef = NucleicAcids::GetNucleicAcidPropertiesData($NucleicAcidID);
 168 
 169     if (!$NucleicAcidRowsOutput) {
 170       if ($FileOutput) {
 171 	print OUTFILE "\nListing properties for nucleic acid $NucleicAcidID...\n\n";
 172       }
 173       else {
 174 	print "\nListing properties for nucleic acid $NucleicAcidID...\n\n";
 175       }
 176     }
 177 
 178     # Collect data..
 179     @PropertyValues = ();
 180     for $PropertyName (@SpecifiedProperies) {
 181       $PropertyValue = $NucleicAcidDataRef->{$PropertyName};
 182       if (IsFloat($PropertyValue)) {
 183 	$PropertyValue = sprintf("%.${Precision}f", $PropertyValue) + 0;
 184       }
 185       push @PropertyValues, $PropertyValue;
 186     }
 187     # List data...
 188     ListNucleicAcidData(\@PropertyLabels, \@PropertyValues);
 189   }
 190   if ($FileOutput) {
 191     close OUTFILE;
 192   }
 193   print "\n";
 194 }
 195 
 196 # Process option values...
 197 sub ProcessOptions {
 198   $OutDelim = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,");
 199   $OutQuote = ($Options{quote} =~ /^yes$/i) ? 1 : 0;
 200 
 201   $NucleicAcidRowsOutput = ($Options{outputstyle} =~ /^NucleicAcidRows$/i) ? 1 : 0;
 202   $FileOutput = ($Options{output} =~ /^File$/i) ? 1 : 0;
 203 
 204   $Precision = $Options{precision};
 205 
 206   my($NucleicAcidID, @NucleicAcidIDs);
 207 
 208   @SpecifiedNucleicAcidIDs = ();
 209 
 210   # Set up Nucleic Acids IDs except for All mode...
 211   @NucleicAcidIDs = ();
 212 
 213   if (@ARGV >= 1) {
 214     push @NucleicAcidIDs, @ARGV;
 215   }
 216   else {
 217     # Setup mode specified default values...
 218     if ($Options{mode} =~ /NucleicAcidID/i) {
 219       push @NucleicAcidIDs, 'A';
 220     }
 221     elsif ($Options{mode} =~ /NucleicAcidType/i) {
 222       push @NucleicAcidIDs, 'Nucleoside';
 223     }
 224     else {
 225       push @NucleicAcidIDs, 'A';
 226     }
 227   }
 228 
 229   # Generate list of nucleic acids...
 230   if (@ARGV == 1 && $ARGV[0] =~ /^All$/i) {
 231     push @SpecifiedNucleicAcidIDs, NucleicAcids::GetNucleicAcids();
 232   }
 233   else {
 234     if ($Options{mode} =~ /NucleicAcidID/i) {
 235       ID: for $NucleicAcidID (@NucleicAcidIDs) {
 236 	if (NucleicAcids::IsNucleicAcid($NucleicAcidID)) {
 237 	  push @SpecifiedNucleicAcidIDs, $NucleicAcidID;
 238 	}
 239 	else {
 240 	  warn "Ignoring nucleic acid ID, $NucleicAcidID, specified using command line parameter option: Unknown nucleic acid ID...\n";
 241 	  next ID;
 242 	}
 243       }
 244     }
 245     elsif ($Options{mode} =~ /NucleicAcidType/i) {
 246       ID: for $NucleicAcidID (@NucleicAcidIDs) {
 247 	  if (!NucleicAcids::IsNucleicAcidType($NucleicAcidID)) {
 248 	    warn "Ignoring nucleic acid type, $NucleicAcidID, specified using command line parameter option: Unknown nucleic acid type...\n";
 249 	    next ID;
 250 	  }
 251 	  push @SpecifiedNucleicAcidIDs, NucleicAcids::GetNucleicAcidsByType($NucleicAcidID);
 252 	}
 253       }
 254   }
 255   SetupSpecifiedProperties();
 256 
 257   # Setup output file name...
 258   $OutFileName = '';
 259   if ($FileOutput) {
 260     my($OutFileRoot, $OutFileExt);
 261 
 262     $OutFileRoot = '';
 263     $OutFileExt = "csv";
 264     if ($Options{outdelim} =~ /^tab$/i) {
 265       $OutFileExt = "tsv";
 266     }
 267     if ($Options{root}) {
 268       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 269       if ($RootFileName && $RootFileExt) {
 270 	$OutFileRoot = $RootFileName;
 271       }
 272       else {
 273 	$OutFileRoot = $Options{root};
 274       }
 275     }
 276     else {
 277       $OutFileRoot = 'NucleicAcidsInfo';
 278     }
 279     $OutFileName = $OutFileRoot . '.' . $OutFileExt;
 280     if (!$Options{overwrite}) {
 281       if (-e $OutFileName) {
 282 	die "Error: Output file, $OutFileName, already exists.\nUse \-o --overwrite\ option or specify a different name using \"-r --root\" option.\n";
 283       }
 284     }
 285   }
 286 }
 287 
 288 # Setup properties to list...
 289 sub SetupSpecifiedProperties {
 290   # Make sure appropriate properties/category names are specified...
 291   @SpecifiedProperies = ();
 292   if ($Options{properties} && ($Options{propertiesmode} =~ /^All$/i) ) {
 293     warn "Warning: Ignoring values specifed by \"-p --properties\" option: Not valid for All value of \"--propertiesmode\" option...\n";
 294   }
 295   if ($Options{propertiesmode} =~ /^All$/i) {
 296     if ($Options{propertieslisting} =~ /^Alphabetical$/i) {
 297       push @SpecifiedProperies, NucleicAcids::GetNucleicAcidPropertiesNames('Alphabetical');
 298     }
 299     else {
 300       push @SpecifiedProperies, NucleicAcids::GetNucleicAcidPropertiesNames();
 301     }
 302   }
 303   else {
 304     if ($Options{properties}) {
 305       if ($Options{propertiesmode} =~ /^Categories$/i) {
 306 	# Check category name...
 307 	if ($Options{properties} !~ /^(Basic|BasicPlus)$/i) {
 308 	  die "Error: The value specified, $Options{properties}, for option \"-p --properties\" in conjunction with \"Categories\" value for option \"--propertiesmode\" is not valid. Allowed values: Basic and BasicPlus\n";
 309 	}
 310 	# Set propertynames...
 311 	push @SpecifiedProperies, GetPropertyNamesFromCategories($Options{properties});
 312       }
 313       else {
 314 	# Check property names..
 315 	my($Name, $PropertyName, @Names);
 316 	@Names = split /\,/, $Options{properties};
 317 	NAME: for $Name (@Names) {
 318 	  $PropertyName = RemoveLeadingAndTrailingWhiteSpaces($Name);
 319 	  if (NucleicAcids::IsNucleicAcidProperty($PropertyName)) {
 320 	    push @SpecifiedProperies, $PropertyName;
 321 	  }
 322 	  else {
 323 	    warn "Warning: Ignoring value, $Name, specifed by \"-p --properties\" option: Unknown property name...\n";
 324 	  }
 325 	}
 326 	if ($Options{propertieslisting} =~ /^Alphabetical$/i) {
 327 	  # Code, OtherCodes and Name are always listed first...
 328 	  my($CodePresent, $OtherCodesPresent, $NamePresent,  @AlphabeticalProperties, %PropertiesMap);
 329 	  %PropertiesMap = ();
 330 	  @AlphabeticalProperties = ();
 331 	  $CodePresent = 0; $OtherCodesPresent = 0; $NamePresent = 0;
 332 	  NAME: for $Name (@SpecifiedProperies) {
 333 	    if ($Name =~ /^Code$/i) {
 334 	      $CodePresent = 1;
 335 	      next NAME;
 336 	    }
 337 	    if ($Name =~ /^OtherCodes$/i) {
 338 	      $OtherCodesPresent = 1;
 339 	      next NAME;
 340 	    }
 341 	    if ($Name =~ /^Name$/i) {
 342 	      $NamePresent = 1;
 343 	      next NAME;
 344 	    }
 345 	    $PropertiesMap{$Name} = $Name;
 346 	  }
 347 	  # Setup the alphabetical list...
 348 	  if ($CodePresent) {
 349 	    push @AlphabeticalProperties, 'Code';
 350 	  }
 351 	  if ($OtherCodesPresent) {
 352 	    push @AlphabeticalProperties, 'OtherCodesPresent';
 353 	  }
 354 	  if ($NamePresent) {
 355 	    push @AlphabeticalProperties, 'Name';
 356 	  }
 357 	  for $Name (sort keys %PropertiesMap) {
 358 	    push @AlphabeticalProperties, $Name;
 359 	  }
 360 	  @SpecifiedProperies = ();
 361 	  push @SpecifiedProperies, @AlphabeticalProperties;
 362 	}
 363       }
 364     }
 365     else {
 366       # Set default value...
 367       push @SpecifiedProperies, GetPropertyNamesFromCategories('Basic');
 368     }
 369   }
 370 }
 371 
 372 # Setup script usage  and retrieve command line arguments specified using various options...
 373 sub SetupScriptUsage {
 374 
 375   # Retrieve all the options...
 376   %Options = ();
 377   $Options{mode} = "NucleicAcidID";
 378   $Options{outdelim} = "comma";
 379   $Options{output} = "STDOUT";
 380   $Options{outputstyle} = "NucleicAcidBlock";
 381   $Options{precision} = 4;
 382   $Options{propertiesmode} = "Categories";
 383   $Options{propertieslisting} = "ByGroup";
 384   $Options{quote} = "yes";
 385 
 386   if (!GetOptions(\%Options, "help|h", "mode|m=s", "outdelim=s", "output=s", "outputstyle=s", "overwrite|o", "precision=i", "properties|p=s", "propertieslisting=s", "propertiesmode=s", "quote|q=s", "root|r=s", "workingdir|w=s")) {
 387     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 388   }
 389   if ($Options{workingdir}) {
 390     if (! -d $Options{workingdir}) {
 391       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 392     }
 393     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 394   }
 395   if ($Options{mode} !~ /^(NucleicAcidID|NucleicAcidType)$/i) {
 396     die "Error: The value specified, $Options{mode}, for option \"--mode\" is not valid. Allowed values: NucleicAcidID or NucleicAcidType\n";
 397   }
 398   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 399     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 400   }
 401   if ($Options{output} !~ /^(STDOUT|File)$/i) {
 402     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: STDOUT or File\n";
 403   }
 404   if ($Options{outputstyle} !~ /^(NucleicAcidBlock|NucleicAcidRows)$/i) {
 405     die "Error: The value specified, $Options{outputstyle}, for option \"--outputstyle\" is not valid. Allowed values: NucleicAcidBlock or NucleicAcidRows\n";
 406   }
 407   if (!IsPositiveInteger($Options{precision})) {
 408     die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n";
 409   }
 410   if ($Options{propertiesmode} !~ /^(Categories|Names|All)$/i) {
 411     die "Error: The value specified, $Options{propertiesmode}, for option \"--propertiesmode\" is not valid. Allowed values: Categories, Names, or All\n";
 412   }
 413   if ($Options{propertieslisting} !~ /^(ByGroup|Alphabetical)$/i) {
 414     die "Error: The value specified, $Options{propertieslisting}, for option \"--propertieslisting\" is not valid. Allowed values: ByGroup, or Alphabetical\n";
 415   }
 416   if ($Options{quote} !~ /^(yes|no)$/i) {
 417     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 418   }
 419 }
 420