MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: InfoNucleicAcids.pl,v $
   4 # $Date: 2010/01/03 00:59:51 $
   5 # $Revision: 1.16 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2010 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 use NucleicAcids;
  38 
  39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  40 
  41 # Autoflush STDOUT
  42 $| = 1;
  43 
  44 # Starting message...
  45 $ScriptName = basename($0);
  46 print "\n$ScriptName: Starting...\n\n";
  47 $StartTime = new Benchmark;
  48 
  49 # Get the options and setup script...
  50 SetupScriptUsage();
  51 if ($Options{help}) {
  52   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  53 }
  54 
  55 my($OutDelim, $OutQuote, $NucleicAcidRowsOutput, $FileOutput, $Precision, $OutFileName, @SpecifiedNucleicAcidIDs, @SpecifiedProperies,);
  56 ProcessOptions();
  57 
  58 ListNucleicAcidProperties();
  59 print "$ScriptName:Done...\n\n";
  60 
  61 $EndTime = new Benchmark;
  62 $TotalTime = timediff ($EndTime, $StartTime);
  63 print "Total time: ", timestr($TotalTime), "\n";
  64 
  65 ###############################################################################
  66 
  67 # Get propery names from categories...
  68 sub GetPropertyNamesFromCategories {
  69   my($CategoryName) = @_;
  70   my(@PropertyNames);
  71 
  72   @PropertyNames = ();
  73   if ($CategoryName =~ /^Basic$/i) {
  74     @PropertyNames = ('Code', 'OtherCodes', 'Name', 'Type', 'MolecularFormula', 'MolecularWeight');
  75   } elsif ($CategoryName =~ /^BasicPlus$/i) {
  76     @PropertyNames = ('Code', 'OtherCodes', 'Name', 'Type', 'MolecularFormula', 'MolecularWeight', 'ExactMass', 'ElementalComposition');
  77   }
  78 
  79   return @PropertyNames;
  80 }
  81 
  82 # List data for an nucleic acid...
  83 sub ListNucleicAcidData {
  84   my($DataLabelRef, $DataValueRef) = @_;
  85   my($Index, $Line, $Value);
  86 
  87   if ($NucleicAcidRowsOutput) {
  88     $Line = '';
  89     # Format data...
  90     if ($OutQuote || $Options{outdelim} !~ /^comma$/i) {
  91       $Line = JoinWords($DataValueRef, $OutDelim, $OutQuote);
  92     }
  93     else {
  94       # Always quote values containing commas...
  95       $Line = ($DataValueRef->[0] =~ /\,/) ? qq("$DataValueRef->[0]") : $DataValueRef->[0];
  96       for $Index (1 .. $#{$DataValueRef} ) {
  97 	$Value = $DataValueRef->[$Index];
  98 	if ($Value =~ /\,/) {
  99 	  $Value = qq("$Value");
 100 	}
 101 	$Line .= $OutDelim . $Value;
 102       }
 103     }
 104     if ($FileOutput) {
 105       print OUTFILE "$Line\n";
 106     }
 107     else {
 108       print "$Line\n";
 109     }
 110   }
 111   else {
 112     # Format and list data...
 113     $Line = '';
 114     for $Index (0 .. $#{$DataLabelRef} ) {
 115       $Line = $DataLabelRef->[$Index] . ' ' . $DataValueRef->[$Index];
 116       if ($FileOutput) {
 117 	print OUTFILE "$Line\n";
 118       }
 119       else {
 120 	print "$Line\n";
 121       }
 122     }
 123   }
 124 }
 125 
 126 # List data for an nucleic acid...
 127 sub ListHeaderRowData {
 128   my($DataLabelRef) = @_;
 129   my($Line);
 130 
 131   # Format data...
 132   $Line = JoinWords($DataLabelRef, $OutDelim, $OutQuote);
 133   $Line =~ s/\://g;
 134   # List data...
 135   if ($FileOutput) {
 136     print OUTFILE "$Line\n";
 137   }
 138   else {
 139     print "$Line\n";
 140   }
 141 }
 142 
 143 # List properties for nucleic acids...
 144 sub ListNucleicAcidProperties {
 145   my($NucleicAcidID, $NucleicAcidDataRef, $PropertyName, $PropertyValue, @PropertyLabels, @PropertyValues);
 146 
 147   print "Listing information for nucleic acids(s)...\n";
 148 
 149   if ($FileOutput) {
 150     print "Generating file $OutFileName...\n";
 151     open OUTFILE, ">$OutFileName" or die "Couldn't open $OutFileName: $!\n";
 152   }
 153 
 154   # Setup property labels...
 155   @PropertyLabels = ();
 156   for $PropertyName (@SpecifiedProperies) {
 157     push @PropertyLabels, ("$PropertyName:");
 158   }
 159 
 160   if ($NucleicAcidRowsOutput) {
 161     ListHeaderRowData(\@PropertyLabels);
 162   }
 163 
 164   # Go over specified properties...
 165   for $NucleicAcidID (@SpecifiedNucleicAcidIDs) {
 166     $NucleicAcidDataRef = NucleicAcids::GetNucleicAcidPropertiesData($NucleicAcidID);
 167 
 168     if (!$NucleicAcidRowsOutput) {
 169       if ($FileOutput) {
 170 	print OUTFILE "\nListing properties for nucleic acid $NucleicAcidID...\n\n";
 171       }
 172       else {
 173 	print "\nListing properties for nucleic acid $NucleicAcidID...\n\n";
 174       }
 175     }
 176 
 177     # Collect data..
 178     @PropertyValues = ();
 179     for $PropertyName (@SpecifiedProperies) {
 180       $PropertyValue = $NucleicAcidDataRef->{$PropertyName};
 181       if (IsFloat($PropertyValue)) {
 182 	$PropertyValue = sprintf("%.${Precision}f", $PropertyValue) + 0;
 183       }
 184       push @PropertyValues, $PropertyValue;
 185     }
 186     # List data...
 187     ListNucleicAcidData(\@PropertyLabels, \@PropertyValues);
 188   }
 189   if ($FileOutput) {
 190     close OUTFILE;
 191   }
 192   print "\n";
 193 }
 194 
 195 # Process option values...
 196 sub ProcessOptions {
 197   $OutDelim = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,");
 198   $OutQuote = ($Options{quote} =~ /^yes$/i) ? 1 : 0;
 199 
 200   $NucleicAcidRowsOutput = ($Options{outputstyle} =~ /^NucleicAcidRows$/i) ? 1 : 0;
 201   $FileOutput = ($Options{output} =~ /^File$/i) ? 1 : 0;
 202 
 203   $Precision = $Options{precision};
 204 
 205   my($NucleicAcidID, @NucleicAcidIDs);
 206 
 207   @SpecifiedNucleicAcidIDs = ();
 208 
 209   # Set up Nucleic Acids IDs except for All mode...
 210   @NucleicAcidIDs = ();
 211 
 212   if (@ARGV >= 1) {
 213     push @NucleicAcidIDs, @ARGV;
 214   }
 215   else {
 216     # Setup mode specified default values...
 217     if ($Options{mode} =~ /NucleicAcidID/i) {
 218       push @NucleicAcidIDs, 'A';
 219     }
 220     elsif ($Options{mode} =~ /NucleicAcidType/i) {
 221       push @NucleicAcidIDs, 'Nucleoside';
 222     }
 223     else {
 224       push @NucleicAcidIDs, 'A';
 225     }
 226   }
 227 
 228   # Generate list of nucleic acids...
 229   if (@ARGV == 1 && $ARGV[0] =~ /^All$/i) {
 230     push @SpecifiedNucleicAcidIDs, NucleicAcids::GetNucleicAcids();
 231   }
 232   else {
 233     if ($Options{mode} =~ /NucleicAcidID/i) {
 234       ID: for $NucleicAcidID (@NucleicAcidIDs) {
 235 	if (NucleicAcids::IsNucleicAcid($NucleicAcidID)) {
 236 	  push @SpecifiedNucleicAcidIDs, $NucleicAcidID;
 237 	}
 238 	else {
 239 	  warn "Ignoring nucleic acid ID, $NucleicAcidID, specified using command line parameter option: Unknown nucleic acid ID...\n";
 240 	  next ID;
 241 	}
 242       }
 243     }
 244     elsif ($Options{mode} =~ /NucleicAcidType/i) {
 245       ID: for $NucleicAcidID (@NucleicAcidIDs) {
 246 	  if (!NucleicAcids::IsNucleicAcidType($NucleicAcidID)) {
 247 	    warn "Ignoring nucleic acid type, $NucleicAcidID, specified using command line parameter option: Unknown nucleic acid type...\n";
 248 	    next ID;
 249 	  }
 250 	  push @SpecifiedNucleicAcidIDs, NucleicAcids::GetNucleicAcidsByType($NucleicAcidID);
 251 	}
 252       }
 253   }
 254   SetupSpecifiedProperties();
 255 
 256   # Setup output file name...
 257   $OutFileName = '';
 258   if ($FileOutput) {
 259     my($OutFileRoot, $OutFileExt);
 260 
 261     $OutFileRoot = '';
 262     $OutFileExt = "csv";
 263     if ($Options{outdelim} =~ /^tab$/i) {
 264       $OutFileExt = "tsv";
 265     }
 266     if ($Options{root}) {
 267       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 268       if ($RootFileName && $RootFileExt) {
 269 	$OutFileRoot = $RootFileName;
 270       }
 271       else {
 272 	$OutFileRoot = $Options{root};
 273       }
 274     }
 275     else {
 276       $OutFileRoot = 'NucleicAcidsInfo';
 277     }
 278     $OutFileName = $OutFileRoot . '.' . $OutFileExt;
 279     if (!$Options{overwrite}) {
 280       if (-e $OutFileName) {
 281 	die "Error: Output file, $OutFileName, already exists.\nUse \-o --overwrite\ option or specify a different name using \"-r --root\" option.\n";
 282       }
 283     }
 284   }
 285 }
 286 
 287 # Setup properties to list...
 288 sub SetupSpecifiedProperties {
 289   # Make sure appropriate properties/category names are specified...
 290   @SpecifiedProperies = ();
 291   if ($Options{properties} && ($Options{propertiesmode} =~ /^All$/i) ) {
 292     warn "Warning: Ignoring values specifed by \"-p --properties\" option: Not valid for All value of \"--propertiesmode\" option...\n";
 293   }
 294   if ($Options{propertiesmode} =~ /^All$/i) {
 295     if ($Options{propertieslisting} =~ /^Alphabetical$/i) {
 296       push @SpecifiedProperies, NucleicAcids::GetNucleicAcidPropertiesNames('Alphabetical');
 297     }
 298     else {
 299       push @SpecifiedProperies, NucleicAcids::GetNucleicAcidPropertiesNames();
 300     }
 301   }
 302   else {
 303     if ($Options{properties}) {
 304       if ($Options{propertiesmode} =~ /^Categories$/i) {
 305 	# Check category name...
 306 	if ($Options{properties} !~ /^(Basic|BasicPlus)$/i) {
 307 	  die "Error: The value specified, $Options{properties}, for option \"-p --properties\" in conjunction with \"Categories\" value for option \"--propertiesmode\" is not valid. Allowed values: Basic and BasicPlus\n";
 308 	}
 309 	# Set propertynames...
 310 	push @SpecifiedProperies, GetPropertyNamesFromCategories($Options{properties});
 311       }
 312       else {
 313 	# Check property names..
 314 	my($Name, $PropertyName, @Names);
 315 	@Names = split /\,/, $Options{properties};
 316 	NAME: for $Name (@Names) {
 317 	  $PropertyName = RemoveLeadingAndTrailingWhiteSpaces($Name);
 318 	  if (NucleicAcids::IsNucleicAcidProperty($PropertyName)) {
 319 	    push @SpecifiedProperies, $PropertyName;
 320 	  }
 321 	  else {
 322 	    warn "Warning: Ignoring value, $Name, specifed by \"-p --properties\" option: Unknown property name...\n";
 323 	  }
 324 	}
 325 	if ($Options{propertieslisting} =~ /^Alphabetical$/i) {
 326 	  # Code, OtherCodes and Name are always listed first...
 327 	  my($CodePresent, $OtherCodesPresent, $NamePresent,  @AlphabeticalProperties, %PropertiesMap);
 328 	  %PropertiesMap = ();
 329 	  @AlphabeticalProperties = ();
 330 	  $CodePresent = 0; $OtherCodesPresent = 0; $NamePresent = 0;
 331 	  NAME: for $Name (@SpecifiedProperies) {
 332 	    if ($Name =~ /^Code$/i) {
 333 	      $CodePresent = 1;
 334 	      next NAME;
 335 	    }
 336 	    if ($Name =~ /^OtherCodes$/i) {
 337 	      $OtherCodesPresent = 1;
 338 	      next NAME;
 339 	    }
 340 	    if ($Name =~ /^Name$/i) {
 341 	      $NamePresent = 1;
 342 	      next NAME;
 343 	    }
 344 	    $PropertiesMap{$Name} = $Name;
 345 	  }
 346 	  # Setup the alphabetical list...
 347 	  if ($CodePresent) {
 348 	    push @AlphabeticalProperties, 'Code';
 349 	  }
 350 	  if ($OtherCodesPresent) {
 351 	    push @AlphabeticalProperties, 'OtherCodesPresent';
 352 	  }
 353 	  if ($NamePresent) {
 354 	    push @AlphabeticalProperties, 'Name';
 355 	  }
 356 	  for $Name (sort keys %PropertiesMap) {
 357 	    push @AlphabeticalProperties, $Name;
 358 	  }
 359 	  @SpecifiedProperies = ();
 360 	  push @SpecifiedProperies, @AlphabeticalProperties;
 361 	}
 362       }
 363     }
 364     else {
 365       # Set default value...
 366       push @SpecifiedProperies, GetPropertyNamesFromCategories('Basic');
 367     }
 368   }
 369 }
 370 
 371 # Setup script usage  and retrieve command line arguments specified using various options...
 372 sub SetupScriptUsage {
 373 
 374   # Retrieve all the options...
 375   %Options = ();
 376   $Options{mode} = "NucleicAcidID";
 377   $Options{outdelim} = "comma";
 378   $Options{output} = "STDOUT";
 379   $Options{outputstyle} = "NucleicAcidBlock";
 380   $Options{precision} = 4;
 381   $Options{propertiesmode} = "Categories";
 382   $Options{propertieslisting} = "ByGroup";
 383   $Options{quote} = "yes";
 384 
 385   if (!GetOptions(\%Options, "help|h", "mode|m=s", "outdelim=s", "output=s", "outputstyle=s", "overwrite|o", "precision=i", "properties|p=s", "propertieslisting=s", "propertiesmode=s", "quote|q=s", "root|r=s", "workingdir|w=s")) {
 386     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 387   }
 388   if ($Options{workingdir}) {
 389     if (! -d $Options{workingdir}) {
 390       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 391     }
 392     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 393   }
 394   if ($Options{mode} !~ /^(NucleicAcidID|NucleicAcidType)$/i) {
 395     die "Error: The value specified, $Options{mode}, for option \"--mode\" is not valid. Allowed values: NucleicAcidID or NucleicAcidType\n";
 396   }
 397   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 398     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 399   }
 400   if ($Options{output} !~ /^(STDOUT|File)$/i) {
 401     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: STDOUT or File\n";
 402   }
 403   if ($Options{outputstyle} !~ /^(NucleicAcidBlock|NucleicAcidRows)$/i) {
 404     die "Error: The value specified, $Options{outputstyle}, for option \"--outputstyle\" is not valid. Allowed values: NucleicAcidBlock or NucleicAcidRows\n";
 405   }
 406   if (!IsPositiveInteger($Options{precision})) {
 407     die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n";
 408   }
 409   if ($Options{propertiesmode} !~ /^(Categories|Names|All)$/i) {
 410     die "Error: The value specified, $Options{propertiesmode}, for option \"--propertiesmode\" is not valid. Allowed values: Categories, Names, or All\n";
 411   }
 412   if ($Options{propertieslisting} !~ /^(ByGroup|Alphabetical)$/i) {
 413     die "Error: The value specified, $Options{propertieslisting}, for option \"--propertieslisting\" is not valid. Allowed values: ByGroup, or Alphabetical\n";
 414   }
 415   if ($Options{quote} !~ /^(yes|no)$/i) {
 416     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 417   }
 418 }
 419