MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: TextFilesToSDFiles.pl,v $
   4 # $Date: 2008/01/30 21:45:04 $
   5 # $Revision: 1.12 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use 5.006;
  30 use strict;
  31 use FindBin; use lib "$FindBin::Bin/../lib";
  32 use Getopt::Long;
  33 use File::Basename;
  34 use Text::ParseWords;
  35 use Benchmark;
  36 use FileUtil;
  37 use TextUtil;
  38 use SDFileUtil;
  39 
  40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  41 
  42 # Autoflush STDOUT
  43 $| = 1;
  44 
  45 # Starting message...
  46 $ScriptName = basename $0;
  47 print "\n$ScriptName:Starting...\n\n";
  48 $StartTime = new Benchmark;
  49 
  50 # Get the options and setup script...
  51 SetupScriptUsage();
  52 if ($Options{help} || @ARGV < 1) {
  53   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  54 }
  55 
  56 my(@TextFilesList);
  57 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
  58 
  59 my($ColLabelsPresent);
  60 ProcessOptions();
  61 
  62 print "Checking input text file(s)...\n";
  63 my(@TextFilesOkay, @TextFilesColCount, @TextFilesColLabels, @TextFilesInDelim, @TextFilesOutSDFile);
  64 RetrieveTextFilesInfo();
  65 
  66 # Generate output files...
  67 my($Index, $TextFile);
  68 if (@TextFilesList > 1) {
  69   print "Processing text files...\n";
  70 }
  71 for $Index (0 .. $#TextFilesList) {
  72   if ($TextFilesOkay[$Index]) {
  73     $TextFile = $TextFilesList[$Index];
  74     if (@TextFilesList > 1) {
  75       print "\nProcessing file $TextFile...\n";
  76     }
  77     else {
  78       print "Processing file $TextFile...\n"
  79     }
  80     ConvertTextFile($Index);
  81   }
  82 }
  83 
  84 print "$ScriptName:Done...\n\n";
  85 
  86 $EndTime = new Benchmark;
  87 $TotalTime = timediff ($EndTime, $StartTime);
  88 print "Total time: ", timestr($TotalTime), "\n";
  89 
  90 ###############################################################################
  91 
  92 # Convert text file to SD file...
  93 sub ConvertTextFile {
  94   my($Index) = @_;
  95   my($TextFile, $SDFile, $Line, $InDelim, $Label, $Value, $ColIndex, $ColCount, @ColLabels, @LineWords);
  96 
  97   $TextFile = $TextFilesList[$Index];
  98   $InDelim = $TextFilesInDelim[$Index];
  99   $SDFile = $TextFilesOutSDFile[$Index];
 100   @ColLabels = @{$TextFilesColLabels[$Index]};
 101   $ColCount = @ColLabels;
 102 
 103   print "Generating SD file $SDFile...\n";
 104   open SDFILE, ">$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 105   open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n";
 106   if ($ColLabelsPresent) {
 107     # Skip over column labels from old file...
 108     $Line = GetTextLine(\*TEXTFILE);
 109   }
 110   my($Date) = GenerateMiscLineDateStamp();
 111   while ($Line = GetTextLine(\*TEXTFILE)) {
 112     @LineWords = quotewords($InDelim, 0, $Line);
 113 
 114     # Write out empty CTAB block...
 115     print SDFILE GenerateEmptyCtabBlockLines($Date), "\n";
 116 
 117     # Write out data fields and values...
 118     for $ColIndex (0 .. $#LineWords) {
 119       if ($ColIndex < $ColCount) {
 120 	$Label = $ColLabels[$ColIndex];
 121 	$Value = $LineWords[$ColIndex];
 122 	print SDFILE "> <$Label>\n$Value\n\n";
 123       }
 124     }
 125     print SDFILE "\$\$\$\$\n";
 126   }
 127   close SDFILE;
 128   close TEXTFILE;
 129 }
 130 
 131 # Process option values...
 132 sub ProcessOptions {
 133   $ColLabelsPresent = ($Options{label} =~ /^yes$/i) ? 1 : 0;
 134 }
 135 
 136 # Retrieve information about input text files...
 137 sub RetrieveTextFilesInfo {
 138   my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @LineWords, @ColLabels, $OutFileRoot,  $OutFile, $ColNum, $ColLabel);
 139 
 140   @TextFilesOkay = ();
 141   @TextFilesColCount = (); @TextFilesColLabels = ();
 142   @TextFilesInDelim = ();
 143   @TextFilesOutSDFile = ();
 144 
 145  FILELIST: for $Index (0 .. $#TextFilesList) {
 146     $TextFile = $TextFilesList[$Index];
 147     $TextFilesOkay[$Index] = 0;
 148     $TextFilesColCount[$Index] = 0;
 149     $TextFilesInDelim[$Index] = "";
 150     $TextFilesOutSDFile[$Index] = "";
 151     @{$TextFilesColLabels[$Index]} = ();
 152     if (!(-e $TextFile)) {
 153       warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
 154       next FILELIST;
 155     }
 156     if (!CheckFileType($TextFile, "csv tsv")) {
 157       warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n";
 158       next FILELIST;
 159     }
 160     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 161     if ($FileExt =~ /^tsv$/i) {
 162       $InDelim = "\t";
 163     }
 164     else {
 165       $InDelim = "\,";
 166       if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 167 	warn "Warning: Ignoring file $TextFile: The value specified, $Options{indelim}, for option \"--indelim\" is not valid for csv files\n";
 168 	next FILELIST;
 169       }
 170       if ($Options{indelim} =~ /^semicolon$/i) {
 171 	$InDelim = "\;";
 172       }
 173     }
 174     if (!open TEXTFILE, "$TextFile") {
 175       warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
 176       next FILELIST;
 177     }
 178     $Line = GetTextLine(\*TEXTFILE);
 179     @LineWords = quotewords($InDelim, 0, $Line);
 180     @ColLabels = ();
 181     if ($ColLabelsPresent) {
 182       push @ColLabels, @LineWords;
 183     }
 184     else {
 185       for $ColNum (1 .. @LineWords) {
 186 	$ColLabel = "Column${ColNum}Data";
 187 	push @ColLabels, $ColLabel;
 188       }
 189     }
 190     close TEXTFILE;
 191 
 192     $FileDir = ""; $FileName = ""; $FileExt = "";
 193     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 194     if ($Options{root} && (@TextFilesList == 1)) {
 195       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 196       if ($RootFileName && $RootFileExt) {
 197 	$FileName = $RootFileName;
 198       }
 199       else {
 200 	$FileName = $Options{root};
 201       }
 202       $OutFileRoot = $FileName;
 203     }
 204     else {
 205       $OutFileRoot = "${FileName}WithNoStrData";
 206     }
 207 
 208     $OutFile = "${OutFileRoot}.sdf";
 209     if (!$Options{overwrite}) {
 210       if (-e $OutFile) {
 211 	warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n";
 212 	next FILELIST;
 213       }
 214     }
 215     $TextFilesOkay[$Index] = 1;
 216     $TextFilesInDelim[$Index] = $InDelim;
 217     $TextFilesOutSDFile[$Index] = "$OutFile";
 218 
 219     $TextFilesColCount[$Index] = @ColLabels;
 220     push @{$TextFilesColLabels[$Index]}, @ColLabels;
 221   }
 222 }
 223 
 224 # Setup script usage  and retrieve command line arguments specified using various options...
 225 sub SetupScriptUsage {
 226 
 227   # Retrieve all the options...
 228   %Options = ();
 229   $Options{label} = "yes";
 230   $Options{indelim} = "comma";
 231   if (!GetOptions(\%Options, "help|h", "indelim=s", "label|l=s", "overwrite|o", "root|r=s", "workingdir|w=s")) {
 232     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 233   }
 234   if ($Options{workingdir}) {
 235     if (! -d $Options{workingdir}) {
 236       die "Error: The value specified, $Options{workingdir},  for option \"-w --workingdir\" is not a directory name.\n";
 237     }
 238     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 239   }
 240   if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 241     die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
 242   }
 243   if ($Options{label} !~ /^(yes|no)$/i) {
 244     die "Error: The value specified, $Options{label}, for option \"-l --label\" is not valid. Allowed values: yes or no\n";
 245   }
 246 }
 247