MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: ModifySDFilesDataFields.pl,v $
   4 # $Date: 2008/02/02 22:23:45 $
   5 # $Revision: 1.14 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use 5.006;
  30 use strict;
  31 use FindBin; use lib "$FindBin::Bin/../lib";
  32 use Getopt::Long;
  33 use File::Basename;
  34 use Text::ParseWords;
  35 use Benchmark;
  36 use FileUtil;
  37 use SDFileUtil;
  38 use TextUtil;
  39 
  40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  41 
  42 # Autoflush STDOUT
  43 $| = 1;
  44 
  45 # Starting message...
  46 $ScriptName = basename($0);
  47 print "\n$ScriptName: Starting...\n\n";
  48 $StartTime = new Benchmark;
  49 
  50 # Get the options and setup script...
  51 SetupScriptUsage();
  52 if ($Options{help} || @ARGV < 1) {
  53   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  54 }
  55 
  56 my(@SDFilesList);
  57 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  58 
  59 my($ModifyMolName, $ModifyDataFields, $KeepAllOldDataFields, $KeepUnMappedOldDataFields, $UseDataFieldForMolName, $AlwaysReplaceMolName, $MolNameDataField, $MolNamePrefix, $CreateDataFieldURL, $URLDataFieldLabel, $URLCGIScriptName, $URLParamName, $URLCmpdIdFieldName, %SpecifiedNewToOldSDFieldMap, %SpecifiedOldToNewSDFieldMap, %SpecifiedCommonFieldMap);
  60 ProcessOptions();
  61 
  62 print "Checking input SD file(s)...\n";
  63 my(@SDFilesOkay, @SDFilesOutFile);
  64 RetrieveSDFilesInfo();
  65 
  66 # Generate output files...
  67 my($Index, $SDFile);
  68 if (@SDFilesList > 1) {
  69   print "Processing SD file(s)...\n";
  70 }
  71 for $Index (0 .. $#SDFilesList) {
  72   if ($SDFilesOkay[$Index]) {
  73     $SDFile = $SDFilesList[$Index];
  74     if (@SDFilesList > 1) {
  75       print "\nProcessing file $SDFile...\n";
  76     }
  77     else {
  78       print "Processing file $SDFile...\n"
  79     }
  80     ModifySDFile($Index);
  81   }
  82 }
  83 
  84 print "$ScriptName:Done...\n\n";
  85 
  86 $EndTime = new Benchmark;
  87 $TotalTime = timediff ($EndTime, $StartTime);
  88 print "Total time: ", timestr($TotalTime), "\n";
  89 
  90 ###############################################################################
  91 
  92 # Modify SD file data fields....
  93 sub ModifySDFile {
  94   my($Index) = @_;
  95   my($SDFile, $NewSDFile);
  96 
  97   $SDFile = $SDFilesList[$Index];
  98   $NewSDFile = $SDFilesOutFile[$Index];
  99 
 100   print "Generating new SD file $NewSDFile...\n";
 101   open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
 102   open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
 103 
 104   my($CmpdCount, $CmpdString, $CmpdData, $MolName, $OldSDField, $NewSDField, $CommonSDField, $Label, $Value, $FieldValues, @CmpdLines, %DataFieldAndValues, @DataFieldLabels);
 105   $CmpdCount = 0;
 106 
 107   COMPOUND: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 108       $CmpdCount++;
 109       @CmpdLines = split "\n", $CmpdString;
 110       if ($UseDataFieldForMolName || $ModifyDataFields) {
 111 	%DataFieldAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 112       }
 113       if ($ModifyMolName) {
 114 	if ($AlwaysReplaceMolName || !IsNotEmpty($CmpdLines[0])) {
 115 	  if ($UseDataFieldForMolName && exists($DataFieldAndValues{$MolNameDataField})) {
 116 	    $MolName = $DataFieldAndValues{$MolNameDataField};
 117 	    if (length($MolName) > 80) {
 118 	      $MolName = substr($MolName, 0, 80);
 119 	    }
 120 	  }
 121 	  else {
 122 	    $MolName = "${MolNamePrefix}${CmpdCount}";
 123 	  }
 124 	  $CmpdLines[0] = $MolName;
 125 	  $CmpdString = join "\n", @CmpdLines;
 126 	}
 127       }
 128       if (!$ModifyDataFields) {
 129 	# Just write the data and get the next compound...
 130 	print NEWSDFILE "$CmpdString\n";
 131 	next COMPOUND;
 132       }
 133       # Write out the structure data now and handle the old data fields later...
 134       ($CmpdData) = split /\n>/, $CmpdString;
 135       print NEWSDFILE "$CmpdData\n";
 136 
 137       # Modify specified data fields...
 138       for $NewSDField (sort keys %SpecifiedNewToOldSDFieldMap) {
 139 	$FieldValues = "";
 140 	for $OldSDField (@{$SpecifiedNewToOldSDFieldMap{$NewSDField}}) {
 141 	  if (exists($DataFieldAndValues{$OldSDField}) && length($DataFieldAndValues{$OldSDField})) {
 142 	    $Value = $DataFieldAndValues{$OldSDField};
 143 	    $FieldValues .= ($FieldValues) ? "\n$Value" : $Value;
 144 	  }
 145 	}
 146 	print NEWSDFILE "> <$NewSDField>\n$FieldValues\n\n";
 147       }
 148       # Add specified common fields...
 149       for $CommonSDField (sort keys %SpecifiedCommonFieldMap) {
 150 	$Value = $SpecifiedCommonFieldMap{$CommonSDField};
 151 	print NEWSDFILE "> <$CommonSDField>\n$Value\n\n";
 152       }
 153       if ($CreateDataFieldURL) {
 154 	$Value = "";
 155 	if (exists($DataFieldAndValues{$URLCmpdIdFieldName}) && length($DataFieldAndValues{$URLCmpdIdFieldName})) {
 156 	  $Value = $DataFieldAndValues{$URLCmpdIdFieldName};
 157 	  $Value = "${URLCGIScriptName}?${URLParamName}=${Value}";
 158 	}
 159 	print NEWSDFILE "> <$URLDataFieldLabel>\n$Value\n\n";
 160       }
 161 
 162       # Handle old data fields and write 'em in the same order as they appear in the input
 163       # files...
 164       if ($KeepAllOldDataFields || $KeepUnMappedOldDataFields) {
 165 	my($KeepLabel);
 166 	@DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
 167 	LABEL: for $Label (@DataFieldLabels) {
 168 	  $KeepLabel = $KeepAllOldDataFields ? 1 : ( exists($SpecifiedOldToNewSDFieldMap{$Label}) ? 0 : 1  );
 169 	  if (!$KeepLabel) {
 170 	    next LABEL;
 171 	  }
 172 	  $Value = $DataFieldAndValues{$Label};
 173 	  print NEWSDFILE "> <$Label>\n$Value\n\n";
 174 	}
 175       }
 176 
 177       print NEWSDFILE "\$\$\$\$\n";
 178   }
 179   close NEWSDFILE;
 180   close SDFILE;
 181 }
 182 
 183 # Process option values...
 184 sub ProcessOptions {
 185 
 186   $ModifyMolName = 1; $ModifyDataFields = 0;
 187   if ($Options{mode} =~ /^both$/i) {
 188     $ModifyMolName = 1; $ModifyDataFields = 1;
 189   }
 190   elsif ($Options{mode} =~ /^datafields$/i) {
 191     $ModifyMolName = 0; $ModifyDataFields = 1;
 192   }
 193 
 194   $KeepAllOldDataFields = ($Options{keepolddatafields} =~ /^all$/i) ? 1 : 0;
 195   $KeepUnMappedOldDataFields = ($Options{keepolddatafields} =~ /^unmappedonly$/i) ? 1 : 0;
 196 
 197   $UseDataFieldForMolName = ($Options{molnamemode} =~ /^datafield$/i) ? 1 : 0;
 198   $MolNameDataField = ""; $MolNamePrefix = "Cmpd";
 199   if ($Options{molname}) {
 200     if ($UseDataFieldForMolName) {
 201       $MolNameDataField = $Options{molname};
 202     }
 203     else {
 204       $MolNamePrefix = $Options{molname};
 205     }
 206   }
 207   $AlwaysReplaceMolName = ($Options{molnamereplace} =~ /^always$/i) ? 1 : 0;
 208 
 209   if ($Options{datafieldsmap} && $Options{datafieldsmapfile}) {
 210     die "Error: Both \"--datafieldsmap\" and  \"--datafieldsmapfile\" options specified: only one is allowed at a time\n";
 211   }
 212 
 213   %SpecifiedNewToOldSDFieldMap = ();
 214   %SpecifiedOldToNewSDFieldMap = ();
 215 
 216   my($SpecifiedDataFieldMap);
 217   $SpecifiedDataFieldMap = "";
 218   if ($Options{datafieldsmap}) {
 219     $SpecifiedDataFieldMap = $Options{datafieldsmap};
 220   }
 221   elsif ($Options{datafieldsmapfile}) {
 222     my($Line, @LineWords);
 223     open DATAFIELDSFILE, "$Options{datafieldsmapfile}" or die "Couldn't  open $Options{datafieldsmapfile}: $! \n";
 224     while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
 225       @LineWords = quotewords(";", 0, $Line);
 226       $SpecifiedDataFieldMap .= JoinWords(\@LineWords, ";", 0);
 227     }
 228     close DATAFIELDSFILE;
 229   }
 230 
 231   if ($SpecifiedDataFieldMap) {
 232     my($DataFieldMap, $DataField, $NewSDField, @OldSDFields, @DataFieldMapSplit, @DataFieldsSplit, $FirstField);
 233     @DataFieldMapSplit = split ";", $SpecifiedDataFieldMap;
 234     for $DataFieldMap (@DataFieldMapSplit) {
 235       @DataFieldsSplit = split ",", $DataFieldMap;
 236       if (@DataFieldsSplit == 1) {
 237 	die "Error: Invalid number of comma delimited values, ", scalar(@DataFieldsSplit), ", specified,  @DataFieldsSplit, using \"--datafieldsmap or --datafieldsmapfile\" option: it must contain more than one value.\n";
 238       }
 239       $FirstField = 1;
 240       @OldSDFields = ();
 241       for $DataField (@DataFieldsSplit) {
 242 	if (!(defined($DataField) && length($DataField))) {
 243 	  die "Error: One of the comma delimited values, \"", join(",", @DataFieldsSplit), "\", specified using \"--datafieldsmap or --datafieldsmapfile\" option is empty.\n";
 244 	}
 245 	if ($FirstField) {
 246 	  $FirstField = 0;
 247 	  $NewSDField = $DataField;
 248 	}
 249 	else {
 250 	  push @OldSDFields, $DataField;
 251 	}
 252       }
 253       # Make sure a datafield is only specified once...
 254       if (exists $SpecifiedNewToOldSDFieldMap{$NewSDField}) {
 255 	die "Error: New data field, $NewSDField, specified more than once using \"--datafieldsmap or --datafieldsmapfile\" option.\n";
 256       }
 257       $SpecifiedNewToOldSDFieldMap{$NewSDField} = ();
 258       push @{$SpecifiedNewToOldSDFieldMap{$NewSDField}}, @OldSDFields;
 259       for $DataField (@OldSDFields) {
 260 	if (exists $SpecifiedOldToNewSDFieldMap{$DataField} ) {
 261 	  die "Error: SD field, $DataField, specified more than once using \"--datafieldsmap or --datafieldsmapfile\" option.\n";
 262 	}
 263 	else {
 264 	  $SpecifiedOldToNewSDFieldMap{$DataField} = $NewSDField;
 265 	}
 266       }
 267 
 268     }
 269   }
 270 
 271   %SpecifiedCommonFieldMap = ();
 272   if ($Options{datafieldscommon}) {
 273     my($DataFieldName, $DataFieldValue, $Index, @CommonDataFieldsSplit);
 274     @CommonDataFieldsSplit = split ",", $Options{datafieldscommon};
 275     if (@CommonDataFieldsSplit % 2) {
 276 	die "Error: Invalid number of comma delimited values, ", scalar(@CommonDataFieldsSplit), ", specified \"",  join(",", @CommonDataFieldsSplit), "\" using \"--datafieldscommon\" option: it must contain even number of values.\n";
 277     }
 278     for ($Index = 0; $Index < @CommonDataFieldsSplit; $Index += 2) {
 279       $DataFieldName = $CommonDataFieldsSplit[$Index];
 280       $DataFieldValue = $CommonDataFieldsSplit[$Index + 1];
 281       if (exists $SpecifiedCommonFieldMap{$DataFieldName}) {
 282 	die "Error: Common data field, $DataFieldName, specified more than once using \"--datafieldscommon\" option.\n";
 283       }
 284       if (exists($SpecifiedNewToOldSDFieldMap{$DataFieldName}) || exists($SpecifiedOldToNewSDFieldMap{$DataFieldName})) {
 285 	die "Error: Common data field, $DataFieldName, specified using \"--datafieldscommon\" option cannot be specified in \"--datafieldsmap or --datafieldsmapfile\" option.\n";
 286       }
 287       $SpecifiedCommonFieldMap{$DataFieldName} = $DataFieldValue;
 288     }
 289   }
 290 
 291   $CreateDataFieldURL = (exists($Options{datafieldurl}) && length($Options{datafieldurl}) ) ? 1 : 0;
 292   $URLDataFieldLabel = ""; $URLCGIScriptName = ""; $URLParamName = ""; $URLCmpdIdFieldName = "";
 293   if ($CreateDataFieldURL) {
 294     my(@DataFieldURLSplit, $Value);
 295     @DataFieldURLSplit = split ",", $Options{datafieldurl};
 296     if (@DataFieldURLSplit != 4) {
 297       die "Error: Invalid number of values, ", scalar(@DataFieldURLSplit), ", specified using \"--datafieldURL\" option: it must contain 4 values.\n";
 298     }
 299     for $Value (@DataFieldURLSplit) {
 300       if (!IsNotEmpty($Value)) {
 301 	die "Error: One of the values, $Options{datafieldurl}, specified using \"--datafieldURL\" option is empty.\n";
 302       }
 303     }
 304     $URLDataFieldLabel = $DataFieldURLSplit[0];
 305     $URLCGIScriptName = $DataFieldURLSplit[1];
 306     $URLParamName  = $DataFieldURLSplit[2];
 307     $URLCmpdIdFieldName = $DataFieldURLSplit[3];
 308   }
 309 
 310 }
 311 
 312 # Retrieve information about input SD files...
 313 sub RetrieveSDFilesInfo {
 314   my($Index, $SDFile, $FileDir, $FileName, $FileExt, $OutFileRoot,  $OutFile, $DataFieldName);
 315 
 316   @SDFilesOkay = ();
 317   @SDFilesOutFile = ();
 318 
 319  FILELIST: for $Index (0 .. $#SDFilesList) {
 320     $SDFile = $SDFilesList[$Index];
 321     $SDFilesOkay[$Index] = 0;
 322     $SDFilesOutFile[$Index] = "";
 323     if (!(-e $SDFile)) {
 324       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 325       next FILELIST;
 326     }
 327     if (!CheckFileType($SDFile, "sd sdf")) {
 328       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 329       next FILELIST;
 330     }
 331     $FileDir = ""; $FileName = ""; $FileExt = "";
 332     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 333     if ($Options{root} && (@SDFilesList == 1)) {
 334       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 335       if ($RootFileName && $RootFileExt) {
 336 	$FileName = $RootFileName;
 337       }
 338       else {
 339 	$FileName = $Options{root};
 340       }
 341       $OutFileRoot = $FileName;
 342     }
 343     else {
 344       $OutFileRoot = $FileName . "ModifiedDataFields";
 345     }
 346 
 347     $OutFile = $OutFileRoot . ".$FileExt";
 348     if (lc($OutFile) eq lc($SDFile)) {
 349       warn "Warning: Ignoring file $SDFile:Output file name, $OutFile, is same as input SD file name, $SDFile\n";
 350       next FILELIST;
 351     }
 352     if (!$Options{overwrite}) {
 353       if (-e $OutFile) {
 354 	warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n";
 355 	next FILELIST;
 356       }
 357     }
 358 
 359     $SDFilesOkay[$Index] = 1;
 360     $SDFilesOutFile[$Index] = "$OutFile";
 361   }
 362 }
 363 
 364 # Setup script usage  and retrieve command line arguments specified using various options...
 365 sub SetupScriptUsage {
 366 
 367   # Retrieve all the options...
 368   %Options = ();
 369   $Options{detail} = 1;
 370   $Options{keepolddatafields} = "none";
 371   $Options{mode} = "molname";
 372   $Options{molnamemode} = "labelprefix";
 373   $Options{molnamereplace} = "empty";
 374 
 375   if (!GetOptions(\%Options, "detail|d=i", "datafieldscommon=s", "datafieldsmap=s", "datafieldsmapfile=s", "datafieldurl=s", "help|h", "keepolddatafields|k=s", "mode|m=s", "molname=s", "molnamemode=s", "molnamereplace=s", "overwrite|o", "root|r=s", "workingdir|w=s")) {
 376     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 377   }
 378   if ($Options{workingdir}) {
 379     if (! -d $Options{workingdir}) {
 380       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 381     }
 382     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 383   }
 384   if ($Options{keepolddatafields} !~ /^(all|unmappedonly|none)$/i) {
 385     die "Error: The value specified, $Options{keepolddatafields}, for option \"-k --keepolddatafields\" is not valid. Allowed values: all, unmappedonly, or none\n";
 386   }
 387   if ($Options{mode} !~ /^(molname|datafields|both)$/i) {
 388     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: molname, datafields, or both\n";
 389   }
 390   if ($Options{molnamemode} !~ /^(datafield|labelprefix)$/i) {
 391     die "Error: The value specified, $Options{molnamemode}, for option \"--molnamemode\" is not valid. Allowed values: datafield or labelprefix\n";
 392   }
 393   if ($Options{molnamereplace} !~ /^(always|empty)$/i) {
 394     die "Error: The value specified, $Options{molnamereplace}, for option \"--molnamereplace\" is not valid. Allowed values: always or empty\n";
 395   }
 396   if (!IsPositiveInteger($Options{detail})) {
 397     die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
 398   }
 399 }
 400