MayaChemTools

   1 package PathLengthFingerprints;
   2 #
   3 # $RCSfile: PathLengthFingerprints.pm,v $
   4 # $Date: 2008/04/19 16:11:36 $
   5 # $Revision: 1.8 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 use 5.006;
  29 use strict;
  30 use Carp;
  31 use Exporter;
  32 use Fingerprints::Fingerprints;
  33 use TextUtil ();
  34 use BitVector;
  35 use Molecule;
  36 
  37 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  38 
  39 $VERSION = '1.00';
  40 @ISA = qw(Fingerprints Exporter);
  41 @EXPORT = qw();
  42 @EXPORT_OK = qw();
  43 
  44 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  45 
  46 # Setup class variables...
  47 my($ClassName);
  48 _InitializeClass();
  49 
  50 # Overload Perl functions...
  51 use overload '""' => 'StringifyPathLengthFingerprints';
  52 
  53 # Class constructor...
  54 sub new {
  55   my($Class, %NamesAndValues) = @_;
  56 
  57   # Initialize object...
  58   my $This = $Class->SUPER::new();
  59   bless $This, ref($Class) || $Class;
  60   $This->_InitializePathLengthFingerprints();
  61 
  62   $This->_InitializePathLengthFingerprintsProperties(%NamesAndValues);
  63 
  64   return $This;
  65 }
  66 
  67 # Initialize object data...
  68 #
  69 sub _InitializePathLengthFingerprints {
  70   my($This) = @_;
  71 
  72   # Type of fingerprint...
  73   $This->{Type} = 'PathLength';
  74 
  75   # Set default mininum, maximum, and default size. Although any arbitrary size can
  76   # be specified, bit vector used to store bits work on a vector size which is
  77   # power of 2 and additonal bits are automatically added and cleared.
  78   #
  79   $This->{Size} = 1024;
  80 
  81   $This->{MinSize} = 32;
  82   $This->{MaxSize} = 2**32;
  83 
  84 
  85   # Minimum and maximum path lengths to use for fingerprints generation...
  86   $This->{MinLength} = 1;
  87   $This->{MaxLength} = 8;
  88 
  89   # For molecules containing rings, atom paths starting from each atom can be traversed in four
  90   # different ways:
  91   #
  92   # . Atom paths without any rings and sharing of bonds in traversed paths.
  93   # . Atom paths containing rings and without any sharing of bonds in traversed paths
  94   # . All possible atom paths without any rings and sharing of bonds in traversed paths
  95   # . All possible atom paths containing rings and with sharing of bonds in traversed paths.
  96   #
  97   # Atom path traversal is terminated at the last ring atom. For molecules containing no rings,
  98   # first two and last two types described above are equivalent.
  99   #
 100   # AllowSharedBonds and AllowRings variables allow generation of differen types of paths
 101   # to be used for fingerprints generation.
 102   #
 103   # In addition to atom symbols, bond symbols are also used to generate a string
 104   # for atom paths. These atom paths strings are hased to a 32 bit integer key which
 105   # in turn is used as a seed for a random number generation in range of 1 to fingerprint
 106   # size for setting corresponding bit in bit vector.
 107   #
 108   # UseBondSymbol variable allow generation of atom path strings and consequently fingerprints.
 109   #
 110   # Combination of AllowSharedBonds, AllowRings, and UseBondSymbols allow generation of
 111   # 8 different types of path length fingerprints:
 112   #
 113   # AllowSharedBonds    AllowRings    UseBondSymbols    PathLengthFingerprintsType
 114   #
 115   # No                  No            Yes                AtomPathsNoCyclesWithBondSymbols
 116   # No                  Yes           Yes                AtomPathsWithCyclesWithBondSymbols
 117   #
 118   # Yes                 No            Yes                AllAtomPathsNoCyclesWithBondSymbols
 119   # Yes                 Yes           Yes                AllAtomPathsWithCyclesWithBondSymbols [ DEFAULT ]
 120   #
 121   # No                  No            No                 AtomPathsNoCyclesNoBondSymbols
 122   # No                  Yes           No                 AtomPathsWithCyclesNoBondSymbols
 123   #
 124   # Yes                 No            No                 AllAtomPathsNoCyclesNoBondSymbols
 125   # Yes                 Yes           No                 AllAtomPathsWithCyclesNoWithBondSymbols
 126   #
 127   #
 128 
 129   # By default, atom paths starting from atoms are allowed to share bonds already traversed...
 130   $This->{AllowSharedBonds} = 1;
 131 
 132   # By default rings are included in paths...
 133   $This->{AllowRings} = 1;
 134 
 135   # By default bond symbols are included in atom path strings...
 136   $This->{UseBondSymbols} = 1;
 137 
 138   # Bond symbols to use during generation of atom path strings...
 139   %{$This->{BondOrderToSymbol}} = ();
 140   %{$This->{BondOrderToSymbol}} = ('1' => '', '1.5' => ':', '2' => '=', '3' => '#');
 141 }
 142 
 143 # Initialize class ...
 144 sub _InitializeClass {
 145   #Class name...
 146   $ClassName = __PACKAGE__;
 147 }
 148 
 149 # Initialize object properties....
 150 sub _InitializePathLengthFingerprintsProperties {
 151   my($This, %NamesAndValues) = @_;
 152 
 153   my($Name, $Value, $MethodName);
 154   while (($Name, $Value) = each  %NamesAndValues) {
 155     $MethodName = "Set${Name}";
 156     $This->$MethodName($Value);
 157   }
 158 
 159   # Make sure molecule object was specified...
 160   if (!exists $NamesAndValues{Molecule}) {
 161     croak "Error: ${ClassName}->New: Object can't be instantiated without specifying molecule...";
 162   }
 163 
 164   # Make sure it's power of 2...
 165   if (exists $NamesAndValues{Size}) {
 166     if (!TextUtil::IsNumberPowerOfNumber($NamesAndValues{Size}, 2)) {
 167       croak "Error: ${ClassName}->New: Specified size value, $NamesAndValues{Size}, must be power of 2...";
 168     }
 169   }
 170   $This->_InitializeFingerprintsBitVector();
 171 
 172   return $This;
 173 }
 174 
 175 # Set minimum path length...
 176 #
 177 sub SetMinLength {
 178   my($This, $Value) = @_;
 179 
 180   if (!TextUtil::IsPositiveInteger($Value)) {
 181     croak "Error: ${ClassName}->SetMinLength: MinLength value, $Value, is not valid:  It must be a positive integer...";
 182   }
 183   $This->{MinLength} = $Value;
 184 
 185   return $This;
 186 }
 187 
 188 # Set maximum path length...
 189 #
 190 sub SetMaxLength {
 191   my($This, $Value) = @_;
 192 
 193   if (!TextUtil::IsPositiveInteger($Value)) {
 194     croak "Error: ${ClassName}->SetMaxLength: MaxLength value, $Value, is not valid:  It must be a positive integer...";
 195   }
 196   $This->{MaxLength} = $Value;
 197 
 198   return $This;
 199 }
 200 
 201 # Generate path length fingerprints...
 202 #
 203 sub GenerateFingerprints {
 204   my($This) = @_;
 205 
 206   if ($This->{MinLength} >= $This->{MaxLength}) {
 207     croak "Error: ${ClassName}->GenerateFingerprints: No fingerpritns generated: MinLength, $This->{MinLength}, must be less than MaxLength, $This->{MaxLength}...";
 208   }
 209 
 210   # Get appropriate atom paths...
 211   my($AtomPathsRef);
 212   $AtomPathsRef = $This->_GetAtomPathsUpToMaxLength();
 213 
 214   # Generate appropriate atom path strings for unique atom paths...
 215   my($AtomPathsStringRef);
 216   $AtomPathsStringRef = $This->_GenerateAtomPathsStrings($AtomPathsRef);
 217 
 218   # Generate fingerprints using atom path strings...
 219   $This->_GenerateFingerprintsUsingAtomPathsStrings($AtomPathsStringRef);
 220 
 221   return $This;
 222 }
 223 
 224 # Get appropriate atom paths with length up to MaxLength...
 225 #
 226 sub _GetAtomPathsUpToMaxLength {
 227   my($This) = @_;
 228   my($PathLength, $AllowRings, $Molecule, $AtomPathsRef);
 229 
 230   $PathLength = $This->{MaxLength};
 231   $AllowRings = $This->{AllowRings};
 232   $Molecule = $This->{Molecule};
 233 
 234   if ($This->{AllowSharedBonds}) {
 235     $AtomPathsRef =  $Molecule->GetAllAtomPathsWithLengthUpto($PathLength, $AllowRings);
 236   }
 237   else {
 238     $AtomPathsRef = $Molecule->GetAtomPathsWithLengthUpto($PathLength, $AllowRings);
 239   }
 240   return $AtomPathsRef;
 241 }
 242 
 243 # Generate appropriate atom path strings for unique atom paths...
 244 #
 245 sub _GenerateAtomPathsStrings {
 246   my($This, $AtomPathsRef) = @_;
 247   my($MinPathLength, $AtomPathString, $ReverseAtomPathString, $PathAtomsRef, %AtomPathsStrings);
 248 
 249   $MinPathLength = $This->{MinLength};
 250   %AtomPathsStrings = ();
 251 
 252   PATHATOMS: for $PathAtomsRef (@{$AtomPathsRef}) {
 253     if (scalar @{$PathAtomsRef} < $MinPathLength) {
 254       next PATHATOMS;
 255     }
 256 
 257     $AtomPathString = $This->_GenerateAtomPathString(@{$PathAtomsRef});
 258     if (exists $AtomPathsStrings{$AtomPathString}) {
 259       $AtomPathsStrings{$AtomPathString} += 1;
 260       next PATHATOMS;
 261     }
 262 
 263     $ReverseAtomPathString = $This->_GenerateAtomPathString(reverse @{$PathAtomsRef});
 264     if (exists $AtomPathsStrings{$ReverseAtomPathString}) {
 265       $AtomPathsStrings{$ReverseAtomPathString} += 1;
 266       next PATHATOMS;
 267     }
 268 
 269     if ($AtomPathString le $ReverseAtomPathString) {
 270       $AtomPathsStrings{$AtomPathString} = 1;
 271     }
 272     else {
 273       $AtomPathsStrings{$ReverseAtomPathString} = 1;
 274     }
 275   }
 276   return \%AtomPathsStrings;
 277 }
 278 
 279 # Generate an approptiate atom path string...
 280 #
 281 sub _GenerateAtomPathString {
 282   my($This, @PathAtoms) = @_;
 283   my($Atom, $UseBondSymbols, $AtomPathString);
 284 
 285   $AtomPathString = '';
 286   $UseBondSymbols = $This->{UseBondSymbols};
 287 
 288   if (@PathAtoms == 1) {
 289     $Atom = $PathAtoms[0];
 290     $AtomPathString = $Atom->GetAtomSymbol();
 291     return $AtomPathString;
 292   }
 293 
 294   # Ignore bond information...
 295   if (!$UseBondSymbols) {
 296     for $Atom (@PathAtoms) {
 297       $AtomPathString .= $Atom->GetAtomSymbol();
 298     }
 299     return $AtomPathString;
 300   }
 301 
 302   # Use atoms and bonds to generate atom path string...
 303   my($Index, $Bond, $BondOrder, $BondSymbol, $Molecule, $BondedAtom, @PathBonds);
 304 
 305   @PathBonds = ();
 306   $Molecule = $This->{Molecule};
 307   @PathBonds = $Molecule->GetAtomPathBonds(@PathAtoms);
 308 
 309   # Assign atom path string to first atom...
 310   $Atom = $PathAtoms[0];
 311   $AtomPathString = $Atom->GetAtomSymbol();
 312 
 313   for $Index (0 .. ($#PathAtoms - 1)) {
 314     $Atom = $PathAtoms[$Index];
 315     $BondedAtom = $PathAtoms[$Index + 1];
 316 
 317     $Bond = $PathBonds[$Index];
 318     $BondOrder = $Bond->GetBondOrder();
 319 
 320     # Append next atom path string to first atom...
 321     $BondSymbol = $Bond->IsAromatic() ? ':' : (exists($This->{BondOrderToSymbol}{$BondOrder}) ? $This->{BondOrderToSymbol}{$BondOrder} : $BondOrder);
 322     $AtomPathString .= $BondSymbol . $BondedAtom->GetAtomSymbol();
 323   }
 324   return $AtomPathString;
 325 }
 326 
 327 # Generate fingerprints using atom path strings...
 328 #
 329 sub _GenerateFingerprintsUsingAtomPathsStrings {
 330   my($This, $AtomPathsStringRef) = @_;
 331   my($Size, $AtomPathString, $AtomPathHashCode, $AtomPathBitPos, $FingerprintsBitVector, $SkipBitPosCheck);
 332 
 333   $Size = $This->{Size};
 334   $SkipBitPosCheck = 1;
 335   $FingerprintsBitVector = $This->{FingerprintsBitVector};
 336 
 337   for $AtomPathString (keys %{$AtomPathsStringRef}) {
 338     $AtomPathHashCode = TextUtil::HashCode($AtomPathString);
 339 
 340     srand($AtomPathHashCode);
 341     $AtomPathBitPos = int(rand($Size));
 342 
 343     $FingerprintsBitVector->SetBit($AtomPathBitPos, $SkipBitPosCheck);
 344   }
 345   return $This;
 346 }
 347 
 348 # Return a string containg data for PathLengthFingerprints object...
 349 sub StringifyPathLengthFingerprints {
 350   my($This) = @_;
 351   my($PathLengthsFingerprintsString);
 352 
 353   # Type of fingerprint...
 354   $PathLengthsFingerprintsString = "Fingerprint type: $This->{Type}";
 355 
 356   # Size...
 357   $PathLengthsFingerprintsString .= "; Size: $This->{Size}; MinSize: $This->{MinSize}; MaxSize: $This->{MaxSize}";
 358 
 359   # Path length...
 360   $PathLengthsFingerprintsString .= "; MinPathLength: $This->{MinLength}; MaxPathLength: $This->{MaxLength}";
 361 
 362   # Fingerprint generation control...
 363   my($AllowSharedBonds, $AllowRings, $UseBondSymbols);
 364 
 365   $AllowSharedBonds = $This->{AllowSharedBonds} ? "Yes" : "No";
 366   $AllowRings = $This->{AllowRings} ? "Yes" : "No";
 367   $UseBondSymbols = $This->{UseBondSymbols} ? "Yes" : "No";
 368   $PathLengthsFingerprintsString .= "; AllowSharedBonds: $AllowSharedBonds; AllowRings: $AllowRings; UseBondSymbols: $UseBondSymbols";
 369 
 370   # Fingerprint bit density and num of bits set...
 371   my($NumOfSetBits, $BitDensity);
 372   $NumOfSetBits = $This->{FingerprintsBitVector}->GetNumOfSetBits();
 373   $BitDensity = $This->{FingerprintsBitVector}->GetFingerprintsBitDensity();
 374   $PathLengthsFingerprintsString .= "; NumOfOnBits: $NumOfSetBits; BitDensity: $BitDensity";
 375 
 376   return $PathLengthsFingerprintsString;
 377 }
 378