1 package PathLengthFingerprints; 2 # 3 # $RCSfile: PathLengthFingerprints.pm,v $ 4 # $Date: 2008/04/19 16:11:36 $ 5 # $Revision: 1.8 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 use 5.006; 29 use strict; 30 use Carp; 31 use Exporter; 32 use Fingerprints::Fingerprints; 33 use TextUtil (); 34 use BitVector; 35 use Molecule; 36 37 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 38 39 $VERSION = '1.00'; 40 @ISA = qw(Fingerprints Exporter); 41 @EXPORT = qw(); 42 @EXPORT_OK = qw(); 43 44 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 45 46 # Setup class variables... 47 my($ClassName); 48 _InitializeClass(); 49 50 # Overload Perl functions... 51 use overload '""' => 'StringifyPathLengthFingerprints'; 52 53 # Class constructor... 54 sub new { 55 my($Class, %NamesAndValues) = @_; 56 57 # Initialize object... 58 my $This = $Class->SUPER::new(); 59 bless $This, ref($Class) || $Class; 60 $This->_InitializePathLengthFingerprints(); 61 62 $This->_InitializePathLengthFingerprintsProperties(%NamesAndValues); 63 64 return $This; 65 } 66 67 # Initialize object data... 68 # 69 sub _InitializePathLengthFingerprints { 70 my($This) = @_; 71 72 # Type of fingerprint... 73 $This->{Type} = 'PathLength'; 74 75 # Set default mininum, maximum, and default size. Although any arbitrary size can 76 # be specified, bit vector used to store bits work on a vector size which is 77 # power of 2 and additonal bits are automatically added and cleared. 78 # 79 $This->{Size} = 1024; 80 81 $This->{MinSize} = 32; 82 $This->{MaxSize} = 2**32; 83 84 85 # Minimum and maximum path lengths to use for fingerprints generation... 86 $This->{MinLength} = 1; 87 $This->{MaxLength} = 8; 88 89 # For molecules containing rings, atom paths starting from each atom can be traversed in four 90 # different ways: 91 # 92 # . Atom paths without any rings and sharing of bonds in traversed paths. 93 # . Atom paths containing rings and without any sharing of bonds in traversed paths 94 # . All possible atom paths without any rings and sharing of bonds in traversed paths 95 # . All possible atom paths containing rings and with sharing of bonds in traversed paths. 96 # 97 # Atom path traversal is terminated at the last ring atom. For molecules containing no rings, 98 # first two and last two types described above are equivalent. 99 # 100 # AllowSharedBonds and AllowRings variables allow generation of differen types of paths 101 # to be used for fingerprints generation. 102 # 103 # In addition to atom symbols, bond symbols are also used to generate a string 104 # for atom paths. These atom paths strings are hased to a 32 bit integer key which 105 # in turn is used as a seed for a random number generation in range of 1 to fingerprint 106 # size for setting corresponding bit in bit vector. 107 # 108 # UseBondSymbol variable allow generation of atom path strings and consequently fingerprints. 109 # 110 # Combination of AllowSharedBonds, AllowRings, and UseBondSymbols allow generation of 111 # 8 different types of path length fingerprints: 112 # 113 # AllowSharedBonds AllowRings UseBondSymbols PathLengthFingerprintsType 114 # 115 # No No Yes AtomPathsNoCyclesWithBondSymbols 116 # No Yes Yes AtomPathsWithCyclesWithBondSymbols 117 # 118 # Yes No Yes AllAtomPathsNoCyclesWithBondSymbols 119 # Yes Yes Yes AllAtomPathsWithCyclesWithBondSymbols [ DEFAULT ] 120 # 121 # No No No AtomPathsNoCyclesNoBondSymbols 122 # No Yes No AtomPathsWithCyclesNoBondSymbols 123 # 124 # Yes No No AllAtomPathsNoCyclesNoBondSymbols 125 # Yes Yes No AllAtomPathsWithCyclesNoWithBondSymbols 126 # 127 # 128 129 # By default, atom paths starting from atoms are allowed to share bonds already traversed... 130 $This->{AllowSharedBonds} = 1; 131 132 # By default rings are included in paths... 133 $This->{AllowRings} = 1; 134 135 # By default bond symbols are included in atom path strings... 136 $This->{UseBondSymbols} = 1; 137 138 # Bond symbols to use during generation of atom path strings... 139 %{$This->{BondOrderToSymbol}} = (); 140 %{$This->{BondOrderToSymbol}} = ('1' => '', '1.5' => ':', '2' => '=', '3' => '#'); 141 } 142 143 # Initialize class ... 144 sub _InitializeClass { 145 #Class name... 146 $ClassName = __PACKAGE__; 147 } 148 149 # Initialize object properties.... 150 sub _InitializePathLengthFingerprintsProperties { 151 my($This, %NamesAndValues) = @_; 152 153 my($Name, $Value, $MethodName); 154 while (($Name, $Value) = each %NamesAndValues) { 155 $MethodName = "Set${Name}"; 156 $This->$MethodName($Value); 157 } 158 159 # Make sure molecule object was specified... 160 if (!exists $NamesAndValues{Molecule}) { 161 croak "Error: ${ClassName}->New: Object can't be instantiated without specifying molecule..."; 162 } 163 164 # Make sure it's power of 2... 165 if (exists $NamesAndValues{Size}) { 166 if (!TextUtil::IsNumberPowerOfNumber($NamesAndValues{Size}, 2)) { 167 croak "Error: ${ClassName}->New: Specified size value, $NamesAndValues{Size}, must be power of 2..."; 168 } 169 } 170 $This->_InitializeFingerprintsBitVector(); 171 172 return $This; 173 } 174 175 # Set minimum path length... 176 # 177 sub SetMinLength { 178 my($This, $Value) = @_; 179 180 if (!TextUtil::IsPositiveInteger($Value)) { 181 croak "Error: ${ClassName}->SetMinLength: MinLength value, $Value, is not valid: It must be a positive integer..."; 182 } 183 $This->{MinLength} = $Value; 184 185 return $This; 186 } 187 188 # Set maximum path length... 189 # 190 sub SetMaxLength { 191 my($This, $Value) = @_; 192 193 if (!TextUtil::IsPositiveInteger($Value)) { 194 croak "Error: ${ClassName}->SetMaxLength: MaxLength value, $Value, is not valid: It must be a positive integer..."; 195 } 196 $This->{MaxLength} = $Value; 197 198 return $This; 199 } 200 201 # Generate path length fingerprints... 202 # 203 sub GenerateFingerprints { 204 my($This) = @_; 205 206 if ($This->{MinLength} >= $This->{MaxLength}) { 207 croak "Error: ${ClassName}->GenerateFingerprints: No fingerpritns generated: MinLength, $This->{MinLength}, must be less than MaxLength, $This->{MaxLength}..."; 208 } 209 210 # Get appropriate atom paths... 211 my($AtomPathsRef); 212 $AtomPathsRef = $This->_GetAtomPathsUpToMaxLength(); 213 214 # Generate appropriate atom path strings for unique atom paths... 215 my($AtomPathsStringRef); 216 $AtomPathsStringRef = $This->_GenerateAtomPathsStrings($AtomPathsRef); 217 218 # Generate fingerprints using atom path strings... 219 $This->_GenerateFingerprintsUsingAtomPathsStrings($AtomPathsStringRef); 220 221 return $This; 222 } 223 224 # Get appropriate atom paths with length up to MaxLength... 225 # 226 sub _GetAtomPathsUpToMaxLength { 227 my($This) = @_; 228 my($PathLength, $AllowRings, $Molecule, $AtomPathsRef); 229 230 $PathLength = $This->{MaxLength}; 231 $AllowRings = $This->{AllowRings}; 232 $Molecule = $This->{Molecule}; 233 234 if ($This->{AllowSharedBonds}) { 235 $AtomPathsRef = $Molecule->GetAllAtomPathsWithLengthUpto($PathLength, $AllowRings); 236 } 237 else { 238 $AtomPathsRef = $Molecule->GetAtomPathsWithLengthUpto($PathLength, $AllowRings); 239 } 240 return $AtomPathsRef; 241 } 242 243 # Generate appropriate atom path strings for unique atom paths... 244 # 245 sub _GenerateAtomPathsStrings { 246 my($This, $AtomPathsRef) = @_; 247 my($MinPathLength, $AtomPathString, $ReverseAtomPathString, $PathAtomsRef, %AtomPathsStrings); 248 249 $MinPathLength = $This->{MinLength}; 250 %AtomPathsStrings = (); 251 252 PATHATOMS: for $PathAtomsRef (@{$AtomPathsRef}) { 253 if (scalar @{$PathAtomsRef} < $MinPathLength) { 254 next PATHATOMS; 255 } 256 257 $AtomPathString = $This->_GenerateAtomPathString(@{$PathAtomsRef}); 258 if (exists $AtomPathsStrings{$AtomPathString}) { 259 $AtomPathsStrings{$AtomPathString} += 1; 260 next PATHATOMS; 261 } 262 263 $ReverseAtomPathString = $This->_GenerateAtomPathString(reverse @{$PathAtomsRef}); 264 if (exists $AtomPathsStrings{$ReverseAtomPathString}) { 265 $AtomPathsStrings{$ReverseAtomPathString} += 1; 266 next PATHATOMS; 267 } 268 269 if ($AtomPathString le $ReverseAtomPathString) { 270 $AtomPathsStrings{$AtomPathString} = 1; 271 } 272 else { 273 $AtomPathsStrings{$ReverseAtomPathString} = 1; 274 } 275 } 276 return \%AtomPathsStrings; 277 } 278 279 # Generate an approptiate atom path string... 280 # 281 sub _GenerateAtomPathString { 282 my($This, @PathAtoms) = @_; 283 my($Atom, $UseBondSymbols, $AtomPathString); 284 285 $AtomPathString = ''; 286 $UseBondSymbols = $This->{UseBondSymbols}; 287 288 if (@PathAtoms == 1) { 289 $Atom = $PathAtoms[0]; 290 $AtomPathString = $Atom->GetAtomSymbol(); 291 return $AtomPathString; 292 } 293 294 # Ignore bond information... 295 if (!$UseBondSymbols) { 296 for $Atom (@PathAtoms) { 297 $AtomPathString .= $Atom->GetAtomSymbol(); 298 } 299 return $AtomPathString; 300 } 301 302 # Use atoms and bonds to generate atom path string... 303 my($Index, $Bond, $BondOrder, $BondSymbol, $Molecule, $BondedAtom, @PathBonds); 304 305 @PathBonds = (); 306 $Molecule = $This->{Molecule}; 307 @PathBonds = $Molecule->GetAtomPathBonds(@PathAtoms); 308 309 # Assign atom path string to first atom... 310 $Atom = $PathAtoms[0]; 311 $AtomPathString = $Atom->GetAtomSymbol(); 312 313 for $Index (0 .. ($#PathAtoms - 1)) { 314 $Atom = $PathAtoms[$Index]; 315 $BondedAtom = $PathAtoms[$Index + 1]; 316 317 $Bond = $PathBonds[$Index]; 318 $BondOrder = $Bond->GetBondOrder(); 319 320 # Append next atom path string to first atom... 321 $BondSymbol = $Bond->IsAromatic() ? ':' : (exists($This->{BondOrderToSymbol}{$BondOrder}) ? $This->{BondOrderToSymbol}{$BondOrder} : $BondOrder); 322 $AtomPathString .= $BondSymbol . $BondedAtom->GetAtomSymbol(); 323 } 324 return $AtomPathString; 325 } 326 327 # Generate fingerprints using atom path strings... 328 # 329 sub _GenerateFingerprintsUsingAtomPathsStrings { 330 my($This, $AtomPathsStringRef) = @_; 331 my($Size, $AtomPathString, $AtomPathHashCode, $AtomPathBitPos, $FingerprintsBitVector, $SkipBitPosCheck); 332 333 $Size = $This->{Size}; 334 $SkipBitPosCheck = 1; 335 $FingerprintsBitVector = $This->{FingerprintsBitVector}; 336 337 for $AtomPathString (keys %{$AtomPathsStringRef}) { 338 $AtomPathHashCode = TextUtil::HashCode($AtomPathString); 339 340 srand($AtomPathHashCode); 341 $AtomPathBitPos = int(rand($Size)); 342 343 $FingerprintsBitVector->SetBit($AtomPathBitPos, $SkipBitPosCheck); 344 } 345 return $This; 346 } 347 348 # Return a string containg data for PathLengthFingerprints object... 349 sub StringifyPathLengthFingerprints { 350 my($This) = @_; 351 my($PathLengthsFingerprintsString); 352 353 # Type of fingerprint... 354 $PathLengthsFingerprintsString = "Fingerprint type: $This->{Type}"; 355 356 # Size... 357 $PathLengthsFingerprintsString .= "; Size: $This->{Size}; MinSize: $This->{MinSize}; MaxSize: $This->{MaxSize}"; 358 359 # Path length... 360 $PathLengthsFingerprintsString .= "; MinPathLength: $This->{MinLength}; MaxPathLength: $This->{MaxLength}"; 361 362 # Fingerprint generation control... 363 my($AllowSharedBonds, $AllowRings, $UseBondSymbols); 364 365 $AllowSharedBonds = $This->{AllowSharedBonds} ? "Yes" : "No"; 366 $AllowRings = $This->{AllowRings} ? "Yes" : "No"; 367 $UseBondSymbols = $This->{UseBondSymbols} ? "Yes" : "No"; 368 $PathLengthsFingerprintsString .= "; AllowSharedBonds: $AllowSharedBonds; AllowRings: $AllowRings; UseBondSymbols: $UseBondSymbols"; 369 370 # Fingerprint bit density and num of bits set... 371 my($NumOfSetBits, $BitDensity); 372 $NumOfSetBits = $This->{FingerprintsBitVector}->GetNumOfSetBits(); 373 $BitDensity = $This->{FingerprintsBitVector}->GetFingerprintsBitDensity(); 374 $PathLengthsFingerprintsString .= "; NumOfOnBits: $NumOfSetBits; BitDensity: $BitDensity"; 375 376 return $PathLengthsFingerprintsString; 377 } 378