1 package FingerprintsBitVector; 2 # 3 # $RCSfile: FingerprintsBitVector.pm,v $ 4 # $Date: 2008/04/19 16:11:36 $ 5 # $Revision: 1.5 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2004-2008 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 use 5.006; 29 use strict; 30 use Carp; 31 use Exporter; 32 use ObjectProperty; 33 use BitVector; 34 use MathUtil; 35 use TextUtil (); 36 37 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 38 39 $VERSION = '1.00'; 40 @ISA = qw(BitVector Exporter); 41 42 # Similiarity coefficients 43 my(@SimilarityCoefficients) = qw(BaroniUrbaniSimilarityCoefficient BuserSimilarityCoefficient CosineSimilarityCoefficient DiceSimilarityCoefficient DennisSimilarityCoefficient EuclidSimilarityCoefficient ForbesSimilarityCoefficient FossumSimilarityCoefficient HamannSimilarityCoefficient JacardSimilarityCoefficient Kulczynski1SimilarityCoefficient Kulczynski2SimilarityCoefficient ManhattanSimilarityCoefficient MatchingSimilarityCoefficient McConnaugheySimilarityCoefficient OchiaiSimilarityCoefficient PearsonSimilarityCoefficient RogersTanimotoSimilarityCoefficient RussellRaoSimilarityCoefficient SimpsonSimilarityCoefficient SkoalSneath1SimilarityCoefficient SkoalSneath2SimilarityCoefficient SkoalSneath3SimilarityCoefficient TanimotoSimilarityCoefficient TverskySimilarityCoefficient YuleSimilarityCoefficient WeightedTanimotoSimilarityCoefficient WeightedTverskySimilarityCoefficient); 44 45 # New from string... 46 my(@NewFromString) = qw(NewFromBinaryString NewFromDecimalString NewFromHexadecimalString NewFromOctalString NewFromRawBinaryString); 47 48 @EXPORT = qw(IsFingerprintsBitVector GetSupportedSimilarityCoefficients); 49 @EXPORT_OK = (@NewFromString, @SimilarityCoefficients); 50 51 %EXPORT_TAGS = ( 52 new => [@NewFromString], 53 coefficients => [@SimilarityCoefficients], 54 all => [@EXPORT, @EXPORT_OK] 55 ); 56 57 # Setup class variables... 58 my($ClassName); 59 _InitializeClass(); 60 61 use overload '""' => 'StringifyFingerprintsBitVector'; 62 63 # Class constructor... 64 sub new { 65 my($Class, $Size) = @_; 66 67 # Initialize object... 68 my $This = $Class->SUPER::new($Size); 69 bless $This, ref($Class) || $Class; 70 $This->_InitializeFingerprintsBitVector($Size); 71 72 return $This; 73 } 74 75 # Initialize object data... 76 # 77 # Note: 78 # . The class, BitVector, used to derive this class provides all the functionality to 79 # manipulate bits. 80 # . Irrespective of specified size, Perl functions used to handle bit data data in 81 # BitVector class automatically sets the size to the next nearest power of 2. 82 # SpecifiedSize is used by this class to process any aribitray size during similarity 83 # coefficient calculations. 84 # 85 sub _InitializeFingerprintsBitVector { 86 my($This, $Size) = @_; 87 88 if (!defined $Size) { 89 croak "Error: ${ClassName}->new: FingerprintsBitVector object instantiated without specifying its size ..."; 90 } 91 if ($Size <=0) { 92 croak "Error: ${ClassName}->new: Fingerprints bit vector size, $Size, must be a positive integer..."; 93 } 94 95 # Specified size of fingerprints... 96 $This->{SpecifiedSize} = $Size; 97 } 98 99 # Initialize class ... 100 sub _InitializeClass { 101 #Class name... 102 $ClassName = __PACKAGE__; 103 } 104 105 # Create a new fingerprints bit vector using binary string. This functionality can be 106 # either invoked as a class function or an object method. 107 # 108 sub NewFromBinaryString ($;$) { 109 110 return _NewFingerptinsBitVectorFromString(@_, 'Binary'); 111 } 112 113 # Create a new fingerprints bit vector using hexadecimal string. This functionality can be 114 # either invoked as a class function or an object method. 115 # 116 sub NewFromHexadecimalString ($;$) { 117 118 return _NewFingerptinsBitVectorFromString(@_, 'Hexadecimal'); 119 } 120 121 # Create a new fingerprints bit vector using octal string. This functionality can be 122 # either invoked as a class function or an object method. 123 # 124 # 125 sub NewFromOctalString ($) { 126 croak "Error: ${ClassName}->NewFromOctalString: Creation of fingerprits bit vector from an octal string is not supported ..."; 127 } 128 129 # Create a new fingerprints bit vector using decimal string. This functionality can be 130 # either invoked as a class function or an object method. 131 # 132 sub NewFromDecimalString ($;$) { 133 croak "Error: ${ClassName}->NewFromDecimalString: Creation of fingerprits bit vector from a decimal string is not supported ..."; 134 } 135 136 # Create a new fingerprints bit vector using raw binary string. This functionality can be 137 # either invoked as a class function or an object method. 138 # 139 sub NewFromRawBinaryString ($;$) { 140 141 return _NewFingerptinsBitVectorFromString(@_, 'RawBinary'); 142 } 143 144 # Create a new fingerprints bit vector from a string... 145 # 146 sub _NewFingerptinsBitVectorFromString ($$;$) { 147 my($FirstParameter, $SecondParameter, $ThirdParameter) = @_; 148 my($This, $Format, $String, $Size, $FingerprintsBitVector); 149 150 if (@_ == 3) { 151 ($This, $String, $Format) = ($FirstParameter, $SecondParameter, $ThirdParameter); 152 } 153 else { 154 ($This, $String, $Format) = (undef, $FirstParameter, $SecondParameter); 155 } 156 $Size = BitVector::_CalculateStringSizeInBits($Format, $String); 157 if (defined $This) { 158 $FingerprintsBitVector = (ref $This)->new($Size); 159 } 160 else { 161 $FingerprintsBitVector = new FingerprintsBitVector($Size); 162 } 163 $FingerprintsBitVector->_SetBitsAsString($Format, $String); 164 165 return $FingerprintsBitVector; 166 } 167 168 # Get fingerprint bits as a hexadecimal string... 169 # 170 sub GetBitsAsHexadecimalString { 171 my($This) = @_; 172 173 return $This->_GetFingerprintBitsAsString('Hexadecimal'); 174 } 175 176 # Get fingerprint bits as an octal string... 177 # 178 sub GetBitsAsOctalString { 179 my($This) = @_; 180 181 croak "Error: ${ClassName}->GetBitsAsOctalString: Retrieval of fingerprits bits as an octal string is not supported ..."; 182 } 183 184 # Get fingerprint bits as an decimal string... 185 # 186 sub GetBitsAsDecimalString { 187 my($This) = @_; 188 189 croak "Error: ${ClassName}->GetBitsAsOctalString: Retrieval of fingerprits bits as a decimal string is not supported ..."; 190 } 191 192 # Get fingerprint bits as a binary string conatning 1s and 0s... 193 # 194 sub GetBitsAsBinaryString { 195 my($This) = @_; 196 197 return $This->_GetFingerprintBitsAsString('Binary'); 198 } 199 200 # Get fingerprint bits as a binary string conatning 1s and 0s... 201 # 202 sub GetBitsAsRawBinaryString { 203 my($This) = @_; 204 205 return $This->_GetFingerprintBitsAsString('RawBinary'); 206 } 207 208 # Return fingerprint bits as a string... 209 # 210 sub _GetFingerprintBitsAsString { 211 my($This, $Format) = @_; 212 213 return $This->_GetBitsAsString($Format, 'Descending'); 214 } 215 216 # Is it a fingerprints bit vector object? 217 sub IsFingerprintsBitVector ($) { 218 my($Object) = @_; 219 220 return _IsFingerprintsBitVector($Object); 221 } 222 223 # Is it a fingerprints bit vector object? 224 sub _IsFingerprintsBitVector { 225 my($Object) = @_; 226 227 return (Scalar::Util::blessed($Object) && $Object->isa($ClassName)) ? 1 : 0; 228 } 229 230 # Return a list of supported similarity coefficients... 231 sub GetSupportedSimilarityCoefficients () { 232 233 return @SimilarityCoefficients; 234 } 235 236 # Get bit density for fingerprints bit vector corresponding to on bits... 237 # 238 sub GetFingerprintsBitDensity { 239 my($This) = @_; 240 my($BitDensity); 241 242 $BitDensity = $This->GetDensityOfSetBits(); 243 244 return round($BitDensity, 2); 245 } 246 247 # Fold fingerprints bit vector by recursively reducing its size by half untill size is less than or equal to 248 # specified size... 249 # 250 sub FoldFingerprintsBitVectorBySize { 251 my($This, $Size) = @_; 252 253 if (!($Size > 0 && $Size <= $This->GetSize())) { 254 croak "Error: ${ClassName}->FoldFingerprintsBitVectorBySize: Specified size, $Size, is not valid: It must be > 0 && <= ", $This->GetSize()," ..."; 255 } 256 257 if ($This->GetSize() <= $Size) { 258 return $This; 259 } 260 return $This->_FoldFingerprintsBitVector('BySize', $Size); 261 } 262 263 # Fold fingerprints bit vector by recursively reducing its size by half untill bit density of set bits is greater than 264 # or equal to specified density... 265 # 266 sub FoldFingerprintsBitVectorByDensity { 267 my($This, $Density) = @_; 268 269 if (!($Density > 0 && $Density <= 1)) { 270 croak "Error: ${ClassName}->FoldFingerprintsBitVectorByDensity: Specified bit density, $Density, is not valid: It must be > 0 && <= 1 ..."; 271 } 272 273 if ($This->GetDensityOfSetBits() >= $Density) { 274 return $This; 275 } 276 return $This->_FoldFingerprintsBitVector('ByDensity', $Density); 277 } 278 279 # Fold fingerprints bit vector using size or density and return folded fingerprint bit vector... 280 # 281 sub _FoldFingerprintsBitVector { 282 my($This, $Mode, $Value) = @_; 283 284 # Fold upto size of 8 bits... 285 if ($This->GetSize() <= 8) { 286 return $This; 287 } 288 289 # Check size or density.... 290 if ($Mode =~ /^BySize$/i) { 291 if ($This->GetSize() <= $Value) { 292 return $This; 293 } 294 } 295 elsif ($Mode =~ /^ByDensity$/i) { 296 if ($This->GetDensityOfSetBits() >= $Value) { 297 return $This; 298 } 299 } 300 else { 301 return $This; 302 } 303 304 # Recursively reduce its size by half... 305 my($FirstHalfBinaryString, $SecondHalfBinaryString, $FirstHalfFingerprintsBitVector, $SecondHalfFingerprintsBitVector, $FoldedFingerprintsBitVector, $BinaryString, $StringLength); 306 307 $BinaryString = $This->GetBitsAsBinaryString(); 308 $StringLength = length $BinaryString; 309 310 $FirstHalfBinaryString = substr($BinaryString, 0, $StringLength/2); 311 $SecondHalfBinaryString = substr($BinaryString, $StringLength/2); 312 313 $FirstHalfFingerprintsBitVector = NewFromBinaryString($FirstHalfBinaryString); 314 $SecondHalfFingerprintsBitVector = NewFromBinaryString($SecondHalfBinaryString); 315 316 $FoldedFingerprintsBitVector = $FirstHalfFingerprintsBitVector | $SecondHalfFingerprintsBitVector; 317 318 return $FoldedFingerprintsBitVector->_FoldFingerprintsBitVector($Mode, $Value); 319 } 320 321 # Is first bit vector subset of second bit vector? 322 # 323 # For a bit vector to be a subset of another bit vector, both vectors must be of 324 # the same size and the bit positions set in first vector must also be set in the 325 # secons bit vector. 326 # 327 # This functionality can be either invoked as a class function or an object method. 328 # 329 sub IsSubSet ($$) { 330 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 331 332 if ($FingerprintsBitVectorA->GetSize() != $FingerprintsBitVectorB->GetSize()) { 333 return 0; 334 } 335 my($AndFingerprintsBitVector); 336 337 $AndFingerprintsBitVector = $FingerprintsBitVectorA & $FingerprintsBitVectorB; 338 339 return ($FingerprintsBitVectorA->GetNumOfSetBits() == $AndFingerprintsBitVector->GetNumOfSetBits()) ? 1 : 0; 340 } 341 342 # Return a string containing vector values... 343 sub StringifyFingerprintsBitVector { 344 my($This) = @_; 345 my($FingerprintsBitVectorString); 346 347 if ($This->{SpecifiedSize} != $This->GetSize()) { 348 $FingerprintsBitVectorString = "SpecifiedSize: " . $This->{SpecifiedSize} . "; BitVector: " . $This->StringifyBitVector(); 349 } 350 else { 351 $FingerprintsBitVectorString = "BitVector: " . $This->StringifyBitVector(); 352 } 353 my($NumOfSetBits, $BitDensity); 354 $NumOfSetBits = $This->GetNumOfSetBits(); 355 $BitDensity = $This->GetFingerprintsBitDensity(); 356 357 $FingerprintsBitVectorString .= "; NumOfOnBits: $NumOfSetBits; BitDensity: $BitDensity"; 358 359 return $FingerprintsBitVectorString; 360 } 361 362 # For two fingerprints bit vectors A and B of same size, let: 363 # 364 # Na = Number of bits set to "1" in A 365 # Nb = Number of bits set to "1" in B 366 # Nc = Number of bits set to "1" in both A and B 367 # Nd = Number of bits set to "0" in both A and B 368 # 369 # Nt = Number of bits set to "1" or "0" in A or B = Size of A or B = Na + Nb - Nc + Nd 370 # 371 # Na - Nc = Number of bits set to "1" in A but not in B 372 # Nb - Nc = Number of bits set to "1" in B but not in A 373 # 374 # Various similarity coefficients [ Ref. 40 - 42 ] for a pair of bit vectors A and B are 375 # defined as follows: 376 # 377 # . BaroniUrbani: ( SQRT( Nc * Nd ) + Nc ) / ( SQRT ( Nc * Nd ) + Nc + ( Na - Nc ) + ( Nb - Nc ) ) ( same as Buser ) 378 # 379 # . Buser: ( SQRT ( Nc * Nd ) + Nc ) / ( SQRT ( Nc * Nd ) + Nc + ( Na - Nc ) + ( Nb - Nc ) ) ( same as BaroniUrbani ) 380 # 381 # . Cosine: Nc / SQRT ( Na * Nb ) (same as Ochiai) 382 # 383 # . Dice: (2 * Nc) / ( Na + Nb ) 384 # 385 # . Dennis: ( Nc * Nd - ( ( Na - Nc ) * ( Nb - Nc ) ) ) / SQRT ( Nt * Na * Nb) 386 # 387 # . Euclid: SQRT ( ( Nc + Nd ) / Nt ) 388 # 389 # . Forbes: ( Nt * Nc ) / ( Na * Nb ) 390 # 391 # . Fossum: ( Nt * ( ( Nc - 1/2 ) ** 2 ) / ( Na * Nb ) 392 # 393 # . Hamann: ( ( Nc + Nd ) - ( Na - Nc ) - ( Nb - Nc ) ) / Nt 394 # 395 # . Jaccard: Nc / ( ( Na - Nc) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc ) (same as Tanimoto) 396 # 397 # . Kulczynski1: Nc / ( ( Na - Nc ) + ( Nb - Nc) ) = Nc / ( Na + Nb - 2Nc ) 398 # 399 # . Kulczynski2: ( ( Nc / 2 ) * ( 2 * Nc + ( Na - Nc ) + ( Nb - Nc) ) ) / ( ( Nc + ( Na - Nc ) ) * ( Nc + ( Nb - Nc ) ) ) = 0.5 * ( Nc / Na + Nc / Nb ) 400 # 401 # . Manhattan: ( ( Na - Nc ) + (Nb - Nc) ) / Nt = ( Na + Nb - 2Nc ) / Nt 402 # 403 # . Matching: ( Nc + Nd ) / Nt 404 # 405 # . McConnaughey: ( Nc ** 2 - ( Na - Nc ) * ( Nb - Nc) ) / ( Na * Nb ) 406 # 407 # . Ochiai: Nc / SQRT ( Na * Nb ) (same as Cosine) 408 # 409 # . Pearson: ( ( Nc * Nd ) - ( ( Na - Nc ) * ( Nb - Nc ) ) / SQRT ( Na * Nb * ( Na - Nc + Nd ) * ( Nb - Nc + Nd ) ) 410 # 411 # . RogersTanimoto: ( Nc + Nd ) / ( ( Na - Nc) + ( Nb - Nc) + Nt) = ( Nc + Nd ) / ( Na + Nb - 2Nc + Nt) 412 # 413 # . RussellRao: Nc / Nt 414 # 415 # . Simpson: Nc / MIN ( Na, Nb) 416 # 417 # . SkoalSneath1: Nc / ( Nc + 2 * ( Na - Nc) + 2 * ( Nb - Nc) ) = Nc / ( 2 * Na + 2 * Nb - 3 * Nc ) 418 # 419 # . SkoalSneath2: ( 2 * Nc + 2 * Nd ) / ( Nc + Nd + Nt ) 420 # 421 # . SkoalSneath3: ( Nc + Nd ) / ( ( Na - Nc ) + ( Nb - Nc ) ) = ( Nc + Nd ) / ( Na + Nb - 2 * Nc ) 422 # 423 # . Tanimoto: Nc / ( ( Na - Nc) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc ) (same as Jaccard) 424 # 425 # . Tversky: Nc / ( alpha * ( Na - Nc ) + ( 1 - alpha) * ( Nb - Nc) + Nc ) = Nc / ( alpha * ( Na - Nb ) + Nb) 426 # 427 # . Yule: ( ( Nc * Nd ) - ( ( Na - Nc ) * ( Nb - Nc ) ) ) / ( ( Nc * Nd ) + ( ( Na - Nc ) * ( Nb - Nc ) ) ) 428 # 429 # 430 # Values of Tanimoto/Jaccard and Tversky coefficients are dependent on only those bit which 431 # are set to "1" in both A and B. In order to take into account all bit positions, modified versions 432 # of Tanimoto [ Ref. 42 ] and Tversky [ Ref. 43 ] have been developed. 433 # 434 # Let: 435 # 436 # Na' = Number of bits set to "0" in A 437 # Nb' = Number of bits set to "0" in B 438 # Nc' = Number of bits set to "0" in both A and B 439 # 440 # . Tanimoto': Nc' / ( ( Na' - Nc') + ( Nb' - Nc' ) + Nc' ) = Nc' / ( Na' + Nb' - Nc' ) 441 # 442 # . Tversky': Nc' / ( alpha * ( Na' - Nc' ) + ( 1 - alpha) * ( Nb' - Nc' ) + Nc' ) = Nc' / ( alpha * ( Na' - Nb' ) + Nb') 443 # 444 # Then: 445 # 446 # . WeightedTanimoto = beta * Tanimoto + (1 - beta) * Tanimoto' 447 # 448 # . WeightedTversky = beta * Tversky + (1 - beta) * Tversky' 449 # 450 # 451 452 # Calculate BaroniUrbani similarity coefficient for two same size bit vectors. 453 # 454 # This functionality can be either invoked as a class function or an object method. 455 # 456 sub BaroniUrbaniSimilarityCoefficient ($$) { 457 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 458 459 return BuserSimilarityCoefficient($FingerprintsBitVectorA, $FingerprintsBitVectorB); 460 } 461 462 # Calculate Buser similarity coefficient for two same size bit vectors. 463 # 464 # This functionality can be either invoked as a class function or an object method. 465 # 466 sub BuserSimilarityCoefficient ($$) { 467 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 468 my($Na, $Nb, $Nc, $Nd, $Nt, $Numerator, $Denominator); 469 470 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 471 $Nd = _GetNumOfCommonClearBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 472 $Nt = $Na + $Nb - $Nc + $Nd; 473 474 $Numerator = sqrt($Nc*$Nd) + $Nc; 475 $Denominator = sqrt($Nc*$Nd) + ($Na - $Nc) + ($Nb - $Nc ) + $Nc; 476 477 return $Denominator ? ($Numerator/$Denominator) : 0; 478 } 479 480 # Calculate Cosine similarity coefficient for two same size bit vectors. 481 # 482 # This functionality can be either invoked as a class function or an object method. 483 # 484 sub CosineSimilarityCoefficient ($$) { 485 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 486 my($Na, $Nb, $Nc, $Numerator, $Denominator); 487 488 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 489 490 $Numerator = $Nc; 491 $Denominator = sqrt($Na*$Nb); 492 493 return $Denominator ? ($Numerator/$Denominator) : 0; 494 } 495 496 # Calculate Dice similarity coefficient for two same size bit vectors. 497 # 498 # This functionality can be either invoked as a class function or an object method. 499 # 500 sub DiceSimilarityCoefficient ($$) { 501 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 502 my($Na, $Nb, $Nc, $Numerator, $Denominator); 503 504 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 505 506 $Numerator = 2*$Nc; 507 $Denominator = $Na + $Nb; 508 509 return $Denominator ? ($Numerator/$Denominator) : 0; 510 } 511 512 # Calculate Dennis similarity coefficient for two same size bit vectors. 513 # 514 # This functionality can be either invoked as a class function or an object method. 515 # 516 sub DennisSimilarityCoefficient ($$) { 517 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 518 my($Na, $Nb, $Nc, $Nd, $Nt, $Numerator, $Denominator); 519 520 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 521 $Nd = _GetNumOfCommonClearBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 522 $Nt = $Na + $Nb - $Nc + $Nd; 523 524 $Numerator = $Nc*$Nd - (($Na - $Nc)*($Nb - $Nc)); 525 $Denominator = sqrt($Nt*$Na*$Nb); 526 527 return $Denominator ? ($Numerator/$Denominator) : 0; 528 } 529 530 # Calculate Euclid similarity coefficient for two same size bit vectors. 531 # 532 # This functionality can be either invoked as a class function or an object method. 533 # 534 sub EuclidSimilarityCoefficient ($$) { 535 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 536 my($Na, $Nb, $Nc, $Nd, $Nt, $Numerator, $Denominator); 537 538 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 539 $Nd = _GetNumOfCommonClearBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 540 $Nt = $Na + $Nb - $Nc + $Nd; 541 542 $Numerator = $Nc + $Nd; 543 $Denominator = $Nt; 544 545 return $Denominator ? (sqrt($Numerator/$Denominator)) : 0; 546 } 547 548 # Calculate Forbes similarity coefficient for two same size bit vectors. 549 # 550 # This functionality can be either invoked as a class function or an object method. 551 # 552 sub ForbesSimilarityCoefficient ($$) { 553 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 554 my($Na, $Nb, $Nc, $Nd, $Nt, $Numerator, $Denominator); 555 556 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 557 $Nd = _GetNumOfCommonClearBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 558 $Nt = $Na + $Nb - $Nc + $Nd; 559 560 $Numerator = $Nt*$Nc; 561 $Denominator = $Na*$Nb; 562 563 return $Denominator ? ($Numerator/$Denominator) : 0; 564 } 565 566 # Calculate Fossum similarity coefficient for two same size bit vectors. 567 # 568 # This functionality can be either invoked as a class function or an object method. 569 # 570 sub FossumSimilarityCoefficient ($$) { 571 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 572 my($Na, $Nb, $Nc, $Nd, $Nt, $Numerator, $Denominator); 573 574 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 575 $Nd = _GetNumOfCommonClearBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 576 $Nt = $Na + $Nb - $Nc + $Nd; 577 578 $Numerator = $Nt*(($Nc - 0.5)** 2); 579 $Denominator = $Na*$Nb ; 580 581 return $Denominator ? ($Numerator/$Denominator) : 0; 582 } 583 584 # Calculate Hamann similarity coefficient for two same size bit vectors. 585 # 586 # This functionality can be either invoked as a class function or an object method. 587 # 588 sub HamannSimilarityCoefficient ($$) { 589 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 590 my($Na, $Nb, $Nc, $Nd, $Nt, $Numerator, $Denominator); 591 592 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 593 $Nd = _GetNumOfCommonClearBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 594 $Nt = $Na + $Nb - $Nc + $Nd; 595 596 $Numerator = ($Nc + $Nd ) - ($Na - $Nc) - ($Nb - $Nc) ; 597 $Denominator = $Nt; 598 599 return $Denominator ? ($Numerator/$Denominator) : 0; 600 } 601 602 # Calculate Jacard similarity coefficient for two same size bit vectors. 603 # 604 # This functionality can be either invoked as a class function or an object method. 605 # 606 sub JacardSimilarityCoefficient ($$) { 607 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 608 609 return TanimotoSimilarityCoefficient($FingerprintsBitVectorA, $FingerprintsBitVectorB); 610 } 611 612 # Calculate Kulczynski1 similarity coefficient for two same size bit vectors. 613 # 614 # This functionality can be either invoked as a class function or an object method. 615 # 616 sub Kulczynski1SimilarityCoefficient ($$) { 617 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 618 my($Na, $Nb, $Nc, $Numerator, $Denominator); 619 620 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 621 622 $Numerator = $Nc; 623 $Denominator = $Na + $Nb - 2*$Nc; 624 625 return $Denominator ? ($Numerator/$Denominator) : 0; 626 } 627 628 # Calculate Kulczynski2 similarity coefficient for two same size bit vectors. 629 # 630 # This functionality can be either invoked as a class function or an object method. 631 # 632 sub Kulczynski2SimilarityCoefficient ($$) { 633 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 634 my($Na, $Nb, $Nc, $Numerator, $Denominator); 635 636 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 637 638 $Numerator = 0.5*($Na*$Nc + $Nb*$Nc); 639 $Denominator = $Na*$Nb; 640 641 return $Denominator ? ($Numerator/$Denominator) : 0; 642 } 643 644 # Calculate Manhattan similarity coefficient for two same size bit vectors. 645 # 646 # This functionality can be either invoked as a class function or an object method. 647 # 648 sub ManhattanSimilarityCoefficient ($$) { 649 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 650 my($Na, $Nb, $Nc, $Nd, $Nt, $Numerator, $Denominator); 651 652 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 653 $Nd = _GetNumOfCommonClearBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 654 $Nt = $Na + $Nb - $Nc + $Nd; 655 656 $Numerator = ($Na - $Nc) + ($Nb - $Nc); 657 $Denominator = $Nt; 658 659 return $Denominator ? ($Numerator/$Denominator) : 0; 660 } 661 662 # Calculate Matching similarity coefficient for two same size bit vectors. 663 # 664 # This functionality can be either invoked as a class function or an object method. 665 # 666 sub MatchingSimilarityCoefficient ($$) { 667 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 668 my($Na, $Nb, $Nc, $Nd, $Nt, $Numerator, $Denominator); 669 670 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 671 $Nd = _GetNumOfCommonClearBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 672 $Nt = $Na + $Nb - $Nc + $Nd; 673 674 $Numerator = $Nc + $Nd; 675 $Denominator = $Nt; 676 677 return $Denominator ? ($Numerator/$Denominator) : 0; 678 } 679 680 # Calculate McConnaughey similarity coefficient for two same size bit vectors. 681 # 682 # This functionality can be either invoked as a class function or an object method. 683 # 684 sub McConnaugheySimilarityCoefficient ($$) { 685 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 686 my($Na, $Nb, $Nc, $Numerator, $Denominator); 687 688 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 689 690 $Numerator = $Nc**2 - (($Na - $Nc)*($Nb - $Nc)); 691 $Denominator = $Na*$Nb ; 692 693 return $Denominator ? ($Numerator/$Denominator) : 0; 694 } 695 696 # Calculate Ochiai similarity coefficient for two same size bit vectors. 697 # 698 # This functionality can be either invoked as a class function or an object method. 699 # 700 sub OchiaiSimilarityCoefficient ($$) { 701 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 702 703 return CosineSimilarityCoefficient($FingerprintsBitVectorA, $FingerprintsBitVectorB); 704 } 705 706 # Calculate Pearson similarity coefficient for two same size bit vectors. 707 # 708 # This functionality can be either invoked as a class function or an object method. 709 # 710 sub PearsonSimilarityCoefficient ($$) { 711 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 712 my($Na, $Nb, $Nc, $Nd, $Nt, $Numerator, $Denominator); 713 714 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 715 $Nd = _GetNumOfCommonClearBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 716 $Nt = $Na + $Nb - $Nc + $Nd; 717 718 $Numerator = ($Nc*$Nd ) - (($Na - $Nc)*($Nb - $Nc)); 719 $Denominator = sqrt($Na*$Nb*($Na - $Nc + $Nd )*($Nb - $Nc + $Nd)); 720 721 return $Denominator ? ($Numerator/$Denominator) : 0; 722 } 723 724 # Calculate RogersTanimoto similarity coefficient for two same size bit vectors. 725 # 726 # This functionality can be either invoked as a class function or an object method. 727 # 728 sub RogersTanimotoSimilarityCoefficient ($$) { 729 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 730 my($Na, $Nb, $Nc, $Nd, $Nt, $Numerator, $Denominator); 731 732 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 733 $Nd = _GetNumOfCommonClearBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 734 $Nt = $Na + $Nb - $Nc + $Nd; 735 736 $Numerator = $Nc + $Nd; 737 $Denominator = ($Na - $Nc) + ($Nb - $Nc) + $Nt; 738 739 return $Denominator ? ($Numerator/$Denominator) : 0; 740 } 741 742 # Calculate RussellRao similarity coefficient for two same size bit vectors. 743 # 744 # This functionality can be either invoked as a class function or an object method. 745 # 746 sub RussellRaoSimilarityCoefficient ($$) { 747 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 748 my($Na, $Nb, $Nc, $Nd, $Nt, $Numerator, $Denominator); 749 750 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 751 $Nd = _GetNumOfCommonClearBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 752 $Nt = $Na + $Nb - $Nc + $Nd; 753 754 $Numerator = $Nc; 755 $Denominator = $Nt; 756 757 return $Denominator ? ($Numerator/$Denominator) : 0; 758 } 759 760 # Calculate Simpson similarity coefficient for two same size bit vectors. 761 # 762 # This functionality can be either invoked as a class function or an object method. 763 # 764 sub SimpsonSimilarityCoefficient ($$) { 765 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 766 my($Na, $Nb, $Nc, $Numerator, $Denominator); 767 768 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 769 770 $Numerator = $Nc; 771 $Denominator = min($Na, $Nb); 772 773 return $Denominator ? ($Numerator/$Denominator) : 0; 774 } 775 776 # Calculate SkoalSneath1 similarity coefficient for two same size bit vectors. 777 # 778 # This functionality can be either invoked as a class function or an object method. 779 # 780 sub SkoalSneath1SimilarityCoefficient ($$) { 781 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 782 my($Na, $Nb, $Nc, $Numerator, $Denominator); 783 784 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 785 786 $Numerator = $Nc; 787 $Denominator = $Nc + 2*($Na - $Nc) + 2*($Nb - $Nc); 788 789 return $Denominator ? ($Numerator/$Denominator) : 0; 790 } 791 792 # Calculate SkoalSneath2 similarity coefficient for two same size bit vectors. 793 # 794 # This functionality can be either invoked as a class function or an object method. 795 # 796 sub SkoalSneath2SimilarityCoefficient ($$) { 797 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 798 my($Na, $Nb, $Nc, $Nd, $Nt, $Numerator, $Denominator); 799 800 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 801 $Nd = _GetNumOfCommonClearBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 802 $Nt = $Na + $Nb - $Nc + $Nd; 803 804 $Numerator = 2*$Nc + 2*$Nd ; 805 $Denominator = $Nc + $Nd + $Nt ; 806 807 return $Denominator ? ($Numerator/$Denominator) : 0; 808 } 809 810 # Calculate SkoalSneath3 similarity coefficient for two same size bit vectors. 811 # 812 # This functionality can be either invoked as a class function or an object method. 813 # 814 sub SkoalSneath3SimilarityCoefficient ($$) { 815 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 816 my($Na, $Nb, $Nc, $Nd, $Nt, $Numerator, $Denominator); 817 818 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 819 $Nd = _GetNumOfCommonClearBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 820 $Nt = $Na + $Nb - $Nc + $Nd; 821 822 $Numerator = $Nc + $Nd; 823 $Denominator = ($Na - $Nc) + ($Nb - $Nc ) ; 824 825 return $Denominator ? ($Numerator/$Denominator) : 0; 826 } 827 828 # Calculate Tanimoto similarity coefficient for two same size bit vectors. 829 # 830 # This functionality can be either invoked as a class function or an object method. 831 # 832 sub TanimotoSimilarityCoefficient ($$) { 833 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 834 my($Na, $Nb, $Nc, $Numerator, $Denominator); 835 836 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 837 838 $Numerator = $Nc; 839 $Denominator = $Na + $Nb - $Nc; 840 841 return $Denominator ? ($Numerator/$Denominator) : 0; 842 } 843 844 # Calculate Tversky similarity coefficient for two same size bit vectors. 845 # 846 # This functionality can be either invoked as a class function or an object method. 847 # 848 sub TverskySimilarityCoefficient ($$$) { 849 my($FingerprintsBitVectorA, $FingerprintsBitVectorB, $Alpha) = @_; 850 my($Na, $Nb, $Nc, $Numerator, $Denominator); 851 852 if (!(defined($Alpha) && ($Alpha >= 0 && $Alpha <= 1))) { 853 croak "Error: ${ClassName}->TverskySimilarityCoefficient: Alpha parameters must be defined and its value must be >=0 and <=1 ..."; 854 } 855 856 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 857 858 $Numerator = $Nc; 859 $Denominator = $Alpha*($Na - $Nb ) + $Nb; 860 861 return $Denominator ? ($Numerator/$Denominator) : 0; 862 } 863 864 # Calculate Yule similarity coefficient for two same size bit vectors. 865 # 866 # This functionality can be either invoked as a class function or an object method. 867 # 868 sub YuleSimilarityCoefficient ($$) { 869 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 870 my($Na, $Nb, $Nc, $Nd, $Nt, $Numerator, $Denominator); 871 872 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 873 $Nd = _GetNumOfCommonClearBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 874 $Nt = $Na + $Nb - $Nc + $Nd; 875 876 $Numerator = ($Nc*$Nd) - (($Na - $Nc)*($Nb - $Nc)) ; 877 $Denominator = ($Nc*$Nd) + (($Na - $Nc)*($Nb - $Nc)) ; 878 879 return $Denominator ? ($Numerator/$Denominator) : 0; 880 } 881 882 # Calculate WeightedTanimoto similarity coefficient for two same size bit vectors. 883 # 884 # This functionality can be either invoked as a class function or an object method. 885 # 886 sub WeightedTanimotoSimilarityCoefficient ($$$) { 887 my($FingerprintsBitVectorA, $FingerprintsBitVectorB, $Beta) = @_; 888 my($Na, $Nb, $Nc, $TanimotoForSetBits, $TanimotoForClearBits, $Numerator, $Denominator, $WeightedTanimoto); 889 890 if (!(defined($Beta) && ($Beta >= 0 && $Beta <= 1))) { 891 croak "Error: ${ClassName}->WeightedTanimotoSimilarityCoefficient: Beta parameters must be defined and its value must be >=0 and <=1 ..."; 892 } 893 894 # Get Tanimoto for set bits... 895 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 896 897 $Numerator = $Nc; 898 $Denominator = $Na + $Nb - $Nc; 899 $TanimotoForSetBits = $Denominator ? ($Numerator/$Denominator) : 0; 900 901 # Get Tanimoto for clear bits... 902 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonClearBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 903 904 $Numerator = $Nc; 905 $Denominator = $Na + $Nb - $Nc; 906 $TanimotoForClearBits = $Denominator ? ($Numerator/$Denominator) : 0; 907 908 $WeightedTanimoto = $Beta*$TanimotoForSetBits + (1 - $Beta)*$TanimotoForClearBits; 909 910 return $WeightedTanimoto; 911 } 912 913 # Calculate WeightedTversky similarity coefficient for two same size bit vectors. 914 # 915 # This functionality can be either invoked as a class function or an object method. 916 # 917 sub WeightedTverskySimilarityCoefficient ($$$) { 918 my($FingerprintsBitVectorA, $FingerprintsBitVectorB, $Alpha, $Beta) = @_; 919 my($Na, $Nb, $Nc, $TverskyForSetBits, $TverskyForClearBits, $Numerator, $Denominator, $WeightedTversky); 920 921 if (!(defined($Alpha) && ($Alpha >= 0 && $Alpha <= 1))) { 922 croak "Error: ${ClassName}->WeightedTverskySimilarityCoefficient: Alpha parameters must be defined and its value must be >=0 and <=1 ..."; 923 } 924 if (!(defined($Beta) && ($Beta >= 0 && $Beta <= 1))) { 925 croak "Error: ${ClassName}->WeightedTverskySimilarityCoefficient: Beta parameters must be defined and its value must be >=0 and <=1 ..."; 926 } 927 928 # Get Tversky for set bits... 929 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 930 931 $Numerator = $Nc; 932 $Denominator = $Alpha*($Na - $Nb ) + $Nb; 933 $TverskyForSetBits = $Denominator ? ($Numerator/$Denominator) : 0; 934 935 # Get Tversky for clear bits... 936 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonClearBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 937 938 $Numerator = $Nc; 939 $Denominator = $Alpha*($Na - $Nb ) + $Nb; 940 $TverskyForClearBits = $Denominator ? ($Numerator/$Denominator) : 0; 941 942 $WeightedTversky = $Beta*$TverskyForSetBits + (1 - $Beta)*$TverskyForClearBits; 943 944 return $WeightedTversky; 945 } 946 947 # Get number of Na, Nb and Nc bits in bit vector A and B to be used for similarity coefficient calculations... 948 # 949 sub _GetNumOfIndividualAndCommonSetBits ($$) { 950 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 951 my($Na, $Nb, $Nc, $Nd); 952 953 # Number of bits set to "1" in A 954 $Na = $FingerprintsBitVectorA->GetNumOfSetBits(); 955 956 # Number of bits set to "1" in B 957 $Nb = $FingerprintsBitVectorB->GetNumOfSetBits(); 958 959 # Number of bits set to "1" in both A and B 960 my($NcBitVector); 961 $NcBitVector = $FingerprintsBitVectorA & $FingerprintsBitVectorB; 962 $Nc = $NcBitVector->GetNumOfSetBits(); 963 964 return ($Na, $Nb, $Nc); 965 } 966 967 # Get number of Nd bits in bit vector A and B to be used for similarity coefficient calculations... 968 # 969 sub _GetNumOfCommonClearBits ($$) { 970 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 971 my($Nd, $NdBitVector); 972 973 # Number of bits set to "0" in both A and B 974 $NdBitVector = ~$FingerprintsBitVectorA & ~$FingerprintsBitVectorB; 975 $Nd = $NdBitVector->GetNumOfSetBits(); 976 977 return $Nd; 978 } 979 980 981 # Get number of Na, Nb and Nc bits in bit vector A and B to be used for similarity coefficient calculations... 982 # 983 sub _GetNumOfIndividualAndCommonClearBits ($$) { 984 my($FingerprintsBitVectorA, $FingerprintsBitVectorB) = @_; 985 my($Na, $Nb, $Nc, $Nd); 986 987 # Number of bits set to "0" in A 988 $Na = $FingerprintsBitVectorA->GetNumOfClearBits(); 989 990 # Number of bits set to "0" in B 991 $Nb = $FingerprintsBitVectorB->GetNumOfClearBits(); 992 993 # Number of bits set to "0" in both A and B 994 $Nc = _GetNumOfCommonClearBits($FingerprintsBitVectorA, $FingerprintsBitVectorB); 995 996 return ($Na, $Nb, $Nc); 997 } 998