Created
January 31, 2016 20:43
-
-
Save sverhoeven/08b9369ac164ec7d7885 to your computer and use it in GitHub Desktop.
bitvector tanimoto python benchmarks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import modifiedtanimoto.db as db | |
import modifiedtanimoto.algorithm as algorithm | |
frags = db.FragmentsDb('data/fragments12.db') | |
bss =frags.bitsets() | |
z = bss['5fl6_Y0R_frag10'] | |
bitsets2 = {'5fl6_Y0R_frag1':bss['5fl6_Y0R_frag1']} | |
myiter = algorithm.distances(bss, bitsets2, 574331, 0.6633333333333333, 0.33666666666666667, 0.55, True) | |
for r in myiter: | |
print(r) | |
sorted_distances = sorted(myiter, key=lambda row: row[1], reverse=True) | |
z = {k:v for k,v in bss.iteritems()} | |
4d1c_B5H_frag1 | |
bs1 = bss['5fl6_Y0R_frag1'] | |
bs2 = bss['4d1c_B5H_frag1'] | |
def mine(bs1, bs2): | |
a = len(bs1) | |
b = len(bs2) | |
c = len(bs1 & bs2) | |
st = float(c) / (a + b - c) | |
return st | |
%timeit mine(bs1,bs2) | |
The slowest run took 7.76 times longer than the fastest. This could mean that an intermediate result is being cached | |
10000 loops, best of 3: 47.6 µs per loop | |
0.1278268230741323 | |
import numpy as np | |
from sklearn.metrics import jaccard_similarity_score | |
jaccard_similarity_score(bs1.extract_finite_list(), bs2.extract_finite_list()) | |
def bynp(bs1, bs2): | |
l1 = bs1.extract_finite_list() | |
l2 = bs2.extract_finite_list() | |
a = len(l1) | |
b = len(l2) | |
c = len(np.intersect1d(l1, l2)) | |
st = float(c) / (a + b - c) | |
return st | |
%timeit bynp(bs1,bs2) | |
1000 loops, best of 3: 869 µs per loop | |
def bynp2(l1, l2): | |
a = len(l1) | |
b = len(l2) | |
c = len(np.intersect1d(l1, l2)) | |
st = float(c) / (a + b - c) | |
return st | |
l1 = bs1.extract_finite_list() | |
l2 = bs2.extract_finite_list() | |
%timeit bynp2(l1,l2) | |
1000 loops, best of 3: 594 µs per loop | |
l1 = np.array(bs1.extract_finite_list()) | |
l2 = np.array(bs2.extract_finite_list()) | |
%timeit bynp2(l1,l2) | |
1000 loops, best of 3: 274 µs per loop | |
def bynp2(l1, l2): | |
a = len(l1) | |
b = len(l2) | |
c = len(np.intersect1d(l1, l2, True)) | |
st = float(c) / (a + b - c) | |
return st | |
%timeit bynp2(l1,l2) | |
10000 loops, best of 3: 177 µs per loop | |
def bynp2(l1, l2): | |
a = len(l1) | |
b = len(l2) | |
c = len(np.in1d(l1, l2, True)) | |
st = float(c) / (a + b - c) | |
return st | |
%timeit bynp2(l1,l2) | |
1000 loops, best of 3: 163 µs per loop | |
s1 = set(bs1.extract_finite_list()) | |
s2 = set(bs2.extract_finite_list()) | |
%timeit mine(s1,s2) | |
10000 loops, best of 3: 109 µs per loop | |
def bynp2(l1, l2): | |
a = len(l1) | |
b = len(l2) | |
c = len(np.in1d(l1, l2)) | |
st = float(c) / (a + b - c) | |
return st | |
%timeit bynp2(l1,l2) | |
from scipy.spatial import distance | |
distance.cdist([l1, l2], [l1, l2], 'jaccard') | |
def bs2np(bs, nr_of_bits): | |
b = np.zeros(nr_of_bits, bool) | |
for i in bs: | |
b[i] = True | |
return b | |
b1 = bs2np(bs1, 574331) | |
b2 = bs2np(bs2, 574331) | |
def bynp3(b1, b2): | |
a = np.count_nonzero(b1) | |
b = np.count_nonzero(b2) | |
c = np.count_nonzero(b1 & b2) | |
st = float(c) / (a + b - c) | |
return st | |
%timeit bynp3(b1,b2) | |
10000 loops, best of 3: 124 µs per loop | |
from rdkit.DataStructs.cDataStructs import SparseBitVect | |
def bs2rd(bs, nr_of_bits): | |
sbv = SparseBitVect(nr_of_bits) | |
sbv.SetBitsFromList(bs.extract_finite_list()) | |
return sbv | |
r1 = bs2rd(bs1, 574331) | |
r2 = bs2rd(bs2, 574331) | |
def byrd(r1, r2): | |
a = r1.GetNumOnBits() | |
b = r2.GetNumOnBits() | |
c = (r1 & r2).GetNumOnBits() | |
st = float(c) / (a + b - c) | |
return st | |
%timeit byrd(r1,r2) | |
1000 loops, best of 3: 258 µs per loop | |
from rdkit.DataStructs.cDataStructs import ExplicitBitVect | |
def bs2rd2(bs, nr_of_bits): | |
sbv = ExplicitBitVect(nr_of_bits) | |
sbv.SetBitsFromList(bs.extract_finite_list()) | |
return sbv | |
r1 = bs2rd2(bs1, 574331) | |
r2 = bs2rd2(bs2, 574331) | |
%timeit byrd(r1,r2) | |
10000 loops, best of 3: 66.7 µs per loop | |
from rdkit.DataStructs.cDataStructs import OnBitsInCommon | |
def byrd2(r1, r2): | |
a = r1.GetNumOnBits() | |
b = r2.GetNumOnBits() | |
c = len(OnBitsInCommon(r1, r2)) | |
st = float(c) / (a + b - c) | |
return st | |
%timeit byrd2(r1,r2) | |
1000 loops, best of 3: 742 µs per loop | |
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity | |
%timeit TanimotoSimilarity(r1,r2) | |
10000 loops, best of 3: 55.6 µs per loop | |
pypy using sets | |
from timeit import timeit | |
timeit('mine(s1,s2)', setup="from __main__ import mine,s1,s2", number=1000) | |
0.3285820484161377 == 328 µs per loop |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment