Skip to content

Instantly share code, notes, and snippets.

@sverhoeven
Created January 31, 2016 20:43
Show Gist options
  • Save sverhoeven/08b9369ac164ec7d7885 to your computer and use it in GitHub Desktop.
Save sverhoeven/08b9369ac164ec7d7885 to your computer and use it in GitHub Desktop.
bitvector tanimoto python benchmarks
import modifiedtanimoto.db as db
import modifiedtanimoto.algorithm as algorithm
frags = db.FragmentsDb('data/fragments12.db')
bss =frags.bitsets()
z = bss['5fl6_Y0R_frag10']
bitsets2 = {'5fl6_Y0R_frag1':bss['5fl6_Y0R_frag1']}
myiter = algorithm.distances(bss, bitsets2, 574331, 0.6633333333333333, 0.33666666666666667, 0.55, True)
for r in myiter:
print(r)
sorted_distances = sorted(myiter, key=lambda row: row[1], reverse=True)
z = {k:v for k,v in bss.iteritems()}
4d1c_B5H_frag1
bs1 = bss['5fl6_Y0R_frag1']
bs2 = bss['4d1c_B5H_frag1']
def mine(bs1, bs2):
a = len(bs1)
b = len(bs2)
c = len(bs1 & bs2)
st = float(c) / (a + b - c)
return st
%timeit mine(bs1,bs2)
The slowest run took 7.76 times longer than the fastest. This could mean that an intermediate result is being cached
10000 loops, best of 3: 47.6 µs per loop
0.1278268230741323
import numpy as np
from sklearn.metrics import jaccard_similarity_score
jaccard_similarity_score(bs1.extract_finite_list(), bs2.extract_finite_list())
def bynp(bs1, bs2):
l1 = bs1.extract_finite_list()
l2 = bs2.extract_finite_list()
a = len(l1)
b = len(l2)
c = len(np.intersect1d(l1, l2))
st = float(c) / (a + b - c)
return st
%timeit bynp(bs1,bs2)
1000 loops, best of 3: 869 µs per loop
def bynp2(l1, l2):
a = len(l1)
b = len(l2)
c = len(np.intersect1d(l1, l2))
st = float(c) / (a + b - c)
return st
l1 = bs1.extract_finite_list()
l2 = bs2.extract_finite_list()
%timeit bynp2(l1,l2)
1000 loops, best of 3: 594 µs per loop
l1 = np.array(bs1.extract_finite_list())
l2 = np.array(bs2.extract_finite_list())
%timeit bynp2(l1,l2)
1000 loops, best of 3: 274 µs per loop
def bynp2(l1, l2):
a = len(l1)
b = len(l2)
c = len(np.intersect1d(l1, l2, True))
st = float(c) / (a + b - c)
return st
%timeit bynp2(l1,l2)
10000 loops, best of 3: 177 µs per loop
def bynp2(l1, l2):
a = len(l1)
b = len(l2)
c = len(np.in1d(l1, l2, True))
st = float(c) / (a + b - c)
return st
%timeit bynp2(l1,l2)
1000 loops, best of 3: 163 µs per loop
s1 = set(bs1.extract_finite_list())
s2 = set(bs2.extract_finite_list())
%timeit mine(s1,s2)
10000 loops, best of 3: 109 µs per loop
def bynp2(l1, l2):
a = len(l1)
b = len(l2)
c = len(np.in1d(l1, l2))
st = float(c) / (a + b - c)
return st
%timeit bynp2(l1,l2)
from scipy.spatial import distance
distance.cdist([l1, l2], [l1, l2], 'jaccard')
def bs2np(bs, nr_of_bits):
b = np.zeros(nr_of_bits, bool)
for i in bs:
b[i] = True
return b
b1 = bs2np(bs1, 574331)
b2 = bs2np(bs2, 574331)
def bynp3(b1, b2):
a = np.count_nonzero(b1)
b = np.count_nonzero(b2)
c = np.count_nonzero(b1 & b2)
st = float(c) / (a + b - c)
return st
%timeit bynp3(b1,b2)
10000 loops, best of 3: 124 µs per loop
from rdkit.DataStructs.cDataStructs import SparseBitVect
def bs2rd(bs, nr_of_bits):
sbv = SparseBitVect(nr_of_bits)
sbv.SetBitsFromList(bs.extract_finite_list())
return sbv
r1 = bs2rd(bs1, 574331)
r2 = bs2rd(bs2, 574331)
def byrd(r1, r2):
a = r1.GetNumOnBits()
b = r2.GetNumOnBits()
c = (r1 & r2).GetNumOnBits()
st = float(c) / (a + b - c)
return st
%timeit byrd(r1,r2)
1000 loops, best of 3: 258 µs per loop
from rdkit.DataStructs.cDataStructs import ExplicitBitVect
def bs2rd2(bs, nr_of_bits):
sbv = ExplicitBitVect(nr_of_bits)
sbv.SetBitsFromList(bs.extract_finite_list())
return sbv
r1 = bs2rd2(bs1, 574331)
r2 = bs2rd2(bs2, 574331)
%timeit byrd(r1,r2)
10000 loops, best of 3: 66.7 µs per loop
from rdkit.DataStructs.cDataStructs import OnBitsInCommon
def byrd2(r1, r2):
a = r1.GetNumOnBits()
b = r2.GetNumOnBits()
c = len(OnBitsInCommon(r1, r2))
st = float(c) / (a + b - c)
return st
%timeit byrd2(r1,r2)
1000 loops, best of 3: 742 µs per loop
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity
%timeit TanimotoSimilarity(r1,r2)
10000 loops, best of 3: 55.6 µs per loop
pypy using sets
from timeit import timeit
timeit('mine(s1,s2)', setup="from __main__ import mine,s1,s2", number=1000)
0.3285820484161377 == 328 µs per loop
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment