Skip to content

Instantly share code, notes, and snippets.

@rileypeterson
Created October 28, 2019 16:30
Show Gist options
  • Save rileypeterson/0e787926572e59fa982d3263c14a5fbd to your computer and use it in GitHub Desktop.
Save rileypeterson/0e787926572e59fa982d3263c14a5fbd to your computer and use it in GitHub Desktop.
KD Tree Demo
from scipy.spatial import cKDTree
import time
import numpy as np
np.random.seed(42)
st = time.time()
a = np.random.rand(40000, 25)
b = np.random.rand(40000, 25)
t0 = cKDTree(a)
t1 = cKDTree(b)
# res = np.empty((len(b), 10))
for i in range(0, len(b), 5000):
print(i)
dists0, inds0 = t0.query(b[i:i+5000], k=1000, n_jobs=-1)
dists1, inds1 = t1.query(a[i:i+5000], k=1000, n_jobs=-1)
# res[i:i+1000] = t0.query(b[i:i+1000], k=10, n_jobs=-1)[1]
# res = np.vstack((res, t0.query(b[i:i+1000], k=10, n_jobs=-1)[1]))
print(time.time() - st)
@rileypeterson
Copy link
Author

import numpy as np
from scipy.spatial.distance import pdist, squareform, cdist
import time
import random
import numba


np.random.seed(42)

@numba.jit(nopython=True)
def calc_dist(A, B, sqrt=False):
    dist = np.dot(A, B.T)

    TMP_A = np.empty(A.shape[0], dtype=A.dtype)
    for i in range(A.shape[0]):
        sum = 0.
        for j in range(A.shape[1]):
            sum += A[i, j] ** 2
        TMP_A[i] = sum

    TMP_B = np.empty(B.shape[0], dtype=A.dtype)
    for i in range(B.shape[0]):
        sum = 0.
        for j in range(B.shape[1]):
            sum += B[i, j] ** 2
        TMP_B[i] = sum

    if sqrt == True:
        for i in range(A.shape[0]):
            for j in range(B.shape[0]):
                dist[i, j] = np.sqrt(-2. * dist[i, j] + TMP_A[i] + TMP_B[j])
    else:
        for i in range(A.shape[0]):
            for j in range(B.shape[0]):
                dist[i, j] = -2. * dist[i, j] + TMP_A[i] + TMP_B[j]
    return dist






def chunk_euclidean_distances(a1, a2, chunk_size=5000, sort_keep=25, w0=1, w1=1, keep_num=500):
    arg_sorts = np.empty((0, sort_keep))
    for i in range(0, len(a1), chunk_size):
        print(i)
        intermediary = np.empty((len(a1[i:i+chunk_size]), 0))

        for j in range(0, len(a1), chunk_size):
            v0 = calc_dist(a1[i:i+chunk_size], a1[j:j+chunk_size], sqrt=True)
            v1 = calc_dist(a2[i:i+chunk_size], a2[j:j+chunk_size], sqrt=True)
            v = w0*v0 + w1*v1
            intermediary = np.hstack((intermediary, v))

        v1 = np.argpartition(intermediary, sort_keep, axis=1)[:, :sort_keep]
        v2 = intermediary[np.arange(intermediary.shape[0])[:, None], v1]
        final = v1[np.arange(v1.shape[0])[:, None], np.argsort(v2, axis=1)]
        arg_sorts = np.vstack((arg_sorts, final))
    return arg_sorts[:, 1:].astype(int)

st = time.time() 
a = np.random.rand(40000, 25)
b = np.random.rand(40000, 25)
w0, w1 = 1, 1
res1 = chunk_euclidean_distances(b, a, w0=w0, w1=w1)
print(res1)
print(time.time() - st)

# 
# 0
# 5000
# 10000
# 15000
# 20000
# 25000
# 30000
# 35000
# 154.92434120178223
# 
# np.sort(res1, axis=1)
# Out[516]: 
# array([[ 1044,  2019,  4198, ..., 34875, 35806, 37105],
#        [ 2220,  2235,  2623, ..., 35427, 36347, 36990],
#        [  304,   363,   781, ..., 31728, 34914, 35839],
#        ...,
#        [  128,  1759,  3348, ..., 35754, 37879, 39596],
#        [  511,  2109,  3382, ..., 33630, 35250, 39915],
#        [ 8018,  8536, 11702, ..., 38547, 38766, 39710]])


from scipy.spatial import cKDTree
import time
import numpy as np

np.random.seed(42)

st = time.time() 
# a = np.random.rand(40000, 25)
# b = np.random.rand(40000, 25)

def chunk_euclidean_distances2(a1, a2, chunk_size=5000, sort_keep=25, w0=1, w1=1, keep_num=500):
    t0 = cKDTree(a)
    t1 = cKDTree(b)
    # res = np.empty((len(b), 10))
    for i in range(0, len(b), chunk_size):
        print(i)
        dists0, inds0 = t0.query(b[i:i+chunk_size], k=1000, n_jobs=-1)
        dists1, inds1 = t1.query(a[i:i+chunk_size], k=1000, n_jobs=-1)
        v = w0*dists0 + w1*dists1
        print(v)
        break
        
        # res[i:i+1000] = t0.query(b[i:i+1000], k=10, n_jobs=-1)[1]
        # res = np.vstack((res, t0.query(b[i:i+1000], k=10, n_jobs=-1)[1]))
res2 = chunk_euclidean_distances2(b, a, w0=w0, w1=w1)

print(time.time() - st)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment