Skip to content

Instantly share code, notes, and snippets.

@jdthorpe
Last active March 30, 2022 16:50
Show Gist options
  • Save jdthorpe/dbac6b48105ae7e543f939c0229e5096 to your computer and use it in GitHub Desktop.
Save jdthorpe/dbac6b48105ae7e543f939c0229e5096 to your computer and use it in GitHub Desktop.
Fast Manhattan distance between a vector and each row in a matrix.

Fast sparse, exact KNN Manhattan Distance

Given a 1xN (row) vector and a MxN matrix, the code below calculates the manhattan distance between the row vector and each row in the matrix. It uses Numba for significant speadup.

Currenlty I'm using this to calculate distances between a 1 x 60K row vector and a 64k x 60x matrix (with about 500k non-empty values) in ~ 10ms on a single processor

Example usage: K nearest neightbors

import numpy as np
import sparse_manhattan_distance from sparse_manhattan_distance

def sparse_manhattan_KNN(mat,vec,N,sorted=True):
    """
    @param mat: A sparse M x N matrix 
    @param vec: A sparse N vector to be compared to each row of `mat`
    @param N: The number of nearest neighbors to 
    @param sorted: If `True`, sort the row indexes before returing 

    @return a vector of row indices of the nearest N rows in `mat` to `vec`
    """

    # cant return more observations than data...
    N = min(N, mat.data.shape[0])  

    distances = sparse_manhattan_distance(mat, vec)

    # note that argpartition and then sorting the top N indexes is *much*
    # faster than sorting the full distance vector and then choosing the top N
    nearest_N_rows = np.argpartition(distances, -N)[-N:]
    if sorted:
        return nearest_N_rows[np.argsort(-distances[nearest_N_rows])]

    return  nearest_N_rows
""" Adapted from this awesome solution was on SO:
https://stackoverflow.com/a/64920528/1519199
"""
import numba as nb
import numpy as np
from scipy import sparse
def sparse_manhattan_distance(mat, vec):
""" compare rows of vec to each row of mat withthe manhattan distance
"""
mat_csr = mat.tocsr()
vec_csr = vec.tocsr()
shape_mat = mat_csr.shape
indices_mat = mat_csr.indices
indptr_mat = mat_csr.indptr
data_mat = mat_csr.data
indices_vec = vec_csr.indices
data_vec = vec_csr.data
res = sparse_manhattan_distance_nb(
indices_mat, indptr_mat, data_mat, shape_mat, indices_vec, data_vec,
)
res = sparse.csr_matrix(res, shape=shape_mat)
return res
@nb.njit(cache=True)
def sparse_manhattan_distance_nb(
indices_mat, indptr_mat, data_mat, shape_mat, vec_row_ind, vec_row_data,
):
out = np.zeros(shape_mat[0])
# for each row in the matrix
for row_idx in range(shape_mat[0]):
mat_row_ind = indices_mat[indptr_mat[row_idx] : indptr_mat[row_idx + 1]]
mat_row_data = data_mat[indptr_mat[row_idx] : indptr_mat[row_idx + 1]]
# for each matrix / vec value in the row (with the usual lazy pointer incrementation trick)
mat_ptr = 0
vec_ptr = 0
while mat_ptr < mat_row_ind.shape[0] and vec_ptr < vec_row_ind.shape[0]:
ind_mat = mat_row_ind[mat_ptr]
ind_vec = vec_row_ind[vec_ptr]
# value for both matrix and vector is present in this row
if ind_mat == ind_vec:
d += abs(mat_row_data[mat_ptr] - vec_row_data[vec_ptr])
mat_ptr += 1 # increment the mattrix row pointer
vec_ptr += 1 # increment the vector row pointer
# only value for the matrix is present; vector is 0 by definition
elif ind_mat < ind_vec:
d += abs(mat_row_data[mat_ptr])
indptr_mat_ += 1 # increment the vector row pointer
mat_ptr += 1 # increment the vector row pointer
# only value for the vector is present; matrix is 0 by definition
else:
d += abs(vec_row_data[vec_ptr])
vec_ptr += 1 # increment the vector row pointer
# At this point mat_ptr or vec_ptr may have reached the end of the row,
# but the *other* one may not have, so finish up with the other one if necessary.
# Aside: Two coins add up to 30 cents and one of them isn't a nickel... (the other one is)
for i in range(mat_ptr, mat_row_ind.shape[0]):
d += abs(mat_row_data[i])
for i in range(vec_ptr, vec_row_ind.shape[0]):
d += abs(vec_row_data[i])
out[row_idx] = d
return out
"""
Sparse matrices don't broadcast, but this awesome solution was on SO:
https://stackoverflow.com/a/64920528/1519199
I added commentary and switched the logic from pairwise max to pairwise min
"""
import numba as nb
import numpy as np
from scipy import sparse
def sparse_elementwise_minimum(mat, vec, shrink_to_fit=True):
""" pairwise minimum between vec and each row of `mat`
"""
mat_csr = mat.tocsr()
vec_csr = vec.tocsr()
shape_mat = mat_csr.shape
indices_mat = mat_csr.indices
indptr_mat = mat_csr.indptr
data_mat = mat_csr.data
indices_vec = vec_csr.indices
data_vec = vec_csr.data
res = sparse_elementwise_minimum_nb(
indices_mat,
indptr_mat,
data_mat,
shape_mat,
indices_vec,
data_vec,
shrink_to_fit,
)
@nb.njit(cache=True)
def sparse_elementwise_minimum_nb(
indices_mat,
indptr_mat,
data_mat,
shape_mat,
vec_row_ind,
vec_row_data,
shrink_to_fit,
):
# the maximum number non-zero elements
max_non_zero = shape_mat[0] * vec_row_data.shape[0] + data_mat.shape[0]
# output data
data_res = np.empty(max_non_zero, dtype=data_mat.dtype)
# output (row) indexes
indices_res = np.empty(max_non_zero, dtype=np.int32)
# pointer to the current data / row index
data_res_p = 0
# output pointer (row slices) (always has length of nrow + 1)
indptr_mat_res = np.empty((shape_mat[0] + 1), dtype=np.int32)
indptr_mat_res[0] = 0
indptr_mat_res_p = 1
indptr_mat_ = 0
# for each row in the matrix
for row_idx in range(shape_mat[0]):
mat_row_ind = indices_mat[indptr_mat[row_idx] : indptr_mat[row_idx + 1]]
mat_row_data = data_mat[indptr_mat[row_idx] : indptr_mat[row_idx + 1]]
# for each matrix / vec value in the row (with the usual lazy pointer incrementation trick)
mat_ptr = 0
vec_ptr = 0
while mat_ptr < mat_row_ind.shape[0] and vec_ptr < vec_row_ind.shape[0]:
ind_mat = mat_row_ind[mat_ptr]
ind_vec = vec_row_ind[vec_ptr]
# value for both matrix and vector is present
if ind_mat == ind_vec:
data_res[data_res_p] = min(
mat_row_data[mat_ptr], vec_row_data[vec_ptr]
) # do the math
indices_res[data_res_p] = ind_mat # set the output column index
data_res_p += 1 # increement the output data pointer
mat_ptr += 1 # increment the mattrix row pointer
vec_ptr += 1 # increment the vector row pointer
indptr_mat_ += 1 # increment the output slice pointer
# only value for the matrix is present; vector is 0 by definition
elif ind_mat < ind_vec:
if mat_row_data[mat_ptr] < 0:
data_res[data_res_p] = mat_row_data[mat_ptr] # do the math
indices_res[data_res_p] = ind_mat # set the output column index
data_res_p += 1 # increement the output data pointer
indptr_mat_ += 1 # increment the output slice pointer
mat_ptr += 1 # increment the mattrix row pointer
# only value for the vector is present; matrix is 0 by definition
else:
if vec_row_data[vec_ptr] < 0:
data_res[data_res_p] = vec_row_data[vec_ptr] # do the math
indices_res[data_res_p] = ind_vec # set the output column index
data_res_p += 1 # increement the output data pointer
indptr_mat_ += 1 # increment the output slice pointer
vec_ptr += 1 # increment the vector row pointer
# At this point mat_ptr or vec_ptr may have reached the end of the row,
# but the *other* one may not have, so finish up with the other one if necessary.
# Aside: Two coins add up to 30 cents and one of them isn't a nickel... (the other one is)
for i in range(mat_ptr, mat_row_ind.shape[0]):
if mat_row_data[i] < 0:
data_res[data_res_p] = mat_row_data[i]
indices_res[data_res_p] = mat_row_ind[i]
data_res_p += 1 # increement the output data pointer
indptr_mat_ += 1 # increment the output slice pointer
for i in range(vec_ptr, vec_row_ind.shape[0]):
if vec_row_data[i] < 0:
data_res[data_res_p] = vec_row_data[i]
indices_res[data_res_p] = vec_row_ind[i]
data_res_p += 1 # increement the output data pointer
indptr_mat_ += 1 # increment the output slice pointer
indptr_mat_res[indptr_mat_res_p] = indptr_mat_
indptr_mat_res_p += 1
if shrink_to_fit == True:
data_res = np.copy(data_res[:data_res_p])
indices_res = np.copy(indices_res[:data_res_p])
else:
data_res = data_res[:data_res_p]
indices_res = indices_res[:data_res_p]
return data_res, indices_res, indptr_mat_res
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment