jdthorpe/Fast Manhattan Distance and Pairwise Min.md

## Fast Manhattan Distance and Pairwise Min.md

      
    Raw
  

              Fast Manhattan Distance and Pairwise Min.md
            
          
    Fast sparse, exact KNN Manhattan Distance

Given a 1xN (row) vector and a MxN matrix, the code below calculates the
manhattan distance between the row vector and each row in the matrix. It uses
Numba for significant speadup.
Currenlty I'm using this to calculate distances between a 1 x 60K  row vector
and a 64k x 60x matrix (with about 500k non-empty values) in ~ 10ms on a single processor
Example usage: K nearest neightbors

import numpy as np
import sparse_manhattan_distance from sparse_manhattan_distance

def sparse_manhattan_KNN(mat,vec,N,sorted=True):
    """
    @param mat: A sparse M x N matrix 
    @param vec: A sparse N vector to be compared to each row of `mat`
    @param N: The number of nearest neighbors to 
    @param sorted: If `True`, sort the row indexes before returing 

    @return a vector of row indices of the nearest N rows in `mat` to `vec`
    """

    # cant return more observations than data...
    N = min(N, mat.data.shape[0])  

    distances = sparse_manhattan_distance(mat, vec)

    # note that argpartition and then sorting the top N indexes is *much*
    # faster than sorting the full distance vector and then choosing the top N
    nearest_N_rows = np.argpartition(distances, -N)[-N:]
    if sorted:
        return nearest_N_rows[np.argsort(-distances[nearest_N_rows])]

    return  nearest_N_rows

  
## sparse_manhattan_distance.py
""" Adapted from this awesome solution was on SO:
https://stackoverflow.com/a/64920528/1519199
"""
import numba as nb
import numpy as np
from scipy import sparse

def sparse_manhattan_distance(mat, vec):
    """ compare rows of vec to each row of mat withthe manhattan distance
    """
    mat_csr = mat.tocsr()
    vec_csr = vec.tocsr()

    shape_mat = mat_csr.shape
    indices_mat = mat_csr.indices
    indptr_mat = mat_csr.indptr
    data_mat = mat_csr.data
    indices_vec = vec_csr.indices
    data_vec = vec_csr.data

    res = sparse_manhattan_distance_nb(
        indices_mat, indptr_mat, data_mat, shape_mat, indices_vec, data_vec,
    )
    res = sparse.csr_matrix(res, shape=shape_mat)
    return res


@nb.njit(cache=True)
def sparse_manhattan_distance_nb(
    indices_mat, indptr_mat, data_mat, shape_mat, vec_row_ind, vec_row_data,
):
    out = np.zeros(shape_mat[0])

    # for each row in the matrix
    for row_idx in range(shape_mat[0]):
        mat_row_ind = indices_mat[indptr_mat[row_idx] : indptr_mat[row_idx + 1]]
        mat_row_data = data_mat[indptr_mat[row_idx] : indptr_mat[row_idx + 1]]

        # for each matrix / vec value in the row (with the usual lazy pointer incrementation trick)
        mat_ptr = 0
        vec_ptr = 0
        while mat_ptr < mat_row_ind.shape[0] and vec_ptr < vec_row_ind.shape[0]:
            ind_mat = mat_row_ind[mat_ptr]
            ind_vec = vec_row_ind[vec_ptr]

            # value for both matrix and vector is present in this row
            if ind_mat == ind_vec:
                d += abs(mat_row_data[mat_ptr] - vec_row_data[vec_ptr])
                mat_ptr += 1  # increment the mattrix row pointer
                vec_ptr += 1  # increment the vector row pointer

            # only value for the matrix is present; vector is 0 by definition
            elif ind_mat < ind_vec:
                d += abs(mat_row_data[mat_ptr])
                indptr_mat_ += 1  # increment the vector row pointer
                mat_ptr += 1  # increment the vector row pointer

            # only value for the vector is present; matrix is 0 by definition
            else:
                d += abs(vec_row_data[vec_ptr])
                vec_ptr += 1  # increment the vector row pointer

        # At this point mat_ptr or vec_ptr may have reached the end of the row,
        # but the *other* one may not have, so finish up with the other one if necessary.
        # Aside: Two coins add up to 30 cents and one of them isn't a nickel... (the other one is)

        for i in range(mat_ptr, mat_row_ind.shape[0]):
            d += abs(mat_row_data[i])
        for i in range(vec_ptr, vec_row_ind.shape[0]):
            d += abs(vec_row_data[i])

        out[row_idx] = d

    return out


## sparse_pmin.py
"""
Sparse matrices don't broadcast, but this awesome solution was on SO:
https://stackoverflow.com/a/64920528/1519199

I added commentary and switched the logic from pairwise max to pairwise min
"""

import numba as nb
import numpy as np
from scipy import sparse

def sparse_elementwise_minimum(mat, vec, shrink_to_fit=True):
    """ pairwise minimum between vec and each row of `mat`
    """
    mat_csr = mat.tocsr()
    vec_csr = vec.tocsr()

    shape_mat = mat_csr.shape
    indices_mat = mat_csr.indices
    indptr_mat = mat_csr.indptr
    data_mat = mat_csr.data
    indices_vec = vec_csr.indices
    data_vec = vec_csr.data

    res = sparse_elementwise_minimum_nb(
        indices_mat,
        indptr_mat,
        data_mat,
        shape_mat,
        indices_vec,
        data_vec,
        shrink_to_fit,
    )

@nb.njit(cache=True)
def sparse_elementwise_minimum_nb(
    indices_mat,
    indptr_mat,
    data_mat,
    shape_mat,
    vec_row_ind,
    vec_row_data,
    shrink_to_fit,
):
    # the maximum number non-zero elements
    max_non_zero = shape_mat[0] * vec_row_data.shape[0] + data_mat.shape[0]
    # output data
    data_res = np.empty(max_non_zero, dtype=data_mat.dtype)
    # output (row) indexes
    indices_res = np.empty(max_non_zero, dtype=np.int32)
    # pointer to the current data / row index
    data_res_p = 0

    # output pointer (row slices) (always has length of nrow + 1)
    indptr_mat_res = np.empty((shape_mat[0] + 1), dtype=np.int32)
    indptr_mat_res[0] = 0
    indptr_mat_res_p = 1
    indptr_mat_ = 0

    # for each row in the matrix
    for row_idx in range(shape_mat[0]):
        mat_row_ind = indices_mat[indptr_mat[row_idx] : indptr_mat[row_idx + 1]]
        mat_row_data = data_mat[indptr_mat[row_idx] : indptr_mat[row_idx + 1]]

        # for each matrix / vec value in the row (with the usual lazy pointer incrementation trick)
        mat_ptr = 0
        vec_ptr = 0
        while mat_ptr < mat_row_ind.shape[0] and vec_ptr < vec_row_ind.shape[0]:
            ind_mat = mat_row_ind[mat_ptr]
            ind_vec = vec_row_ind[vec_ptr]

            # value for both matrix and vector is present
            if ind_mat == ind_vec:
                data_res[data_res_p] = min(
                    mat_row_data[mat_ptr], vec_row_data[vec_ptr]
                )  # do the math
                indices_res[data_res_p] = ind_mat  # set the output column index
                data_res_p += 1  # increement the output data pointer
                mat_ptr += 1  # increment the mattrix row pointer
                vec_ptr += 1  # increment the vector row pointer
                indptr_mat_ += 1  # increment the output slice pointer

            # only value for the matrix is present; vector is 0 by definition
            elif ind_mat < ind_vec:
                if mat_row_data[mat_ptr] < 0:
                    data_res[data_res_p] = mat_row_data[mat_ptr]  # do the math
                    indices_res[data_res_p] = ind_mat  # set the output column index
                    data_res_p += 1  # increement the output data pointer
                    indptr_mat_ += 1  # increment the output slice pointer
                mat_ptr += 1  # increment the mattrix row pointer

            # only value for the vector is present; matrix is 0 by definition
            else:
                if vec_row_data[vec_ptr] < 0:
                    data_res[data_res_p] = vec_row_data[vec_ptr]  # do the math
                    indices_res[data_res_p] = ind_vec  # set the output column index
                    data_res_p += 1  # increement the output data pointer
                    indptr_mat_ += 1  # increment the output slice pointer
                vec_ptr += 1  # increment the vector row pointer

        # At this point mat_ptr or vec_ptr may have reached the end of the row,
        # but the *other* one may not have, so finish up with the other one if necessary.
        # Aside: Two coins add up to 30 cents and one of them isn't a nickel... (the other one is)

        for i in range(mat_ptr, mat_row_ind.shape[0]):
            if mat_row_data[i] < 0:
                data_res[data_res_p] = mat_row_data[i]
                indices_res[data_res_p] = mat_row_ind[i]
                data_res_p += 1  # increement the output data pointer
                indptr_mat_ += 1  # increment the output slice pointer
        for i in range(vec_ptr, vec_row_ind.shape[0]):
            if vec_row_data[i] < 0:
                data_res[data_res_p] = vec_row_data[i]
                indices_res[data_res_p] = vec_row_ind[i]
                data_res_p += 1  # increement the output data pointer
                indptr_mat_ += 1  # increment the output slice pointer
        indptr_mat_res[indptr_mat_res_p] = indptr_mat_
        indptr_mat_res_p += 1

    if shrink_to_fit == True:
        data_res = np.copy(data_res[:data_res_p])
        indices_res = np.copy(indices_res[:data_res_p])
    else:
        data_res = data_res[:data_res_p]
        indices_res = indices_res[:data_res_p]

    return data_res, indices_res, indptr_mat_res
	""" Adapted from this awesome solution was on SO:
	https://stackoverflow.com/a/64920528/1519199
	"""
	import numba as nb
	import numpy as np
	from scipy import sparse

	def sparse_manhattan_distance(mat, vec):
	""" compare rows of vec to each row of mat withthe manhattan distance
	"""
	mat_csr = mat.tocsr()
	vec_csr = vec.tocsr()

	shape_mat = mat_csr.shape
	indices_mat = mat_csr.indices
	indptr_mat = mat_csr.indptr
	data_mat = mat_csr.data
	indices_vec = vec_csr.indices
	data_vec = vec_csr.data

	res = sparse_manhattan_distance_nb(
	indices_mat, indptr_mat, data_mat, shape_mat, indices_vec, data_vec,
	)
	res = sparse.csr_matrix(res, shape=shape_mat)
	return res


	@nb.njit(cache=True)
	def sparse_manhattan_distance_nb(
	indices_mat, indptr_mat, data_mat, shape_mat, vec_row_ind, vec_row_data,
	):
	out = np.zeros(shape_mat[0])

	# for each row in the matrix
	for row_idx in range(shape_mat[0]):
	mat_row_ind = indices_mat[indptr_mat[row_idx] : indptr_mat[row_idx + 1]]
	mat_row_data = data_mat[indptr_mat[row_idx] : indptr_mat[row_idx + 1]]

	# for each matrix / vec value in the row (with the usual lazy pointer incrementation trick)
	mat_ptr = 0
	vec_ptr = 0
	while mat_ptr < mat_row_ind.shape[0] and vec_ptr < vec_row_ind.shape[0]:
	ind_mat = mat_row_ind[mat_ptr]
	ind_vec = vec_row_ind[vec_ptr]

	# value for both matrix and vector is present in this row
	if ind_mat == ind_vec:
	d += abs(mat_row_data[mat_ptr] - vec_row_data[vec_ptr])
	mat_ptr += 1 # increment the mattrix row pointer
	vec_ptr += 1 # increment the vector row pointer

	# only value for the matrix is present; vector is 0 by definition
	elif ind_mat < ind_vec:
	d += abs(mat_row_data[mat_ptr])
	indptr_mat_ += 1 # increment the vector row pointer
	mat_ptr += 1 # increment the vector row pointer

	# only value for the vector is present; matrix is 0 by definition
	else:
	d += abs(vec_row_data[vec_ptr])
	vec_ptr += 1 # increment the vector row pointer

	# At this point mat_ptr or vec_ptr may have reached the end of the row,
	# but the other one may not have, so finish up with the other one if necessary.
	# Aside: Two coins add up to 30 cents and one of them isn't a nickel... (the other one is)

	for i in range(mat_ptr, mat_row_ind.shape[0]):
	d += abs(mat_row_data[i])
	for i in range(vec_ptr, vec_row_ind.shape[0]):
	d += abs(vec_row_data[i])

	out[row_idx] = d

	return out
	"""
	Sparse matrices don't broadcast, but this awesome solution was on SO:
	https://stackoverflow.com/a/64920528/1519199

	I added commentary and switched the logic from pairwise max to pairwise min
	"""

	import numba as nb
	import numpy as np
	from scipy import sparse

	def sparse_elementwise_minimum(mat, vec, shrink_to_fit=True):
	""" pairwise minimum between vec and each row of `mat`
	"""
	mat_csr = mat.tocsr()
	vec_csr = vec.tocsr()

	shape_mat = mat_csr.shape
	indices_mat = mat_csr.indices
	indptr_mat = mat_csr.indptr
	data_mat = mat_csr.data
	indices_vec = vec_csr.indices
	data_vec = vec_csr.data

	res = sparse_elementwise_minimum_nb(
	indices_mat,
	indptr_mat,
	data_mat,
	shape_mat,
	indices_vec,
	data_vec,
	shrink_to_fit,
	)

	@nb.njit(cache=True)
	def sparse_elementwise_minimum_nb(
	indices_mat,
	indptr_mat,
	data_mat,
	shape_mat,
	vec_row_ind,
	vec_row_data,
	shrink_to_fit,
	):
	# the maximum number non-zero elements
	max_non_zero = shape_mat[0] * vec_row_data.shape[0] + data_mat.shape[0]
	# output data
	data_res = np.empty(max_non_zero, dtype=data_mat.dtype)
	# output (row) indexes
	indices_res = np.empty(max_non_zero, dtype=np.int32)
	# pointer to the current data / row index
	data_res_p = 0

	# output pointer (row slices) (always has length of nrow + 1)
	indptr_mat_res = np.empty((shape_mat[0] + 1), dtype=np.int32)
	indptr_mat_res[0] = 0
	indptr_mat_res_p = 1
	indptr_mat_ = 0

	# for each row in the matrix
	for row_idx in range(shape_mat[0]):
	mat_row_ind = indices_mat[indptr_mat[row_idx] : indptr_mat[row_idx + 1]]
	mat_row_data = data_mat[indptr_mat[row_idx] : indptr_mat[row_idx + 1]]

	# for each matrix / vec value in the row (with the usual lazy pointer incrementation trick)
	mat_ptr = 0
	vec_ptr = 0
	while mat_ptr < mat_row_ind.shape[0] and vec_ptr < vec_row_ind.shape[0]:
	ind_mat = mat_row_ind[mat_ptr]
	ind_vec = vec_row_ind[vec_ptr]

	# value for both matrix and vector is present
	if ind_mat == ind_vec:
	data_res[data_res_p] = min(
	mat_row_data[mat_ptr], vec_row_data[vec_ptr]
	) # do the math
	indices_res[data_res_p] = ind_mat # set the output column index
	data_res_p += 1 # increement the output data pointer
	mat_ptr += 1 # increment the mattrix row pointer
	vec_ptr += 1 # increment the vector row pointer
	indptr_mat_ += 1 # increment the output slice pointer

	# only value for the matrix is present; vector is 0 by definition
	elif ind_mat < ind_vec:
	if mat_row_data[mat_ptr] < 0:
	data_res[data_res_p] = mat_row_data[mat_ptr] # do the math
	indices_res[data_res_p] = ind_mat # set the output column index
	data_res_p += 1 # increement the output data pointer
	indptr_mat_ += 1 # increment the output slice pointer
	mat_ptr += 1 # increment the mattrix row pointer

	# only value for the vector is present; matrix is 0 by definition
	else:
	if vec_row_data[vec_ptr] < 0:
	data_res[data_res_p] = vec_row_data[vec_ptr] # do the math
	indices_res[data_res_p] = ind_vec # set the output column index
	data_res_p += 1 # increement the output data pointer
	indptr_mat_ += 1 # increment the output slice pointer
	vec_ptr += 1 # increment the vector row pointer

	# At this point mat_ptr or vec_ptr may have reached the end of the row,
	# but the other one may not have, so finish up with the other one if necessary.
	# Aside: Two coins add up to 30 cents and one of them isn't a nickel... (the other one is)

	for i in range(mat_ptr, mat_row_ind.shape[0]):
	if mat_row_data[i] < 0:
	data_res[data_res_p] = mat_row_data[i]
	indices_res[data_res_p] = mat_row_ind[i]
	data_res_p += 1 # increement the output data pointer
	indptr_mat_ += 1 # increment the output slice pointer
	for i in range(vec_ptr, vec_row_ind.shape[0]):
	if vec_row_data[i] < 0:
	data_res[data_res_p] = vec_row_data[i]
	indices_res[data_res_p] = vec_row_ind[i]
	data_res_p += 1 # increement the output data pointer
	indptr_mat_ += 1 # increment the output slice pointer
	indptr_mat_res[indptr_mat_res_p] = indptr_mat_
	indptr_mat_res_p += 1

	if shrink_to_fit == True:
	data_res = np.copy(data_res[:data_res_p])
	indices_res = np.copy(indices_res[:data_res_p])
	else:
	data_res = data_res[:data_res_p]
	indices_res = indices_res[:data_res_p]

	return data_res, indices_res, indptr_mat_res