Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save spencebeecher/fc53a12dd9a6b313b5aace697aa1e356 to your computer and use it in GitHub Desktop.
Save spencebeecher/fc53a12dd9a6b313b5aace697aa1e356 to your computer and use it in GitHub Desktop.
# coding: utf-8
# In[1]:
import pysparnn as snn
# In[2]:
import pysparnn.matrix_distance
# In[3]:
import sklearn
import sklearn.metrics.pairwise
import scipy.sparse as sparse
import numpy as np
class UserCustomDistance(pysparnn.matrix_distance.MatrixMetricSearch):
def __init__(self, features, records_data):
super(UserCustomDistance, self).__init__(features, records_data)
self.matrix = self.matrix
self.max_overlap = self.matrix.shape[0] # for testing purpose
@staticmethod
def features_to_matrix(features):
return features
@staticmethod
def vstack(matrix_list):
return np.vstack(matrix_list)
def _transform_value(self, v):
return v
def user_distance_metric(self, u, v):
rep = sparse.csr_matrix(np.minimum(u.A, v.A))
return self.max_overlap - rep.sum()
def _distance(self, a_matrix):
return sklearn.metrics.pairwise.pairwise_distances(
a_matrix, self.matrix, lambda u, v: self.user_distance_metric(u, v))
# In[4]:
class SpencerUserCustomDistance(pysparnn.matrix_distance.MatrixMetricSearch):
def __init__(self, features, records_data):
super(SpencerUserCustomDistance, self).__init__(features, records_data)
self.matrix = self.matrix
self.max_overlap = self.matrix.shape[0] # for testing purpose
@staticmethod
def features_to_matrix(features):
return features
@staticmethod
def vstack(matrix_list):
return np.vstack(matrix_list)
def _transform_value(self, v):
return 1.0 if v > 0 else 0.0
def user_distance_metric(self, u, v):
rep = sparse.csr_matrix(np.minimum(u.A, v.A))
return self.max_overlap - rep.sum()
def _distance(self, a_matrix):
return np.array(self.matrix.sum(axis=1) - self.matrix.dot(a_matrix.transpose()).transpose())
# In[5]:
B = sparse.csr_matrix([[1, 1, 1],[1, 1, 1],[1, 1, 1]])
A = sparse.csr_matrix(
[
[2, 2, 1],
[1, 0, 1],
[1, 1, 1]
])
# In[6]:
u_dist = UserCustomDistance(A, range(A.shape[0]))
u_dist._distance(A)
# In[7]:
su_dist = SpencerUserCustomDistance(A, range(A.shape[0]))
su_dist._distance(A)
# In[8]:
import scipy
# In[10]:
from scipy.sparse import rand
# In[11]:
num_records = 1000000
matrix = rand(num_records, num_records, density=0.00002, format='csr')
# In[12]:
matrix.sum(axis=1).mean()
# In[13]:
import pysparnn.cluster_index as ci
# In[ ]:
import time
# In[ ]:
t0 = time.time()
cp2 = ci.MultiClusterIndex(matrix, np.array(range(num_records)), distance_type=SpencerUserCustomDistance)
t1 = time.time()
# In[ ]:
t1 - t0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment