Skip to content

Instantly share code, notes, and snippets.

@alinazhanguwo
Created December 20, 2020 04:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alinazhanguwo/3b6c11cabbbd74c365d2f23efede2ec2 to your computer and use it in GitHub Desktop.
Save alinazhanguwo/3b6c11cabbbd74c365d2f23efede2ec2 to your computer and use it in GitHub Desktop.
def get_matches_df(sparse_matrix, name_vector, top=100):
non_zeros = sparse_matrix.nonzero()
sparserows = non_zeros[0]
sparsecols = non_zeros[1]
if top:
nr_matches = top
else:
nr_matches = sparsecols.size
left_side = np.empty([nr_matches], dtype=object)
right_side = np.empty([nr_matches], dtype=object)
similairity = np.zeros(nr_matches)
for index in range(0, nr_matches):
left_side[index] = name_vector[sparserows[index]]
right_side[index] = name_vector[sparsecols[index]]
similairity[index] = sparse_matrix.data[index]
return pd.DataFrame({'TITLE': left_side,
'SIMILAR_TITLE': right_side,
'similairity_score': similairity})
matches_df = pd.DataFrame()
matches_df = get_matches_df(matches, df['TITLE'], top=10000)
# Remove all exact matches
matches_df = matches_df[matches_df['similairity_score'] < 0.99999]
matches_df.sample(10)
@AbhilashaYadav14
Copy link

matches_df = get_matches_df(matches, df['TITLE'], top=10000) - what does top=10000 mean in this?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment