alinazhanguwo/get_matches_df.py

## get_matches_df.py
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()

    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]

    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size

    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)

    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]

    return pd.DataFrame({'TITLE': left_side,
                          'SIMILAR_TITLE': right_side,
                           'similairity_score': similairity})


matches_df = pd.DataFrame()
matches_df = get_matches_df(matches, df['TITLE'], top=10000)
# Remove all exact matches
matches_df = matches_df[matches_df['similairity_score'] < 0.99999]
matches_df.sample(10)
	def get_matches_df(sparse_matrix, name_vector, top=100):
	non_zeros = sparse_matrix.nonzero()

	sparserows = non_zeros[0]
	sparsecols = non_zeros[1]

	if top:
	nr_matches = top
	else:
	nr_matches = sparsecols.size

	left_side = np.empty([nr_matches], dtype=object)
	right_side = np.empty([nr_matches], dtype=object)
	similairity = np.zeros(nr_matches)

	for index in range(0, nr_matches):
	left_side[index] = name_vector[sparserows[index]]
	right_side[index] = name_vector[sparsecols[index]]
	similairity[index] = sparse_matrix.data[index]

	return pd.DataFrame({'TITLE': left_side,
	'SIMILAR_TITLE': right_side,
	'similairity_score': similairity})



	matches_df = pd.DataFrame()
	matches_df = get_matches_df(matches, df['TITLE'], top=10000)
	# Remove all exact matches
	matches_df = matches_df[matches_df['similairity_score'] < 0.99999]
	matches_df.sample(10)