hamletbatista/get_similarity_suggestions.py

## get_similarity_suggestions.py
import heapq

TOP_N = 5
BEST_ONLY = False
THRESHOLD_PROBABILITY = 0.65

def get_similarity_suggestion(phrase, no_percentage=False):
    graph = tf.Graph()
    with tf.compat.v1.Session(graph = graph) as session:
        embed = hub.Module(module_url)
        similarity_input_placeholder = tf.compat.v1.placeholder(tf.string, shape=(None))
        similarity_message_encodings = embed(similarity_input_placeholder)
        session.run(tf.compat.v1.global_variables_initializer())
        session.run(tf.compat.v1.tables_initializer())
        to_find_embeddings = session.run(similarity_message_encodings, feed_dict={similarity_input_placeholder: [phrase]})
        result = np.inner(message_embeddings, to_find_embeddings)

    top_N_indexes = heapq.nlargest(TOP_N, range(len(result)), result.take)
    if BEST_ONLY:
        top_N_indexes = [index for index in top_N_indexes if result[index] > THRESHOLD_PROBABILITY]

    to_return = list()
    for i in top_N_indexes:
        matched = df_404s.iloc[i]
        if no_percentage:
            to_return.append(matched['phrase'])
        else:
            to_return.append([str(matched['phrase']), '%.2f' % float(result[i]*100), i])
    return to_return

#Here we test one of the 404 phrases
test_phrase = df_404s["phrase"].iloc[0] # -> ' shop by collection wonderland rainbow'

results = get_similarity_suggestion(test_phrase, no_percentage=False)

print(results)

# This is what the suggested matches looks like.

#[[' shop by collection wonderland rainbow', '21.87', 0],
# [' catalog gold earrings gold cascade earrings p 200 ', '16.33', 1],
# [' shop by collection silver rain silver jewelry 1 ', '1.48', 2]]

#You can iterate this line over all 404 urls to get the top matching suggestions for each url.
#Please try this as a homework exercise
#
#results = get_similarity_suggestion(test_phrase, no_percentage=False)
	import heapq

	TOP_N = 5
	BEST_ONLY = False
	THRESHOLD_PROBABILITY = 0.65

	def get_similarity_suggestion(phrase, no_percentage=False):
	graph = tf.Graph()
	with tf.compat.v1.Session(graph = graph) as session:
	embed = hub.Module(module_url)
	similarity_input_placeholder = tf.compat.v1.placeholder(tf.string, shape=(None))
	similarity_message_encodings = embed(similarity_input_placeholder)
	session.run(tf.compat.v1.global_variables_initializer())
	session.run(tf.compat.v1.tables_initializer())
	to_find_embeddings = session.run(similarity_message_encodings, feed_dict={similarity_input_placeholder: [phrase]})
	result = np.inner(message_embeddings, to_find_embeddings)

	top_N_indexes = heapq.nlargest(TOP_N, range(len(result)), result.take)
	if BEST_ONLY:
	top_N_indexes = [index for index in top_N_indexes if result[index] > THRESHOLD_PROBABILITY]

	to_return = list()
	for i in top_N_indexes:
	matched = df_404s.iloc[i]
	if no_percentage:
	to_return.append(matched['phrase'])
	else:
	to_return.append([str(matched['phrase']), '%.2f' % float(result[i]*100), i])
	return to_return

	#Here we test one of the 404 phrases
	test_phrase = df_404s["phrase"].iloc[0] # -> ' shop by collection wonderland rainbow'

	results = get_similarity_suggestion(test_phrase, no_percentage=False)

	print(results)

	# This is what the suggested matches looks like.

	#[[' shop by collection wonderland rainbow', '21.87', 0],
	# [' catalog gold earrings gold cascade earrings p 200 ', '16.33', 1],
	# [' shop by collection silver rain silver jewelry 1 ', '1.48', 2]]

	#You can iterate this line over all 404 urls to get the top matching suggestions for each url.
	#Please try this as a homework exercise
	#
	#results = get_similarity_suggestion(test_phrase, no_percentage=False)