import heapq
TOP_N = 5
def get_similarity_suggestion(phrase, no_percentage=False):
graph = tf.Graph()
with tf.compat.v1.Session(graph = graph) as session:
embed = hub.Module(module_url)
# Here we combine both lists into a single set of unique phrases
messages = set(df_404s["phrase"].to_list() + df_canonicals["phrase"].to_list())
messages = list(messages)[:-1]
similarity_input_placeholder = tf.placeholder(tf.string, shape=(None))
similarity_message_encodings = embed(similarity_input_placeholder)
with tf.Session() as session:
import tensorflow_hub as hub
embed = hub.load("")
embeddings = embed([
"The quick brown fox jumps over the lazy dog.",
"I am a sentence for which I would like to get its embedding"])["outputs"]
print embeddings
# The following are example embedding output of 512 dimensions per sentence
import pandas as pd
#load URL sets to data frames
df_404s = pd.read_csv("404-urls.csv")
df_canonicals = pd.read_csv("canonical-urls.csv")
import re
#replace / - _ and .html with spaces
df_404s["phrase"] = df_404s["404 url"].apply(lambda x: re.sub(r"[/_-]|\.html", " ", x))
from google.colab import drive
%cd '/drive/My Drive/'
!cp canonicals-urls.csv 404-urls.csv /content
#!pip install gdown
import gdown
canonical_urls="Google Drive link to the canonicals URLs set"
error_urls="Google Drive link to the 404 URLs set", output="canonicals-urls.csv", quiet=False)
#Please type: !pip install requests-html
from requests_html import HTMLSession
session = HTMLSession()
url = ""
with session.get(url) as r:
selector="#post-328471 > div:nth-child(2) > div > div > div.sej-article-content.gototop-pos"
