Skip to content

Instantly share code, notes, and snippets.

Hamlet Batista hamletbatista

Block or report user

Report or block hamletbatista

Hide content and notifications from this user.

Learn more about blocking users

Contact Support about this user’s behavior.

Learn more about reporting abuse

Report abuse
View GitHub Profile
View predictive_prefetching.html
<!DOCTYPE html>
<html lang="en">
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<script src="index.js"></script>
import heapq
TOP_N = 5
def get_similarity_suggestion(phrase, no_percentage=False):
graph = tf.Graph()
with tf.compat.v1.Session(graph = graph) as session:
embed = hub.Module(module_url)
# Here we combine both lists into a single set of unique phrases
messages = set(df_404s["phrase"].to_list() + df_canonicals["phrase"].to_list())
messages = list(messages)[:-1]
similarity_input_placeholder = tf.placeholder(tf.string, shape=(None))
similarity_message_encodings = embed(similarity_input_placeholder)
with tf.Session() as session:
import tensorflow_hub as hub
embed = hub.load("")
embeddings = embed([
"The quick brown fox jumps over the lazy dog.",
"I am a sentence for which I would like to get its embedding"])["outputs"]
print embeddings
# The following are example embedding output of 512 dimensions per sentence
import pandas as pd
#load URL sets to data frames
df_404s = pd.read_csv("404-urls.csv")
df_canonicals = pd.read_csv("canonical-urls.csv")
import re
#replace / - _ and .html with spaces
df_404s["phrase"] = df_404s["404 url"].apply(lambda x: re.sub(r"[/_-]|\.html", " ", x))
from google.colab import drive
%cd '/drive/My Drive/'
!cp canonicals-urls.csv 404-urls.csv /content
#!pip install gdown
import gdown
canonical_urls="Google Drive link to the canonicals URLs set"
error_urls="Google Drive link to the 404 URLs set", output="canonicals-urls.csv", quiet=False)
total 12
-rw-r--r-- 1 root root 355 Oct 29 03:03 abs_bert_cnndm_sample.148000.candidate
-rw-r--r-- 1 root root 1319 Oct 29 03:03
-rw-r--r-- 1 root root 2695 Oct 29 03:03 abs_bert_cnndm_sample.148000.raw_src
abs test
[2019-10-29 04:11:27,312 INFO] Loading checkpoint from /content/PreSumm/models/CNN_DailyMail_Abstractive/
Namespace(accum_count=1, alpha=0.95, batch_size=32, beam_size=5, bert_data_path='../bert_data/cnndm', beta1=0.9, beta2=0.999, block_trigram=True, dec_dropout=0.2, dec_ff_size=2048, dec_heads=8, dec_hidden_size=768, dec_layers=6, enc_dropout=0.2, enc_ff_size=512, enc_hidden_size=512, enc_layers=6, encoder='bert', ext_dropout=0.2, ext_ff_size=2048, ext_heads=8, ext_hidden_size=768, ext_layers=2, finetune_bert=True, generator_shard_size=32, gpu_ranks=[0], label_smoothing=0.1, large=False, load_from_extractive='', log_file='../logs/val_abs_bert_cnndm', lower=True, lr=1, lr_bert=0.002, lr_dec=0.002, max_grad_norm=0, max_length=200, max_pos=512, max_src_nsents=100, max_src_ntokens_per_sent=200, max_tgt_len=140, max_tgt_ntokens=500, min_length=50, min_src_nsents=3, min_src_ntokens_per_sent=5, min_tgt_ntokens=5, mode='test', model_path='../../models/', optim='adam
#Please type: !pip install requests-html
from requests_html import HTMLSession
session = HTMLSession()
url = ""
with session.get(url) as r:
selector="#post-328471 > div:nth-child(2) > div > div > div.sej-article-content.gototop-pos"
You can’t perform that action at this time.