Skip to content

Instantly share code, notes, and snippets.

hamletbatista

Block or report user

Report or block hamletbatista

Hide content and notifications from this user.

Learn more about blocking users

Contact Support about this user’s behavior.

Learn more about reporting abuse

Report abuse
View GitHub Profile
View get_similarity_suggestions.py
import heapq
TOP_N = 5
BEST_ONLY = False
THRESHOLD_PROBABILITY = 0.65
def get_similarity_suggestion(phrase, no_percentage=False):
graph = tf.Graph()
with tf.compat.v1.Session(graph = graph) as session:
embed = hub.Module(module_url)
View prepare_phrase_corpus.py
# Here we combine both lists into a single set of unique phrases
messages = set(df_404s["phrase"].to_list() + df_canonicals["phrase"].to_list())
messages = list(messages)[:-1]
similarity_input_placeholder = tf.placeholder(tf.string, shape=(None))
similarity_message_encodings = embed(similarity_input_placeholder)
with tf.Session() as session:
session.run(tf.global_variables_initializer())
View universal_sentence_encoder.py
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/3")
embeddings = embed([
"The quick brown fox jumps over the lazy dog.",
"I am a sentence for which I would like to get its embedding"])["outputs"]
print embeddings
# The following are example embedding output of 512 dimensions per sentence
View convert_urls_to_phrases.py
import pandas as pd
#load URL sets to data frames
df_404s = pd.read_csv("404-urls.csv")
df_canonicals = pd.read_csv("canonical-urls.csv")
import re
#replace / - _ and .html with spaces
df_404s["phrase"] = df_404s["404 url"].apply(lambda x: re.sub(r"[/_-]|\.html", " ", x))
View mount_google_drive.py
from google.colab import drive
drive.mount("/drive")
%cd '/drive/My Drive/'
!cp canonicals-urls.csv 404-urls.csv /content
View download_drive_files.py
#!pip install gdown
#https://pypi.org/project/gdown/
import gdown
canonical_urls="Google Drive link to the canonicals URLs set"
error_urls="Google Drive link to the 404 URLs set"
gdown.download(canonical_urls, output="canonicals-urls.csv", quiet=False)
View bert_listing.sh
total 12
-rw-r--r-- 1 root root 355 Oct 29 03:03 abs_bert_cnndm_sample.148000.candidate
-rw-r--r-- 1 root root 1319 Oct 29 03:03 abs_bert_cnndm_sample.148000.gold
-rw-r--r-- 1 root root 2695 Oct 29 03:03 abs_bert_cnndm_sample.148000.raw_src
View presum_output.sh
/content/PreSumm/src
abs test
[2019-10-29 04:11:27,312 INFO] Loading checkpoint from /content/PreSumm/models/CNN_DailyMail_Abstractive/model_step_148000.pt
Namespace(accum_count=1, alpha=0.95, batch_size=32, beam_size=5, bert_data_path='../bert_data/cnndm', beta1=0.9, beta2=0.999, block_trigram=True, dec_dropout=0.2, dec_ff_size=2048, dec_heads=8, dec_hidden_size=768, dec_layers=6, enc_dropout=0.2, enc_ff_size=512, enc_hidden_size=512, enc_layers=6, encoder='bert', ext_dropout=0.2, ext_ff_size=2048, ext_heads=8, ext_hidden_size=768, ext_layers=2, finetune_bert=True, generator_shard_size=32, gpu_ranks=[0], label_smoothing=0.1, large=False, load_from_extractive='', log_file='../logs/val_abs_bert_cnndm', lower=True, lr=1, lr_bert=0.002, lr_dec=0.002, max_grad_norm=0, max_length=200, max_pos=512, max_src_nsents=100, max_src_ntokens_per_sent=200, max_tgt_len=140, max_tgt_ntokens=500, min_length=50, min_src_nsents=3, min_src_ntokens_per_sent=5, min_tgt_ntokens=5, mode='test', model_path='../../models/', optim='adam
View download_sej_article.py
#Please type: !pip install requests-html
from requests_html import HTMLSession
session = HTMLSession()
url = "https://www.searchenginejournal.com/uncover-powerful-data-stories-python/328471/"
with session.get(url) as r:
selector="#post-328471 > div:nth-child(2) > div > div > div.sej-article-content.gototop-pos"
View uncompress_and_move_models.sh
!unzip /content/PreSumm/models/bertext_cnndm_transformer.zip
!unzip /content/PreSumm/models/bertsumextabs_cnndm_final_model.zip
!unzip /content/PreSumm/models/bertsumextabs_xsum_final_model.zip
!mkdir /content/PreSumm/models/CNN_DailyMail_Extractive
!mkdir /content/PreSumm/models/CNN_DailyMail_Abstractive
!mkdir /content/PreSumm/models/XSUM_OneSentence
!mv /content/PreSumm/models/bertext_cnndm_transformer.pt /content/PreSumm/models/CNN_DailyMail_Extractive
!mv /content/PreSumm/models/model_step_148000.pt /content/PreSumm/models/CNN_DailyMail_Abstractive
You can’t perform that action at this time.