Skip to content

Instantly share code, notes, and snippets.

View hamletbatista's full-sized avatar

Hamlet Batista hamletbatista

View GitHub Profile
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/3")
embeddings = embed([
"The quick brown fox jumps over the lazy dog.",
"I am a sentence for which I would like to get its embedding"])["outputs"]
print embeddings
# The following are example embedding output of 512 dimensions per sentence
import pandas as pd
#load URL sets to data frames
df_404s = pd.read_csv("404-urls.csv")
df_canonicals = pd.read_csv("canonical-urls.csv")
import re
#replace / - _ and .html with spaces
df_404s["phrase"] = df_404s["404 url"].apply(lambda x: re.sub(r"[/_-]|\.html", " ", x))
from google.colab import drive
drive.mount("/drive")
%cd '/drive/My Drive/'
!cp canonicals-urls.csv 404-urls.csv /content
#!pip install gdown
#https://pypi.org/project/gdown/
import gdown
canonical_urls="Google Drive link to the canonicals URLs set"
error_urls="Google Drive link to the 404 URLs set"
gdown.download(canonical_urls, output="canonicals-urls.csv", quiet=False)
total 12
-rw-r--r-- 1 root root 355 Oct 29 03:03 abs_bert_cnndm_sample.148000.candidate
-rw-r--r-- 1 root root 1319 Oct 29 03:03 abs_bert_cnndm_sample.148000.gold
-rw-r--r-- 1 root root 2695 Oct 29 03:03 abs_bert_cnndm_sample.148000.raw_src
/content/PreSumm/src
abs test
[2019-10-29 04:11:27,312 INFO] Loading checkpoint from /content/PreSumm/models/CNN_DailyMail_Abstractive/model_step_148000.pt
Namespace(accum_count=1, alpha=0.95, batch_size=32, beam_size=5, bert_data_path='../bert_data/cnndm', beta1=0.9, beta2=0.999, block_trigram=True, dec_dropout=0.2, dec_ff_size=2048, dec_heads=8, dec_hidden_size=768, dec_layers=6, enc_dropout=0.2, enc_ff_size=512, enc_hidden_size=512, enc_layers=6, encoder='bert', ext_dropout=0.2, ext_ff_size=2048, ext_heads=8, ext_hidden_size=768, ext_layers=2, finetune_bert=True, generator_shard_size=32, gpu_ranks=[0], label_smoothing=0.1, large=False, load_from_extractive='', log_file='../logs/val_abs_bert_cnndm', lower=True, lr=1, lr_bert=0.002, lr_dec=0.002, max_grad_norm=0, max_length=200, max_pos=512, max_src_nsents=100, max_src_ntokens_per_sent=200, max_tgt_len=140, max_tgt_ntokens=500, min_length=50, min_src_nsents=3, min_src_ntokens_per_sent=5, min_tgt_ntokens=5, mode='test', model_path='../../models/', optim='adam
#Please type: !pip install requests-html
from requests_html import HTMLSession
session = HTMLSession()
url = "https://www.searchenginejournal.com/uncover-powerful-data-stories-python/328471/"
with session.get(url) as r:
selector="#post-328471 > div:nth-child(2) > div > div > div.sej-article-content.gototop-pos"
!unzip /content/PreSumm/models/bertext_cnndm_transformer.zip
!unzip /content/PreSumm/models/bertsumextabs_cnndm_final_model.zip
!unzip /content/PreSumm/models/bertsumextabs_xsum_final_model.zip
!mkdir /content/PreSumm/models/CNN_DailyMail_Extractive
!mkdir /content/PreSumm/models/CNN_DailyMail_Abstractive
!mkdir /content/PreSumm/models/XSUM_OneSentence
!mv /content/PreSumm/models/bertext_cnndm_transformer.pt /content/PreSumm/models/CNN_DailyMail_Extractive
!mv /content/PreSumm/models/model_step_148000.pt /content/PreSumm/models/CNN_DailyMail_Abstractive
%cd /content/PreSumm/models
#CNN/DM Extractive bertext_cnndm_transformer.pt
!gdown https://drive.google.com/uc?id=1kKWoV0QCbeIuFt85beQgJ4v0lujaXobJ&export=download
#CNN/DM Abstractive model_step_148000.pt
!gdown https://drive.google.com/uc?id=1-IKVCtc4Q-BdZpjXc4s70_fRsWnjtYLr&export=download
#XSUM (One Sentence Summary) model_step_30000.pt
!gdown https://drive.google.com/uc?id=1H50fClyTkNprWJNh10HWdGEdDdQIkzsI&export=download
--- /content/PreSumm/src/summarizer.py 2019-10-29 02:12:01.951535276 +0000
+++ /content/PreSumm/src/summarizer2.py 2019-10-29 03:47:19.168619951 +0000
@@ -1,6 +1,6 @@
#!/usr/bin/env python
"""
- Inference entrance
+ Main training workflow
"""
from __future__ import division