This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow_hub as hub | |
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/3") | |
embeddings = embed([ | |
"The quick brown fox jumps over the lazy dog.", | |
"I am a sentence for which I would like to get its embedding"])["outputs"] | |
print embeddings | |
# The following are example embedding output of 512 dimensions per sentence |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
#load URL sets to data frames | |
df_404s = pd.read_csv("404-urls.csv") | |
df_canonicals = pd.read_csv("canonical-urls.csv") | |
import re | |
#replace / - _ and .html with spaces | |
df_404s["phrase"] = df_404s["404 url"].apply(lambda x: re.sub(r"[/_-]|\.html", " ", x)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from google.colab import drive | |
drive.mount("/drive") | |
%cd '/drive/My Drive/' | |
!cp canonicals-urls.csv 404-urls.csv /content | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!pip install gdown | |
#https://pypi.org/project/gdown/ | |
import gdown | |
canonical_urls="Google Drive link to the canonicals URLs set" | |
error_urls="Google Drive link to the 404 URLs set" | |
gdown.download(canonical_urls, output="canonicals-urls.csv", quiet=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
total 12 | |
-rw-r--r-- 1 root root 355 Oct 29 03:03 abs_bert_cnndm_sample.148000.candidate | |
-rw-r--r-- 1 root root 1319 Oct 29 03:03 abs_bert_cnndm_sample.148000.gold | |
-rw-r--r-- 1 root root 2695 Oct 29 03:03 abs_bert_cnndm_sample.148000.raw_src |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/content/PreSumm/src | |
abs test | |
[2019-10-29 04:11:27,312 INFO] Loading checkpoint from /content/PreSumm/models/CNN_DailyMail_Abstractive/model_step_148000.pt | |
Namespace(accum_count=1, alpha=0.95, batch_size=32, beam_size=5, bert_data_path='../bert_data/cnndm', beta1=0.9, beta2=0.999, block_trigram=True, dec_dropout=0.2, dec_ff_size=2048, dec_heads=8, dec_hidden_size=768, dec_layers=6, enc_dropout=0.2, enc_ff_size=512, enc_hidden_size=512, enc_layers=6, encoder='bert', ext_dropout=0.2, ext_ff_size=2048, ext_heads=8, ext_hidden_size=768, ext_layers=2, finetune_bert=True, generator_shard_size=32, gpu_ranks=[0], label_smoothing=0.1, large=False, load_from_extractive='', log_file='../logs/val_abs_bert_cnndm', lower=True, lr=1, lr_bert=0.002, lr_dec=0.002, max_grad_norm=0, max_length=200, max_pos=512, max_src_nsents=100, max_src_ntokens_per_sent=200, max_tgt_len=140, max_tgt_ntokens=500, min_length=50, min_src_nsents=3, min_src_ntokens_per_sent=5, min_tgt_ntokens=5, mode='test', model_path='../../models/', optim='adam |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Please type: !pip install requests-html | |
from requests_html import HTMLSession | |
session = HTMLSession() | |
url = "https://www.searchenginejournal.com/uncover-powerful-data-stories-python/328471/" | |
with session.get(url) as r: | |
selector="#post-328471 > div:nth-child(2) > div > div > div.sej-article-content.gototop-pos" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!unzip /content/PreSumm/models/bertext_cnndm_transformer.zip | |
!unzip /content/PreSumm/models/bertsumextabs_cnndm_final_model.zip | |
!unzip /content/PreSumm/models/bertsumextabs_xsum_final_model.zip | |
!mkdir /content/PreSumm/models/CNN_DailyMail_Extractive | |
!mkdir /content/PreSumm/models/CNN_DailyMail_Abstractive | |
!mkdir /content/PreSumm/models/XSUM_OneSentence | |
!mv /content/PreSumm/models/bertext_cnndm_transformer.pt /content/PreSumm/models/CNN_DailyMail_Extractive | |
!mv /content/PreSumm/models/model_step_148000.pt /content/PreSumm/models/CNN_DailyMail_Abstractive |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%cd /content/PreSumm/models | |
#CNN/DM Extractive bertext_cnndm_transformer.pt | |
!gdown https://drive.google.com/uc?id=1kKWoV0QCbeIuFt85beQgJ4v0lujaXobJ&export=download | |
#CNN/DM Abstractive model_step_148000.pt | |
!gdown https://drive.google.com/uc?id=1-IKVCtc4Q-BdZpjXc4s70_fRsWnjtYLr&export=download | |
#XSUM (One Sentence Summary) model_step_30000.pt | |
!gdown https://drive.google.com/uc?id=1H50fClyTkNprWJNh10HWdGEdDdQIkzsI&export=download |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- /content/PreSumm/src/summarizer.py 2019-10-29 02:12:01.951535276 +0000 | |
+++ /content/PreSumm/src/summarizer2.py 2019-10-29 03:47:19.168619951 +0000 | |
@@ -1,6 +1,6 @@ | |
#!/usr/bin/env python | |
""" | |
- Inference entrance | |
+ Main training workflow | |
""" | |
from __future__ import division | |