This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def get_link(soup): | |
| try: | |
| meta = soup.head.find('meta', {"property":"og:url"}) | |
| return meta.attrs['content'] | |
| except: | |
| return None | |
| def get_publication(soup): | |
| try: | |
| card = soup.main.div.div.nextSibling |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class MediumPage: | |
| """ | |
| ## DIVS = ['animation', 'embed', 'images', 'banner', 'video', 'pull-quote'] + error | |
| ## TAGS = ['pre', 'h2', None, 'ul', 'div', 'h5', 'h3', 'ol', 'p', 'h1', 'h6', 'h4', 'blockquote'] | |
| FEATURES = ["author", "author_fw", "author_de", "images", "headings", "content"] | |
| """ | |
| def __init__(self, id, session, features=FEATURES, download=False, soup=None): | |
| self.id = id | |
| self.html_path = f"{FOLDER}/html/{id}.html" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| VALID = ['towardsdatascience.com', | |
| 'uxdesign.cc', | |
| 'writingcooperative.com', | |
| 'bettermarketing.pub', | |
| 'proandroiddev.com', | |
| 'chatbotslife.com', | |
| 'chatbotsmagazine.com', | |
| 'artplusmarketing.com'] | |
| def domain_validation(url, valid=VALID): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| CSV = "links_csv.csv" | |
| if __name__ == "__main__": | |
| # Initiate Session, maximum connects, retries and backoff factor | |
| options = webdriver.ChromeOptions() | |
| for arg in ARGS: | |
| options.add_argument(arg) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import logging | |
| import os | |
| import csv | |
| from bs4 import BeautifulSoup | |
| from selenium import webdriver | |
| logging.basicConfig( | |
| filename="./logs/web-scraper.log", | |
| filemode="a+", | |
| format="%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s", |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class Brain(nn.Module): | |
| def __init__(self,flags,n_vocab,): | |
| super(Brain,self).__init__() | |
| self.flags=flags | |
| self.embedding= nn.Embedding(n_vocab, flags.embedding_size) | |
| self.lstm=nn.LSTM(flags.embedding_size, flags.lstm_size, batch_first=True, num_layers=flags.num_layers, dropout=flags.dropout) | |
| self.dense = nn.Linear(flags.lstm_size, n_vocab) | |
| self.criterion = nn.CrossEntropyLoss() | |
| self.optimizer = torch.optim.Adam(self.parameters(), lr=self.flags.lr) | |
| self.loss_value=0 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| read_path="{}.txt" | |
| write_path="{}_.txt" | |
| file="cano" | |
| flags = Namespace( | |
| data_dir=read_path.format(file), #Directory to load your cleaned data | |
| batch_size=256, #Batch size for input (set it according to memory available to you | |
| seq_size=64, #Length of the sequence, a very important parameter | |
| embedding_size=256, #Size of embeddings | |
| num_layers=1, #Number of lstm layers | |
| is_bidirectional=False, #Bidirectionality |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class Data : | |
| def __init__(self,flags): | |
| self.flags=flags #Set the flags to maintain consistency throughout the model | |
| # Pre Process Data | |
| self.pre_process() | |
| # Prepare Data For Training | |
| @calculate_time | |
| def pre_process(self): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| import string | |
| data=open('data.txt').read() #Read the text | |
| chars_set = set(data) #Get the vocabulary of your characters | |
| print('There are {} characters in the text file'.format(len(chars_set))) | |
| english=set(string.ascii_letters+string.digits+string.punctuation+' \n') #creating english characters vocabulary | |
| print('foreign characters are : ',chars_set-english) #Collect the foreign characters | |
| #There are 97 characters in the text file |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import urllib.request | |
| #This text is available for research, it will be downloaded to your directory as data.txt | |
| print('Beginning file download with urllib2...') | |
| url = 'https://sherlock-holm.es/stories/plain-text/cnus.txt' | |
| urllib.request.urlretrieve(url, 'data.txt') |
NewerOlder