Skip to content

Instantly share code, notes, and snippets.

def get_link(soup):
try:
meta = soup.head.find('meta', {"property":"og:url"})
return meta.attrs['content']
except:
return None
def get_publication(soup):
try:
card = soup.main.div.div.nextSibling
@someshsingh22
someshsingh22 / webpage_parser.py
Created July 2, 2022 22:50
Parse webpage content
class MediumPage:
"""
## DIVS = ['animation', 'embed', 'images', 'banner', 'video', 'pull-quote'] + error
## TAGS = ['pre', 'h2', None, 'ul', 'div', 'h5', 'h3', 'ol', 'p', 'h1', 'h6', 'h4', 'blockquote']
FEATURES = ["author", "author_fw", "author_de", "images", "headings", "content"]
"""
def __init__(self, id, session, features=FEATURES, download=False, soup=None):
self.id = id
self.html_path = f"{FOLDER}/html/{id}.html"
@someshsingh22
someshsingh22 / domain_validation.py
Created July 2, 2022 22:45
Validate the set of publications you want and fit in your dom structure
VALID = ['towardsdatascience.com',
'uxdesign.cc',
'writingcooperative.com',
'bettermarketing.pub',
'proandroiddev.com',
'chatbotslife.com',
'chatbotsmagazine.com',
'artplusmarketing.com']
def domain_validation(url, valid=VALID):
@someshsingh22
someshsingh22 / scrape_main.py
Last active July 2, 2022 18:40
Scrape htmls and sace in csv
import pandas as pd
CSV = "links_csv.csv"
if __name__ == "__main__":
# Initiate Session, maximum connects, retries and backoff factor
options = webdriver.ChromeOptions()
for arg in ARGS:
options.add_argument(arg)
@someshsingh22
someshsingh22 / scrape.py
Last active July 2, 2022 18:39
Scraping a medium webpage
import logging
import os
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
logging.basicConfig(
filename="./logs/web-scraper.log",
filemode="a+",
format="%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s",
class Brain(nn.Module):
def __init__(self,flags,n_vocab,):
super(Brain,self).__init__()
self.flags=flags
self.embedding= nn.Embedding(n_vocab, flags.embedding_size)
self.lstm=nn.LSTM(flags.embedding_size, flags.lstm_size, batch_first=True, num_layers=flags.num_layers, dropout=flags.dropout)
self.dense = nn.Linear(flags.lstm_size, n_vocab)
self.criterion = nn.CrossEntropyLoss()
self.optimizer = torch.optim.Adam(self.parameters(), lr=self.flags.lr)
self.loss_value=0
read_path="{}.txt"
write_path="{}_.txt"
file="cano"
flags = Namespace(
data_dir=read_path.format(file), #Directory to load your cleaned data
batch_size=256, #Batch size for input (set it according to memory available to you
seq_size=64, #Length of the sequence, a very important parameter
embedding_size=256, #Size of embeddings
num_layers=1, #Number of lstm layers
is_bidirectional=False, #Bidirectionality
class Data :
def __init__(self,flags):
self.flags=flags #Set the flags to maintain consistency throughout the model
# Pre Process Data
self.pre_process()
# Prepare Data For Training
@calculate_time
def pre_process(self):
import re
import string
data=open('data.txt').read() #Read the text
chars_set = set(data) #Get the vocabulary of your characters
print('There are {} characters in the text file'.format(len(chars_set)))
english=set(string.ascii_letters+string.digits+string.punctuation+' \n') #creating english characters vocabulary
print('foreign characters are : ',chars_set-english) #Collect the foreign characters
#There are 97 characters in the text file
import urllib.request
#This text is available for research, it will be downloaded to your directory as data.txt
print('Beginning file download with urllib2...')
url = 'https://sherlock-holm.es/stories/plain-text/cnus.txt'
urllib.request.urlretrieve(url, 'data.txt')