Somesh Kumar Singh someshsingh22

## webpage_meta.py
def get_link(soup):
    try:
        meta = soup.head.find('meta', {"property":"og:url"})
        return meta.attrs['content']
    except:
        return None

def get_publication(soup):
    try:
        card = soup.main.div.div.nextSibling

## webpage_parser.py
class MediumPage:
    """
    ## DIVS = ['animation', 'embed', 'images', 'banner', 'video', 'pull-quote'] + error
    ## TAGS = ['pre', 'h2', None, 'ul', 'div', 'h5', 'h3', 'ol', 'p', 'h1', 'h6', 'h4', 'blockquote']
    FEATURES = ["author", "author_fw", "author_de", "images", "headings", "content"]
    """

    def __init__(self, id, session, features=FEATURES, download=False, soup=None):
        self.id = id
        self.html_path = f"{FOLDER}/html/{id}.html"

## domain_validation.py
VALID = ['towardsdatascience.com',
 'uxdesign.cc',
 'writingcooperative.com',
 'bettermarketing.pub',
 'proandroiddev.com',
 'chatbotslife.com',
 'chatbotsmagazine.com',
 'artplusmarketing.com']

def domain_validation(url, valid=VALID):

## scrape_main.py
import pandas as pd
CSV = "links_csv.csv"

if __name__ == "__main__":
    # Initiate Session, maximum connects, retries and backoff factor

    options = webdriver.ChromeOptions()
    for arg in ARGS:
        options.add_argument(arg)

## scrape.py
import logging
import os
import csv
from bs4 import BeautifulSoup
from selenium import webdriver

logging.basicConfig(
    filename="./logs/web-scraper.log",
    filemode="a+",
    format="%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s",

## SHERLOCKED_BRAIN.py
class Brain(nn.Module):
    def __init__(self,flags,n_vocab,):
        super(Brain,self).__init__()
        self.flags=flags
        self.embedding= nn.Embedding(n_vocab, flags.embedding_size)
        self.lstm=nn.LSTM(flags.embedding_size, flags.lstm_size, batch_first=True, num_layers=flags.num_layers, dropout=flags.dropout)
        self.dense = nn.Linear(flags.lstm_size, n_vocab)
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(self.parameters(), lr=self.flags.lr)
        self.loss_value=0

## SHERLOCKED_Flags.py
read_path="{}.txt"
write_path="{}_.txt"
file="cano"
flags = Namespace(
    data_dir=read_path.format(file), #Directory to load your cleaned data
    batch_size=256, #Batch size for input (set it according to memory available to you
    seq_size=64, #Length of the sequence, a very important parameter
    embedding_size=256, #Size of embeddings
    num_layers=1, #Number of lstm layers
    is_bidirectional=False, #Bidirectionality

## SHERLOCKED_DATA.py
class Data :
    def __init__(self,flags):
        self.flags=flags  #Set the flags to maintain consistency throughout the model

        # Pre Process Data
        self.pre_process()

  # Prepare Data For Training
    @calculate_time
    def pre_process(self):

## SHERLOCKED_preprocessing.py
import re
import string

data=open('data.txt').read()  #Read the text
chars_set = set(data)  #Get the vocabulary of your characters
print('There are {} characters in the text file'.format(len(chars_set)))
english=set(string.ascii_letters+string.digits+string.punctuation+' \n') #creating english characters vocabulary
print('foreign characters are : ',chars_set-english) #Collect the foreign characters

#There are 97 characters in the text file

## scrape_sherlock.py
import urllib.request

#This text is available for research, it will be downloaded to your directory as data.txt
print('Beginning file download with urllib2...')
url = 'https://sherlock-holm.es/stories/plain-text/cnus.txt'
urllib.request.urlretrieve(url, 'data.txt')
	def get_link(soup):
	try:
	meta = soup.head.find('meta', {"property":"og:url"})
	return meta.attrs['content']
	except:
	return None

	def get_publication(soup):
	try:
	card = soup.main.div.div.nextSibling
	class MediumPage:
	"""
	## DIVS = ['animation', 'embed', 'images', 'banner', 'video', 'pull-quote'] + error
	## TAGS = ['pre', 'h2', None, 'ul', 'div', 'h5', 'h3', 'ol', 'p', 'h1', 'h6', 'h4', 'blockquote']
	FEATURES = ["author", "author_fw", "author_de", "images", "headings", "content"]
	"""

	def __init__(self, id, session, features=FEATURES, download=False, soup=None):
	self.id = id
	self.html_path = f"{FOLDER}/html/{id}.html"
	VALID = ['towardsdatascience.com',
	'uxdesign.cc',
	'writingcooperative.com',
	'bettermarketing.pub',
	'proandroiddev.com',
	'chatbotslife.com',
	'chatbotsmagazine.com',
	'artplusmarketing.com']

	def domain_validation(url, valid=VALID):
	import pandas as pd
	CSV = "links_csv.csv"

	if __name__ == "__main__":
	# Initiate Session, maximum connects, retries and backoff factor

	options = webdriver.ChromeOptions()
	for arg in ARGS:
	options.add_argument(arg)
	import logging
	import os
	import csv
	from bs4 import BeautifulSoup
	from selenium import webdriver

	logging.basicConfig(
	filename="./logs/web-scraper.log",
	filemode="a+",
	format="%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s",
	class Brain(nn.Module):
	def __init__(self,flags,n_vocab,):
	super(Brain,self).__init__()
	self.flags=flags
	self.embedding= nn.Embedding(n_vocab, flags.embedding_size)
	self.lstm=nn.LSTM(flags.embedding_size, flags.lstm_size, batch_first=True, num_layers=flags.num_layers, dropout=flags.dropout)
	self.dense = nn.Linear(flags.lstm_size, n_vocab)
	self.criterion = nn.CrossEntropyLoss()
	self.optimizer = torch.optim.Adam(self.parameters(), lr=self.flags.lr)
	self.loss_value=0
	read_path="{}.txt"
	write_path="{}_.txt"
	file="cano"
	flags = Namespace(
	data_dir=read_path.format(file), #Directory to load your cleaned data
	batch_size=256, #Batch size for input (set it according to memory available to you
	seq_size=64, #Length of the sequence, a very important parameter
	embedding_size=256, #Size of embeddings
	num_layers=1, #Number of lstm layers
	is_bidirectional=False, #Bidirectionality
	class Data :
	def __init__(self,flags):
	self.flags=flags #Set the flags to maintain consistency throughout the model

	# Pre Process Data
	self.pre_process()

	# Prepare Data For Training
	@calculate_time
	def pre_process(self):
	import re
	import string

	data=open('data.txt').read() #Read the text
	chars_set = set(data) #Get the vocabulary of your characters
	print('There are {} characters in the text file'.format(len(chars_set)))
	english=set(string.ascii_letters+string.digits+string.punctuation+' \n') #creating english characters vocabulary
	print('foreign characters are : ',chars_set-english) #Collect the foreign characters

	#There are 97 characters in the text file
	import urllib.request

	#This text is available for research, it will be downloaded to your directory as data.txt
	print('Beginning file download with urllib2...')
	url = 'https://sherlock-holm.es/stories/plain-text/cnus.txt'
	urllib.request.urlretrieve(url, 'data.txt')