Skip to content

Instantly share code, notes, and snippets.

import mysql.connector
all_sql_tables = [...]
# An example of a single table creation called "articles" in SQL command
sql_articles = """CREATE TABLE IF NOT EXISTS articles (ID int AUTO_INCREMENT,
doi_link varchar(255) NOT NULL,
title varchar(255),
abstract TEXT,
publication_date varchar(255),
citations int,
urls = ["www.google.com", "..."] # all links to scrape
# Create pools of proxies and headers and get the first ones
proxies_pool, headers_pool = create_pools()
current_proxy = next(proxy_pool)
current_headers = next(headers_pool)
# Create a generator of all links that are used in grequests.map() function. This way, 4 requests are sent concurrently
# Note that the current proxy and headers are the same for all the requests below. It is up to you to specify the urls for it.
rs = (grequests.get(u) for u in urls)
# Create a dict of accept headers for each user-agent.
accepts = {"Firefox": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Safari, Chrome": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5"}
# Get a random user-agent. We used Chrome and Firefox user agents.
# Getting a user agent using the fake_useragent package
ua = UserAgent()
if random.random() > 0.5:
random_user_agent = ua.chrome
else:
def update_num_citations(my_db, cursor):
# Get all articles from DB (doi_link is a unique key)
sql_get_articles = """SELECT doi_link, citations, last_name FROM authors;"""
cursor.execute(sql_get_articles)
articles = cursor.fetchall()
# Get the updated number of citations (Using requests and BeautifulSoup, open each URL of each doi_link and extract the number of citations)
articles_dict = [{'doi_link': articles[ind][0], 'citations': articles[ind][1]} for ind in range(len(authors_no_gen))]
articles_dict = get_citations(articles_dict)
try:
# Instert all articles into the articles table.
# Note that we use INSERT IGNORE which means that duplicates will not be inserted to DB (checked against doi_link).
for key, item in all_articles.items():
cursor.execute("""INSERT IGNORE INTO articles (doi_link, title, abstract, publication_date, citations)
VALUES ('{}', '{}', '{}', '{}', {});""".format(key, item[0], item[1], item[2], int(item[3])))
# The for loop above can be replaced with:
# cur.executemany("""INSERT IGNORE INTO articles (doi_link, title, abstract, publication_date, citations)
# VALUES ('{}', '{}', '{}', '{}', {});""".format(key, item[0], item[1], item[2], int(item[3])),
import mysql.connector
def build_database(db_name, host_name, user_name, password, all_sql_tables):
# Define the connection and the cursor that is used for executing the SQL commands
my_db = mysql.connector.connect(host=host_name, user=user_name, passwd=password, database=db_name)
cursor = my_db.cursor()
# Execute all SQL commands and commit it into the DB
for sql_q in all_sql_tables:
cursor.execute(sql_q)
from itertools import cycle
import requests
# Generate the pools
def create_pools():
proxies = proxies_pool()
headers = [random_header() for ind in range(len(proxies))] # list of headers, same length as the proxies list
# This transforms the list into itertools.cycle object (an iterator) that we can run
# through using the next() function in lines 16-17.
@ShaikeA
ShaikeA / logger.py
Last active January 13, 2019 20:21
import logging
class Logger:
def __init__(self):
# Initiating the logger object
self.logger = logging.getLogger(__name__)
# Set the level of the logger. This is SUPER USEFUL since it enables you to decide what to save in the logs file.
# Explanation regarding the logger levels can be found here - https://docs.python.org/3/howto/logging.html
self.logger.setLevel(logging.DEBUG)
def random_header(logger):
# Create a dict of accept headers for each user-agent.
accepts = {"Firefox": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Safari, Chrome": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5"}
# Get a random user-agent. We used Chrome and Firefox user agents.
# Take a look at fake-useragent project's page to see all other options - https://pypi.org/project/fake-useragent/
try:
# Getting a user agent using the fake_useragent package
ua = UserAgent()
def proxies_pool():
url = 'https://www.sslproxies.org/'
# Retrieve the site's page. The 'with'(Python closure) is used here in order to automatically close the session when done
with requests.Session() as res:
proxies_page = res.get(url)
# Create a BeutifulSoup object and find the table element which consists of all proxies
soup = BeautifulSoup(proxies_page.content, 'html.parser')
proxies_table = soup.find(id='proxylisttable')