This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Narrowing down the space to the article in the page | |
#(since there are many other irrelevant elements in the page) | |
article = soup.find(class_="article-wrapper grid row") | |
# Getting the keywords section | |
keyword_section = soup.find(class_="keywords-section") | |
# Same as: soup.select("div.article-wrapper grid row div.keywords-section") | |
# Getting a list of all keywords which are inserted into a keywords list in line 7. | |
keywords_raw = keyword_section.find_all(class_="keyword") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def proxies_pool(): | |
url = 'https://www.sslproxies.org/' | |
# Retrieve the site's page. The 'with'(Python closure) is used here in order to automatically close the session when done | |
with requests.Session() as res: | |
proxies_page = res.get(url) | |
# Create a BeutifulSoup object and find the table element which consists of all proxies | |
soup = BeautifulSoup(proxies_page.content, 'html.parser') | |
proxies_table = soup.find(id='proxylisttable') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import cycle | |
import requests | |
# Generate the pools | |
def create_pools(): | |
proxies = proxies_pool() | |
headers = [random_header() for ind in range(len(proxies))] # list of headers, same length as the proxies list | |
# This transforms the list into itertools.cycle object (an iterator) that we can run | |
# through using the next() function in lines 16-17. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
class Logger: | |
def __init__(self): | |
# Initiating the logger object | |
self.logger = logging.getLogger(__name__) | |
# Set the level of the logger. This is SUPER USEFUL since it enables you to decide what to save in the logs file. | |
# Explanation regarding the logger levels can be found here - https://docs.python.org/3/howto/logging.html | |
self.logger.setLevel(logging.DEBUG) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
try: | |
# Instert all articles into the articles table. | |
# Note that we use INSERT IGNORE which means that duplicates will not be inserted to DB (checked against doi_link). | |
for key, item in all_articles.items(): | |
cursor.execute("""INSERT IGNORE INTO articles (doi_link, title, abstract, publication_date, citations) | |
VALUES ('{}', '{}', '{}', '{}', {});""".format(key, item[0], item[1], item[2], int(item[3]))) | |
# The for loop above can be replaced with: | |
# cur.executemany("""INSERT IGNORE INTO articles (doi_link, title, abstract, publication_date, citations) | |
# VALUES ('{}', '{}', '{}', '{}', {});""".format(key, item[0], item[1], item[2], int(item[3])), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def random_header(logger): | |
# Create a dict of accept headers for each user-agent. | |
accepts = {"Firefox": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
"Safari, Chrome": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5"} | |
# Get a random user-agent. We used Chrome and Firefox user agents. | |
# Take a look at fake-useragent project's page to see all other options - https://pypi.org/project/fake-useragent/ | |
try: | |
# Getting a user agent using the fake_useragent package | |
ua = UserAgent() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create a dict of accept headers for each user-agent. | |
accepts = {"Firefox": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
"Safari, Chrome": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5"} | |
# Get a random user-agent. We used Chrome and Firefox user agents. | |
# Getting a user agent using the fake_useragent package | |
ua = UserAgent() | |
if random.random() > 0.5: | |
random_user_agent = ua.chrome | |
else: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
urls = ["www.google.com", "..."] # all links to scrape | |
# Create pools of proxies and headers and get the first ones | |
proxies_pool, headers_pool = create_pools() | |
current_proxy = next(proxy_pool) | |
current_headers = next(headers_pool) | |
# Create a generator of all links that are used in grequests.map() function. This way, 4 requests are sent concurrently | |
# Note that the current proxy and headers are the same for all the requests below. It is up to you to specify the urls for it. | |
rs = (grequests.get(u) for u in urls) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mysql.connector | |
all_sql_tables = [...] | |
# An example of a single table creation called "articles" in SQL command | |
sql_articles = """CREATE TABLE IF NOT EXISTS articles (ID int AUTO_INCREMENT, | |
doi_link varchar(255) NOT NULL, | |
title varchar(255), | |
abstract TEXT, | |
publication_date varchar(255), | |
citations int, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mysql.connector | |
def build_database(db_name, host_name, user_name, password, all_sql_tables): | |
# Define the connection and the cursor that is used for executing the SQL commands | |
my_db = mysql.connector.connect(host=host_name, user=user_name, passwd=password, database=db_name) | |
cursor = my_db.cursor() | |
# Execute all SQL commands and commit it into the DB | |
for sql_q in all_sql_tables: | |
cursor.execute(sql_q) |
OlderNewer