Shai Ardazi ShaikeA

## soup_keywords.py
# Narrowing down the space to the article in the page
#(since there are many other irrelevant elements in the page)
article = soup.find(class_="article-wrapper grid row")

# Getting the keywords section
keyword_section = soup.find(class_="keywords-section")
# Same as: soup.select("div.article-wrapper grid row div.keywords-section")

# Getting a list of all keywords which are inserted into a keywords list in line 7.
keywords_raw = keyword_section.find_all(class_="keyword")

## get_proxies.py
def proxies_pool():
    url = 'https://www.sslproxies.org/'

    # Retrieve the site's page. The 'with'(Python closure) is used here in order to automatically close the session when done
    with requests.Session() as res:
        proxies_page = res.get(url)

    # Create a BeutifulSoup object and find the table element which consists of all proxies
    soup = BeautifulSoup(proxies_page.content, 'html.parser')
    proxies_table = soup.find(id='proxylisttable')

## rotation.py
from itertools import cycle
import requests

# Generate the pools
def create_pools():
    proxies = proxies_pool()
    headers = [random_header() for ind in range(len(proxies))] # list of headers, same length as the proxies list

    # This transforms the list into itertools.cycle object (an iterator) that we can run
    # through using the next() function in lines 16-17.

## logger.py
import logging

class Logger:
    def __init__(self):
        # Initiating the logger object
        self.logger = logging.getLogger(__name__)

        # Set the level of the logger. This is SUPER USEFUL since it enables you to decide what to save in the logs file.
        # Explanation regarding the logger levels can be found here - https://docs.python.org/3/howto/logging.html
        self.logger.setLevel(logging.DEBUG)

## articles_insert.py
try:
    # Instert all articles into the articles table.
    # Note that we use INSERT IGNORE which means that duplicates will not be inserted to DB (checked against doi_link).
    for key, item in all_articles.items():
        cursor.execute("""INSERT IGNORE INTO articles (doi_link, title, abstract, publication_date, citations)
                        VALUES ('{}', '{}', '{}', '{}', {});""".format(key, item[0], item[1], item[2], int(item[3])))

    # The for loop above can be replaced with:
    # cur.executemany("""INSERT IGNORE INTO articles (doi_link, title, abstract, publication_date, citations)
    #                   VALUES ('{}', '{}', '{}', '{}', {});""".format(key, item[0], item[1], item[2], int(item[3])),

## random_headers.py
 def random_header(logger):
    # Create a dict of accept headers for each user-agent.
    accepts = {"Firefox": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Safari, Chrome": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5"}

    # Get a random user-agent. We used Chrome and Firefox user agents.
    # Take a look at fake-useragent project's page to see all other options - https://pypi.org/project/fake-useragent/
    try:
        # Getting a user agent using the fake_useragent package
        ua = UserAgent()

## random_headers_short.py
# Create a dict of accept headers for each user-agent.
accepts = {"Firefox": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Safari, Chrome": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5"}

# Get a random user-agent. We used Chrome and Firefox user agents.
# Getting a user agent using the fake_useragent package
ua = UserAgent()
if random.random() > 0.5:
    random_user_agent = ua.chrome
else:

## grequests.py
urls = ["www.google.com", "..."] # all links to scrape

# Create pools of proxies and headers and get the first ones
proxies_pool, headers_pool = create_pools()
current_proxy = next(proxy_pool)
current_headers = next(headers_pool)

# Create a generator of all links that are used in grequests.map() function. This way, 4 requests are sent concurrently
# Note that the current proxy and headers are the same for all the requests below. It is up to you to specify the urls for it.
rs = (grequests.get(u) for u in urls)

## sql_articles_table.py
import mysql.connector
all_sql_tables = [...]

# An example of a single table creation called "articles" in SQL command
sql_articles = """CREATE TABLE IF NOT EXISTS articles (ID int AUTO_INCREMENT,
                                                     doi_link varchar(255) NOT NULL,
                                                     title varchar(255),
                                                     abstract TEXT,
                                                     publication_date varchar(255),
                                                     citations int,

## tables_creation.py
import mysql.connector

def build_database(db_name, host_name, user_name, password, all_sql_tables):
  #  Define the connection and the cursor that is used for executing the SQL commands
  my_db = mysql.connector.connect(host=host_name, user=user_name, passwd=password, database=db_name)
  cursor = my_db.cursor()

  # Execute all SQL commands and commit it into the DB
  for sql_q in all_sql_tables:
      cursor.execute(sql_q)
	# Narrowing down the space to the article in the page
	#(since there are many other irrelevant elements in the page)
	article = soup.find(class_="article-wrapper grid row")

	# Getting the keywords section
	keyword_section = soup.find(class_="keywords-section")
	# Same as: soup.select("div.article-wrapper grid row div.keywords-section")

	# Getting a list of all keywords which are inserted into a keywords list in line 7.
	keywords_raw = keyword_section.find_all(class_="keyword")
	def proxies_pool():
	url = 'https://www.sslproxies.org/'

	# Retrieve the site's page. The 'with'(Python closure) is used here in order to automatically close the session when done
	with requests.Session() as res:
	proxies_page = res.get(url)

	# Create a BeutifulSoup object and find the table element which consists of all proxies
	soup = BeautifulSoup(proxies_page.content, 'html.parser')
	proxies_table = soup.find(id='proxylisttable')
	from itertools import cycle
	import requests

	# Generate the pools
	def create_pools():
	proxies = proxies_pool()
	headers = [random_header() for ind in range(len(proxies))] # list of headers, same length as the proxies list

	# This transforms the list into itertools.cycle object (an iterator) that we can run
	# through using the next() function in lines 16-17.
	import logging

	class Logger:
	def __init__(self):
	# Initiating the logger object
	self.logger = logging.getLogger(__name__)

	# Set the level of the logger. This is SUPER USEFUL since it enables you to decide what to save in the logs file.
	# Explanation regarding the logger levels can be found here - https://docs.python.org/3/howto/logging.html
	self.logger.setLevel(logging.DEBUG)
	try:
	# Instert all articles into the articles table.
	# Note that we use INSERT IGNORE which means that duplicates will not be inserted to DB (checked against doi_link).
	for key, item in all_articles.items():
	cursor.execute("""INSERT IGNORE INTO articles (doi_link, title, abstract, publication_date, citations)
	VALUES ('{}', '{}', '{}', '{}', {});""".format(key, item[0], item[1], item[2], int(item[3])))

	# The for loop above can be replaced with:
	# cur.executemany("""INSERT IGNORE INTO articles (doi_link, title, abstract, publication_date, citations)
	# VALUES ('{}', '{}', '{}', '{}', {});""".format(key, item[0], item[1], item[2], int(item[3])),
	def random_header(logger):
	# Create a dict of accept headers for each user-agent.
	accepts = {"Firefox": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Safari, Chrome": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,/;q=0.5"}

	# Get a random user-agent. We used Chrome and Firefox user agents.
	# Take a look at fake-useragent project's page to see all other options - https://pypi.org/project/fake-useragent/
	try:
	# Getting a user agent using the fake_useragent package
	ua = UserAgent()
	urls = ["www.google.com", "..."] # all links to scrape

	# Create pools of proxies and headers and get the first ones
	proxies_pool, headers_pool = create_pools()
	current_proxy = next(proxy_pool)
	current_headers = next(headers_pool)

	# Create a generator of all links that are used in grequests.map() function. This way, 4 requests are sent concurrently
	# Note that the current proxy and headers are the same for all the requests below. It is up to you to specify the urls for it.
	rs = (grequests.get(u) for u in urls)
	import mysql.connector
	all_sql_tables = [...]

	# An example of a single table creation called "articles" in SQL command
	sql_articles = """CREATE TABLE IF NOT EXISTS articles (ID int AUTO_INCREMENT,
	doi_link varchar(255) NOT NULL,
	title varchar(255),
	abstract TEXT,
	publication_date varchar(255),
	citations int,
	import mysql.connector

	def build_database(db_name, host_name, user_name, password, all_sql_tables):
	# Define the connection and the cursor that is used for executing the SQL commands
	my_db = mysql.connector.connect(host=host_name, user=user_name, passwd=password, database=db_name)
	cursor = my_db.cursor()

	# Execute all SQL commands and commit it into the DB
	for sql_q in all_sql_tables:
	cursor.execute(sql_q)