exhuma/read_prices.py

## read_prices.py
"""
This module shows an alternative implementation of the code shown in the SO question
https://stackoverflow.com/questions/58188342/looping-through-web-pages

Comments marked with a ♫ symbol are possible improvements to this code which were
left out to keep concepts out of the code which could make it more difficult to
understand for beginners.
"""

from os.path import exists
from urllib.request import Request, urlopen

from bs4 import BeautifulSoup


def read_page_offline(page_num):
    """
    A dummy function which reads the HTML content from the file under
    "pages/<n>.html" where "<n>" is the page number.

    If it returns an empty-string the file could not be found

    >>> content = read_page_offline(1)  # Reads "pages/1.html"
    """
    if not exists('pages/%d.html' % page_num):
        return ''
    with open('pages/%d.html' % page_num) as fptr:
        data = fptr.read()
    return data


def read_page(page_num):
    """
    Read the page *page_num* from the Internet.

    >>> content = read_page(1)  # Reads page #1 from the Internet
    """
    # Create the URL with the page number
    # ♫ This could be improved with an f-string
    url = 'https://website/section/food-drink?page=%d/' % page_num

    req = Request(url, headers={'User-Agent': 'Chrome'})

    # ♫ This could fail and can be improved with at try/except block
    web_page = urlopen(req).read()

    return web_page


def get_prices(web_page):
    """
    Given a HTML document in *web_page* this function parses the content and
    returns a list of prices on that page. Each price is a tuple with (title,
    price).

    >>> document = read_page(1)
    >>> get_prices(document)
    [('title1', '£2.19'), ...]
    """

    # First we initialise the output.
    # ♫ this could be improved by using a Generator (using yield)
    prices = []

    soup = BeautifulSoup(web_page, "html.parser")
    for product in soup.find_all('div', class_="product-wrapper"):
        # Get product name
        product_title = product.find('p', class_='h4 product__title').text
        # Get product price
        product_price = product.find('p', class_='product__price')
        raw_data = list(product_price.children)[-1]
        # Remove spaces, newlines and quotes from prices
        clean_price = raw_data.strip(' \n"')

        prices.append((product_title, clean_price))
    return prices


# We will start at page 1
page_num = 1

# Initialise the accumulted list of prices with an empty list
all_prices = []


# ♫ this could be replaced with a "for page_num in range(1, 15)" loop
while page_num < 15:

    # Use one of our functions to fetch the page content
    web_page = read_page_offline(page_num)

    # Our functinos to read the pages may return an empty string. If this is
    # the case, we break out of the while loop
    if not web_page:
        break

    # Use our function to extract the prices from the page
    prices = get_prices(web_page)

    # Using "extend", we can append all items from one list to another. We use
    # this to keep a record of all our prices
    all_prices.extend(prices)

    # ♫ When using a "for" loop, this is not needed
    page_num += 1


# Finally, write all items out to a CSV file
# ♫ This loop couls also be moved into a function
for product_title, product_price in all_prices:
    csv_writer.writerow([product_title, product_price])
csv_file.close()
	"""
	This module shows an alternative implementation of the code shown in the SO question
	https://stackoverflow.com/questions/58188342/looping-through-web-pages

	Comments marked with a ♫ symbol are possible improvements to this code which were
	left out to keep concepts out of the code which could make it more difficult to
	understand for beginners.
	"""

	from os.path import exists
	from urllib.request import Request, urlopen

	from bs4 import BeautifulSoup


	def read_page_offline(page_num):
	"""
	A dummy function which reads the HTML content from the file under
	"pages/<n>.html" where "<n>" is the page number.

	If it returns an empty-string the file could not be found

	>>> content = read_page_offline(1) # Reads "pages/1.html"
	"""
	if not exists('pages/%d.html' % page_num):
	return ''
	with open('pages/%d.html' % page_num) as fptr:
	data = fptr.read()
	return data


	def read_page(page_num):
	"""
	Read the page page_num from the Internet.

	>>> content = read_page(1) # Reads page #1 from the Internet
	"""
	# Create the URL with the page number
	# ♫ This could be improved with an f-string
	url = 'https://website/section/food-drink?page=%d/' % page_num

	req = Request(url, headers={'User-Agent': 'Chrome'})

	# ♫ This could fail and can be improved with at try/except block
	web_page = urlopen(req).read()

	return web_page


	def get_prices(web_page):
	"""
	Given a HTML document in web_page this function parses the content and
	returns a list of prices on that page. Each price is a tuple with (title,
	price).

	>>> document = read_page(1)
	>>> get_prices(document)
	[('title1', '£2.19'), ...]
	"""

	# First we initialise the output.
	# ♫ this could be improved by using a Generator (using yield)
	prices = []

	soup = BeautifulSoup(web_page, "html.parser")
	for product in soup.find_all('div', class_="product-wrapper"):
	# Get product name
	product_title = product.find('p', class_='h4 product__title').text
	# Get product price
	product_price = product.find('p', class_='product__price')
	raw_data = list(product_price.children)[-1]
	# Remove spaces, newlines and quotes from prices
	clean_price = raw_data.strip(' \n"')

	prices.append((product_title, clean_price))
	return prices


	# We will start at page 1
	page_num = 1

	# Initialise the accumulted list of prices with an empty list
	all_prices = []


	# ♫ this could be replaced with a "for page_num in range(1, 15)" loop
	while page_num < 15:

	# Use one of our functions to fetch the page content
	web_page = read_page_offline(page_num)

	# Our functinos to read the pages may return an empty string. If this is
	# the case, we break out of the while loop
	if not web_page:
	break

	# Use our function to extract the prices from the page
	prices = get_prices(web_page)

	# Using "extend", we can append all items from one list to another. We use
	# this to keep a record of all our prices
	all_prices.extend(prices)

	# ♫ When using a "for" loop, this is not needed
	page_num += 1


	# Finally, write all items out to a CSV file
	# ♫ This loop couls also be moved into a function
	for product_title, product_price in all_prices:
	csv_writer.writerow([product_title, product_price])
	csv_file.close()