Created
October 2, 2019 06:52
-
-
Save exhuma/a1fc740b34b404e832be9627ec59b66d to your computer and use it in GitHub Desktop.
Scrape Prices from a Web-Page
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This module shows an alternative implementation of the code shown in the SO question | |
https://stackoverflow.com/questions/58188342/looping-through-web-pages | |
Comments marked with a ♫ symbol are possible improvements to this code which were | |
left out to keep concepts out of the code which could make it more difficult to | |
understand for beginners. | |
""" | |
from os.path import exists | |
from urllib.request import Request, urlopen | |
from bs4 import BeautifulSoup | |
def read_page_offline(page_num): | |
""" | |
A dummy function which reads the HTML content from the file under | |
"pages/<n>.html" where "<n>" is the page number. | |
If it returns an empty-string the file could not be found | |
>>> content = read_page_offline(1) # Reads "pages/1.html" | |
""" | |
if not exists('pages/%d.html' % page_num): | |
return '' | |
with open('pages/%d.html' % page_num) as fptr: | |
data = fptr.read() | |
return data | |
def read_page(page_num): | |
""" | |
Read the page *page_num* from the Internet. | |
>>> content = read_page(1) # Reads page #1 from the Internet | |
""" | |
# Create the URL with the page number | |
# ♫ This could be improved with an f-string | |
url = 'https://website/section/food-drink?page=%d/' % page_num | |
req = Request(url, headers={'User-Agent': 'Chrome'}) | |
# ♫ This could fail and can be improved with at try/except block | |
web_page = urlopen(req).read() | |
return web_page | |
def get_prices(web_page): | |
""" | |
Given a HTML document in *web_page* this function parses the content and | |
returns a list of prices on that page. Each price is a tuple with (title, | |
price). | |
>>> document = read_page(1) | |
>>> get_prices(document) | |
[('title1', '£2.19'), ...] | |
""" | |
# First we initialise the output. | |
# ♫ this could be improved by using a Generator (using yield) | |
prices = [] | |
soup = BeautifulSoup(web_page, "html.parser") | |
for product in soup.find_all('div', class_="product-wrapper"): | |
# Get product name | |
product_title = product.find('p', class_='h4 product__title').text | |
# Get product price | |
product_price = product.find('p', class_='product__price') | |
raw_data = list(product_price.children)[-1] | |
# Remove spaces, newlines and quotes from prices | |
clean_price = raw_data.strip(' \n"') | |
prices.append((product_title, clean_price)) | |
return prices | |
# We will start at page 1 | |
page_num = 1 | |
# Initialise the accumulted list of prices with an empty list | |
all_prices = [] | |
# ♫ this could be replaced with a "for page_num in range(1, 15)" loop | |
while page_num < 15: | |
# Use one of our functions to fetch the page content | |
web_page = read_page_offline(page_num) | |
# Our functinos to read the pages may return an empty string. If this is | |
# the case, we break out of the while loop | |
if not web_page: | |
break | |
# Use our function to extract the prices from the page | |
prices = get_prices(web_page) | |
# Using "extend", we can append all items from one list to another. We use | |
# this to keep a record of all our prices | |
all_prices.extend(prices) | |
# ♫ When using a "for" loop, this is not needed | |
page_num += 1 | |
# Finally, write all items out to a CSV file | |
# ♫ This loop couls also be moved into a function | |
for product_title, product_price in all_prices: | |
csv_writer.writerow([product_title, product_price]) | |
csv_file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment