Skip to content

Instantly share code, notes, and snippets.

@exhuma
Created October 2, 2019 06:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save exhuma/a1fc740b34b404e832be9627ec59b66d to your computer and use it in GitHub Desktop.
Save exhuma/a1fc740b34b404e832be9627ec59b66d to your computer and use it in GitHub Desktop.
Scrape Prices from a Web-Page
"""
This module shows an alternative implementation of the code shown in the SO question
https://stackoverflow.com/questions/58188342/looping-through-web-pages
Comments marked with a ♫ symbol are possible improvements to this code which were
left out to keep concepts out of the code which could make it more difficult to
understand for beginners.
"""
from os.path import exists
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
def read_page_offline(page_num):
"""
A dummy function which reads the HTML content from the file under
"pages/<n>.html" where "<n>" is the page number.
If it returns an empty-string the file could not be found
>>> content = read_page_offline(1) # Reads "pages/1.html"
"""
if not exists('pages/%d.html' % page_num):
return ''
with open('pages/%d.html' % page_num) as fptr:
data = fptr.read()
return data
def read_page(page_num):
"""
Read the page *page_num* from the Internet.
>>> content = read_page(1) # Reads page #1 from the Internet
"""
# Create the URL with the page number
# ♫ This could be improved with an f-string
url = 'https://website/section/food-drink?page=%d/' % page_num
req = Request(url, headers={'User-Agent': 'Chrome'})
# ♫ This could fail and can be improved with at try/except block
web_page = urlopen(req).read()
return web_page
def get_prices(web_page):
"""
Given a HTML document in *web_page* this function parses the content and
returns a list of prices on that page. Each price is a tuple with (title,
price).
>>> document = read_page(1)
>>> get_prices(document)
[('title1', '£2.19'), ...]
"""
# First we initialise the output.
# ♫ this could be improved by using a Generator (using yield)
prices = []
soup = BeautifulSoup(web_page, "html.parser")
for product in soup.find_all('div', class_="product-wrapper"):
# Get product name
product_title = product.find('p', class_='h4 product__title').text
# Get product price
product_price = product.find('p', class_='product__price')
raw_data = list(product_price.children)[-1]
# Remove spaces, newlines and quotes from prices
clean_price = raw_data.strip(' \n"')
prices.append((product_title, clean_price))
return prices
# We will start at page 1
page_num = 1
# Initialise the accumulted list of prices with an empty list
all_prices = []
# ♫ this could be replaced with a "for page_num in range(1, 15)" loop
while page_num < 15:
# Use one of our functions to fetch the page content
web_page = read_page_offline(page_num)
# Our functinos to read the pages may return an empty string. If this is
# the case, we break out of the while loop
if not web_page:
break
# Use our function to extract the prices from the page
prices = get_prices(web_page)
# Using "extend", we can append all items from one list to another. We use
# this to keep a record of all our prices
all_prices.extend(prices)
# ♫ When using a "for" loop, this is not needed
page_num += 1
# Finally, write all items out to a CSV file
# ♫ This loop couls also be moved into a function
for product_title, product_price in all_prices:
csv_writer.writerow([product_title, product_price])
csv_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment