Skip to content

Instantly share code, notes, and snippets.

@tomncooper
Last active January 20, 2019 12:15
Show Gist options
  • Save tomncooper/002df62bf359480aae80849f933db1af to your computer and use it in GitHub Desktop.
Save tomncooper/002df62bf359480aae80849f933db1af to your computer and use it in GitHub Desktop.
Methods for extracting nutritional information from the Tesco website
""" Methods for scraping product information from the tesco website.
You will need to install requests, beautifulsoup4, lxml and pandas libraries
$ pip install requests beautifulsoup4 lxml pandas
"""
from typing import List, Optional
import requests
import pandas as pd
from bs4 import BeautifulSoup
from bs4.element import Tag
def fetch_page(url: str) -> BeautifulSoup:
response: requests.Response = requests.get(url)
# Raise an exception if the response status is not 200
response.raise_for_status()
# Parse the html page
page: BeautifulSoup = BeautifulSoup(response.text, "lxml")
return page
def extract_catagories(page: BeautifulSoup) -> List[str]:
return list(page.find("ol").strings)
def extract_net_weight(page: BeautifulSoup) -> str:
return page.find(id="net-contents").p.text.split()[0]
def extract_nutrition_table(page: BeautifulSoup) -> pd.DataFrame:
# Get all the html table within the page
tables: List[Tag] = page.find_all("table")
# Check that we have the product information table
product_table_tag: Optional[Tag] = None
for table in tables:
if table["class"][0] == "product__info-table":
product_table_tag = table
if not product_table_tag:
raise RuntimeError(f"No product information table found in html for URL: {url}")
# The read html method returns a list of all the DataFrame it finds in the supplied
# html, we have only
product_table: pd.DataFrame = pd.read_html(str(product_table_tag))[0]
# Sometimes the last row is just text about reference values so we check if that is
# the case and drop that last row if so
if product_table.tail(1)["Typical Values"].str.contains("Reference intake").any():
product_table.drop(product_table.tail(1).index, inplace=True)
return product_table
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment