Last active
January 20, 2019 12:15
-
-
Save tomncooper/002df62bf359480aae80849f933db1af to your computer and use it in GitHub Desktop.
Methods for extracting nutritional information from the Tesco website
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Methods for scraping product information from the tesco website. | |
You will need to install requests, beautifulsoup4, lxml and pandas libraries | |
$ pip install requests beautifulsoup4 lxml pandas | |
""" | |
from typing import List, Optional | |
import requests | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
from bs4.element import Tag | |
def fetch_page(url: str) -> BeautifulSoup: | |
response: requests.Response = requests.get(url) | |
# Raise an exception if the response status is not 200 | |
response.raise_for_status() | |
# Parse the html page | |
page: BeautifulSoup = BeautifulSoup(response.text, "lxml") | |
return page | |
def extract_catagories(page: BeautifulSoup) -> List[str]: | |
return list(page.find("ol").strings) | |
def extract_net_weight(page: BeautifulSoup) -> str: | |
return page.find(id="net-contents").p.text.split()[0] | |
def extract_nutrition_table(page: BeautifulSoup) -> pd.DataFrame: | |
# Get all the html table within the page | |
tables: List[Tag] = page.find_all("table") | |
# Check that we have the product information table | |
product_table_tag: Optional[Tag] = None | |
for table in tables: | |
if table["class"][0] == "product__info-table": | |
product_table_tag = table | |
if not product_table_tag: | |
raise RuntimeError(f"No product information table found in html for URL: {url}") | |
# The read html method returns a list of all the DataFrame it finds in the supplied | |
# html, we have only | |
product_table: pd.DataFrame = pd.read_html(str(product_table_tag))[0] | |
# Sometimes the last row is just text about reference values so we check if that is | |
# the case and drop that last row if so | |
if product_table.tail(1)["Typical Values"].str.contains("Reference intake").any(): | |
product_table.drop(product_table.tail(1).index, inplace=True) | |
return product_table |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment