tomncooper/tesco.py

## tesco.py
""" Methods for scraping product information from the tesco website.
You will need to install requests, beautifulsoup4, lxml and pandas libraries

$ pip install requests beautifulsoup4 lxml pandas
"""
from typing import List, Optional

import requests

import pandas as pd

from bs4 import BeautifulSoup
from bs4.element import Tag


def fetch_page(url: str) -> BeautifulSoup:

    response: requests.Response = requests.get(url)

    # Raise an exception if the response status is not 200
    response.raise_for_status()

    # Parse the html page
    page: BeautifulSoup = BeautifulSoup(response.text, "lxml")

    return page


def extract_catagories(page: BeautifulSoup) -> List[str]:

    return list(page.find("ol").strings)


def extract_net_weight(page: BeautifulSoup) -> str:

    return page.find(id="net-contents").p.text.split()[0]


def extract_nutrition_table(page: BeautifulSoup) -> pd.DataFrame:

    # Get all the html table within the page
    tables: List[Tag] = page.find_all("table")

    # Check that we have the product information table
    product_table_tag: Optional[Tag] = None
    for table in tables:
        if table["class"][0] == "product__info-table":
            product_table_tag = table

    if not product_table_tag:
        raise RuntimeError(f"No product information table found in html for URL: {url}")

    # The read html method returns a list of all the DataFrame it finds in the supplied
    # html, we have only
    product_table: pd.DataFrame = pd.read_html(str(product_table_tag))[0]

    # Sometimes the last row is just text about reference values so we check if that is
    # the case and drop that last row if so
    if product_table.tail(1)["Typical Values"].str.contains("Reference intake").any():
        product_table.drop(product_table.tail(1).index, inplace=True)

    return product_table
	""" Methods for scraping product information from the tesco website.
	You will need to install requests, beautifulsoup4, lxml and pandas libraries

	$ pip install requests beautifulsoup4 lxml pandas
	"""
	from typing import List, Optional

	import requests

	import pandas as pd

	from bs4 import BeautifulSoup
	from bs4.element import Tag


	def fetch_page(url: str) -> BeautifulSoup:

	response: requests.Response = requests.get(url)

	# Raise an exception if the response status is not 200
	response.raise_for_status()

	# Parse the html page
	page: BeautifulSoup = BeautifulSoup(response.text, "lxml")

	return page


	def extract_catagories(page: BeautifulSoup) -> List[str]:

	return list(page.find("ol").strings)


	def extract_net_weight(page: BeautifulSoup) -> str:

	return page.find(id="net-contents").p.text.split()[0]


	def extract_nutrition_table(page: BeautifulSoup) -> pd.DataFrame:

	# Get all the html table within the page
	tables: List[Tag] = page.find_all("table")

	# Check that we have the product information table
	product_table_tag: Optional[Tag] = None
	for table in tables:
	if table["class"][0] == "product__info-table":
	product_table_tag = table

	if not product_table_tag:
	raise RuntimeError(f"No product information table found in html for URL: {url}")

	# The read html method returns a list of all the DataFrame it finds in the supplied
	# html, we have only
	product_table: pd.DataFrame = pd.read_html(str(product_table_tag))[0]

	# Sometimes the last row is just text about reference values so we check if that is
	# the case and drop that last row if so
	if product_table.tail(1)["Typical Values"].str.contains("Reference intake").any():
	product_table.drop(product_table.tail(1).index, inplace=True)

	return product_table