Skip to content

Instantly share code, notes, and snippets.

@Xiol
Created May 17, 2016 15:31
Show Gist options
  • Save Xiol/6835eebf21e1c5bd54b831c6dc014923 to your computer and use it in GitHub Desktop.
Save Xiol/6835eebf21e1c5bd54b831c6dc014923 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# Given a load of HTML pages from the BBC Food website, this script will
# scrape the recipes from the HTML and output them in structured JSON format.
import argparse
import sys
import os
import json
import base64
from bs4 import BeautifulSoup
def log(msg):
sys.stderr.write(msg+"\n")
def fatal(msg):
log(msg)
sys.exit(1)
class FoodParser():
def __init__(self, path):
log("Parsing {}".format(path))
self.soup = self.soup_from_file(path)
self.path = path
def parse(self):
jout = {
"title": self.get_title(),
"author": self.get_author(),
"from": self.get_from(),
"prep_time": self.get_prep_time(),
"cook_time": self.get_cooking_time(),
"servings": self.get_servings(),
"recommendations": self.get_recommendations(),
"description": self.get_description(),
"ingredients": self.get_ingredients(),
"method": self.get_method(),
"tips": self.get_tips(),
"dietary": self.get_dietary(),
"url": self.get_original_url(),
"image": self.get_image(),
}
return jout
def get_title(self):
try:
return self.soup.find("h1", class_='content-title__text').text
except AttributeError:
return None
def get_prep_time(self):
try:
return self.soup.find("p", class_='recipe-metadata__prep-time').text
except AttributeError:
return None
def get_cooking_time(self):
try:
return self.soup.find("p", class_='recipe-metadata__cook-time').text
except AttributeError:
return None
def get_servings(self):
try:
servings = self.soup.find("p", class_="recipe-metadata__serving").text
except AttributeError:
return None
if servings.startswith("Serves"):
try:
servings = int(servings.split(" ")[1])
except ValueError:
pass
return servings
def get_recommendations(self):
try:
recommendations = self.soup.find("p", class_="recipe-metadata__recommendations").text
except AttributeError:
return -1
if len(recommendations.split(" ")) > 0:
try:
return int(recommendations.split(" ")[0])
except AttributeError:
return recommendations
def get_author(self):
try:
return self.soup.find(class_="chef").find(class_="chef__name").find(class_="chef__preposition", string="By").find_next_sibling("a").text
except AttributeError:
return "Unknown"
def get_from(self):
try:
return self.soup.find(class_="chef").find(class_="chef__programme-name").find(class_="chef__preposition", string="From").find_next_sibling("a").text
except AttributeError:
return None
def get_description(self):
try:
return self.soup.find("p", class_="recipe-description__text").text.strip()
except AttributeError:
return None
def get_ingredients(self):
ingredients = []
for ingredient in self.soup.find("ul", class_="recipe-ingredients__list").find_all("li"):
ingredients.append(ingredient.text.strip())
return ingredients
def get_method(self):
method = []
for step in self.soup.find("ol", class_="recipe-method__list").find_all("li"):
method.append(step.find("p").text.strip())
return method
def get_tips(self):
try:
return self.soup.find("p", class_="recipe-tips__text").text.strip()
except AttributeError:
return None
def get_dietary(self):
try:
return self.soup.find("div", class_="recipe-metadata__dietary").text.strip()
except AttributeError:
return None
def get_original_url(self):
# May not work all the time, but was fine for all the ones I tested on.
fname = os.path.basename(self.path)
fname = fname.replace('.html', '')
return "http://www.bbc.co.uk/food/recipes/{}".format(fname)
def get_image(self):
try:
imgsrc = self.soup.find("div", class_="recipe-media")
if not imgsrc:
return None
fname = imgsrc.find("img")['src']
fpath = os.path.join(os.path.dirname(self.path), fname)
img = None
try:
with open(fpath, 'rb') as fh:
img = fh.read()
except (OSError, IOError) as e:
log("Could not open image at path: {}".format(fpath))
return None
return base64.b64encode(img).decode('utf-8')
except AttributeError:
return None
def soup_from_file(self, path):
with open(path, 'rb') as fh:
fc = fh.read()
return BeautifulSoup(fc, 'html.parser')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Parses recipes from the BBC Food website")
parser.add_argument("-o", "--parse-one", type=str, help="Parse single HTML file")
args = parser.parse_args()
if args.parse_one:
try:
fp = FoodParser(args.parse_one)
out = fp.parse()
print(json.dumps(out))
except (OSError, IOError) as e:
fatal("Failed to open file: {}".format(e))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment