Xiol/bbcfoodscraper.py

## bbcfoodscraper.py
#!/usr/bin/env python3
# Given a load of HTML pages from the BBC Food website, this script will
# scrape the recipes from the HTML and output them in structured JSON format.

import argparse
import sys
import os
import json
import base64
from bs4 import BeautifulSoup

def log(msg):
    sys.stderr.write(msg+"\n")

def fatal(msg):
    log(msg)
    sys.exit(1)


class FoodParser():
    def __init__(self, path):
        log("Parsing {}".format(path))
        self.soup = self.soup_from_file(path)
        self.path = path

    def parse(self):
        jout = {
            "title": self.get_title(),
            "author": self.get_author(),
            "from": self.get_from(),
            "prep_time": self.get_prep_time(),
            "cook_time": self.get_cooking_time(),
            "servings": self.get_servings(),
            "recommendations": self.get_recommendations(),
            "description": self.get_description(),
            "ingredients": self.get_ingredients(),
            "method": self.get_method(),
            "tips": self.get_tips(),
            "dietary": self.get_dietary(),
            "url": self.get_original_url(),
            "image": self.get_image(),
        }
        return jout

    def get_title(self):
        try:
            return self.soup.find("h1", class_='content-title__text').text
        except AttributeError:
            return None

    def get_prep_time(self):
        try:
            return self.soup.find("p", class_='recipe-metadata__prep-time').text
        except AttributeError:
            return None

    def get_cooking_time(self):
        try:
            return self.soup.find("p", class_='recipe-metadata__cook-time').text
        except AttributeError:
            return None

    def get_servings(self):
        try:
            servings = self.soup.find("p", class_="recipe-metadata__serving").text
        except AttributeError:
            return None

        if servings.startswith("Serves"):
            try:
                servings = int(servings.split(" ")[1])
            except ValueError:
                pass
        return servings

    def get_recommendations(self):
        try:
            recommendations = self.soup.find("p", class_="recipe-metadata__recommendations").text
        except AttributeError:
            return -1

        if len(recommendations.split(" ")) > 0:
            try:
                return int(recommendations.split(" ")[0])
            except AttributeError:
                return recommendations

    def get_author(self):
        try:
            return self.soup.find(class_="chef").find(class_="chef__name").find(class_="chef__preposition", string="By").find_next_sibling("a").text
        except AttributeError:
            return "Unknown"

    def get_from(self):
        try:
            return self.soup.find(class_="chef").find(class_="chef__programme-name").find(class_="chef__preposition", string="From").find_next_sibling("a").text
        except AttributeError:
            return None

    def get_description(self):
        try:
            return self.soup.find("p", class_="recipe-description__text").text.strip()
        except AttributeError:
            return None

    def get_ingredients(self):
        ingredients = []
        for ingredient in self.soup.find("ul", class_="recipe-ingredients__list").find_all("li"):
            ingredients.append(ingredient.text.strip())
        return ingredients

    def get_method(self):
        method = []
        for step in self.soup.find("ol", class_="recipe-method__list").find_all("li"):
            method.append(step.find("p").text.strip())
        return method

    def get_tips(self):
        try:
            return self.soup.find("p", class_="recipe-tips__text").text.strip()
        except AttributeError:
            return None

    def get_dietary(self):
        try:
            return self.soup.find("div", class_="recipe-metadata__dietary").text.strip()
        except AttributeError:
            return None

    def get_original_url(self):
        # May not work all the time, but was fine for all the ones I tested on.
        fname = os.path.basename(self.path)
        fname = fname.replace('.html', '')
        return "http://www.bbc.co.uk/food/recipes/{}".format(fname)

    def get_image(self):
        try:
            imgsrc = self.soup.find("div", class_="recipe-media")
            if not imgsrc:
                return None

            fname = imgsrc.find("img")['src']
            fpath = os.path.join(os.path.dirname(self.path), fname)
            img = None
            try:
                with open(fpath, 'rb') as fh:
                    img = fh.read()
            except (OSError, IOError) as e:
                log("Could not open image at path: {}".format(fpath))
                return None

            return base64.b64encode(img).decode('utf-8')
        except AttributeError:
            return None

    def soup_from_file(self, path):
        with open(path, 'rb') as fh:
            fc = fh.read()
        return BeautifulSoup(fc, 'html.parser')

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Parses recipes from the BBC Food website")
    parser.add_argument("-o", "--parse-one", type=str, help="Parse single HTML file")
    args = parser.parse_args()

    if args.parse_one:
        try:
            fp = FoodParser(args.parse_one)
            out = fp.parse()
            print(json.dumps(out))
        except (OSError, IOError) as e:
            fatal("Failed to open file: {}".format(e))
	#!/usr/bin/env python3
	# Given a load of HTML pages from the BBC Food website, this script will
	# scrape the recipes from the HTML and output them in structured JSON format.

	import argparse
	import sys
	import os
	import json
	import base64
	from bs4 import BeautifulSoup

	def log(msg):
	sys.stderr.write(msg+"\n")

	def fatal(msg):
	log(msg)
	sys.exit(1)


	class FoodParser():
	def __init__(self, path):
	log("Parsing {}".format(path))
	self.soup = self.soup_from_file(path)
	self.path = path

	def parse(self):
	jout = {
	"title": self.get_title(),
	"author": self.get_author(),
	"from": self.get_from(),
	"prep_time": self.get_prep_time(),
	"cook_time": self.get_cooking_time(),
	"servings": self.get_servings(),
	"recommendations": self.get_recommendations(),
	"description": self.get_description(),
	"ingredients": self.get_ingredients(),
	"method": self.get_method(),
	"tips": self.get_tips(),
	"dietary": self.get_dietary(),
	"url": self.get_original_url(),
	"image": self.get_image(),
	}
	return jout

	def get_title(self):
	try:
	return self.soup.find("h1", class_='content-title__text').text
	except AttributeError:
	return None

	def get_prep_time(self):
	try:
	return self.soup.find("p", class_='recipe-metadata__prep-time').text
	except AttributeError:
	return None

	def get_cooking_time(self):
	try:
	return self.soup.find("p", class_='recipe-metadata__cook-time').text
	except AttributeError:
	return None

	def get_servings(self):
	try:
	servings = self.soup.find("p", class_="recipe-metadata__serving").text
	except AttributeError:
	return None

	if servings.startswith("Serves"):
	try:
	servings = int(servings.split(" ")[1])
	except ValueError:
	pass
	return servings

	def get_recommendations(self):
	try:
	recommendations = self.soup.find("p", class_="recipe-metadata__recommendations").text
	except AttributeError:
	return -1

	if len(recommendations.split(" ")) > 0:
	try:
	return int(recommendations.split(" ")[0])
	except AttributeError:
	return recommendations

	def get_author(self):
	try:
	return self.soup.find(class_="chef").find(class_="chef__name").find(class_="chef__preposition", string="By").find_next_sibling("a").text
	except AttributeError:
	return "Unknown"

	def get_from(self):
	try:
	return self.soup.find(class_="chef").find(class_="chef__programme-name").find(class_="chef__preposition", string="From").find_next_sibling("a").text
	except AttributeError:
	return None

	def get_description(self):
	try:
	return self.soup.find("p", class_="recipe-description__text").text.strip()
	except AttributeError:
	return None

	def get_ingredients(self):
	ingredients = []
	for ingredient in self.soup.find("ul", class_="recipe-ingredients__list").find_all("li"):
	ingredients.append(ingredient.text.strip())
	return ingredients

	def get_method(self):
	method = []
	for step in self.soup.find("ol", class_="recipe-method__list").find_all("li"):
	method.append(step.find("p").text.strip())
	return method

	def get_tips(self):
	try:
	return self.soup.find("p", class_="recipe-tips__text").text.strip()
	except AttributeError:
	return None

	def get_dietary(self):
	try:
	return self.soup.find("div", class_="recipe-metadata__dietary").text.strip()
	except AttributeError:
	return None

	def get_original_url(self):
	# May not work all the time, but was fine for all the ones I tested on.
	fname = os.path.basename(self.path)
	fname = fname.replace('.html', '')
	return "http://www.bbc.co.uk/food/recipes/{}".format(fname)

	def get_image(self):
	try:
	imgsrc = self.soup.find("div", class_="recipe-media")
	if not imgsrc:
	return None

	fname = imgsrc.find("img")['src']
	fpath = os.path.join(os.path.dirname(self.path), fname)
	img = None
	try:
	with open(fpath, 'rb') as fh:
	img = fh.read()
	except (OSError, IOError) as e:
	log("Could not open image at path: {}".format(fpath))
	return None

	return base64.b64encode(img).decode('utf-8')
	except AttributeError:
	return None

	def soup_from_file(self, path):
	with open(path, 'rb') as fh:
	fc = fh.read()
	return BeautifulSoup(fc, 'html.parser')

	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description="Parses recipes from the BBC Food website")
	parser.add_argument("-o", "--parse-one", type=str, help="Parse single HTML file")
	args = parser.parse_args()

	if args.parse_one:
	try:
	fp = FoodParser(args.parse_one)
	out = fp.parse()
	print(json.dumps(out))
	except (OSError, IOError) as e:
	fatal("Failed to open file: {}".format(e))