Skip to content

Instantly share code, notes, and snippets.

@tvwerkhoven
Last active December 10, 2015 13:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tvwerkhoven/4443142 to your computer and use it in GitHub Desktop.
Save tvwerkhoven/4443142 to your computer and use it in GitHub Desktop.
Produce energy label for Albert Heijn recipes
{"Framboos": [["C", "Spanje"], ["E", "Mexico"]], "Nectarine": [["B", "Chili"]], "Limoen": [["A", "Brazili\u00eb"]], "Avocado": [["B", "Isra\u00ebl"], ["C", "Chili"], ["C", "Spanje"]], "Wortelen (breekpeen/waspeen)": [["A", "Nederland"]], "Carambola": [["D", "Isra\u00ebl"], ["E", "Maleisi\u00eb"]], "Radijs (los)": [["B", "Nederland"]], "Sperzieboon": [["B", "Marokko"], ["B", "Spanje"], ["C", "Senegal"], ["D", "Egypte"], ["E", "Kenia"]], "Kokosnoot": [["C", "Ivoorkust"], ["C", "Sri Lanka"], ["D", "Algerije"]], "Broccoli": [["B", "Frankrijk"], ["B", "Spanje"]], "Witte kool": [["A", "Nederland"]], "Lychee": [["C", "Madagaskar"], ["C", "Zuid-Afrika"]], "Rabarber": [["C", "Nederland"]], "Asperge (groen)": [["E", "Peru"]], "Courgette": [["A", "Spanje"]], "Rode biet": [["A", "Nederland"]], "Andijvie (gewoon)": [["A", "Spanje"], ["E", "Nederland"]], "Spitskool": [["A", "Nederland"], ["A", "Spanje"], ["B", "Portugal"]], "Ananas": [["B", "Costa Rica"], ["B", "Ecuador"]], "Kersen": [["E", "Chili"]], "Artisjok": [["B", "Spanje"]], "Dadel (vers)": [["C", "Isra\u00ebl"], ["D", "Tunesi\u00eb"]], "Rucola": [["A", "Itali\u00eb"]], "Rode bes": [["D", "Nederland"]], "Tuinboon": [["B", "Groot-Brittanni\u00eb"]], "Grapefruit": [["A", "Verenigde Staten"], ["B", "China"]], "Kiwi": [["B", "Itali\u00eb"]], "Veldsla": [["A", "Itali\u00eb"], ["A", "Nederland"]], "Oesterzwam": [["C", "Nederland"]], "Asperge (wit)": [["E", "Peru"]], "Aardappelen": [["A", "Belgi\u00eb"], ["A", "Duitsland"], ["A", "Frankrijk"], ["A", "Nederland"]], "Radijs": [["B", "Isra\u00ebl"]], "Venkelknol": [["A", "Itali\u00eb"], ["A", "Spanje"]], "Knolselderij": [["A", "Nederland"]], "Winterpostelein": [["A", "Nederland"]], "Bleekselderij": [["A", "Spanje"]], "Pastinaak": [["A", "Nederland"]], "Ui": [["A", "Belgi\u00eb"], ["A", "Nederland"], ["A", "Polen"]], "Trostomaat": [["B", "Isra\u00ebl"], ["B", "Spanje"]], "Sharonfruit (Kaki)": [["C", "Isra\u00ebl"]], "Cherrytomaat": [["B", "Isra\u00ebl"], ["B", "Marokko"], ["B", "Senegal"], ["B", "Spanje"]], "Mango": [["C", "Peru"]], "Dadel (gekonfijt/gedroogd)": [["C", "Californi\u00eb"], ["C", "Iran"], ["C", "Isra\u00ebl"], ["D", "Algerije"], ["D", "Tunesi\u00eb"]], "Rammenas": [["A", "Nederland"]], "Banaan": [["A", "Colombia"], ["A", "Costa Rica"], ["A", "Ecuador"]], "Aardbeien": [["C", "Spanje"], ["D", "Egypte"], ["D", "Isra\u00ebl"]], "Cherrytrostomaat": [["B", "Isra\u00ebl"], ["B", "Itali\u00eb"], ["B", "Spanje"]], "Bloemkool": [["A", "Nederland"], ["B", "Frankrijk"], ["B", "Spanje"]], "Rode kool": [["A", "Nederland"]], "IJsbergsla": [["A", "Spanje"]], "Aubergine": [["B", "Spanje"]], "Romatomaat": [["B", "Spanje"]], "Appel": [["A", "Frankrijk"], ["A", "Nederland"]], "Abrikoos": [["B", "Zuid-Afrika"]], "Pruim": [["B", "Zuid-Afrika"]], "Aardpeer": [["A", "Nederland"], ["B", "China"], ["B", "Costa Rica"], ["B", "Ghana"]], "Boerenkool": [["A", "Nederland"], ["A", "Spanje"]], "Passievrucht": [["E", "Colombia"], ["E", "Kenia"], ["E", "Maleisi\u00eb"], ["E", "Zimbabwe"]], "Blauwe bes": [["E", "Chili"], ["E", "Nederland"], ["E", "Nieuw-Zeeland"]], "Paprika": [["A", "Spanje"], ["B", "Isra\u00ebl"]], "Snijboon": [["B", "Marokko"], ["B", "Spanje"], ["C", "Senegal"], ["D", "Egypte"], ["E", "Kenia"]], "Mandarijn": [["A", "Marokko"], ["B", "Spanje"]], "Zoete Aardappel": [["A", "Verenigde Staten"], ["B", "China"]], "Pompoen": [["A", "Frankrijk"], ["A", "Spanje"], ["B", "Nederland"], ["B", "Portugal"]], "Vleestomaat": [["B", "Spanje"]], "Peultjes": [["B", "Marokko"], ["D", "Egypte"], ["E", "Guatamala"], ["E", "Kenia"]], "Doperwt (vers)": [["E", "Guatamala"], ["E", "Kenia"]], "Koolraap": [["A", "Nederland"]], "Prei": [["A", "Belgi\u00eb"], ["A", "Nederland"]], "Mais": [["B", "Verenigde Staten"]], "Ronde tomaat": [["B", "Marokko"], ["B", "Spanje"]], "Spruiten": [["A", "Belgi\u00eb"], ["A", "Duitsland"], ["A", "Nederland"]], "Miniromatomaat": [["B", "Spanje"], ["C", "Nederland"]], "Perzik": [["B", "Chili"], ["B", "Zuid-Afrika"]], "Peer (stoofpeer)": [["A", "Nederland"]], "Schorseneren": [["A", "Nederland"]], "Granaatappel": [["B", "Itali\u00eb"], ["B", "Spanje"]], "Peer (handpeer)": [["A", "Belgi\u00eb"], ["A", "Nederland"], ["A", "Zuid-Afrika"]], "Meloen": [["B", "Brazili\u00eb"], ["B", "Honduras"]], "Bosui": [["A", "Frankrijk"]], "Druif": [["B", "Zuid-Afrika"]], "Witlof": [["A", "Belgi\u00eb"], ["A", "Itali\u00eb"], ["A", "Nederland"], ["A", "Spanje"]], "Zeekraal": [["D", "Isra\u00ebl"], ["E", "Mexico"]], "Koolrabi": [["A", "Itali\u00eb"]], "Chinese kool": [["A", "Nederland"], ["A", "Oostenrijk"]], "Savooie kool": [["A", "Nederland"]], "Wortelen (bospeen)": [["A", "Spanje"]], "Papaja": [["E", "Brazili\u00eb"], ["E", "Ecuador"]], "Wortelen (winterpeen)": [["A", "Nederland"]], "Rettich": [["A", "Itali\u00eb"], ["A", "Nederland"]], "Citroen": [["B", "Spanje"]], "Radicchio": [["B", "Itali\u00eb"]], "Radijs (bos)": [["A", "Nederland"]], "Andijvie (krul)": [["A", "Frankrijk"]], "Vijg": [["E", "Brazili\u00eb"]], "Raapjes": [["A", "Frankrijk"], ["A", "Itali\u00eb"]], "Braam": [["E", "Mexico"]], "Komkommer": [["A", "Spanje"]], "Kropsla": [["A", "Spanje"], ["D", "Nederland"]], "Sinaasappel": [["A", "Spanje"], ["B", "Marokko"]], "Spinazie": [["A", "Spanje"]], "Champignon": [["B", "Nederland"], ["D", "Belgi\u00eb"], ["D", "Polen"]], "Babymais": [["E", "Thailand"]]}
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
"""
@file recept_label.py -- produce energy label for Albert Heijn recipes
@author Tim van Werkhoven
@date 20130103
@copyright Copyright (c) 2013 Tim van Werkhoven <timvanwerkhoven@gmail.com>
This file is licensed under the Creative Commons Attribution-Share Alike
license versions 3.0 or higher, see
http://creativecommons.org/licenses/by-sa/3.0/
"""
#############################################################################
### PREAMBLE
#############################################################################
import urllib2, urllib
from cookielib import CookieJar
from bs4 import BeautifulSoup
import re
import difflib
import bz2
import json, os
def unit_per_kg(inunit):
"""
Convert **inunit** to kilograms
"""
if (inunit == 'kg'): return 1.0
if (inunit == 'g'): return 1e-3
if (inunit == 'l'): return 1.0
if (inunit == 'ml'): return 1e-3
else: return None
### Get recipe, find ingredients
INURL="http://www.ah.nl/allerhande/recepten/882692/boerenkool-venkelstamppot-met-kip?latestAllerhande=on"
ah_recipe = BeautifulSoup(urllib2.urlopen(INURL).read())
ing_list = [(ing.string.strip(), ing['data-search-term']) for ing in ah_recipe('span', {'class' : 'ingredient'})]
### Compute normalized ingredient quantities
# For each ingredient, find the quantity, look for ([0-9]+)[ ]([kg|g|l|ml])
qty_re = re.compile('([0-9]+) (kg|g|ml|l)')
# Loop over ingredient, find quantity, convert
ing_list2 = []
for ing, ingname in ing_list:
qty = qty_re.search(ing)
if (not qty):
print "not parsed:", ing
ing_list2.append((ing, ingname, None))
continue
print ing, qty.groups()
normqty = float(qty.groups()[0]) * unit_per_kg(qty.groups()[1])
ing_list2.append((ing, ingname, normqty))
### Get milieucentraal data from web -- DOES NOT WORK, COOKIES?
# cj = CookieJar()
# opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj), urllib2.HTTPRedirectHandler())
# milieu_url = 'http://groentefruit.milieucentraal.nl/Groente-en-Fruit-Kalender/groente-en-fruit-kalender/zoeken/formulier'
# milieu_parms = urllib.urlencode({'questionid': "5010cd30", 'Maand': 'januari', 'Energieklasse': 'A', 'Energieklasse': 'B', 'Energieklasse': 'C', 'Energieklasse': 'D', 'Energieklasse': 'E', 'next' : u'Toon+resultaat+%C2%BB'})
# milieu_parms = "focusfield=&scrollx=&scrolly=&dummyFieldForIEEnterSubmitSupport=&questionid=5010cd30&Producttype=&Maand=januari&Product=&Energieklasse=A&Energieklasse=B&Energieklasse=C&Energieklasse=D&Energieklasse=E&finish=Toon+resultaat+%C2%BB"
# #POSTDATA=focusfield=&scrollx=&scrolly=&dummyFieldForIEEnterSubmitSupport=&questionid=5010cd30&Producttype=&Maand=januari&Product=&Energieklasse=A&Energieklasse=B&Energieklasse=C&Energieklasse=D&Energieklasse=E&finish=Toon+resultaat+%C2%BB
# milieu_open = opener.open(milieu_url, milieu_parms)
# milieu_data = BeautifulSoup(milieu_open.read())
### Get milieucentraal data
JSONFILE = './data/milieudata_januari.json'
if (os.path.isfile(JSONFILE)):
# Load data from previously parsed file
print "Loading previously parsed JSON file"
fd = open(JSONFILE, 'r')
milieu_labels = json.load(fd)
fd.close()
else:
# Get milieucentraal data from local file
print "Parsing local data file"
fd = bz2.BZ2File("./data/milieudata_januari.html.bz2")
milieu_data = BeautifulSoup(fd.read())
fd.close()
### Format energy labels
# Loop over products, extract name, energy labels and countries
milieu_labels = {}
for prod in milieu_data('td', {'class' : 'product'}):
prod_name = prod.contents[0]
# Find energy labels for each country where they are produced
milieu_labels[prod_name] = [(c.a['class'][-1], c.a.contents[-1]) for c in prod.nextSibling('div', {'class': 'tooltiparent'})]
# The above code is equivalent to:
# prod_country = prod.nextSibling('div', {'class': 'tooltiparent'})
# this_c = prod_country[0].a.contents[-1]
# this_e1 = prod_country[0].a['class'][-1]
# Energy labels are also here:
# prod_energy = prod.nextSibling('td', {'class' : 'energy'})
# this_e0 = prod_energy[0].div['class'][-1]
# but this is less robust, there are multiple countries per label
# Store labels as JSON
fd = open(JSONFILE, 'w')
json.dump(milieu_labels, fd)
fd.close()
### Match ingredient names to energy labels product names
for ingfull, ing, norm_qty in ing_list2:
bestprod = difflib.get_close_matches(ing, milieu_labels.keys()) or None
if (bestprod):
print ingfull, ing, norm_qty, bestprod[0], milieu_labels[bestprod[0]]
else:
print ingfull, ing, norm_qty, "no energy data"
# EOF
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment