Skip to content

Instantly share code, notes, and snippets.

@jeremyorme
Last active January 28, 2020 17:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jeremyorme/34504e4966763f1170474fc978f44ddf to your computer and use it in GitHub Desktop.
Save jeremyorme/34504e4966763f1170474fc978f44ddf to your computer and use it in GitHub Desktop.
Ingredient Classification & Aggregation
import json
import csv
import re
# Input recipe, ingredient per line, ignore blanks
recipe = '''
2 courgettes (zucchini)
1 carrot
1 avocado
1 bunch basil
1 tbsp lemon juice
2 tbsp nutritional yeast
10 olives, sliced
4 garlic cloves, roasted
2 tomatoes, roasted
Pinch of chilli powder or smoked paprika
'''
# Dict of matchable ingredients indexed by name
with open('ingredients.json') as f:
ingredients = json.load(f)
# Dict of nutrients indexed by nutrient id
with open('nutrients.json') as f:
nutrients = json.load(f)
# Dict of units indexed by name
with open('units.json') as f:
units = json.load(f)
# Dict of nutrient values indexed by fdc_id:nutrient_id
nutrient_values = {}
with open('ingredient_nutrients.csv') as f:
rows = csv.DictReader(f)
for row in rows:
k = row['fdc_id'] + ':' + row['nutrient_id']
nutrient_values[k] = float(row['amount'])
# Substitution rule class
class rule:
def __init__(self, pattern, substitution):
self.p = rule._translate_type_captures(rule._translate_type_matches(pattern))
self.s = rule._translate_type_substitutions(substitution)
def sub(self, s):
return re.sub(self.p, self.s, s)
def _translate_type_captures(s):
pat = r'\{\(\?\<(?P<type_and_index>[a-z_]+[0-9]*)\>(?P<content>.*?)\)\}'
rep = r' ?(?<![^\> ])(?P<T_\g<type_and_index>>\g<content>)(?![^\< ]) ?'
return re.sub(pat, rep, s)
def _translate_type_matches(s):
pat = r'\<\<!(?P<type_and_index>(?P<type>[a-z_]+)[0-9]*)\>\>'
rep = r'(?! ?\<\g<type>\>)'
s2 = re.sub(pat, rep, s)
pat = r'\<\<(?P<type_and_index>(?P<type>[a-z_]+)[0-9]*)\>\>'
rep = r' ?\<\g<type>\>(?P<T_\g<type_and_index>>(?:(?!\<\/\g<type>\>).)*)\<\/\g<type>\> ?'
return re.sub(pat, rep, s2)
def _translate_type_substitutions(s):
pat = r'\<\<(?P<type_and_index>(?P<type>[a-z_]+)[0-9]*)\>\>'
rep = r' <\g<type>>\\g<T_\g<type_and_index>></\g<type>> '
return re.sub(pat, rep, s)
# Amount substitution rules
amount_rules = [
# imprecise cooking units
rule(r'{(?<pinch>[pP]inch(?:es)?)}', ' <unit><<pinch>></unit> '),
rule(r'{(?<dash>[dD]ash)}', ' <unit><<dash>></unit> '),
# general units of volume
rule(r'{(?<ml>mls?|mL|cc|millilitres?|milliliters?)}', ' <unit><<ml>></unit> '),
rule(r'{(?<tsp>tsps?|t|teaspoons?)}', ' <unit><<tsp>></unit> '),
rule(r'{(?<tbsp>[tT]bsps?|T|tbl|tbs|[tT]ablespoons?)}', ' <unit><<tbsp>></unit> '),
rule(r'{(?<floz>fl ?oz|fluid ounces?)}', ' <unit><<floz>></unit> '),
rule(r'{(?<cup>cups?)}', ' <unit><<cup>></unit> '),
rule(r'{(?<pt>p|pts?|pints?)}', ' <unit><<pt>></unit> '),
rule(r'{(?<l>ls?|L|litres?|liters?)}', ' <unit><<l>></unit> '),
rule(r'{(?<gal>gals?|gallons?/)}', ' <unit><<gal>></unit> '),
rule(r'{(?<dl>dls?|dL|decilitre|deciliter)}', ' <unit><<dl>></unit> '),
# general units of mass
rule(r'{(?<kg>kgs?|kilos?|kilograms?)}', ' <unit><<kg>></unit> '),
rule(r'{(?<g>gs?|grams?|grammes?)}', ' <unit><<g>></unit> '),
rule(r'{(?<oz>oz|ounces?)}', ' <unit><<oz>></unit> '),
rule(r'{(?<lb>lbs?|#|pounds?)}', ' <unit><<lb>></unit> '),
# numbers
rule(r'{(?<number>(?:\d* )?\d+ ?\/ ?\d+|\d*\s?[½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞]|\d+(\.\d+)?)}', '<<number>>'),
rule(r'{(?<number>an?)}', '<<number>>'),
# imprecise amounts
rule(r'{(?<amount>to taste)}', '<<amount>>'),
rule(r'{(?<amount>to serve)}', '<<amount>>'),
rule(r'{(?<amount>for \w+ing)}', '<<amount>>'),
# general amounts
rule(r'{(?<amount><<number1>>[\-–]?<<unit1>>|<<number2>>|<<unit2>>)}', '<<amount>>')
]
# Text helper functions
def tokenise(s):
return ' '.join([t for t in re.split(r'([a-zA-Zñ][a-zA-Zñ\-]*|\d+\.\d+|[½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞\d]+|[^\w ])', s) if t.strip() != ''])
def extract_typed(s, t):
return re.findall(r'\<' + t + r'\>((?:(?!\</' + t + '\>).)*)\</' + t + r'\>', s)
def first_tag(s):
for t in re.finditer(r'\<([^\>]*)\>', s):
return t.group(1)
# Substitution helpers
def classify_ingredients(s):
sout = s
for name, ingredient in ingredients.items():
for keyword in ingredient['keywords']:
sout = sout.replace(keyword, '<ingredient><' + name + '>' + keyword + '</' + name + '></ingredient>')
return sout
def classify_amounts(s):
for r in amount_rules:
s = r.sub(s)
return s
# Structured ingredient quantity
class ingredient_quantity:
def __init__(self, number, unit, names):
self.number = number
self.unit = unit
self.names = names
def __repr__(self):
return 'number: ' + str(self.number) + ', unit: ' + self.unit + ', names: ' + str(self.names)
@classmethod
def from_string(cls, iq_str):
s = tokenise(iq_str)
s = classify_ingredients(s)
s = classify_amounts(s)
amounts = extract_typed(s, 'amount')
number = '1'
unit = 'default'
for amount in amounts:
numbers = extract_typed(amount, 'number')
units = extract_typed(amount, 'unit')
if len(numbers) > 0:
number = numbers[0]
if len(units) > 0:
unit = first_tag(units[0])
if len(numbers) > 0 and len(units) > 0:
break
names = [first_tag(i) for i in extract_typed(s, 'ingredient')]
return cls(float(number), unit, names)
def total_grams_of(self, name):
if self.unit == 'default':
unit_mass = ingredients[name]['unit_mass']
return self.number * unit_mass
factor = units[self.unit]['factor']
base_number = self.number * factor
base_unit = units[self.unit]['base_unit']
if base_unit == 'g':
return base_number
density = ingredients[name]['density']
return base_number * density
def total_grams(self):
g = 0
for name in self.names:
g += self.total_grams_of(name)
return g / len(self.names)
def nutrient_grams(self, nutrient_id):
g = 0
for name in self.names:
fdc_id = ingredients[name]['fdc_id']
g += nutrient_values[fdc_id + ':' + nutrient_id] * self.total_grams_of(name) / 100
return g / len(self.names)
# Parse ingredient quantities
iqs = [ingredient_quantity.from_string(iq_str) for iq_str in recipe.splitlines() if iq_str.strip() != '']
# Print nutrient masses per ingredient
for iq in iqs:
print(str(iq) + ', grams: ' + str(iq.total_grams()) + ', ' + ', '.join([nutrient_name + ': ' + str(iq.nutrient_grams(nutrient_id)) for nutrient_id, nutrient_name in nutrients.items()]))
# Aggregate nutrient masses
totals = [0.0 for n in nutrients.items()]
total_mass = 0.0
for iq in iqs:
for n, nutrient_id in enumerate(nutrients.keys()):
totals[n] += iq.nutrient_grams(nutrient_id)
total_mass += iq.total_grams()
# Print aggregated nutrient masses
print()
print('Total mass: ' + str(total_mass))
for n, nutrient_id in enumerate(nutrients.keys()):
print(nutrients[nutrient_id] + ': ' + str(totals[n]))
id fdc_id nutrient_id amount data_points derivation_id min max median footnote min_year_acquired
1526524 170471 1003 3.33 36
1526500 170471 1004 0.52 36
1526496 170471 1005 13.47 0 49
1430596 169291 1003 1.21 10 43 0.91 1.5
1430544 169291 1004 0.32 7 42 0.1 0.45
1430545 169291 1005 3.11 0 49
2696692 342354 1003 0.93
2696693 342354 1004 0.24
2696694 342354 1005 9.58
2643002 341528 1003 2
2643003 341528 1004 14.66
2643004 341528 1005 8.53
2713202 342608 1003 3.15
2713203 342608 1004 0.64
2713204 342608 1005 2.65
2636827 341433 1003 1.1
2636828 341433 1004 0.3
2636829 341433 1005 9.32
2764097 343391 1003 40.44
2764098 343391 1004 7.61
2764099 343391 1005 41.22
2782232 343670 1003 0.88
2782233 343670 1004 9.54
2782234 343670 1005 6.06
2713592 342614 1003 6.36
2713593 342614 1004 0.5
2713594 342614 1005 33.06
2706312 342502 1003 0.88
2706313 342502 1004 0.2
2706314 342502 1005 3.89
1600381 171319 1003 13.46 1 1
1600336 171319 1004 14.28 1 1
1600337 171319 1005 49.7 0 49
1601325 171329 1003 14.14 3 1 13.96 14.47
1601334 171329 1004 12.89 3 1 11.48 14.8
1601368 171329 1005 53.99 0 49
{
"mixed-vegetables": {
"keywords": [
"different types of veg"
],
"fdc_id": "170471",
"density": 0.59,
"unit_mass": null,
"unit_name": null
},
"courgette": {
"keywords": [
"courgette",
"zucchini"
],
"fdc_id": "169291",
"density": null,
"unit_mass": 217.0,
"unit_name": "courgette"
},
"carrot": {
"keywords": [
"carrot"
],
"fdc_id": "342354",
"density": null,
"unit_mass": 46.0,
"unit_name": "carrot"
},
"avocado": {
"keywords": [
"avocado"
],
"fdc_id": "341528",
"density": null,
"unit_mass": 201.0,
"unit_name": "avocado"
},
"basil": {
"keywords": [
"basil"
],
"fdc_id": "342608",
"density": null,
"unit_mass": 10.0,
"unit_name": "bunch"
},
"lemon": {
"keywords": [
"lemon"
],
"fdc_id": "341433",
"density": 1.0,
"unit_mass": 10.0,
"unit_name": "lemon (juice of)"
},
"yeast": {
"keywords": [
"yeast"
],
"fdc_id": "343391",
"density": 0.81,
"unit_mass": null,
"unit_name": null
},
"black-olives": {
"keywords": [
"olive"
],
"fdc_id": "343670",
"density": null,
"unit_mass": 3.8,
"unit_name": "olive"
},
"garlic": {
"keywords": [
"garlic"
],
"fdc_id": "342614",
"density": null,
"unit_mass": 3.0,
"unit_name": "clove"
},
"tomato": {
"keywords": [
"tomato"
],
"fdc_id": "342502",
"density": null,
"unit_mass": 123.0,
"unit_name": "tomato"
},
"chilli-powder": {
"keywords": [
"chilli powder"
],
"fdc_id": "171319",
"density": 0.55,
"unit_mass": null,
"unit_name": null
},
"paprika": {
"keywords": [
"paprika"
],
"fdc_id": "171329",
"density": 0.47,
"unit_mass": null,
"unit_name": null
}
}
{
"1003": "Protein",
"1004": "Fat",
"1005": "Carbohydrate"
}
{
"g": {
"factor": 1.0,
"base_unit": "g"
},
"oz": {
"factor": 28.3495,
"base_unit": "g"
},
"lb": {
"factor": 453.592,
"base_unit": "g"
},
"kg": {
"factor": 1000.0,
"base_unit": "g"
},
"ml": {
"factor": 1.0,
"base_unit": "ml"
},
"cc": {
"factor": 1.0,
"base_unit": "ml"
},
"pinch": {
"factor": 0.73992,
"base_unit": "ml"
},
"tsp": {
"factor": 5.91939,
"base_unit": "ml"
},
"tbsp": {
"factor": 17.7582,
"base_unit": "ml"
},
"floz": {
"factor": 28.4131,
"base_unit": "ml"
},
"cup": {
"factor": 284.131,
"base_unit": "ml"
},
"pt": {
"factor": 568.261,
"base_unit": "ml"
},
"dl": {
"factor": 100.0,
"base_unit": "ml"
},
"l": {
"factor": 1000.0,
"base_unit": "ml"
},
"gal": {
"factor": 4546.09,
"base_unit": "ml"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment