Skip to content

Instantly share code, notes, and snippets.

@jeremyorme
Last active April 29, 2020 07:28
Show Gist options
  • Save jeremyorme/d2a6622494a7cf058974ea5571012657 to your computer and use it in GitHub Desktop.
Save jeremyorme/d2a6622494a7cf058974ea5571012657 to your computer and use it in GitHub Desktop.
Classify free text ingredients and aggregate nutrient values
import json
import csv
import re
# Input recipe, ingredient per line, ignore blanks
recipe = '''
2 courgettes (zucchini)
1 carrot
1 avocado
1 bunch basil
1 tbsp lemon juice
2 tbsp nutritional yeast
10 olives, sliced
4 garlic cloves, roasted
2 tomatoes, roasted
Pinch of chilli powder or smoked paprika
'''
#
# --- Classification ---
#
# Substitution rule class
class rule:
def __init__(self, pattern, substitution):
self.p = rule._translate_type_captures(rule._translate_type_matches(pattern))
self.s = rule._translate_type_substitutions(substitution)
def sub(self, s):
return re.sub(self.p, self.s, s)
def _translate_type_captures(s):
pat = r'\{\(\?\<(?P<type_and_index>[a-z_]+[0-9]*)\>(?P<content>.*?)\)\}'
rep = r' ?(?<![^\> ])(?P<T_\g<type_and_index>>\g<content>)(?![^\< ]) ?'
return re.sub(pat, rep, s)
def _translate_type_matches(s):
pat = r'\<\<!(?P<type_and_index>(?P<type>[a-z_]+)[0-9]*)\>\>'
rep = r'(?! ?\<\g<type>\>)'
s2 = re.sub(pat, rep, s)
pat = r'\<\<(?P<type_and_index>(?P<type>[a-z_]+)[0-9]*)\>\>'
rep = r' ?\<\g<type>\>(?P<T_\g<type_and_index>>(?:(?!\<\/\g<type>\>).)*)\<\/\g<type>\> ?'
return re.sub(pat, rep, s2)
def _translate_type_substitutions(s):
pat = r'\<\<(?P<type_and_index>(?P<type>[a-z_]+)[0-9]*)\>\>'
rep = r' <\g<type>>\\g<T_\g<type_and_index>></\g<type>> '
return re.sub(pat, rep, s)
# Amount substitution rules
amount_rules = [
# imprecise cooking units
rule(r'{(?<pinch>[pP]inch(?:es)?)}', ' <unit><<pinch>></unit> '),
rule(r'{(?<dash>[dD]ash)}', ' <unit><<dash>></unit> '),
# general units of volume
rule(r'{(?<ml>mls?|mL|cc|millilitres?|milliliters?)}', ' <unit><<ml>></unit> '),
rule(r'{(?<tsp>tsps?|t|teaspoons?)}', ' <unit><<tsp>></unit> '),
rule(r'{(?<tbsp>[tT]bsps?|T|tbl|tbs|[tT]ablespoons?)}', ' <unit><<tbsp>></unit> '),
rule(r'{(?<floz>fl ?oz|fluid ounces?)}', ' <unit><<floz>></unit> '),
rule(r'{(?<cup>cups?)}', ' <unit><<cup>></unit> '),
rule(r'{(?<pt>p|pts?|pints?)}', ' <unit><<pt>></unit> '),
rule(r'{(?<l>ls?|L|litres?|liters?)}', ' <unit><<l>></unit> '),
rule(r'{(?<gal>gals?|gallons?/)}', ' <unit><<gal>></unit> '),
rule(r'{(?<dl>dls?|dL|decilitre|deciliter)}', ' <unit><<dl>></unit> '),
# general units of mass
rule(r'{(?<kg>kgs?|kilos?|kilograms?)}', ' <unit><<kg>></unit> '),
rule(r'{(?<g>gs?|grams?|grammes?)}', ' <unit><<g>></unit> '),
rule(r'{(?<oz>oz|ounces?)}', ' <unit><<oz>></unit> '),
rule(r'{(?<lb>lbs?|#|pounds?)}', ' <unit><<lb>></unit> '),
# numbers
rule(r'{(?<number>(?:\d* )?\d+ ?\/ ?\d+|\d*\s?[½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞]|\d+(\.\d+)?)}', '<<number>>'),
rule(r'{(?<number>an?)}', '<<number>>'),
# imprecise amounts
rule(r'{(?<amount>to taste)}', '<<amount>>'),
rule(r'{(?<amount>to serve)}', '<<amount>>'),
rule(r'{(?<amount>for \w+ing)}', '<<amount>>'),
# general amounts
rule(r'{(?<amount><<number1>>[\-–]?<<unit1>>|<<number2>>|<<unit2>>)}', '<<amount>>')
]
# Text helper functions
def tokenise(s):
return ' '.join([t for t in re.split(r'([a-zA-Zñ][a-zA-Zñ\-]*|\d+\.\d+|[½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞\d]+|[^\w ])', s) if t.strip() != ''])
def extract_typed(s, t):
return re.findall(r'\<' + t + r'\>((?:(?!\</' + t + '\>).)*)\</' + t + r'\>', s)
def first_tag(s):
for t in re.finditer(r'\<([^\>]*)\>', s):
return t.group(1)
# Substitution helpers
def classify_ingredients(s, ingredients):
candidates = []
names = {}
for name, ingredient in ingredients.items():
for keyword in ingredient['keywords']:
if keyword in s:
candidates.append(keyword)
names[keyword] = name
sorted_candidates = sorted(candidates, key=len)
if len(sorted_candidates) == 0:
return s
keyword = sorted_candidates[-1]
name = names[keyword]
return s.replace(keyword, '<ingredient><' + name + '>' + keyword + '</' + name + '></ingredient>')
def classify_amounts(s):
for r in amount_rules:
s = r.sub(s)
return s
# Structured ingredient quantity
class ingredient_quantity:
def __init__(self, source, number, unit, ings):
self.source = source
self.number = number
self.unit = unit
self.ings = ings
def __repr__(self):
return 'number: ' + str(self.number) + ', unit: ' + self.unit + ', ingredients: ' + str(self.ings)
@classmethod
def from_string(cls, iq_str, ingredients):
s = tokenise(iq_str)
s = classify_ingredients(s, ingredients)
s = classify_amounts(s)
amounts = extract_typed(s, 'amount')
number = '1'
unit = 'default'
for amount in amounts:
numbers = extract_typed(amount, 'number')
units = extract_typed(amount, 'unit')
if len(numbers) > 0:
number = numbers[0]
if len(units) > 0:
unit = first_tag(units[0])
if len(numbers) > 0 and len(units) > 0:
break
names = [first_tag(i) for i in extract_typed(s, 'ingredient')]
ings = dict(zip(names, [{
'component_ids': ingredients[name]['component_ids'],
'unit_mass': ingredients[name]['unit_mass'],
'density': ingredients[name]['density']
} for name in names]))
return cls(iq_str, float(number), unit, ings)
def parse_recipe(recipe, ingredients):
return [ingredient_quantity.from_string(iq_str, ingredients) for iq_str in recipe.splitlines() if iq_str.strip() != '']
# Dict of matchable ingredients indexed by name
with open('ingredients.json') as f:
ing_map = json.load(f)
iqs = parse_recipe(recipe, ing_map)
#
# --- Aggregation ---
#
# Dict of nutrients indexed by nutrient id
with open('nutrients.json') as f:
nutrients = json.load(f)
# Dict of units indexed by name
with open('units.json') as f:
units = json.load(f)
# Dict of nutrient values indexed by fdc_id:nutrient_id
nutrient_values = {}
with open('food_nutrient.csv') as f:
rows = csv.DictReader(f)
for row in rows:
k = row['fdc_id'] + ':' + row['nutrient_id']
nutrient_values[k] = float(row['amount'])
def total_grams_of(iq, name):
if iq.unit == 'default':
unit_mass = iq.ings[name]['unit_mass']
return iq.number * unit_mass
factor = units[iq.unit]['factor']
base_number = iq.number * factor
base_unit = units[iq.unit]['base_unit']
if base_unit == 'g':
return base_number
density = iq.ings[name]['density']
return base_number * density
def total_grams(iq):
g = 0
for name in iq.ings.keys():
g += total_grams_of(iq, name)
return g / max(len(iq.ings), 1)
def nutrient_grams(iq, nutrient_id):
g = 0
for name in iq.ings.keys():
i_g = 0
for fdc_id in iq.ings[name]['component_ids']:
i_g += nutrient_values[fdc_id + ':' + nutrient_id] * total_grams_of(iq, name) / 100
g += i_g / len(iq.ings[name]['component_ids'])
return g / max(len(iq.ings), 1)
for iq in iqs:
print(iq.source)
print(str(iq) + ', grams: ' + str(total_grams(iq)) + ', ' + ', '.join([nutrient_name + ': ' + str(nutrient_grams(iq, nutrient_id)) for nutrient_id, nutrient_name in nutrients.items()]))
print()
totals = [0.0 for n in nutrients.items()]
total_mass = 0.0
for iq in iqs:
for n, nutrient_id in enumerate(nutrients.keys()):
totals[n] += nutrient_grams(iq, nutrient_id)
total_mass += total_grams(iq)
print()
print('Total mass: ' + str(total_mass))
for n, nutrient_id in enumerate(nutrients.keys()):
print(nutrients[nutrient_id] + ': ' + str(totals[n]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment