Last active
April 29, 2020 07:28
-
-
Save jeremyorme/d2a6622494a7cf058974ea5571012657 to your computer and use it in GitHub Desktop.
Classify free text ingredients and aggregate nutrient values
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import csv | |
import re | |
# Input recipe, ingredient per line, ignore blanks | |
recipe = ''' | |
2 courgettes (zucchini) | |
1 carrot | |
1 avocado | |
1 bunch basil | |
1 tbsp lemon juice | |
2 tbsp nutritional yeast | |
10 olives, sliced | |
4 garlic cloves, roasted | |
2 tomatoes, roasted | |
Pinch of chilli powder or smoked paprika | |
''' | |
# | |
# --- Classification --- | |
# | |
# Substitution rule class | |
class rule: | |
def __init__(self, pattern, substitution): | |
self.p = rule._translate_type_captures(rule._translate_type_matches(pattern)) | |
self.s = rule._translate_type_substitutions(substitution) | |
def sub(self, s): | |
return re.sub(self.p, self.s, s) | |
def _translate_type_captures(s): | |
pat = r'\{\(\?\<(?P<type_and_index>[a-z_]+[0-9]*)\>(?P<content>.*?)\)\}' | |
rep = r' ?(?<![^\> ])(?P<T_\g<type_and_index>>\g<content>)(?![^\< ]) ?' | |
return re.sub(pat, rep, s) | |
def _translate_type_matches(s): | |
pat = r'\<\<!(?P<type_and_index>(?P<type>[a-z_]+)[0-9]*)\>\>' | |
rep = r'(?! ?\<\g<type>\>)' | |
s2 = re.sub(pat, rep, s) | |
pat = r'\<\<(?P<type_and_index>(?P<type>[a-z_]+)[0-9]*)\>\>' | |
rep = r' ?\<\g<type>\>(?P<T_\g<type_and_index>>(?:(?!\<\/\g<type>\>).)*)\<\/\g<type>\> ?' | |
return re.sub(pat, rep, s2) | |
def _translate_type_substitutions(s): | |
pat = r'\<\<(?P<type_and_index>(?P<type>[a-z_]+)[0-9]*)\>\>' | |
rep = r' <\g<type>>\\g<T_\g<type_and_index>></\g<type>> ' | |
return re.sub(pat, rep, s) | |
# Amount substitution rules | |
amount_rules = [ | |
# imprecise cooking units | |
rule(r'{(?<pinch>[pP]inch(?:es)?)}', ' <unit><<pinch>></unit> '), | |
rule(r'{(?<dash>[dD]ash)}', ' <unit><<dash>></unit> '), | |
# general units of volume | |
rule(r'{(?<ml>mls?|mL|cc|millilitres?|milliliters?)}', ' <unit><<ml>></unit> '), | |
rule(r'{(?<tsp>tsps?|t|teaspoons?)}', ' <unit><<tsp>></unit> '), | |
rule(r'{(?<tbsp>[tT]bsps?|T|tbl|tbs|[tT]ablespoons?)}', ' <unit><<tbsp>></unit> '), | |
rule(r'{(?<floz>fl ?oz|fluid ounces?)}', ' <unit><<floz>></unit> '), | |
rule(r'{(?<cup>cups?)}', ' <unit><<cup>></unit> '), | |
rule(r'{(?<pt>p|pts?|pints?)}', ' <unit><<pt>></unit> '), | |
rule(r'{(?<l>ls?|L|litres?|liters?)}', ' <unit><<l>></unit> '), | |
rule(r'{(?<gal>gals?|gallons?/)}', ' <unit><<gal>></unit> '), | |
rule(r'{(?<dl>dls?|dL|decilitre|deciliter)}', ' <unit><<dl>></unit> '), | |
# general units of mass | |
rule(r'{(?<kg>kgs?|kilos?|kilograms?)}', ' <unit><<kg>></unit> '), | |
rule(r'{(?<g>gs?|grams?|grammes?)}', ' <unit><<g>></unit> '), | |
rule(r'{(?<oz>oz|ounces?)}', ' <unit><<oz>></unit> '), | |
rule(r'{(?<lb>lbs?|#|pounds?)}', ' <unit><<lb>></unit> '), | |
# numbers | |
rule(r'{(?<number>(?:\d* )?\d+ ?\/ ?\d+|\d*\s?[½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞]|\d+(\.\d+)?)}', '<<number>>'), | |
rule(r'{(?<number>an?)}', '<<number>>'), | |
# imprecise amounts | |
rule(r'{(?<amount>to taste)}', '<<amount>>'), | |
rule(r'{(?<amount>to serve)}', '<<amount>>'), | |
rule(r'{(?<amount>for \w+ing)}', '<<amount>>'), | |
# general amounts | |
rule(r'{(?<amount><<number1>>[\-–]?<<unit1>>|<<number2>>|<<unit2>>)}', '<<amount>>') | |
] | |
# Text helper functions | |
def tokenise(s): | |
return ' '.join([t for t in re.split(r'([a-zA-Zñ][a-zA-Zñ\-]*|\d+\.\d+|[½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞\d]+|[^\w ])', s) if t.strip() != '']) | |
def extract_typed(s, t): | |
return re.findall(r'\<' + t + r'\>((?:(?!\</' + t + '\>).)*)\</' + t + r'\>', s) | |
def first_tag(s): | |
for t in re.finditer(r'\<([^\>]*)\>', s): | |
return t.group(1) | |
# Substitution helpers | |
def classify_ingredients(s, ingredients): | |
candidates = [] | |
names = {} | |
for name, ingredient in ingredients.items(): | |
for keyword in ingredient['keywords']: | |
if keyword in s: | |
candidates.append(keyword) | |
names[keyword] = name | |
sorted_candidates = sorted(candidates, key=len) | |
if len(sorted_candidates) == 0: | |
return s | |
keyword = sorted_candidates[-1] | |
name = names[keyword] | |
return s.replace(keyword, '<ingredient><' + name + '>' + keyword + '</' + name + '></ingredient>') | |
def classify_amounts(s): | |
for r in amount_rules: | |
s = r.sub(s) | |
return s | |
# Structured ingredient quantity | |
class ingredient_quantity: | |
def __init__(self, source, number, unit, ings): | |
self.source = source | |
self.number = number | |
self.unit = unit | |
self.ings = ings | |
def __repr__(self): | |
return 'number: ' + str(self.number) + ', unit: ' + self.unit + ', ingredients: ' + str(self.ings) | |
@classmethod | |
def from_string(cls, iq_str, ingredients): | |
s = tokenise(iq_str) | |
s = classify_ingredients(s, ingredients) | |
s = classify_amounts(s) | |
amounts = extract_typed(s, 'amount') | |
number = '1' | |
unit = 'default' | |
for amount in amounts: | |
numbers = extract_typed(amount, 'number') | |
units = extract_typed(amount, 'unit') | |
if len(numbers) > 0: | |
number = numbers[0] | |
if len(units) > 0: | |
unit = first_tag(units[0]) | |
if len(numbers) > 0 and len(units) > 0: | |
break | |
names = [first_tag(i) for i in extract_typed(s, 'ingredient')] | |
ings = dict(zip(names, [{ | |
'component_ids': ingredients[name]['component_ids'], | |
'unit_mass': ingredients[name]['unit_mass'], | |
'density': ingredients[name]['density'] | |
} for name in names])) | |
return cls(iq_str, float(number), unit, ings) | |
def parse_recipe(recipe, ingredients): | |
return [ingredient_quantity.from_string(iq_str, ingredients) for iq_str in recipe.splitlines() if iq_str.strip() != ''] | |
# Dict of matchable ingredients indexed by name | |
with open('ingredients.json') as f: | |
ing_map = json.load(f) | |
iqs = parse_recipe(recipe, ing_map) | |
# | |
# --- Aggregation --- | |
# | |
# Dict of nutrients indexed by nutrient id | |
with open('nutrients.json') as f: | |
nutrients = json.load(f) | |
# Dict of units indexed by name | |
with open('units.json') as f: | |
units = json.load(f) | |
# Dict of nutrient values indexed by fdc_id:nutrient_id | |
nutrient_values = {} | |
with open('food_nutrient.csv') as f: | |
rows = csv.DictReader(f) | |
for row in rows: | |
k = row['fdc_id'] + ':' + row['nutrient_id'] | |
nutrient_values[k] = float(row['amount']) | |
def total_grams_of(iq, name): | |
if iq.unit == 'default': | |
unit_mass = iq.ings[name]['unit_mass'] | |
return iq.number * unit_mass | |
factor = units[iq.unit]['factor'] | |
base_number = iq.number * factor | |
base_unit = units[iq.unit]['base_unit'] | |
if base_unit == 'g': | |
return base_number | |
density = iq.ings[name]['density'] | |
return base_number * density | |
def total_grams(iq): | |
g = 0 | |
for name in iq.ings.keys(): | |
g += total_grams_of(iq, name) | |
return g / max(len(iq.ings), 1) | |
def nutrient_grams(iq, nutrient_id): | |
g = 0 | |
for name in iq.ings.keys(): | |
i_g = 0 | |
for fdc_id in iq.ings[name]['component_ids']: | |
i_g += nutrient_values[fdc_id + ':' + nutrient_id] * total_grams_of(iq, name) / 100 | |
g += i_g / len(iq.ings[name]['component_ids']) | |
return g / max(len(iq.ings), 1) | |
for iq in iqs: | |
print(iq.source) | |
print(str(iq) + ', grams: ' + str(total_grams(iq)) + ', ' + ', '.join([nutrient_name + ': ' + str(nutrient_grams(iq, nutrient_id)) for nutrient_id, nutrient_name in nutrients.items()])) | |
print() | |
totals = [0.0 for n in nutrients.items()] | |
total_mass = 0.0 | |
for iq in iqs: | |
for n, nutrient_id in enumerate(nutrients.keys()): | |
totals[n] += nutrient_grams(iq, nutrient_id) | |
total_mass += total_grams(iq) | |
print() | |
print('Total mass: ' + str(total_mass)) | |
for n, nutrient_id in enumerate(nutrients.keys()): | |
print(nutrients[nutrient_id] + ': ' + str(totals[n])) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment