Last active
January 28, 2020 17:25
-
-
Save jeremyorme/34504e4966763f1170474fc978f44ddf to your computer and use it in GitHub Desktop.
Ingredient Classification & Aggregation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import csv | |
import re | |
# Input recipe, ingredient per line, ignore blanks | |
recipe = ''' | |
2 courgettes (zucchini) | |
1 carrot | |
1 avocado | |
1 bunch basil | |
1 tbsp lemon juice | |
2 tbsp nutritional yeast | |
10 olives, sliced | |
4 garlic cloves, roasted | |
2 tomatoes, roasted | |
Pinch of chilli powder or smoked paprika | |
''' | |
# Dict of matchable ingredients indexed by name | |
with open('ingredients.json') as f: | |
ingredients = json.load(f) | |
# Dict of nutrients indexed by nutrient id | |
with open('nutrients.json') as f: | |
nutrients = json.load(f) | |
# Dict of units indexed by name | |
with open('units.json') as f: | |
units = json.load(f) | |
# Dict of nutrient values indexed by fdc_id:nutrient_id | |
nutrient_values = {} | |
with open('ingredient_nutrients.csv') as f: | |
rows = csv.DictReader(f) | |
for row in rows: | |
k = row['fdc_id'] + ':' + row['nutrient_id'] | |
nutrient_values[k] = float(row['amount']) | |
# Substitution rule class | |
class rule: | |
def __init__(self, pattern, substitution): | |
self.p = rule._translate_type_captures(rule._translate_type_matches(pattern)) | |
self.s = rule._translate_type_substitutions(substitution) | |
def sub(self, s): | |
return re.sub(self.p, self.s, s) | |
def _translate_type_captures(s): | |
pat = r'\{\(\?\<(?P<type_and_index>[a-z_]+[0-9]*)\>(?P<content>.*?)\)\}' | |
rep = r' ?(?<![^\> ])(?P<T_\g<type_and_index>>\g<content>)(?![^\< ]) ?' | |
return re.sub(pat, rep, s) | |
def _translate_type_matches(s): | |
pat = r'\<\<!(?P<type_and_index>(?P<type>[a-z_]+)[0-9]*)\>\>' | |
rep = r'(?! ?\<\g<type>\>)' | |
s2 = re.sub(pat, rep, s) | |
pat = r'\<\<(?P<type_and_index>(?P<type>[a-z_]+)[0-9]*)\>\>' | |
rep = r' ?\<\g<type>\>(?P<T_\g<type_and_index>>(?:(?!\<\/\g<type>\>).)*)\<\/\g<type>\> ?' | |
return re.sub(pat, rep, s2) | |
def _translate_type_substitutions(s): | |
pat = r'\<\<(?P<type_and_index>(?P<type>[a-z_]+)[0-9]*)\>\>' | |
rep = r' <\g<type>>\\g<T_\g<type_and_index>></\g<type>> ' | |
return re.sub(pat, rep, s) | |
# Amount substitution rules | |
amount_rules = [ | |
# imprecise cooking units | |
rule(r'{(?<pinch>[pP]inch(?:es)?)}', ' <unit><<pinch>></unit> '), | |
rule(r'{(?<dash>[dD]ash)}', ' <unit><<dash>></unit> '), | |
# general units of volume | |
rule(r'{(?<ml>mls?|mL|cc|millilitres?|milliliters?)}', ' <unit><<ml>></unit> '), | |
rule(r'{(?<tsp>tsps?|t|teaspoons?)}', ' <unit><<tsp>></unit> '), | |
rule(r'{(?<tbsp>[tT]bsps?|T|tbl|tbs|[tT]ablespoons?)}', ' <unit><<tbsp>></unit> '), | |
rule(r'{(?<floz>fl ?oz|fluid ounces?)}', ' <unit><<floz>></unit> '), | |
rule(r'{(?<cup>cups?)}', ' <unit><<cup>></unit> '), | |
rule(r'{(?<pt>p|pts?|pints?)}', ' <unit><<pt>></unit> '), | |
rule(r'{(?<l>ls?|L|litres?|liters?)}', ' <unit><<l>></unit> '), | |
rule(r'{(?<gal>gals?|gallons?/)}', ' <unit><<gal>></unit> '), | |
rule(r'{(?<dl>dls?|dL|decilitre|deciliter)}', ' <unit><<dl>></unit> '), | |
# general units of mass | |
rule(r'{(?<kg>kgs?|kilos?|kilograms?)}', ' <unit><<kg>></unit> '), | |
rule(r'{(?<g>gs?|grams?|grammes?)}', ' <unit><<g>></unit> '), | |
rule(r'{(?<oz>oz|ounces?)}', ' <unit><<oz>></unit> '), | |
rule(r'{(?<lb>lbs?|#|pounds?)}', ' <unit><<lb>></unit> '), | |
# numbers | |
rule(r'{(?<number>(?:\d* )?\d+ ?\/ ?\d+|\d*\s?[½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞]|\d+(\.\d+)?)}', '<<number>>'), | |
rule(r'{(?<number>an?)}', '<<number>>'), | |
# imprecise amounts | |
rule(r'{(?<amount>to taste)}', '<<amount>>'), | |
rule(r'{(?<amount>to serve)}', '<<amount>>'), | |
rule(r'{(?<amount>for \w+ing)}', '<<amount>>'), | |
# general amounts | |
rule(r'{(?<amount><<number1>>[\-–]?<<unit1>>|<<number2>>|<<unit2>>)}', '<<amount>>') | |
] | |
# Text helper functions | |
def tokenise(s): | |
return ' '.join([t for t in re.split(r'([a-zA-Zñ][a-zA-Zñ\-]*|\d+\.\d+|[½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞\d]+|[^\w ])', s) if t.strip() != '']) | |
def extract_typed(s, t): | |
return re.findall(r'\<' + t + r'\>((?:(?!\</' + t + '\>).)*)\</' + t + r'\>', s) | |
def first_tag(s): | |
for t in re.finditer(r'\<([^\>]*)\>', s): | |
return t.group(1) | |
# Substitution helpers | |
def classify_ingredients(s): | |
sout = s | |
for name, ingredient in ingredients.items(): | |
for keyword in ingredient['keywords']: | |
sout = sout.replace(keyword, '<ingredient><' + name + '>' + keyword + '</' + name + '></ingredient>') | |
return sout | |
def classify_amounts(s): | |
for r in amount_rules: | |
s = r.sub(s) | |
return s | |
# Structured ingredient quantity | |
class ingredient_quantity: | |
def __init__(self, number, unit, names): | |
self.number = number | |
self.unit = unit | |
self.names = names | |
def __repr__(self): | |
return 'number: ' + str(self.number) + ', unit: ' + self.unit + ', names: ' + str(self.names) | |
@classmethod | |
def from_string(cls, iq_str): | |
s = tokenise(iq_str) | |
s = classify_ingredients(s) | |
s = classify_amounts(s) | |
amounts = extract_typed(s, 'amount') | |
number = '1' | |
unit = 'default' | |
for amount in amounts: | |
numbers = extract_typed(amount, 'number') | |
units = extract_typed(amount, 'unit') | |
if len(numbers) > 0: | |
number = numbers[0] | |
if len(units) > 0: | |
unit = first_tag(units[0]) | |
if len(numbers) > 0 and len(units) > 0: | |
break | |
names = [first_tag(i) for i in extract_typed(s, 'ingredient')] | |
return cls(float(number), unit, names) | |
def total_grams_of(self, name): | |
if self.unit == 'default': | |
unit_mass = ingredients[name]['unit_mass'] | |
return self.number * unit_mass | |
factor = units[self.unit]['factor'] | |
base_number = self.number * factor | |
base_unit = units[self.unit]['base_unit'] | |
if base_unit == 'g': | |
return base_number | |
density = ingredients[name]['density'] | |
return base_number * density | |
def total_grams(self): | |
g = 0 | |
for name in self.names: | |
g += self.total_grams_of(name) | |
return g / len(self.names) | |
def nutrient_grams(self, nutrient_id): | |
g = 0 | |
for name in self.names: | |
fdc_id = ingredients[name]['fdc_id'] | |
g += nutrient_values[fdc_id + ':' + nutrient_id] * self.total_grams_of(name) / 100 | |
return g / len(self.names) | |
# Parse ingredient quantities | |
iqs = [ingredient_quantity.from_string(iq_str) for iq_str in recipe.splitlines() if iq_str.strip() != ''] | |
# Print nutrient masses per ingredient | |
for iq in iqs: | |
print(str(iq) + ', grams: ' + str(iq.total_grams()) + ', ' + ', '.join([nutrient_name + ': ' + str(iq.nutrient_grams(nutrient_id)) for nutrient_id, nutrient_name in nutrients.items()])) | |
# Aggregate nutrient masses | |
totals = [0.0 for n in nutrients.items()] | |
total_mass = 0.0 | |
for iq in iqs: | |
for n, nutrient_id in enumerate(nutrients.keys()): | |
totals[n] += iq.nutrient_grams(nutrient_id) | |
total_mass += iq.total_grams() | |
# Print aggregated nutrient masses | |
print() | |
print('Total mass: ' + str(total_mass)) | |
for n, nutrient_id in enumerate(nutrients.keys()): | |
print(nutrients[nutrient_id] + ': ' + str(totals[n])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
id | fdc_id | nutrient_id | amount | data_points | derivation_id | min | max | median | footnote | min_year_acquired | |
---|---|---|---|---|---|---|---|---|---|---|---|
1526524 | 170471 | 1003 | 3.33 | 36 | |||||||
1526500 | 170471 | 1004 | 0.52 | 36 | |||||||
1526496 | 170471 | 1005 | 13.47 | 0 | 49 | ||||||
1430596 | 169291 | 1003 | 1.21 | 10 | 43 | 0.91 | 1.5 | ||||
1430544 | 169291 | 1004 | 0.32 | 7 | 42 | 0.1 | 0.45 | ||||
1430545 | 169291 | 1005 | 3.11 | 0 | 49 | ||||||
2696692 | 342354 | 1003 | 0.93 | ||||||||
2696693 | 342354 | 1004 | 0.24 | ||||||||
2696694 | 342354 | 1005 | 9.58 | ||||||||
2643002 | 341528 | 1003 | 2 | ||||||||
2643003 | 341528 | 1004 | 14.66 | ||||||||
2643004 | 341528 | 1005 | 8.53 | ||||||||
2713202 | 342608 | 1003 | 3.15 | ||||||||
2713203 | 342608 | 1004 | 0.64 | ||||||||
2713204 | 342608 | 1005 | 2.65 | ||||||||
2636827 | 341433 | 1003 | 1.1 | ||||||||
2636828 | 341433 | 1004 | 0.3 | ||||||||
2636829 | 341433 | 1005 | 9.32 | ||||||||
2764097 | 343391 | 1003 | 40.44 | ||||||||
2764098 | 343391 | 1004 | 7.61 | ||||||||
2764099 | 343391 | 1005 | 41.22 | ||||||||
2782232 | 343670 | 1003 | 0.88 | ||||||||
2782233 | 343670 | 1004 | 9.54 | ||||||||
2782234 | 343670 | 1005 | 6.06 | ||||||||
2713592 | 342614 | 1003 | 6.36 | ||||||||
2713593 | 342614 | 1004 | 0.5 | ||||||||
2713594 | 342614 | 1005 | 33.06 | ||||||||
2706312 | 342502 | 1003 | 0.88 | ||||||||
2706313 | 342502 | 1004 | 0.2 | ||||||||
2706314 | 342502 | 1005 | 3.89 | ||||||||
1600381 | 171319 | 1003 | 13.46 | 1 | 1 | ||||||
1600336 | 171319 | 1004 | 14.28 | 1 | 1 | ||||||
1600337 | 171319 | 1005 | 49.7 | 0 | 49 | ||||||
1601325 | 171329 | 1003 | 14.14 | 3 | 1 | 13.96 | 14.47 | ||||
1601334 | 171329 | 1004 | 12.89 | 3 | 1 | 11.48 | 14.8 | ||||
1601368 | 171329 | 1005 | 53.99 | 0 | 49 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"mixed-vegetables": { | |
"keywords": [ | |
"different types of veg" | |
], | |
"fdc_id": "170471", | |
"density": 0.59, | |
"unit_mass": null, | |
"unit_name": null | |
}, | |
"courgette": { | |
"keywords": [ | |
"courgette", | |
"zucchini" | |
], | |
"fdc_id": "169291", | |
"density": null, | |
"unit_mass": 217.0, | |
"unit_name": "courgette" | |
}, | |
"carrot": { | |
"keywords": [ | |
"carrot" | |
], | |
"fdc_id": "342354", | |
"density": null, | |
"unit_mass": 46.0, | |
"unit_name": "carrot" | |
}, | |
"avocado": { | |
"keywords": [ | |
"avocado" | |
], | |
"fdc_id": "341528", | |
"density": null, | |
"unit_mass": 201.0, | |
"unit_name": "avocado" | |
}, | |
"basil": { | |
"keywords": [ | |
"basil" | |
], | |
"fdc_id": "342608", | |
"density": null, | |
"unit_mass": 10.0, | |
"unit_name": "bunch" | |
}, | |
"lemon": { | |
"keywords": [ | |
"lemon" | |
], | |
"fdc_id": "341433", | |
"density": 1.0, | |
"unit_mass": 10.0, | |
"unit_name": "lemon (juice of)" | |
}, | |
"yeast": { | |
"keywords": [ | |
"yeast" | |
], | |
"fdc_id": "343391", | |
"density": 0.81, | |
"unit_mass": null, | |
"unit_name": null | |
}, | |
"black-olives": { | |
"keywords": [ | |
"olive" | |
], | |
"fdc_id": "343670", | |
"density": null, | |
"unit_mass": 3.8, | |
"unit_name": "olive" | |
}, | |
"garlic": { | |
"keywords": [ | |
"garlic" | |
], | |
"fdc_id": "342614", | |
"density": null, | |
"unit_mass": 3.0, | |
"unit_name": "clove" | |
}, | |
"tomato": { | |
"keywords": [ | |
"tomato" | |
], | |
"fdc_id": "342502", | |
"density": null, | |
"unit_mass": 123.0, | |
"unit_name": "tomato" | |
}, | |
"chilli-powder": { | |
"keywords": [ | |
"chilli powder" | |
], | |
"fdc_id": "171319", | |
"density": 0.55, | |
"unit_mass": null, | |
"unit_name": null | |
}, | |
"paprika": { | |
"keywords": [ | |
"paprika" | |
], | |
"fdc_id": "171329", | |
"density": 0.47, | |
"unit_mass": null, | |
"unit_name": null | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"1003": "Protein", | |
"1004": "Fat", | |
"1005": "Carbohydrate" | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"g": { | |
"factor": 1.0, | |
"base_unit": "g" | |
}, | |
"oz": { | |
"factor": 28.3495, | |
"base_unit": "g" | |
}, | |
"lb": { | |
"factor": 453.592, | |
"base_unit": "g" | |
}, | |
"kg": { | |
"factor": 1000.0, | |
"base_unit": "g" | |
}, | |
"ml": { | |
"factor": 1.0, | |
"base_unit": "ml" | |
}, | |
"cc": { | |
"factor": 1.0, | |
"base_unit": "ml" | |
}, | |
"pinch": { | |
"factor": 0.73992, | |
"base_unit": "ml" | |
}, | |
"tsp": { | |
"factor": 5.91939, | |
"base_unit": "ml" | |
}, | |
"tbsp": { | |
"factor": 17.7582, | |
"base_unit": "ml" | |
}, | |
"floz": { | |
"factor": 28.4131, | |
"base_unit": "ml" | |
}, | |
"cup": { | |
"factor": 284.131, | |
"base_unit": "ml" | |
}, | |
"pt": { | |
"factor": 568.261, | |
"base_unit": "ml" | |
}, | |
"dl": { | |
"factor": 100.0, | |
"base_unit": "ml" | |
}, | |
"l": { | |
"factor": 1000.0, | |
"base_unit": "ml" | |
}, | |
"gal": { | |
"factor": 4546.09, | |
"base_unit": "ml" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment