jeremyorme/ingredient_processor.py

## ingredient_processor.py
import json
import csv
import re

# Input recipe, ingredient per line, ignore blanks
recipe = '''
2 courgettes (zucchini)
1 carrot
1 avocado
1 bunch basil
1 tbsp lemon juice
2 tbsp nutritional yeast
10 olives, sliced
4 garlic cloves, roasted
2 tomatoes, roasted
Pinch of chilli powder or smoked paprika
'''

#
# --- Classification ---
#

# Substitution rule class
class rule:

	def __init__(self, pattern, substitution):
		self.p = rule._translate_type_captures(rule._translate_type_matches(pattern))
		self.s = rule._translate_type_substitutions(substitution)

	def sub(self, s):
		return re.sub(self.p, self.s, s)

	def _translate_type_captures(s):
		pat = r'\{\(\?\<(?P<type_and_index>[a-z_]+[0-9]*)\>(?P<content>.*?)\)\}'
		rep = r' ?(?<![^\> ])(?P<T_\g<type_and_index>>\g<content>)(?![^\< ]) ?'
		return re.sub(pat, rep, s)

	def _translate_type_matches(s):
		pat = r'\<\<!(?P<type_and_index>(?P<type>[a-z_]+)[0-9]*)\>\>'
		rep = r'(?! ?\<\g<type>\>)'
		s2 = re.sub(pat, rep, s)

		pat = r'\<\<(?P<type_and_index>(?P<type>[a-z_]+)[0-9]*)\>\>'
		rep = r' ?\<\g<type>\>(?P<T_\g<type_and_index>>(?:(?!\<\/\g<type>\>).)*)\<\/\g<type>\> ?'
		return re.sub(pat, rep, s2)

	def _translate_type_substitutions(s):
		pat = r'\<\<(?P<type_and_index>(?P<type>[a-z_]+)[0-9]*)\>\>'
		rep = r' <\g<type>>\\g<T_\g<type_and_index>></\g<type>> '
		return re.sub(pat, rep, s)

# Amount substitution rules
amount_rules = [
	# imprecise cooking units
	rule(r'{(?<pinch>[pP]inch(?:es)?)}', ' <unit><<pinch>></unit> '),
	rule(r'{(?<dash>[dD]ash)}', ' <unit><<dash>></unit> '),

	# general units of volume
	rule(r'{(?<ml>mls?|mL|cc|millilitres?|milliliters?)}', ' <unit><<ml>></unit> '),
	rule(r'{(?<tsp>tsps?|t|teaspoons?)}', ' <unit><<tsp>></unit> '),
	rule(r'{(?<tbsp>[tT]bsps?|T|tbl|tbs|[tT]ablespoons?)}', ' <unit><<tbsp>></unit> '),
	rule(r'{(?<floz>fl ?oz|fluid ounces?)}', ' <unit><<floz>></unit> '),
	rule(r'{(?<cup>cups?)}', ' <unit><<cup>></unit> '),
	rule(r'{(?<pt>p|pts?|pints?)}', ' <unit><<pt>></unit> '),
	rule(r'{(?<l>ls?|L|litres?|liters?)}', ' <unit><<l>></unit> '),
	rule(r'{(?<gal>gals?|gallons?/)}', ' <unit><<gal>></unit> '),
	rule(r'{(?<dl>dls?|dL|decilitre|deciliter)}', ' <unit><<dl>></unit> '),

	# general units of mass
	rule(r'{(?<kg>kgs?|kilos?|kilograms?)}', ' <unit><<kg>></unit> '),
	rule(r'{(?<g>gs?|grams?|grammes?)}', ' <unit><<g>></unit> '),
	rule(r'{(?<oz>oz|ounces?)}', ' <unit><<oz>></unit> '),
	rule(r'{(?<lb>lbs?|#|pounds?)}', ' <unit><<lb>></unit> '),

	# numbers
	rule(r'{(?<number>(?:\d* )?\d+ ?\/ ?\d+|\d*\s?[½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞]|\d+(\.\d+)?)}', '<<number>>'),
	rule(r'{(?<number>an?)}', '<<number>>'),

	# imprecise amounts
	rule(r'{(?<amount>to taste)}', '<<amount>>'),
	rule(r'{(?<amount>to serve)}', '<<amount>>'),
	rule(r'{(?<amount>for \w+ing)}', '<<amount>>'),

	# general amounts
	rule(r'{(?<amount><<number1>>[\-–]?<<unit1>>|<<number2>>|<<unit2>>)}', '<<amount>>')
]

# Text helper functions
def tokenise(s):
	return ' '.join([t for t in re.split(r'([a-zA-Zñ][a-zA-Zñ\-]*|\d+\.\d+|[½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞\d]+|[^\w ])', s) if t.strip() != ''])

def extract_typed(s, t):
	return re.findall(r'\<' + t + r'\>((?:(?!\</' + t + '\>).)*)\</' + t + r'\>', s)

def first_tag(s):
	for t in re.finditer(r'\<([^\>]*)\>', s):
		return t.group(1)

# Substitution helpers
def classify_ingredients(s, ingredients):
	candidates = []
	names = {}
	for name, ingredient in ingredients.items():
		for keyword in ingredient['keywords']:
			if keyword in s:
				candidates.append(keyword)
				names[keyword] = name

	sorted_candidates = sorted(candidates, key=len)

	if len(sorted_candidates) == 0:
		return s

	keyword = sorted_candidates[-1]
	name = names[keyword]
	return s.replace(keyword, '<ingredient><' + name + '>' + keyword + '</' + name + '></ingredient>')

def classify_amounts(s):
	for r in amount_rules:
		s = r.sub(s)
	return s

# Structured ingredient quantity
class ingredient_quantity:
	def __init__(self, source, number, unit, ings):
		self.source = source
		self.number = number
		self.unit = unit
		self.ings = ings

	def __repr__(self):
		return 'number: ' + str(self.number) + ', unit: ' + self.unit + ', ingredients: ' + str(self.ings)

	@classmethod
	def from_string(cls, iq_str, ingredients):
		s = tokenise(iq_str)
		s = classify_ingredients(s, ingredients)
		s = classify_amounts(s)

		amounts = extract_typed(s, 'amount')
		number = '1'
		unit = 'default'
		for amount in amounts:
			numbers = extract_typed(amount, 'number')
			units = extract_typed(amount, 'unit')
			if len(numbers) > 0:
				number = numbers[0]
			if len(units) > 0:
				unit = first_tag(units[0])
			if len(numbers) > 0 and len(units) > 0:
				break

		names = [first_tag(i) for i in extract_typed(s, 'ingredient')]

		ings = dict(zip(names, [{
			'component_ids': ingredients[name]['component_ids'],
			'unit_mass': ingredients[name]['unit_mass'],
			'density': ingredients[name]['density']
		} for name in names]))

		return cls(iq_str, float(number), unit, ings)

def parse_recipe(recipe, ingredients):
	return [ingredient_quantity.from_string(iq_str, ingredients) for iq_str in recipe.splitlines() if iq_str.strip() != '']


# Dict of matchable ingredients indexed by name
with open('ingredients.json') as f:
	ing_map = json.load(f)

iqs = parse_recipe(recipe, ing_map)

#
# --- Aggregation ---
#

# Dict of nutrients indexed by nutrient id
with open('nutrients.json') as f:
	nutrients = json.load(f)

# Dict of units indexed by name
with open('units.json') as f:
	units = json.load(f)

# Dict of nutrient values indexed by fdc_id:nutrient_id
nutrient_values = {}
with open('food_nutrient.csv') as f:
	rows = csv.DictReader(f)
	for row in rows:
		k = row['fdc_id'] + ':' + row['nutrient_id']
		nutrient_values[k] = float(row['amount'])

def total_grams_of(iq, name):
	if iq.unit == 'default':
		unit_mass = iq.ings[name]['unit_mass']
		return iq.number * unit_mass

	factor = units[iq.unit]['factor']
	base_number = iq.number * factor

	base_unit = units[iq.unit]['base_unit']
	if base_unit == 'g':
		return base_number

	density = iq.ings[name]['density']
	return base_number * density

def total_grams(iq):
	g = 0
	for name in iq.ings.keys():
		g += total_grams_of(iq, name)
	return g / max(len(iq.ings), 1)

def nutrient_grams(iq, nutrient_id):
	g = 0
	for name in iq.ings.keys():
		i_g = 0
		for fdc_id in iq.ings[name]['component_ids']:
			i_g += nutrient_values[fdc_id + ':' + nutrient_id] * total_grams_of(iq, name) / 100
		g += i_g / len(iq.ings[name]['component_ids'])
	return g / max(len(iq.ings), 1)

for iq in iqs:
	print(iq.source)
	print(str(iq) + ', grams: ' + str(total_grams(iq)) + ', ' + ', '.join([nutrient_name + ': ' + str(nutrient_grams(iq, nutrient_id)) for nutrient_id, nutrient_name in nutrients.items()]))
	print()

totals = [0.0 for n in nutrients.items()]
total_mass = 0.0
for iq in iqs:
	for n, nutrient_id in enumerate(nutrients.keys()):
		totals[n] += nutrient_grams(iq, nutrient_id)
	total_mass += total_grams(iq)

print()
print('Total mass: ' + str(total_mass))
for n, nutrient_id in enumerate(nutrients.keys()):
	print(nutrients[nutrient_id] + ': ' + str(totals[n]))
	import json
	import csv
	import re

	# Input recipe, ingredient per line, ignore blanks
	recipe = '''
	2 courgettes (zucchini)
	1 carrot
	1 avocado
	1 bunch basil
	1 tbsp lemon juice
	2 tbsp nutritional yeast
	10 olives, sliced
	4 garlic cloves, roasted
	2 tomatoes, roasted
	Pinch of chilli powder or smoked paprika
	'''

	#
	# --- Classification ---
	#

	# Substitution rule class
	class rule:

	def __init__(self, pattern, substitution):
	self.p = rule._translate_type_captures(rule._translate_type_matches(pattern))
	self.s = rule._translate_type_substitutions(substitution)

	def sub(self, s):
	return re.sub(self.p, self.s, s)

	def _translate_type_captures(s):
	pat = r'\{\(\?\<(?P<type_and_index>[a-z_]+[0-9])\>(?P<content>.?)\)\}'
	rep = r' ?(?<![^\> ])(?P<T_\g<type_and_index>>\g<content>)(?![^\< ]) ?'
	return re.sub(pat, rep, s)

	def _translate_type_matches(s):
	pat = r'\<\<!(?P<type_and_index>(?P<type>[a-z_]+)[0-9]*)\>\>'
	rep = r'(?! ?\<\g<type>\>)'
	s2 = re.sub(pat, rep, s)

	pat = r'\<\<(?P<type_and_index>(?P<type>[a-z_]+)[0-9]*)\>\>'
	rep = r' ?\<\g<type>\>(?P<T_\g<type_and_index>>(?:(?!\<\/\g<type>\>).)*)\<\/\g<type>\> ?'
	return re.sub(pat, rep, s2)

	def _translate_type_substitutions(s):
	pat = r'\<\<(?P<type_and_index>(?P<type>[a-z_]+)[0-9]*)\>\>'
	rep = r' <\g<type>>\\g<T_\g<type_and_index>></\g<type>> '
	return re.sub(pat, rep, s)

	# Amount substitution rules
	amount_rules = [
	# imprecise cooking units
	rule(r'{(?<pinch>[pP]inch(?:es)?)}', ' <unit><<pinch>></unit> '),
	rule(r'{(?<dash>[dD]ash)}', ' <unit><<dash>></unit> '),

	# general units of volume
	rule(r'{(?<ml>mls?\|mL\|cc\|millilitres?\|milliliters?)}', ' <unit><<ml>></unit> '),
	rule(r'{(?<tsp>tsps?\|t\|teaspoons?)}', ' <unit><<tsp>></unit> '),
	rule(r'{(?<tbsp>[tT]bsps?\|T\|tbl\|tbs\|[tT]ablespoons?)}', ' <unit><<tbsp>></unit> '),
	rule(r'{(?<floz>fl ?oz\|fluid ounces?)}', ' <unit><<floz>></unit> '),
	rule(r'{(?<cup>cups?)}', ' <unit><<cup>></unit> '),
	rule(r'{(?<pt>p\|pts?\|pints?)}', ' <unit><<pt>></unit> '),
	rule(r'{(?<l>ls?\|L\|litres?\|liters?)}', ' <unit><<l>></unit> '),
	rule(r'{(?<gal>gals?\|gallons?/)}', ' <unit><<gal>></unit> '),
	rule(r'{(?<dl>dls?\|dL\|decilitre\|deciliter)}', ' <unit><<dl>></unit> '),

	# general units of mass
	rule(r'{(?<kg>kgs?\|kilos?\|kilograms?)}', ' <unit><<kg>></unit> '),
	rule(r'{(?<g>gs?\|grams?\|grammes?)}', ' <unit><<g>></unit> '),
	rule(r'{(?<oz>oz\|ounces?)}', ' <unit><<oz>></unit> '),
	rule(r'{(?<lb>lbs?\|#\|pounds?)}', ' <unit><<lb>></unit> '),

	# numbers
	rule(r'{(?<number>(?:\d* )?\d+ ?\/ ?\d+\|\d*\s?[½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞]\|\d+(\.\d+)?)}', '<<number>>'),
	rule(r'{(?<number>an?)}', '<<number>>'),

	# imprecise amounts
	rule(r'{(?<amount>to taste)}', '<<amount>>'),
	rule(r'{(?<amount>to serve)}', '<<amount>>'),
	rule(r'{(?<amount>for \w+ing)}', '<<amount>>'),

	# general amounts
	rule(r'{(?<amount><<number1>>[\-–]?<<unit1>>\|<<number2>>\|<<unit2>>)}', '<<amount>>')
	]

	# Text helper functions
	def tokenise(s):
	return ' '.join([t for t in re.split(r'([a-zA-Zñ][a-zA-Zñ\-]*\|\d+\.\d+\|[½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞\d]+\|[^\w ])', s) if t.strip() != ''])

	def extract_typed(s, t):
	return re.findall(r'\<' + t + r'\>((?:(?!\</' + t + '\>).)*)\</' + t + r'\>', s)

	def first_tag(s):
	for t in re.finditer(r'\<([^\>]*)\>', s):
	return t.group(1)

	# Substitution helpers
	def classify_ingredients(s, ingredients):
	candidates = []
	names = {}
	for name, ingredient in ingredients.items():
	for keyword in ingredient['keywords']:
	if keyword in s:
	candidates.append(keyword)
	names[keyword] = name

	sorted_candidates = sorted(candidates, key=len)

	if len(sorted_candidates) == 0:
	return s

	keyword = sorted_candidates[-1]
	name = names[keyword]
	return s.replace(keyword, '<ingredient><' + name + '>' + keyword + '</' + name + '></ingredient>')

	def classify_amounts(s):
	for r in amount_rules:
	s = r.sub(s)
	return s

	# Structured ingredient quantity
	class ingredient_quantity:
	def __init__(self, source, number, unit, ings):
	self.source = source
	self.number = number
	self.unit = unit
	self.ings = ings

	def __repr__(self):
	return 'number: ' + str(self.number) + ', unit: ' + self.unit + ', ingredients: ' + str(self.ings)

	@classmethod
	def from_string(cls, iq_str, ingredients):
	s = tokenise(iq_str)
	s = classify_ingredients(s, ingredients)
	s = classify_amounts(s)

	amounts = extract_typed(s, 'amount')
	number = '1'
	unit = 'default'
	for amount in amounts:
	numbers = extract_typed(amount, 'number')
	units = extract_typed(amount, 'unit')
	if len(numbers) > 0:
	number = numbers[0]
	if len(units) > 0:
	unit = first_tag(units[0])
	if len(numbers) > 0 and len(units) > 0:
	break

	names = [first_tag(i) for i in extract_typed(s, 'ingredient')]

	ings = dict(zip(names, [{
	'component_ids': ingredients[name]['component_ids'],
	'unit_mass': ingredients[name]['unit_mass'],
	'density': ingredients[name]['density']
	} for name in names]))

	return cls(iq_str, float(number), unit, ings)

	def parse_recipe(recipe, ingredients):
	return [ingredient_quantity.from_string(iq_str, ingredients) for iq_str in recipe.splitlines() if iq_str.strip() != '']


	# Dict of matchable ingredients indexed by name
	with open('ingredients.json') as f:
	ing_map = json.load(f)

	iqs = parse_recipe(recipe, ing_map)

	#
	# --- Aggregation ---
	#

	# Dict of nutrients indexed by nutrient id
	with open('nutrients.json') as f:
	nutrients = json.load(f)

	# Dict of units indexed by name
	with open('units.json') as f:
	units = json.load(f)

	# Dict of nutrient values indexed by fdc_id:nutrient_id
	nutrient_values = {}
	with open('food_nutrient.csv') as f:
	rows = csv.DictReader(f)
	for row in rows:
	k = row['fdc_id'] + ':' + row['nutrient_id']
	nutrient_values[k] = float(row['amount'])

	def total_grams_of(iq, name):
	if iq.unit == 'default':
	unit_mass = iq.ings[name]['unit_mass']
	return iq.number * unit_mass

	factor = units[iq.unit]['factor']
	base_number = iq.number * factor

	base_unit = units[iq.unit]['base_unit']
	if base_unit == 'g':
	return base_number

	density = iq.ings[name]['density']
	return base_number * density

	def total_grams(iq):
	g = 0
	for name in iq.ings.keys():
	g += total_grams_of(iq, name)
	return g / max(len(iq.ings), 1)

	def nutrient_grams(iq, nutrient_id):
	g = 0
	for name in iq.ings.keys():
	i_g = 0
	for fdc_id in iq.ings[name]['component_ids']:
	i_g += nutrient_values[fdc_id + ':' + nutrient_id] * total_grams_of(iq, name) / 100
	g += i_g / len(iq.ings[name]['component_ids'])
	return g / max(len(iq.ings), 1)

	for iq in iqs:
	print(iq.source)
	print(str(iq) + ', grams: ' + str(total_grams(iq)) + ', ' + ', '.join([nutrient_name + ': ' + str(nutrient_grams(iq, nutrient_id)) for nutrient_id, nutrient_name in nutrients.items()]))
	print()

	totals = [0.0 for n in nutrients.items()]
	total_mass = 0.0
	for iq in iqs:
	for n, nutrient_id in enumerate(nutrients.keys()):
	totals[n] += nutrient_grams(iq, nutrient_id)
	total_mass += total_grams(iq)

	print()
	print('Total mass: ' + str(total_mass))
	for n, nutrient_id in enumerate(nutrients.keys()):
	print(nutrients[nutrient_id] + ': ' + str(totals[n]))