jeremyorme/generate_ingredients.py

## generate_ingredients.py
import csv
import re

forbidden = [
	'milk',
	'cheese',
	'egg',
	'yoghurt',
	'meat',
	'beef',
	'pork',
	'lamb',
	'chicken',
	'duck',
	'veal',
	'game',
	'bison',
	'fish',
	'mollusk',
	'crustacean',
	'turkey',
	'ostrich'
	'emu',
	'salmon',
	'corn dog',
	'pepperoni',
	'vitasoy',
	'silk',
	'poultry',
	'o\'brien',
	'new zealand',
	'microwave',
	'house foods',
	'clam',
	'v8',
	'the coca-cola company',
	'nestle',
	'lipton',
	'energy',
	'animal',
	'gerolsteiner',
	'ovaltine'
	]

def is_forbidden(ing):
	ing_lower = ing.lower()
	for f in forbidden:
		if f in ing_lower:
			yield True

include_cats = [
	'2',	# Spices and Herbs
	'4',	# Fats and Oils
	'9',	# Fruits and Fruit Juices
	'11',	# Vegetables and Vegetable Products
	'12',	# Nut and Seed Products
	'14',	# Beverages
	'16',	# Legumes and Legume Products
	'20',	# Cereal Grains and Pasta
	'28'	# Alcoholic Beverages
	]

class ingredient:
	def __init__(self, description, fdc_id):
		self.description = description
		self.fdc_id = fdc_id
		self.keywords = self.generate_keywords()

	@classmethod
	def from_row(cls, r):
		return cls(r['description'], r['fdc_id'])

	def __repr__(self):
		return str(self.keywords) + '=>' + str((self.description, self.fdc_id))

	def __lt__(self, other):
		return self.description < other.description

	def generate_keywords(self):
		s = self.description
		s = re.sub(r'^(?P<type>Beans|Cabbage|Corn|Lettuce|Mushrooms|Peppers|Pickles|Potatoes|Seaweed|Squash|Tomatoes|Vegetables|Vinegar|Oil|Pasta|Rice), (?P<sub_type>[^,]+)', r'\g<sub_type> \g<type>', s)
		s = re.sub(r'^Alcoholic Beverage, wine, table, (?P<sub_type>[^,]+)', r'\g<sub_type> wine', s)
		s = re.sub(r'^(?P<type>Alcoholic beverages?|Spices|Beverages|Nuts), (?P<sub_type>[^,]+)', r'\g<sub_type>', s)
		return re.sub(r'(?P<kwd>^[^,]*).*', r'\g<kwd>', s).lower()

with open('food.csv') as f:
	rows = csv.DictReader(f)
	ings = [ingredient.from_row(r) for r in rows if r['food_category_id'] in include_cats and not any(is_forbidden(r['description']))]

kwd_to_ing = {}
for ing in sorted(ings):
	if ing.keywords not in kwd_to_ing:
		kwd_to_ing[ing.keywords] = []
	kwd_to_ing[ing.keywords].append(ing)

def common_name(ings):
	if len(ings) < 1:
		return ''

	if len(ings) == 1:
		return ings[0].description

	def mismatch(ings, i):
		for ing in ings:
			yield ing.description[i] != ings[0].description[i]

	for i in range(len(ings[0].description)):
		if any(mismatch(ings, i)):
			return ings[0].description[:i].rstrip(' ,')

	return ''

def component_ids(ings):
	return '+'.join([i.fdc_id for i in ings])

print('keywords,component_ids,density,unit_mass,name')
for k,v in kwd_to_ing.items():
	print(','.join([k, component_ids(v), '1.0', '100.0', '"' + common_name(v) + '"']))
	import csv
	import re

	forbidden = [
	'milk',
	'cheese',
	'egg',
	'yoghurt',
	'meat',
	'beef',
	'pork',
	'lamb',
	'chicken',
	'duck',
	'veal',
	'game',
	'bison',
	'fish',
	'mollusk',
	'crustacean',
	'turkey',
	'ostrich'
	'emu',
	'salmon',
	'corn dog',
	'pepperoni',
	'vitasoy',
	'silk',
	'poultry',
	'o\'brien',
	'new zealand',
	'microwave',
	'house foods',
	'clam',
	'v8',
	'the coca-cola company',
	'nestle',
	'lipton',
	'energy',
	'animal',
	'gerolsteiner',
	'ovaltine'
	]

	def is_forbidden(ing):
	ing_lower = ing.lower()
	for f in forbidden:
	if f in ing_lower:
	yield True

	include_cats = [
	'2', # Spices and Herbs
	'4', # Fats and Oils
	'9', # Fruits and Fruit Juices
	'11', # Vegetables and Vegetable Products
	'12', # Nut and Seed Products
	'14', # Beverages
	'16', # Legumes and Legume Products
	'20', # Cereal Grains and Pasta
	'28' # Alcoholic Beverages
	]

	class ingredient:
	def __init__(self, description, fdc_id):
	self.description = description
	self.fdc_id = fdc_id
	self.keywords = self.generate_keywords()

	@classmethod
	def from_row(cls, r):
	return cls(r['description'], r['fdc_id'])

	def __repr__(self):
	return str(self.keywords) + '=>' + str((self.description, self.fdc_id))

	def __lt__(self, other):
	return self.description < other.description

	def generate_keywords(self):
	s = self.description
	s = re.sub(r'^(?P<type>Beans\|Cabbage\|Corn\|Lettuce\|Mushrooms\|Peppers\|Pickles\|Potatoes\|Seaweed\|Squash\|Tomatoes\|Vegetables\|Vinegar\|Oil\|Pasta\|Rice), (?P<sub_type>[^,]+)', r'\g<sub_type> \g<type>', s)
	s = re.sub(r'^Alcoholic Beverage, wine, table, (?P<sub_type>[^,]+)', r'\g<sub_type> wine', s)
	s = re.sub(r'^(?P<type>Alcoholic beverages?\|Spices\|Beverages\|Nuts), (?P<sub_type>[^,]+)', r'\g<sub_type>', s)
	return re.sub(r'(?P<kwd>^[^,]).', r'\g<kwd>', s).lower()

	with open('food.csv') as f:
	rows = csv.DictReader(f)
	ings = [ingredient.from_row(r) for r in rows if r['food_category_id'] in include_cats and not any(is_forbidden(r['description']))]

	kwd_to_ing = {}
	for ing in sorted(ings):
	if ing.keywords not in kwd_to_ing:
	kwd_to_ing[ing.keywords] = []
	kwd_to_ing[ing.keywords].append(ing)

	def common_name(ings):
	if len(ings) < 1:
	return ''

	if len(ings) == 1:
	return ings[0].description

	def mismatch(ings, i):
	for ing in ings:
	yield ing.description[i] != ings[0].description[i]

	for i in range(len(ings[0].description)):
	if any(mismatch(ings, i)):
	return ings[0].description[:i].rstrip(' ,')

	return ''

	def component_ids(ings):
	return '+'.join([i.fdc_id for i in ings])

	print('keywords,component_ids,density,unit_mass,name')
	for k,v in kwd_to_ing.items():
	print(','.join([k, component_ids(v), '1.0', '100.0', '"' + common_name(v) + '"']))