Skip to content

Instantly share code, notes, and snippets.

@jeremyorme
Last active February 11, 2020 17:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jeremyorme/f750e110a6e13c5598e063b3f180b883 to your computer and use it in GitHub Desktop.
Save jeremyorme/f750e110a6e13c5598e063b3f180b883 to your computer and use it in GitHub Desktop.
Ingredient list generation
import csv
import re
forbidden = [
'milk',
'cheese',
'egg',
'yoghurt',
'meat',
'beef',
'pork',
'lamb',
'chicken',
'duck',
'veal',
'game',
'bison',
'fish',
'mollusk',
'crustacean',
'turkey',
'ostrich'
'emu',
'salmon',
'corn dog',
'pepperoni',
'vitasoy',
'silk',
'poultry',
'o\'brien',
'new zealand',
'microwave',
'house foods',
'clam',
'v8',
'the coca-cola company',
'nestle',
'lipton',
'energy',
'animal',
'gerolsteiner',
'ovaltine'
]
def is_forbidden(ing):
ing_lower = ing.lower()
for f in forbidden:
if f in ing_lower:
yield True
include_cats = [
'2', # Spices and Herbs
'4', # Fats and Oils
'9', # Fruits and Fruit Juices
'11', # Vegetables and Vegetable Products
'12', # Nut and Seed Products
'14', # Beverages
'16', # Legumes and Legume Products
'20', # Cereal Grains and Pasta
'28' # Alcoholic Beverages
]
class ingredient:
def __init__(self, description, fdc_id):
self.description = description
self.fdc_id = fdc_id
self.keywords = self.generate_keywords()
@classmethod
def from_row(cls, r):
return cls(r['description'], r['fdc_id'])
def __repr__(self):
return str(self.keywords) + '=>' + str((self.description, self.fdc_id))
def __lt__(self, other):
return self.description < other.description
def generate_keywords(self):
s = self.description
s = re.sub(r'^(?P<type>Beans|Cabbage|Corn|Lettuce|Mushrooms|Peppers|Pickles|Potatoes|Seaweed|Squash|Tomatoes|Vegetables|Vinegar|Oil|Pasta|Rice), (?P<sub_type>[^,]+)', r'\g<sub_type> \g<type>', s)
s = re.sub(r'^Alcoholic Beverage, wine, table, (?P<sub_type>[^,]+)', r'\g<sub_type> wine', s)
s = re.sub(r'^(?P<type>Alcoholic beverages?|Spices|Beverages|Nuts), (?P<sub_type>[^,]+)', r'\g<sub_type>', s)
return re.sub(r'(?P<kwd>^[^,]*).*', r'\g<kwd>', s).lower()
with open('food.csv') as f:
rows = csv.DictReader(f)
ings = [ingredient.from_row(r) for r in rows if r['food_category_id'] in include_cats and not any(is_forbidden(r['description']))]
kwd_to_ing = {}
for ing in sorted(ings):
if ing.keywords not in kwd_to_ing:
kwd_to_ing[ing.keywords] = []
kwd_to_ing[ing.keywords].append(ing)
def common_name(ings):
if len(ings) < 1:
return ''
if len(ings) == 1:
return ings[0].description
def mismatch(ings, i):
for ing in ings:
yield ing.description[i] != ings[0].description[i]
for i in range(len(ings[0].description)):
if any(mismatch(ings, i)):
return ings[0].description[:i].rstrip(' ,')
return ''
def component_ids(ings):
return '+'.join([i.fdc_id for i in ings])
print('keywords,component_ids,density,unit_mass,name')
for k,v in kwd_to_ing.items():
print(','.join([k, component_ids(v), '1.0', '100.0', '"' + common_name(v) + '"']))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment