|
# -*- coding: utf-8 -*- |
|
|
|
import json |
|
import os |
|
import re |
|
|
|
# Remove all superlatives |
|
superlatives = r"[ ]*[finely ]*chopped[ ]*|[ ]*fresh[ ]+|[ ]*freshly[ ]+|[ ]*peeled[ ]+|[ ]*crushed[ ]+|[ ]*[petite ]*diced[ ]+|[ ]*minced[ ]+|[ ]*shredded[ ]*|[ ]*leaves$|[ ]*coarsely[ ]+|[ ]*coarse[ ]+|^fine[ ]+|[ ]*large[ ]+|[ ]*small[ ]+|[ ]*medium[ ]+|[ ]+minicube[s]*$|^knorr[®]*[ ]+|^(hellmann's\xae or best foods\xae)[ ]+|^[ ]*bottled[ ]+|^[ ]*canned[ ]+|^(no-salt-added)[ ]+|^fire[ ]*|^(vine ripened)[ ]*|^rotel[ ]+|^rotelle$|^organic[ ]+|^chunky$|^pickling[ ]+|^canning[ ]+|[ ]*kosher[ ]+|[ ]*sea[ ]+|[ ]*light[ ]+|^juice$|[ ]+(in juice)$|[ ]+slices$|^[corn ]*(tortilla chips)$|^boiling[ ]+|^cracked[ ]+|^frozen[ ]+|[ ]+kernels$|[ ]+cloves$|^seasoning$|[ ]+crumbles$|[ ]+sprigs$|^mini[ ]+|^seeds$|^(goya fancy)[ ]+|^sauce$|^herbs$" |
|
|
|
# Standardize ingredients |
|
def ingredient_equals(string): |
|
if string.count('ground salt'): |
|
return 'salt' |
|
elif any([string.count(lime) for lime in ['lime juice', 'key lime', 'key lime juice']]): |
|
return 'lime' |
|
elif string.count('lemon juice'): |
|
return 'lemon' |
|
elif string.count('sweet corn'): |
|
return 'corn' |
|
elif any([string.count(cherry) for cherry in ['pitted cherries', 'sweet cherries']]): |
|
return 'cherries' |
|
elif any([string.count(tomato) for tomato in ['cherry tomatoes', 'grape tomatoes']]): |
|
return 'cherry tomatoes' |
|
elif any([string.count(tomato) for tomato in ['roma tomatoes', 'plum tomatoes']]): |
|
return 'roma tomatoes' |
|
elif string == 'clove': |
|
return 'garlic' |
|
elif any([string.count(sugar) for sugar in ['granulated sugar', 'white sugar', 'sugar']]): |
|
return 'white sugar' |
|
elif any([string == salsa for salsa in ['salsa', 'salsa verde', 'tomato salsa']]): |
|
return '' |
|
elif string.count('ground black pepper'): |
|
return 'black pepper' |
|
elif string.count('ground cumin'): |
|
return 'cumin' |
|
elif any([string == grape for grape in ['seedless red grapes', 'seedless green grapes']]): |
|
return 'grapes' |
|
elif string == 'granny smith apples': |
|
return 'green apples' |
|
elif string == 'scallions': |
|
return 'green onion' |
|
|
|
return string |
|
|
|
# Stem plurals |
|
def stem(string): |
|
string = re.sub(r'peppers$', 'pepper', string) |
|
string = re.sub(r'chilies$', 'chili', string) |
|
string = re.sub(r'chiles$', 'chile', string) |
|
string = re.sub(r'onions$', 'onion', string) |
|
string = re.sub(r'tomatoes$', 'tomato', string) |
|
string = re.sub(r'apples$', 'apple', string) |
|
string = re.sub(r'cucumbers$', 'cucumber', string) |
|
string = re.sub(r'tomatillos$', 'tomatillo', string) |
|
string = re.sub(r'seeds$', 'seed', string) |
|
string = re.sub(r'berries$', 'berry', string) |
|
string = re.sub(r'peaches$', 'peach', string) |
|
string = re.sub(r'shallots$', 'shallots', string) |
|
string = re.sub(r'flakes$', 'flake', string) |
|
string = re.sub(r'persimmons$', 'persimmon', string) |
|
string = re.sub(r'cherries$', 'cherry', string) |
|
string = re.sub(r'grapes$', 'grape', string) |
|
string = re.sub(r'plums$', 'plum', string) |
|
string = re.sub(r'apricots$', 'apricot', string) |
|
string = re.sub(r'sprouts$', 'sprout', string) |
|
string = re.sub(r'beans$', 'bean', string) |
|
string = re.sub(r'jalepenos$', 'jalepeno', string) |
|
string = re.sub(r'nuts$', 'nut', string) |
|
string = re.sub(r'pears$', 'pear', string) |
|
string = re.sub(r'nectarines$', 'nectarine', string) |
|
string = re.sub(r'greens$', 'greens', string) |
|
string = re.sub(r'papadews$', 'papadew', string) |
|
string = re.sub(r'pimientos$', 'pimiento', string) |
|
string = re.sub(r'segments$', 'segment', string) |
|
string = re.sub(r'shoots$', 'shoot', string) |
|
string = re.sub(r'raisins$', 'raisin', string) |
|
|
|
return string |
|
|
|
def get_ingredients(): |
|
ingredients = []; ids = [] |
|
for file in os.listdir('salsa_search'): |
|
with open('salsa_search/' + file) as f: |
|
x = json.load(f) |
|
recipes = x['matches'] |
|
for recipe in recipes: |
|
if recipe["rating"] <= 0: continue # Salsa must have a rating > 0 |
|
|
|
x = []; # Sometimes ingredients contain 'and' or 'with' (e.g. 'salt and pepper') |
|
for each in recipe['ingredients']: |
|
x.extend(re.split(" with | and ", each)) |
|
|
|
if len(x) > 0 and recipe['id'] not in ids: |
|
y = [] |
|
for each in x: |
|
each = each.lower() |
|
|
|
# More ad-hoc rules |
|
each = ingredient_equals(each) |
|
|
|
# Remove 'chopped', 'fresh', etc. |
|
z = re.sub(superlatives, ' ', each).strip() |
|
|
|
# There aren't that many plurals in my data set, so let's just code them by hand |
|
z = stem(z) |
|
|
|
if len(z) > 0: |
|
y.append(z) |
|
|
|
ingredients.append(y) |
|
ids.append(recipe['id']) |
|
return ingredients, ids |