Skip to content

Instantly share code, notes, and snippets.

@borgesalkan
Created October 2, 2020 04:43
Show Gist options
  • Save borgesalkan/e1ab96a9f2ecfe954bb9db752c1b77a4 to your computer and use it in GitHub Desktop.
Save borgesalkan/e1ab96a9f2ecfe954bb9db752c1b77a4 to your computer and use it in GitHub Desktop.
Extract Food Items
from textblob import TextBlob
from nltk.stem.porter import PorterStemmer
def _extract_ngrams(data: str, num: int):
n_grams = TextBlob(data).ngrams(num)
return [' '.join(grams).lower() for grams in n_grams]
def _delete_duplicate_food_n_grams(text: str, foods: List[str]) -> List[str]:
foods.sort(key=lambda x: -len(x.split())) # Sort desc by number of words
result_foods = []
for food in foods:
if food in text:
text = text.replace(food, '')
result_foods.append(food)
return result_foods
def extract_foods(text: str) -> List[str]:
foods = set()
stemmer = PorterStemmer()
for n in range(6, 0, -1):
n_grams = _extract_ngrams(text, n)
n_grams_stemmed = [stemmer.stem(n_gram) for n_gram in n_grams]
n_grams_set = set(n_grams).union(n_grams_stemmed)
foods = foods.union(n_grams_set.intersection(FOOD_LEXICONS))
foods = list(foods)
foods = _delete_duplicate_food_n_grams(text, foods)
return list(foods)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment