Skip to content

Instantly share code, notes, and snippets.

@atucom
Created August 2, 2018 22:42
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save atucom/5148e9a88a13ee732cb64422c74a9d22 to your computer and use it in GitHub Desktop.
Parse HTML for ingredient list, customized for homechef
from lxml import html
import lxml
import os
from collections import defaultdict
def getIngredients(htmlFile):
# Returns the ingredients from an html file
try:
tree = html.fromstring(htmlFile)
except lxml.etree.ParserError:
return "NOPE"
return [ingredient.text_content() for ingredient in tree.xpath('//ul[@class="list--unstyled group position--relative text--center--bpDown2"]/*')]
def cleanIngredientList(ingredientList):
# Cleans the list to prettify it
cleaned = []
for i in ingredientList:
if "Info" in i:
cleaned.append(i.split('\n')[8])
if len(i) > 2:
cleaned.append(i.split('\n')[3])
return cleaned
def sortAndCount(ingredientList,top=100):
# returns the top 10 count of ingredients
counts = defaultdict(int)
for x in ingredientList:
counts[x] += 1
return sorted(counts.items(), reverse=True, key=lambda tup: tup[1])[:top]
def main():
files = os.listdir()
ingredients = []
for file in files:
with open(file) as f:
htmlFile = f.read()
ingredients.append(cleanIngredientList(getIngredients(htmlFile)))
a = sum(ingredients, []) #flatten the array so you can count it
counted = sortAndCount(a)
for i in counted:
print(str(i[1]) + " " + str(i[0]))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment