Skip to content

Instantly share code, notes, and snippets.

@turnipsoup
Last active January 18, 2020 01:30
Show Gist options
  • Save turnipsoup/32f65c72b397463030b1fe8c1e8a3f10 to your computer and use it in GitHub Desktop.
Save turnipsoup/32f65c72b397463030b1fe8c1e8a3f10 to your computer and use it in GitHub Desktop.
Get all publicly available recipes from tasteofhome.com and compiles them into a TSV file. Includes nutrition facts.
import requests as r
import bs4 as bs
import json
# Configuration
input_list = "tasteOfHome.csv"
base_url = "https://www.tasteofhome.com/recipes/page/"
site = 'https://www.tasteofhome.com'
output_filename = "tasteOfHome.csv" # URL csv for getRecipesList()
final_filename = "tasteOfHomeRecipes.tsv" # Recipe tsv output for tasteOfHomeCsvMake()
######################################################
######################################################
def getRecipesList(output_filename):
# Set global page number variable
page_num_max = 0
# Counter for getting search results
appended_num = 1
######################################################
# If this is the first run, then get the total page numbers
if appended_num < 2:
initial_url = base_url + str(appended_num)
init_request = r.get(initial_url)
soup = bs.BeautifulSoup(init_request.content,"lxml")
# Find total pages to search
page_num_list = []
for number in soup.find_all("a", {"class": "page-numbers"}):
page_num_list.append(number.text)
page_num_list = [x.replace(",","") for x in page_num_list]
page_num_check_list = []
# Remove Strings
for item in page_num_list:
try:
int(item)
page_num_check_list.append(item)
except:
pass
# Convert to Ints
page_num_check_list = [int(x) for x in page_num_check_list]
# Set total iterations target to max page number
page_num_max = max(page_num_check_list)
######################################################
# Iterate until we reach the max page number
while appended_num <= page_num_max:
# Set recipe count for this page to 0
total_links_fetched_this_page = 0
# Setup Request
get_url = base_url + str(appended_num)
request = r.get(get_url)
soup = bs.BeautifulSoup(request.content,"lxml")
# Iterate over all recipes found on this page
for recipe in soup.find_all("li", {"class": "single-recipe"}):
# Write desired output to file
with open(output_filename, "a") as f:
join_string = site + "," + recipe.a.get('href') + "\n"
f.write(join_string)
total_links_fetched_this_page += 1
# Print data so we know its working
print(f"Page {appended_num} complete - {total_links_fetched_this_page} fetched")
appended_num += 1
######################################################
######################################################
def tasteOfHomeCsvMake(input_filename, final_filename):
# Create output file
csvCreate(final_filename)
# Open input file
with open(input_filename, "r") as input_file:
# Iterate over rows
for row in input_file:
try:
# Get site
request = r.get(row.split(",")[1])
soup = bs.BeautifulSoup(request.content, 'lxml')
######################################################
# Find ingredients, make an array of ingredients
ingredients = {}
for ingredient_list in soup.find_all("ul", {"class": "recipe-ingredients__list"}):
all_ingredients = ingredient_list.find_all("li")
# Iterate over a range to build a JSON object easier
ingredients['MAIN'] = {}
for i in range(len(all_ingredients)):
if ":" not in all_ingredients[i].text:
ingredients['MAIN'][i] = {}
ingredients['MAIN'][i] = all_ingredients[i].text
continue
# Breakdown subrecipe into own portion of JSON
if ":" in all_ingredients[i].text:
sub_recipe = all_ingredients[i].text.replace(":","")
ingredients[sub_recipe] = {}
sub_recipe_ingredients = all_ingredients[i+1:]
for t in range(len(sub_recipe_ingredients)):
ingredients[sub_recipe][t] = {}
ingredients[sub_recipe][t] = sub_recipe_ingredients[t].text
break
######################################################
# Get directions, do the same as above (but there appear to be no sub directions)
directions = {}
all_directions = soup.find_all("li", {"class": "recipe-directions__item"})
for i in range(len(all_directions)):
directions[i] = {}
directions[i] = all_directions[i].find("span").text.strip()
######################################################
# Fetch ingredients
try:
directions_string = soup.find("div", {"class": "recipe-nutrition-facts"}).text
calories = factsFinder(",".join(directions_string.split(":")[1:]), "calories")
total_fat = factsFinder(",".join(directions_string.split(":")[1:]), "fat")
cholesterol = factsFinder(",".join(directions_string.split(":")[1:]), "cholesterol")
sodium = factsFinder(",".join(directions_string.split(":")[1:]), "sodium")
carbohydrate = factsFinder(",".join(directions_string.split(":")[1:]), "carbohydrate")
protein = factsFinder(",".join(directions_string.split(":")[1:]), "protein")
saturated_fat = factsFinder(",".join(directions_string.split(":")[1:]), "saturated")
sugars = factsFinder(",".join(directions_string.split(":")[1:]), "sugars")
fiber = factsFinder(",".join(directions_string.split(":")[1:]), "fiber")
except:
calories = "null"
total_fat = "null"
cholesterol = "null"
sodium = "null"
carbohydrate = "null"
protein = "null"
saturated_fat = "null"
sugars = "null"
fiber = "null"
except:
with open("tasteOfHomeRecipes.error.log", "a") as error_file:
error_string = ",".join([site,row.split(",")[1]]) + "\n"
error_file.write(error_string)
continue
#print(f"calories: {calories}, total_fat: {total_fat}, cholesterol: {cholesterol}, sodium: {sodium}, carbohydrate: {carbohydrate}, protein: {protein}, saturated_fat: {saturated_fat}, sugars: {sugars}, fiber {fiber}")
writeToCsv(final_filename, site.strip(), row.split(",")[1].strip(), json.dumps(ingredients),json.dumps(directions),calories,total_fat,saturated_fat,cholesterol,sodium,carbohydrate,protein,sugars,fiber, row)
######################################################
######################################################
# Easily define nutrient fields in above tasteOfHomeCsvMake
def factsFinder(directions_string, target_nutrient):
try:
if directions_string == "null":
return "null"
for nutrient in directions_string.replace("(",",").replace(")","").replace(".","").strip().split(","):
# If the nutrient we are looking for is the first one (not a sub) then return it
if target_nutrient in nutrient.strip():
return nutrient.strip().split(" ")[0]
except:
return "null"
######################################################
######################################################
# Create the CSV file
def csvCreate(final_filename):
with open(final_filename, "w") as f:
f.write("site\trecipe_addr\tingredients\tdirections\tcalories\ttotal_fat\tsaturated_fat\tcholesterol\tsodium\tcarbohydrate\tprotein\tsugars\tfiber\n")
f.close()
# Write to the TSV
def writeToCsv(final_filename, site, recipe_addr, ingredients,directions,calories,total_fat,saturated_fat,cholesterol,sodium,carbohydrate,protein,sugars,fiber, row):
try:
with open(final_filename, "a") as f:
write_string = "\t".join([site, recipe_addr, ingredients,directions,calories,total_fat,saturated_fat,cholesterol,sodium,carbohydrate,protein,sugars,fiber]) + "\n"
f.write(write_string)
f.close()
except:
with open("tasteOfHomeRecipes.error.log", "a") as error_file:
error_string = ",".join([site,row.split(",")[1]]) + "\n"
error_file.write(error_string)
if __name__ == "__main__":
getRecipesList(output_filename)
tasteOfHomeCsvMake(input_list, final_filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment