Last active
January 18, 2020 01:30
-
-
Save turnipsoup/32f65c72b397463030b1fe8c1e8a3f10 to your computer and use it in GitHub Desktop.
Get all publicly available recipes from tasteofhome.com and compiles them into a TSV file. Includes nutrition facts.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests as r | |
import bs4 as bs | |
import json | |
# Configuration | |
input_list = "tasteOfHome.csv" | |
base_url = "https://www.tasteofhome.com/recipes/page/" | |
site = 'https://www.tasteofhome.com' | |
output_filename = "tasteOfHome.csv" # URL csv for getRecipesList() | |
final_filename = "tasteOfHomeRecipes.tsv" # Recipe tsv output for tasteOfHomeCsvMake() | |
###################################################### | |
###################################################### | |
def getRecipesList(output_filename): | |
# Set global page number variable | |
page_num_max = 0 | |
# Counter for getting search results | |
appended_num = 1 | |
###################################################### | |
# If this is the first run, then get the total page numbers | |
if appended_num < 2: | |
initial_url = base_url + str(appended_num) | |
init_request = r.get(initial_url) | |
soup = bs.BeautifulSoup(init_request.content,"lxml") | |
# Find total pages to search | |
page_num_list = [] | |
for number in soup.find_all("a", {"class": "page-numbers"}): | |
page_num_list.append(number.text) | |
page_num_list = [x.replace(",","") for x in page_num_list] | |
page_num_check_list = [] | |
# Remove Strings | |
for item in page_num_list: | |
try: | |
int(item) | |
page_num_check_list.append(item) | |
except: | |
pass | |
# Convert to Ints | |
page_num_check_list = [int(x) for x in page_num_check_list] | |
# Set total iterations target to max page number | |
page_num_max = max(page_num_check_list) | |
###################################################### | |
# Iterate until we reach the max page number | |
while appended_num <= page_num_max: | |
# Set recipe count for this page to 0 | |
total_links_fetched_this_page = 0 | |
# Setup Request | |
get_url = base_url + str(appended_num) | |
request = r.get(get_url) | |
soup = bs.BeautifulSoup(request.content,"lxml") | |
# Iterate over all recipes found on this page | |
for recipe in soup.find_all("li", {"class": "single-recipe"}): | |
# Write desired output to file | |
with open(output_filename, "a") as f: | |
join_string = site + "," + recipe.a.get('href') + "\n" | |
f.write(join_string) | |
total_links_fetched_this_page += 1 | |
# Print data so we know its working | |
print(f"Page {appended_num} complete - {total_links_fetched_this_page} fetched") | |
appended_num += 1 | |
###################################################### | |
###################################################### | |
def tasteOfHomeCsvMake(input_filename, final_filename): | |
# Create output file | |
csvCreate(final_filename) | |
# Open input file | |
with open(input_filename, "r") as input_file: | |
# Iterate over rows | |
for row in input_file: | |
try: | |
# Get site | |
request = r.get(row.split(",")[1]) | |
soup = bs.BeautifulSoup(request.content, 'lxml') | |
###################################################### | |
# Find ingredients, make an array of ingredients | |
ingredients = {} | |
for ingredient_list in soup.find_all("ul", {"class": "recipe-ingredients__list"}): | |
all_ingredients = ingredient_list.find_all("li") | |
# Iterate over a range to build a JSON object easier | |
ingredients['MAIN'] = {} | |
for i in range(len(all_ingredients)): | |
if ":" not in all_ingredients[i].text: | |
ingredients['MAIN'][i] = {} | |
ingredients['MAIN'][i] = all_ingredients[i].text | |
continue | |
# Breakdown subrecipe into own portion of JSON | |
if ":" in all_ingredients[i].text: | |
sub_recipe = all_ingredients[i].text.replace(":","") | |
ingredients[sub_recipe] = {} | |
sub_recipe_ingredients = all_ingredients[i+1:] | |
for t in range(len(sub_recipe_ingredients)): | |
ingredients[sub_recipe][t] = {} | |
ingredients[sub_recipe][t] = sub_recipe_ingredients[t].text | |
break | |
###################################################### | |
# Get directions, do the same as above (but there appear to be no sub directions) | |
directions = {} | |
all_directions = soup.find_all("li", {"class": "recipe-directions__item"}) | |
for i in range(len(all_directions)): | |
directions[i] = {} | |
directions[i] = all_directions[i].find("span").text.strip() | |
###################################################### | |
# Fetch ingredients | |
try: | |
directions_string = soup.find("div", {"class": "recipe-nutrition-facts"}).text | |
calories = factsFinder(",".join(directions_string.split(":")[1:]), "calories") | |
total_fat = factsFinder(",".join(directions_string.split(":")[1:]), "fat") | |
cholesterol = factsFinder(",".join(directions_string.split(":")[1:]), "cholesterol") | |
sodium = factsFinder(",".join(directions_string.split(":")[1:]), "sodium") | |
carbohydrate = factsFinder(",".join(directions_string.split(":")[1:]), "carbohydrate") | |
protein = factsFinder(",".join(directions_string.split(":")[1:]), "protein") | |
saturated_fat = factsFinder(",".join(directions_string.split(":")[1:]), "saturated") | |
sugars = factsFinder(",".join(directions_string.split(":")[1:]), "sugars") | |
fiber = factsFinder(",".join(directions_string.split(":")[1:]), "fiber") | |
except: | |
calories = "null" | |
total_fat = "null" | |
cholesterol = "null" | |
sodium = "null" | |
carbohydrate = "null" | |
protein = "null" | |
saturated_fat = "null" | |
sugars = "null" | |
fiber = "null" | |
except: | |
with open("tasteOfHomeRecipes.error.log", "a") as error_file: | |
error_string = ",".join([site,row.split(",")[1]]) + "\n" | |
error_file.write(error_string) | |
continue | |
#print(f"calories: {calories}, total_fat: {total_fat}, cholesterol: {cholesterol}, sodium: {sodium}, carbohydrate: {carbohydrate}, protein: {protein}, saturated_fat: {saturated_fat}, sugars: {sugars}, fiber {fiber}") | |
writeToCsv(final_filename, site.strip(), row.split(",")[1].strip(), json.dumps(ingredients),json.dumps(directions),calories,total_fat,saturated_fat,cholesterol,sodium,carbohydrate,protein,sugars,fiber, row) | |
###################################################### | |
###################################################### | |
# Easily define nutrient fields in above tasteOfHomeCsvMake | |
def factsFinder(directions_string, target_nutrient): | |
try: | |
if directions_string == "null": | |
return "null" | |
for nutrient in directions_string.replace("(",",").replace(")","").replace(".","").strip().split(","): | |
# If the nutrient we are looking for is the first one (not a sub) then return it | |
if target_nutrient in nutrient.strip(): | |
return nutrient.strip().split(" ")[0] | |
except: | |
return "null" | |
###################################################### | |
###################################################### | |
# Create the CSV file | |
def csvCreate(final_filename): | |
with open(final_filename, "w") as f: | |
f.write("site\trecipe_addr\tingredients\tdirections\tcalories\ttotal_fat\tsaturated_fat\tcholesterol\tsodium\tcarbohydrate\tprotein\tsugars\tfiber\n") | |
f.close() | |
# Write to the TSV | |
def writeToCsv(final_filename, site, recipe_addr, ingredients,directions,calories,total_fat,saturated_fat,cholesterol,sodium,carbohydrate,protein,sugars,fiber, row): | |
try: | |
with open(final_filename, "a") as f: | |
write_string = "\t".join([site, recipe_addr, ingredients,directions,calories,total_fat,saturated_fat,cholesterol,sodium,carbohydrate,protein,sugars,fiber]) + "\n" | |
f.write(write_string) | |
f.close() | |
except: | |
with open("tasteOfHomeRecipes.error.log", "a") as error_file: | |
error_string = ",".join([site,row.split(",")[1]]) + "\n" | |
error_file.write(error_string) | |
if __name__ == "__main__": | |
getRecipesList(output_filename) | |
tasteOfHomeCsvMake(input_list, final_filename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment