turnipsoup/tasteOfHomeGet.py

## tasteOfHomeGet.py
import requests as r
import bs4 as bs
import json

# Configuration
input_list = "tasteOfHome.csv"
base_url = "https://www.tasteofhome.com/recipes/page/"
site = 'https://www.tasteofhome.com'
output_filename = "tasteOfHome.csv" # URL csv for getRecipesList()
final_filename =  "tasteOfHomeRecipes.tsv" # Recipe tsv output for tasteOfHomeCsvMake()

######################################################
######################################################

def getRecipesList(output_filename):
	# Set global page number variable
	page_num_max = 0
	# Counter for getting search results
	appended_num = 1

	######################################################
	# If this is the first run, then get the total page numbers
	if appended_num < 2:
		initial_url = base_url + str(appended_num)
		init_request = r.get(initial_url)
		soup = bs.BeautifulSoup(init_request.content,"lxml")

		# Find total pages to search
		page_num_list = []

		for number in soup.find_all("a", {"class": "page-numbers"}):
			page_num_list.append(number.text)

		page_num_list = [x.replace(",","") for x in page_num_list]
		page_num_check_list = []

		# Remove Strings
		for item in page_num_list:
			try:
				int(item)
				page_num_check_list.append(item)
			except:
				pass

		# Convert to Ints
		page_num_check_list = [int(x) for x in page_num_check_list]

		# Set total iterations target to max page number
		page_num_max = max(page_num_check_list)

	######################################################
	# Iterate until we reach the max page number
	while appended_num <= page_num_max:

		# Set recipe count for this page to 0
		total_links_fetched_this_page = 0

		# Setup Request
		get_url = base_url + str(appended_num)
		request = r.get(get_url)
		soup = bs.BeautifulSoup(request.content,"lxml")

		# Iterate over all recipes found on this page
		for recipe in soup.find_all("li", {"class": "single-recipe"}):

			# Write desired output to file
			with open(output_filename, "a") as f:
				join_string = site + "," + recipe.a.get('href') + "\n"
				f.write(join_string)
			total_links_fetched_this_page += 1

		# Print data so we know its working
		print(f"Page {appended_num} complete - {total_links_fetched_this_page} fetched")
		appended_num += 1

######################################################
######################################################

def tasteOfHomeCsvMake(input_filename, final_filename):

	# Create output file
	csvCreate(final_filename)

	# Open input file
	with open(input_filename, "r") as input_file:

		# Iterate over rows
		for row in input_file:
			try:
				# Get site
				request = r.get(row.split(",")[1])
				soup = bs.BeautifulSoup(request.content, 'lxml')

				######################################################
				# Find ingredients, make an array of ingredients
				ingredients = {}

				for ingredient_list in soup.find_all("ul", {"class": "recipe-ingredients__list"}):
					all_ingredients = ingredient_list.find_all("li")

					# Iterate over a range to build a JSON object easier
					ingredients['MAIN'] = {}
					for i in range(len(all_ingredients)):

						if ":" not in all_ingredients[i].text:
							ingredients['MAIN'][i] = {}
							ingredients['MAIN'][i] = all_ingredients[i].text
							continue

						# Breakdown subrecipe into own portion of JSON
						if ":" in all_ingredients[i].text:
							sub_recipe = all_ingredients[i].text.replace(":","")
							ingredients[sub_recipe] = {}

							sub_recipe_ingredients = all_ingredients[i+1:]

							for t in range(len(sub_recipe_ingredients)):
								ingredients[sub_recipe][t] = {}
								ingredients[sub_recipe][t] = sub_recipe_ingredients[t].text
							break

				######################################################
				# Get directions, do the same as above (but there appear to be no sub directions)
				directions = {}

				all_directions = soup.find_all("li", {"class": "recipe-directions__item"})
				for i in range(len(all_directions)):
					directions[i] = {}
					directions[i] = all_directions[i].find("span").text.strip()


				######################################################
				# Fetch ingredients
				try:
					directions_string = soup.find("div", {"class": "recipe-nutrition-facts"}).text

					calories = factsFinder(",".join(directions_string.split(":")[1:]), "calories")
					total_fat = factsFinder(",".join(directions_string.split(":")[1:]), "fat")
					cholesterol = factsFinder(",".join(directions_string.split(":")[1:]), "cholesterol")
					sodium = factsFinder(",".join(directions_string.split(":")[1:]), "sodium")
					carbohydrate = factsFinder(",".join(directions_string.split(":")[1:]), "carbohydrate")
					protein = factsFinder(",".join(directions_string.split(":")[1:]), "protein")
					saturated_fat = factsFinder(",".join(directions_string.split(":")[1:]), "saturated")
					sugars = factsFinder(",".join(directions_string.split(":")[1:]), "sugars")
					fiber = factsFinder(",".join(directions_string.split(":")[1:]), "fiber")

				except:
					calories = "null"
					total_fat = "null"
					cholesterol = "null"
					sodium = "null"
					carbohydrate = "null"
					protein = "null"
					saturated_fat = "null"
					sugars = "null"
					fiber = "null"
			except:
				with open("tasteOfHomeRecipes.error.log", "a") as error_file:
					error_string = ",".join([site,row.split(",")[1]]) + "\n"
					error_file.write(error_string)
				continue

			#print(f"calories: {calories}, total_fat: {total_fat}, cholesterol: {cholesterol}, sodium: {sodium}, carbohydrate: {carbohydrate}, protein: {protein}, saturated_fat: {saturated_fat}, sugars: {sugars}, fiber {fiber}")
			writeToCsv(final_filename, site.strip(), row.split(",")[1].strip(), json.dumps(ingredients),json.dumps(directions),calories,total_fat,saturated_fat,cholesterol,sodium,carbohydrate,protein,sugars,fiber, row)

######################################################
######################################################

# Easily define nutrient fields in above tasteOfHomeCsvMake
def factsFinder(directions_string, target_nutrient):
	try:
		if directions_string == "null":
			return "null"

		for nutrient in directions_string.replace("(",",").replace(")","").replace(".","").strip().split(","):
			# If the nutrient we are looking for is the first one (not a sub) then return it
			if target_nutrient in nutrient.strip():
				return nutrient.strip().split(" ")[0]
	except:
		return "null"

######################################################
######################################################

# Create the CSV file
def csvCreate(final_filename):
	with open(final_filename, "w") as f:
		f.write("site\trecipe_addr\tingredients\tdirections\tcalories\ttotal_fat\tsaturated_fat\tcholesterol\tsodium\tcarbohydrate\tprotein\tsugars\tfiber\n")
		f.close()

# Write to the TSV
def writeToCsv(final_filename, site, recipe_addr, ingredients,directions,calories,total_fat,saturated_fat,cholesterol,sodium,carbohydrate,protein,sugars,fiber, row):

	try:
		with open(final_filename, "a") as f:
			write_string = "\t".join([site, recipe_addr, ingredients,directions,calories,total_fat,saturated_fat,cholesterol,sodium,carbohydrate,protein,sugars,fiber]) + "\n"
			f.write(write_string)
			f.close()
	except:
		with open("tasteOfHomeRecipes.error.log", "a") as error_file:
			error_string = ",".join([site,row.split(",")[1]]) + "\n"
			error_file.write(error_string)

if __name__ == "__main__":
	getRecipesList(output_filename)
	tasteOfHomeCsvMake(input_list, final_filename)
	import requests as r
	import bs4 as bs
	import json

	# Configuration
	input_list = "tasteOfHome.csv"
	base_url = "https://www.tasteofhome.com/recipes/page/"
	site = 'https://www.tasteofhome.com'
	output_filename = "tasteOfHome.csv" # URL csv for getRecipesList()
	final_filename = "tasteOfHomeRecipes.tsv" # Recipe tsv output for tasteOfHomeCsvMake()

	######################################################
	######################################################

	def getRecipesList(output_filename):
	# Set global page number variable
	page_num_max = 0
	# Counter for getting search results
	appended_num = 1

	######################################################
	# If this is the first run, then get the total page numbers
	if appended_num < 2:
	initial_url = base_url + str(appended_num)
	init_request = r.get(initial_url)
	soup = bs.BeautifulSoup(init_request.content,"lxml")

	# Find total pages to search
	page_num_list = []

	for number in soup.find_all("a", {"class": "page-numbers"}):
	page_num_list.append(number.text)

	page_num_list = [x.replace(",","") for x in page_num_list]
	page_num_check_list = []

	# Remove Strings
	for item in page_num_list:
	try:
	int(item)
	page_num_check_list.append(item)
	except:
	pass

	# Convert to Ints
	page_num_check_list = [int(x) for x in page_num_check_list]

	# Set total iterations target to max page number
	page_num_max = max(page_num_check_list)

	######################################################
	# Iterate until we reach the max page number
	while appended_num <= page_num_max:

	# Set recipe count for this page to 0
	total_links_fetched_this_page = 0

	# Setup Request
	get_url = base_url + str(appended_num)
	request = r.get(get_url)
	soup = bs.BeautifulSoup(request.content,"lxml")

	# Iterate over all recipes found on this page
	for recipe in soup.find_all("li", {"class": "single-recipe"}):

	# Write desired output to file
	with open(output_filename, "a") as f:
	join_string = site + "," + recipe.a.get('href') + "\n"
	f.write(join_string)
	total_links_fetched_this_page += 1

	# Print data so we know its working
	print(f"Page {appended_num} complete - {total_links_fetched_this_page} fetched")
	appended_num += 1

	######################################################
	######################################################

	def tasteOfHomeCsvMake(input_filename, final_filename):

	# Create output file
	csvCreate(final_filename)

	# Open input file
	with open(input_filename, "r") as input_file:

	# Iterate over rows
	for row in input_file:
	try:
	# Get site
	request = r.get(row.split(",")[1])
	soup = bs.BeautifulSoup(request.content, 'lxml')

	######################################################
	# Find ingredients, make an array of ingredients
	ingredients = {}

	for ingredient_list in soup.find_all("ul", {"class": "recipe-ingredients__list"}):
	all_ingredients = ingredient_list.find_all("li")

	# Iterate over a range to build a JSON object easier
	ingredients['MAIN'] = {}
	for i in range(len(all_ingredients)):

	if ":" not in all_ingredients[i].text:
	ingredients['MAIN'][i] = {}
	ingredients['MAIN'][i] = all_ingredients[i].text
	continue

	# Breakdown subrecipe into own portion of JSON
	if ":" in all_ingredients[i].text:
	sub_recipe = all_ingredients[i].text.replace(":","")
	ingredients[sub_recipe] = {}

	sub_recipe_ingredients = all_ingredients[i+1:]

	for t in range(len(sub_recipe_ingredients)):
	ingredients[sub_recipe][t] = {}
	ingredients[sub_recipe][t] = sub_recipe_ingredients[t].text
	break

	######################################################
	# Get directions, do the same as above (but there appear to be no sub directions)
	directions = {}

	all_directions = soup.find_all("li", {"class": "recipe-directions__item"})
	for i in range(len(all_directions)):
	directions[i] = {}
	directions[i] = all_directions[i].find("span").text.strip()


	######################################################
	# Fetch ingredients
	try:
	directions_string = soup.find("div", {"class": "recipe-nutrition-facts"}).text

	calories = factsFinder(",".join(directions_string.split(":")[1:]), "calories")
	total_fat = factsFinder(",".join(directions_string.split(":")[1:]), "fat")
	cholesterol = factsFinder(",".join(directions_string.split(":")[1:]), "cholesterol")
	sodium = factsFinder(",".join(directions_string.split(":")[1:]), "sodium")
	carbohydrate = factsFinder(",".join(directions_string.split(":")[1:]), "carbohydrate")
	protein = factsFinder(",".join(directions_string.split(":")[1:]), "protein")
	saturated_fat = factsFinder(",".join(directions_string.split(":")[1:]), "saturated")
	sugars = factsFinder(",".join(directions_string.split(":")[1:]), "sugars")
	fiber = factsFinder(",".join(directions_string.split(":")[1:]), "fiber")

	except:
	calories = "null"
	total_fat = "null"
	cholesterol = "null"
	sodium = "null"
	carbohydrate = "null"
	protein = "null"
	saturated_fat = "null"
	sugars = "null"
	fiber = "null"
	except:
	with open("tasteOfHomeRecipes.error.log", "a") as error_file:
	error_string = ",".join([site,row.split(",")[1]]) + "\n"
	error_file.write(error_string)
	continue

	#print(f"calories: {calories}, total_fat: {total_fat}, cholesterol: {cholesterol}, sodium: {sodium}, carbohydrate: {carbohydrate}, protein: {protein}, saturated_fat: {saturated_fat}, sugars: {sugars}, fiber {fiber}")
	writeToCsv(final_filename, site.strip(), row.split(",")[1].strip(), json.dumps(ingredients),json.dumps(directions),calories,total_fat,saturated_fat,cholesterol,sodium,carbohydrate,protein,sugars,fiber, row)

	######################################################
	######################################################

	# Easily define nutrient fields in above tasteOfHomeCsvMake
	def factsFinder(directions_string, target_nutrient):
	try:
	if directions_string == "null":
	return "null"

	for nutrient in directions_string.replace("(",",").replace(")","").replace(".","").strip().split(","):
	# If the nutrient we are looking for is the first one (not a sub) then return it
	if target_nutrient in nutrient.strip():
	return nutrient.strip().split(" ")[0]
	except:
	return "null"

	######################################################
	######################################################

	# Create the CSV file
	def csvCreate(final_filename):
	with open(final_filename, "w") as f:
	f.write("site\trecipe_addr\tingredients\tdirections\tcalories\ttotal_fat\tsaturated_fat\tcholesterol\tsodium\tcarbohydrate\tprotein\tsugars\tfiber\n")
	f.close()

	# Write to the TSV
	def writeToCsv(final_filename, site, recipe_addr, ingredients,directions,calories,total_fat,saturated_fat,cholesterol,sodium,carbohydrate,protein,sugars,fiber, row):

	try:
	with open(final_filename, "a") as f:
	write_string = "\t".join([site, recipe_addr, ingredients,directions,calories,total_fat,saturated_fat,cholesterol,sodium,carbohydrate,protein,sugars,fiber]) + "\n"
	f.write(write_string)
	f.close()
	except:
	with open("tasteOfHomeRecipes.error.log", "a") as error_file:
	error_string = ",".join([site,row.split(",")[1]]) + "\n"
	error_file.write(error_string)

	if __name__ == "__main__":
	getRecipesList(output_filename)
	tasteOfHomeCsvMake(input_list, final_filename)