Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@Yankim
Last active May 5, 2021 16:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save Yankim/04a52d99942962176ee77690467f018e to your computer and use it in GitHub Desktop.
Save Yankim/04a52d99942962176ee77690467f018e to your computer and use it in GitHub Desktop.
def scrape_recipe(br, year, idnumber):
#This is called when user wants to scrape for specific recipe site
#Try functions were used to prevent any one element from stopping the operation
#recipe title
try:
rtitle = br.find_element_by_tag_name('h1').text
except:
rtitle = 'NA'
#Star rating
try:
starrating = br.find_element_by_class_name('rating-stars').\
get_attribute('data-ratingstars')
except:
starrating = 'NA'
#Number of people who clicked that they "made it"
try:
madeitcount = br.find_element_by_class_name('made-it-count').text
except:
madeitcount = 'NA'
#Number of reviews
try:
reviewcount = br.find_element_by_class_name('review-count').text
reviewcount = str(re.findall('(\w+) reviews', reviewcount)[0])
except:
reviewcount = 'NA'
#calories per serving
try:
calcount = br.find_element_by_class_name('calorie-count').text
calcount = str(re.findall('(\w+) cals', calcount)[0])
except:
calcount = 'NA'
#prep time
try:
prepTime = br.find_element_by_xpath('//time[@itemprop = "prepTime"]').\
get_attribute('datetime')
prepTime = str(re.findall('PT(\w+)', prepTime)[0])
except:
prepTime = 'NA'
#Cook time
try:
cookTime = br.find_element_by_xpath('//time[@itemprop = "cookTime"]').\
get_attribute('datetime')
cookTime = str(re.findall('PT(\w+)', cookTime)[0])
except:
cookTime = 'NA'
#total time
try:
totalTime = br.find_element_by_xpath('//time[@itemprop = "totalTime"]').\
get_attribute('datetime')
totalTime = str(re.findall('PT(\w+)', totalTime)[0])
except:
totalTime = 'NA'
#find all the ingredient attributes
ingred = br.find_elements_by_class_name("checkList__item")
#Go through all ingredients and collect text
ingredients = []
for x in np.arange(len(ingred)-1):
ingredients.append(str(ingred[x].text.encode('ascii', 'ignore')))
#update mongoDB with ingredients entry
for ingr in ingredients:
temp = {'idnumber': idnumber, 'year': year, 'ingredient': ingr.encode('ascii', 'ignore')}
collection2.insert(temp)#listingr.append(year+'\t'+idnumber+'\t'+ingr)
#Update mongoDB with recipe entry
temp = {'idnumber': idnumber, 'year': year, 'recipe_title': rtitle.encode('ascii', 'ignore'), \
'star_rating': starrating, 'made_it_count': madeitcount, 'review_count': reviewcount, \
'cal_count': calcount, 'prep_time': prepTime, 'cook_time': cookTime, 'total_time': totalTime}
collection.insert(temp)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment