Skip to content

Instantly share code, notes, and snippets.

@benspargo89
Created April 21, 2018 21:42
Show Gist options
  • Save benspargo89/99b9aa5551cd068e4887482f3263f596 to your computer and use it in GitHub Desktop.
Save benspargo89/99b9aa5551cd068e4887482f3263f596 to your computer and use it in GitHub Desktop.
Yelp Scraper Python
import requests
from bs4 import BeautifulSoup as soup
import pandas as pd
import time
import json
import time
import urllib.request
def GetReviews(URL):
start = time.time()
url_base = URL + '?start='
page = requests.get(URL)
html = page.content
page_soup = soup(html, "html.parser")
page_marker = page_soup.findAll("span", {"class":"pagination-label responsive-hidden-small pagination-links_anchor"})
all_reviews = []
try:
company = page_soup.findAll("h1", {"class":"biz-page-title embossed-text-white"})[0].text.strip()
except:
company = page_soup.findAll("h1", {"class":"biz-page-title embossed-text-white shortenough"})[0].text.strip()
page_number = 20
company = company + '.xls'
while len(page_marker) > 0:
page = requests.get(URL)
html = page.content
page_soup = soup(html, "html.parser")
page_marker = page_soup.findAll("span", {"class":"pagination-label responsive-hidden-small pagination-links_anchor"})
reviews = page_soup.findAll("div", {"class":"review-wrapper"})
for N in range(1, len(reviews)):
all_reviews.append(reviews[N])
URL = url_base + str(page_number)
print("Scraping " + URL)
page_number = page_number + 20
df = pd.DataFrame(all_reviews)
df['Rating'] = df.apply(lambda row: int(row[0].findAll("div", {"class" : "biz-rating biz-rating-large clearfix"})[0].div.div.get('title').split(" ")[0][0]), axis=1)
df['Date'] = df.apply(lambda row: row[0].findAll("span", {"class" : "rating-qualifier"})[0].text.strip(), axis=1)
df['Review'] = df.apply(lambda row: row[0].p.text, axis=1)
df = df.drop([0], axis=1)
df.to_excel(company, index=False)
end = time.time()
print('Finished in ' + str(end - start)[:4] + ' seconds')
URL = 'https://www.yelp.com/biz/zollikoffee-nashville-2'
def GetCoordinates(URL):
page = requests.get(URL)
html = page.content
page_soup = soup(html, "html.parser")
latitude = json.loads(page_soup.findAll("div", {"class":"lightbox-map hidden"})[0].get('data-map-state'))["markers"][1]['location']['latitude']
longitude = json.loads(page_soup.findAll("div", {"class":"lightbox-map hidden"})[0].get('data-map-state'))["markers"][1]['location']['longitude']
health_rating = int(page_soup.findAll('div', {'class':'score-block custom-result'})[0].text.strip())
## except:
## health_rating = 'Blank'
time.sleep(2)
return [latitude, longitude, health_rating]
##############################################################################
URL = 'https://www.yelp.com/biz/etch-restaurant-nashville-5'
##URL = 'https://www.yelp.com/biz/zollikoffee-nashville-2'
start = time.time()
print(time.time() - start)
GetReviews(URL)
df = pd.read_excel('NoWait Landing Pages.xlsx')
sites = df['Site']
for site in sites:
GetReviews(site)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment