Created
April 21, 2018 21:42
-
-
Save benspargo89/99b9aa5551cd068e4887482f3263f596 to your computer and use it in GitHub Desktop.
Yelp Scraper Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup as soup | |
import pandas as pd | |
import time | |
import json | |
import time | |
import urllib.request | |
def GetReviews(URL): | |
start = time.time() | |
url_base = URL + '?start=' | |
page = requests.get(URL) | |
html = page.content | |
page_soup = soup(html, "html.parser") | |
page_marker = page_soup.findAll("span", {"class":"pagination-label responsive-hidden-small pagination-links_anchor"}) | |
all_reviews = [] | |
try: | |
company = page_soup.findAll("h1", {"class":"biz-page-title embossed-text-white"})[0].text.strip() | |
except: | |
company = page_soup.findAll("h1", {"class":"biz-page-title embossed-text-white shortenough"})[0].text.strip() | |
page_number = 20 | |
company = company + '.xls' | |
while len(page_marker) > 0: | |
page = requests.get(URL) | |
html = page.content | |
page_soup = soup(html, "html.parser") | |
page_marker = page_soup.findAll("span", {"class":"pagination-label responsive-hidden-small pagination-links_anchor"}) | |
reviews = page_soup.findAll("div", {"class":"review-wrapper"}) | |
for N in range(1, len(reviews)): | |
all_reviews.append(reviews[N]) | |
URL = url_base + str(page_number) | |
print("Scraping " + URL) | |
page_number = page_number + 20 | |
df = pd.DataFrame(all_reviews) | |
df['Rating'] = df.apply(lambda row: int(row[0].findAll("div", {"class" : "biz-rating biz-rating-large clearfix"})[0].div.div.get('title').split(" ")[0][0]), axis=1) | |
df['Date'] = df.apply(lambda row: row[0].findAll("span", {"class" : "rating-qualifier"})[0].text.strip(), axis=1) | |
df['Review'] = df.apply(lambda row: row[0].p.text, axis=1) | |
df = df.drop([0], axis=1) | |
df.to_excel(company, index=False) | |
end = time.time() | |
print('Finished in ' + str(end - start)[:4] + ' seconds') | |
URL = 'https://www.yelp.com/biz/zollikoffee-nashville-2' | |
def GetCoordinates(URL): | |
page = requests.get(URL) | |
html = page.content | |
page_soup = soup(html, "html.parser") | |
latitude = json.loads(page_soup.findAll("div", {"class":"lightbox-map hidden"})[0].get('data-map-state'))["markers"][1]['location']['latitude'] | |
longitude = json.loads(page_soup.findAll("div", {"class":"lightbox-map hidden"})[0].get('data-map-state'))["markers"][1]['location']['longitude'] | |
health_rating = int(page_soup.findAll('div', {'class':'score-block custom-result'})[0].text.strip()) | |
## except: | |
## health_rating = 'Blank' | |
time.sleep(2) | |
return [latitude, longitude, health_rating] | |
############################################################################## | |
URL = 'https://www.yelp.com/biz/etch-restaurant-nashville-5' | |
##URL = 'https://www.yelp.com/biz/zollikoffee-nashville-2' | |
start = time.time() | |
print(time.time() - start) | |
GetReviews(URL) | |
df = pd.read_excel('NoWait Landing Pages.xlsx') | |
sites = df['Site'] | |
for site in sites: | |
GetReviews(site) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment