Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save QuantumCalzone/76efbb6d227ff7f794e19b397726c015 to your computer and use it in GitHub Desktop.
Save QuantumCalzone/76efbb6d227ff7f794e19b397726c015 to your computer and use it in GitHub Desktop.
Scrapes a user's public Yelp bookmarks and exports them to a CSV
#requires beautifulsoup4
#run 'pip install bs4' without quotes in command prompt first for wondows or 'pip3 install bs4' for mac
import bs4
import re
pattern = re.compile(r"(?<=\>)(.*?)(?=\<)")
debug = "Done!"
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
print("\n")
userName = input("Enter is the name of the person who owns this Yelp profile: ")
#create the initial csv file
filename = userName+"sYelpBookmarks.csv"
#f is the normal convention for a File Writer | w stands for "Write"
f = open(filename, "w")
headers = "BizName,Rating,Reviews,Catagories,City\n"
f.write(headers)
pageIndex = 0
print("\n")
urlSource = input("Paste thir Yelp bookmark URL: ")
print("\n")
pageAmount = int(input("Enter the amount of pages they have: "))
maxPages = (pageAmount * 50) + 1
print("\n")
print("Downloading...")
while pageIndex < maxPages:
myUrl = urlSource+"&start="+str(pageIndex)
#opening up connection
uClient = uReq(myUrl)
#grabbing the page
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#grabs each product
containers = page_soup.findAll("li", {"class":"js-bookmark-row"})
for container in containers:
containerBizName = container.findAll("a", {"class":"biz-name js-analytics-click"})
bizName = containerBizName[0].span.contents[0]
containerBizRating = container.findAll("div", {"class":"biz-rating biz-rating-large clearfix"})
rating = containerBizRating[0].div["title"]
rating = rating.replace(" star rating", "")
reviews = containerBizRating[0].span.contents[0].strip()
reviews = reviews.replace(" reviews", "")
reviews = reviews.replace(" review", "")
containerCatagories = container.findAll("span", {"class":"category-str-list"})
catagories = ""
rawCatagories = str(containerCatagories[0])
matches = pattern.finditer(rawCatagories)
for match in matches:
if (len(catagories) > 0): catagories += "|"
try:
newCatagory = match.group()
newCatagory = newCatagory.replace("&amp;", "and")
newCatagory = newCatagory.replace(",", "|")
catagories += newCatagory
except:
catagories = str(match)
pass
containerCity = container.findAll("small", {"class":"biz-city"})
city = containerCity[0].contents[0]
print("bizName: "+bizName)
print("rating: "+rating)
print("reviews: "+reviews)
print("catagories: "+catagories)
print("city: "+city)
print("\n")
f.write(
bizName.replace(",", "|") + "," +
rating + "," +
reviews + "," +
catagories.replace(",", "|") + "," +
city.replace(",", "|") + "," +
"\n"
)
pageIndex += 50
f.close()
print(debug)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment