Skip to content

Instantly share code, notes, and snippets.

@alkimo
Last active May 23, 2020 21:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alkimo/df36328664a9c2045ea914fbe5bae5fc to your computer and use it in GitHub Desktop.
Save alkimo/df36328664a9c2045ea914fbe5bae5fc to your computer and use it in GitHub Desktop.
ScrapperYielp
# Include data libraries
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import sys
# Include standard modules
import argparse
# Initiate the parser
parser = argparse.ArgumentParser()
# Add long and short argument
parser.add_argument("--keyword", "-k", help="set output keyword")
parser.add_argument("--location", "-l", help="set output keyword")
parser.add_argument("--start_page", "-s", help="set output keyword")
parser.add_argument("--end_page", "-e", help="set output keyword")
parser.add_argument("--pause", "-p", help="set output keyword")
# Read arguments from the command line
args = parser.parse_args()
if(args.start_page == None):
args.start_page = 0
if(args.location == None):
args.location = 'None'
if(args.end_page == None):
args.end_page = str(int(args.start_page) + 1)
search_query = (int(args.start_page) - 1) * 10
for var in list(range(int(int(args.end_page) - int(args.start_page)))):
print(search_query)
search_query += 10
#Yielp baseUrl to be modified by parameters added on the CLI
baseUrl = f"https://www.yelp.com/search?find_desc={args.keyword}&find_loc={args.location}&start={search_query}"
baseUrl = baseUrl.strip().replace(' ', "%20")
html = requests.get(baseUrl).text
soup = bs(html, 'html.parser')
tags = soup.findAll("li")[5:]
tags = tags[0:20]
print(soup.prettify())
for i in tags:
name = i.find("a", {"class" : "lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE"})
address = i.find("span", {"class": "lemon--span__373c0__3997G raw__373c0__3rcx7"})
number = i.find("p", {"class": "lemon--p__373c0__3Qnnj text__373c0__2Kxyz text-color--black-extra-light__373c0__2OyzO text-align--right__373c0__1f0KI text-size--small__373c0__3NVWO"})
try:
cardURL = 'https://www.yelp.com' + name.get('href')
cardHtml = requests.get(cardURL).text
cardPage = bs(cardHtml, 'html.parser')
website = cardPage.find('a', {"class": "lemon--a__373c0__IEZFH link__373c0__1G70M link-color--blue-dark__373c0__85-Nu link-size--inherit__373c0__1VFlE", "rel": "noopener nofollow"})
print(name.contents[0])
print(address.contents[0])
print(number.contents[0])
print(cardURL)
try:
print(website.contents[0])
except:
print("Website not provided")
pass
print("\n")
except:
pass
print(baseUrl)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment