Last active
May 23, 2020 21:31
-
-
Save alkimo/df36328664a9c2045ea914fbe5bae5fc to your computer and use it in GitHub Desktop.
ScrapperYielp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Include data libraries | |
from bs4 import BeautifulSoup as bs | |
import pandas as pd | |
import requests | |
import sys | |
# Include standard modules | |
import argparse | |
# Initiate the parser | |
parser = argparse.ArgumentParser() | |
# Add long and short argument | |
parser.add_argument("--keyword", "-k", help="set output keyword") | |
parser.add_argument("--location", "-l", help="set output keyword") | |
parser.add_argument("--start_page", "-s", help="set output keyword") | |
parser.add_argument("--end_page", "-e", help="set output keyword") | |
parser.add_argument("--pause", "-p", help="set output keyword") | |
# Read arguments from the command line | |
args = parser.parse_args() | |
if(args.start_page == None): | |
args.start_page = 0 | |
if(args.location == None): | |
args.location = 'None' | |
if(args.end_page == None): | |
args.end_page = str(int(args.start_page) + 1) | |
search_query = (int(args.start_page) - 1) * 10 | |
for var in list(range(int(int(args.end_page) - int(args.start_page)))): | |
print(search_query) | |
search_query += 10 | |
#Yielp baseUrl to be modified by parameters added on the CLI | |
baseUrl = f"https://www.yelp.com/search?find_desc={args.keyword}&find_loc={args.location}&start={search_query}" | |
baseUrl = baseUrl.strip().replace(' ', "%20") | |
html = requests.get(baseUrl).text | |
soup = bs(html, 'html.parser') | |
tags = soup.findAll("li")[5:] | |
tags = tags[0:20] | |
print(soup.prettify()) | |
for i in tags: | |
name = i.find("a", {"class" : "lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE"}) | |
address = i.find("span", {"class": "lemon--span__373c0__3997G raw__373c0__3rcx7"}) | |
number = i.find("p", {"class": "lemon--p__373c0__3Qnnj text__373c0__2Kxyz text-color--black-extra-light__373c0__2OyzO text-align--right__373c0__1f0KI text-size--small__373c0__3NVWO"}) | |
try: | |
cardURL = 'https://www.yelp.com' + name.get('href') | |
cardHtml = requests.get(cardURL).text | |
cardPage = bs(cardHtml, 'html.parser') | |
website = cardPage.find('a', {"class": "lemon--a__373c0__IEZFH link__373c0__1G70M link-color--blue-dark__373c0__85-Nu link-size--inherit__373c0__1VFlE", "rel": "noopener nofollow"}) | |
print(name.contents[0]) | |
print(address.contents[0]) | |
print(number.contents[0]) | |
print(cardURL) | |
try: | |
print(website.contents[0]) | |
except: | |
print("Website not provided") | |
pass | |
print("\n") | |
except: | |
pass | |
print(baseUrl) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment