Skip to content

Instantly share code, notes, and snippets.

View XWilliamY's full-sized avatar

William Yang XWilliamY

View GitHub Profile
from autoscraper import AutoScraper
# replace with desired url
url = 'https://www.yelp.com/biz/chun-yang-tea-flushing-new-york-flushing'
# make sure that autoscraper can exactly match the items in your wanted_list
wanted_list = ['A review'] # replace with item(s) of interest
# build the scraper
scraper = AutoScraper()
result = scraper.build(url, wanted_list)
groups = scraper.get_result_similar(url, grouped=True)

Since groups is a dictionary, you can get the names of the rules by calling

groups.keys()

You can then key into the dictionary using a particular review.

#0
from autoscraper import AutoScraper
#1
url = "https://www.yelp.com/biz/chun-yang-tea-flushing-new-york-flushing?osq=bubble%20tea&sort_by=date_desc"
#2
wanted_list = ['Tried their Brown Sugar milk tea and it was not bad compare to Tiger Sugar. I prefer this over Tiger Sugar due to the L size option and sweetness content. It was my to go bubble tea spot for the last two days straight. Will visit again!']
#3
from bs4 import BeautifulSoup
url = "https://www.yelp.com/biz/chun-yang-tea-flushing-new-york-flushing?osq=bubble%20tea&sort_by=date_desc"
browser.get(url)
response = BeautifulSoup(browser.page_source, 'html.parser')
published = response.find_all('span', class_='lemon--span__373c0__3997G raw__373c0__3rKqk')
from selenium import webdriver
def setup_browser(driver_path=None, headless=False):
op = webdriver.ChromeOptions()
if headless:
op.add_argument('headless')
if driver_path:
return ebdriver.Chrome(driver_path, options=op)
# this depends on where you install chromedriver
# 0: returns the detected language
detected.lang
# 1: returns the confidence in the predicted language
detected.confidence
# 0: returns the detected source language
translated.src
# 1: returns the language the text has been translated to
translated.dest
# 2: returns the original text
translated.origin
# 3: returns the translated text
#0 import libraries
from googletrans import Translator
import multiprocessing
import workers
#1 create Translator object
translator = Translator()
#2 create multiprocessing pool
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
from googletrans import Translator
translator = Translator()
def google_translate(text, dest='en'):
return translator.translate(text, dest=dest)
if isinstance(text, list):
result = []
for item in text:
translated = self.translate(item, dest=dest, src=src, **kwargs)
result.append(translated)
return result