William Yang XWilliamY

## full_autoscraper_example.py
from autoscraper import AutoScraper
# replace with desired url
url = 'https://www.yelp.com/biz/chun-yang-tea-flushing-new-york-flushing'
# make sure that autoscraper can exactly match the items in your wanted_list
wanted_list = ['A review']     # replace with item(s) of interest

# build the scraper
scraper = AutoScraper()
result = scraper.build(url, wanted_list)

## finetune_selections.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                XWilliamY
                / finetune_selections.md
            
            
              Last active
              September 10, 2020 15:55
            
          
    groups = scraper.get_result_similar(url, grouped=True)
Since groups is a dictionary, you can get the names of the rules by calling
groups.keys()
You can then key into the dictionary using a particular review.

  
## autoscraping.py
#0
from autoscraper import AutoScraper

#1
url = "https://www.yelp.com/biz/chun-yang-tea-flushing-new-york-flushing?osq=bubble%20tea&sort_by=date_desc"

#2
wanted_list = ['Tried their Brown Sugar milk tea and it was not bad compare to Tiger Sugar. I prefer this over Tiger Sugar due to the L size option and sweetness content. It was my to go bubble tea spot for the last two days straight. Will visit again!']

#3

## find_data.py
from bs4 import BeautifulSoup
url = "https://www.yelp.com/biz/chun-yang-tea-flushing-new-york-flushing?osq=bubble%20tea&sort_by=date_desc"
browser.get(url)

response = BeautifulSoup(browser.page_source, 'html.parser')
published = response.find_all('span', class_='lemon--span__373c0__3997G raw__373c0__3rKqk')

## headless.py
from selenium import webdriver

def setup_browser(driver_path=None, headless=False):
    op = webdriver.ChromeOptions()
    if headless:
        op.add_argument('headless')

    if driver_path:
        return ebdriver.Chrome(driver_path, options=op)
    # this depends on where you install chromedriver

## detected.py
# 0: returns the detected language
detected.lang

# 1: returns the confidence in the predicted language
detected.confidence

## translated.py
# 0: returns the detected source language
translated.src

# 1: returns the language the text has been translated to
translated.dest

# 2: returns the original text
translated.origin

# 3: returns the translated text

## multiprocessing_in_nb.py
#0 import libraries
from googletrans import Translator
import multiprocessing
import workers

#1 create Translator object
translator = Translator()

#2 create multiprocessing pool
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())

## workers.py
from googletrans import Translator
translator = Translator()


def google_translate(text, dest='en'):
    return translator.translate(text, dest=dest)

## closer_look_at_translate.py
if isinstance(text, list):
    result = []
    for item in text:
        translated = self.translate(item, dest=dest, src=src, **kwargs)
        result.append(translated)
    return result
	from autoscraper import AutoScraper
	# replace with desired url
	url = 'https://www.yelp.com/biz/chun-yang-tea-flushing-new-york-flushing'
	# make sure that autoscraper can exactly match the items in your wanted_list
	wanted_list = ['A review'] # replace with item(s) of interest

	# build the scraper
	scraper = AutoScraper()
	result = scraper.build(url, wanted_list)
	#0
	from autoscraper import AutoScraper

	#1
	url = "https://www.yelp.com/biz/chun-yang-tea-flushing-new-york-flushing?osq=bubble%20tea&sort_by=date_desc"

	#2
	wanted_list = ['Tried their Brown Sugar milk tea and it was not bad compare to Tiger Sugar. I prefer this over Tiger Sugar due to the L size option and sweetness content. It was my to go bubble tea spot for the last two days straight. Will visit again!']

	#3
	from bs4 import BeautifulSoup
	url = "https://www.yelp.com/biz/chun-yang-tea-flushing-new-york-flushing?osq=bubble%20tea&sort_by=date_desc"
	browser.get(url)

	response = BeautifulSoup(browser.page_source, 'html.parser')
	published = response.find_all('span', class_='lemon--span__373c0__3997G raw__373c0__3rKqk')
	from selenium import webdriver

	def setup_browser(driver_path=None, headless=False):
	op = webdriver.ChromeOptions()
	if headless:
	op.add_argument('headless')

	if driver_path:
	return ebdriver.Chrome(driver_path, options=op)
	# this depends on where you install chromedriver
	# 0: returns the detected language
	detected.lang

	# 1: returns the confidence in the predicted language
	detected.confidence
	# 0: returns the detected source language
	translated.src

	# 1: returns the language the text has been translated to
	translated.dest

	# 2: returns the original text
	translated.origin

	# 3: returns the translated text
	#0 import libraries
	from googletrans import Translator
	import multiprocessing
	import workers

	#1 create Translator object
	translator = Translator()

	#2 create multiprocessing pool
	pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
	from googletrans import Translator
	translator = Translator()


	def google_translate(text, dest='en'):
	return translator.translate(text, dest=dest)
	if isinstance(text, list):
	result = []
	for item in text:
	translated = self.translate(item, dest=dest, src=src, **kwargs)
	result.append(translated)
	return result