Last active
August 14, 2023 20:21
-
-
Save XWilliamY/69ab4974bf09d6c51e64d39916fdbd52 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from autoscraper import AutoScraper | |
# replace with desired url | |
url = 'https://www.yelp.com/biz/chun-yang-tea-flushing-new-york-flushing' | |
# make sure that autoscraper can exactly match the items in your wanted_list | |
wanted_list = ['A review'] # replace with item(s) of interest | |
# build the scraper | |
scraper = AutoScraper() | |
result = scraper.build(url, wanted_list) | |
# get similar results, and check which rules to keep | |
groups = scraper.get_result_similar(url, grouped=True) | |
groups.keys() | |
groups['rule_io6e'] # replace with rule(s) of interest | |
# keep rules and save the model to disk | |
scraper.keep_rules('rule_io6e') # replace with rule(s) of interest | |
scraper.save('yelp-reviews') # replace with desired model name | |
#------------------------------------------------------------------------- | |
# using the model later | |
scraper.load('yelp-reviews') | |
new_url = "" # replace with desired url | |
scraper.get_result_similar(new_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment