Skip to content

Instantly share code, notes, and snippets.

@AnderRV
Created August 20, 2021 10:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AnderRV/05702a70a0368a51d826acce8faf08d5 to your computer and use it in GitHub Desktop.
Save AnderRV/05702a70a0368a51d826acce8faf08d5 to your computer and use it in GitHub Desktop.
from redis import Redis
# ...
connection = Redis(db=1)
@app.task
def crawl(url):
connection.sadd('crawling:queued', url) # add URL to set
html = get_html(url)
soup = BeautifulSoup(html, 'html.parser')
links = extract_links(url, soup)
for link in links:
if allow_url_filter(link) and not seen(link):
print('Add URL to visit queue', link)
add_to_visit(link)
# atomically move a URL from queued to visited
connection.smove('crawling:queued', 'crawling:visited', url)
def allow_url_filter(url):
return '/shop/page/' in url and '#' not in url
def seen(url):
return connection.sismember('crawling:visited', url) or connection.sismember('crawling:queued', url)
def add_to_visit(url):
# LPOS command is not available in Redis library
if connection.execute_command('LPOS', 'crawling:to_visit', url) is None:
connection.rpush('crawling:to_visit', url) # add URL to the end of the list
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment