Skip to content

Instantly share code, notes, and snippets.

@AnderRV

AnderRV/tasks.py Secret

Created August 20, 2021 10:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AnderRV/4e06185b0e53da256622a4f85479db14 to your computer and use it in GitHub Desktop.
Save AnderRV/4e06185b0e53da256622a4f85479db14 to your computer and use it in GitHub Desktop.
from celery import Celery
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
app = Celery('tasks', broker_url='redis://127.0.0.1:6379/1')
@app.task
def crawl(url):
html = get_html(url)
soup = BeautifulSoup(html, 'html.parser')
links = extract_links(url, soup)
print(links)
def get_html(url):
try:
response = requests.get(url)
return response.content
except Exception as e:
print(e)
return ''
def extract_links(url, soup):
return list({
urljoin(url, a.get('href'))
for a in soup.find_all('a')
if a.get('href') and not(a.get('rel') and 'nofollow' in a.get('rel'))
})
starting_url = 'https://scrapeme.live/shop/page/1/'
crawl.delay(starting_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment