Skip to content

Instantly share code, notes, and snippets.

@magiskboy
Created September 4, 2020 23:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save magiskboy/efb51ab4a498d4caf57c9a71e4840fcd to your computer and use it in GitHub Desktop.
Save magiskboy/efb51ab4a498d4caf57c9a71e4840fcd to your computer and use it in GitHub Desktop.
Clone lucumr.pocco.org
import asyncio
import sys
import httpx
from bs4 import BeautifulSoup
site = 'https://lucumr.pocoo.org'
def get_next_page_url(page):
next_link_sel = 'body > div > div.body > div.pagination > a:nth-child(3)'
node = page.select_one(next_link_sel)
if node:
return site + node.get('href')
return None
def get_article_urls(page):
article_link_sel = 'body > div > div.body > div.entry-wrapper > div.entry-overview > div.detail > h1 > a'
return map(lambda x: x.get('href'), page.select(article_link_sel))
def parse_article(page):
article_title_sel = 'body > div > div.body > h1'
article_date_sel = 'body > div > div.body > p.date'
article_content_sel = 'body > div.container > div.body'
title = page.select_one(article_title_sel).get_text()
date = page.select_one(article_date_sel).get_text().replace('written on ', '')
content = str(page.select_one(article_content_sel))
return {
'title': title,
'date': date,
'content': content,
}
async def fetch_article(url):
async with httpx.AsyncClient() as client:
r = await client.get(site + url)
if r.status_code == 200:
page = BeautifulSoup(r.text)
article = parse_article(page)
print(article['title'], '-', article['date'])
def get_page(current_page):
r = httpx.get(current_page)
if r.status_code == 200:
page = BeautifulSoup(r.text)
article_urls = get_article_urls(page)
current_page = get_next_page_url(page)
return article_urls, current_page
if __name__ == '__main__':
page_url = site
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
while page_url:
article_urls, page_url = get_page(page_url)
fetches = asyncio.gather(*list(map(fetch_article, article_urls)))
loop.run_until_complete(fetches)
except Exception as err:
print(str(err), file=sys.stderr)
finally:
loop.close()
print('Done')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment