Created
September 4, 2020 23:44
-
-
Save magiskboy/efb51ab4a498d4caf57c9a71e4840fcd to your computer and use it in GitHub Desktop.
Clone lucumr.pocco.org
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import sys | |
import httpx | |
from bs4 import BeautifulSoup | |
site = 'https://lucumr.pocoo.org' | |
def get_next_page_url(page): | |
next_link_sel = 'body > div > div.body > div.pagination > a:nth-child(3)' | |
node = page.select_one(next_link_sel) | |
if node: | |
return site + node.get('href') | |
return None | |
def get_article_urls(page): | |
article_link_sel = 'body > div > div.body > div.entry-wrapper > div.entry-overview > div.detail > h1 > a' | |
return map(lambda x: x.get('href'), page.select(article_link_sel)) | |
def parse_article(page): | |
article_title_sel = 'body > div > div.body > h1' | |
article_date_sel = 'body > div > div.body > p.date' | |
article_content_sel = 'body > div.container > div.body' | |
title = page.select_one(article_title_sel).get_text() | |
date = page.select_one(article_date_sel).get_text().replace('written on ', '') | |
content = str(page.select_one(article_content_sel)) | |
return { | |
'title': title, | |
'date': date, | |
'content': content, | |
} | |
async def fetch_article(url): | |
async with httpx.AsyncClient() as client: | |
r = await client.get(site + url) | |
if r.status_code == 200: | |
page = BeautifulSoup(r.text) | |
article = parse_article(page) | |
print(article['title'], '-', article['date']) | |
def get_page(current_page): | |
r = httpx.get(current_page) | |
if r.status_code == 200: | |
page = BeautifulSoup(r.text) | |
article_urls = get_article_urls(page) | |
current_page = get_next_page_url(page) | |
return article_urls, current_page | |
if __name__ == '__main__': | |
page_url = site | |
loop = asyncio.new_event_loop() | |
asyncio.set_event_loop(loop) | |
try: | |
while page_url: | |
article_urls, page_url = get_page(page_url) | |
fetches = asyncio.gather(*list(map(fetch_article, article_urls))) | |
loop.run_until_complete(fetches) | |
except Exception as err: | |
print(str(err), file=sys.stderr) | |
finally: | |
loop.close() | |
print('Done') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment