magiskboy/get-lucumr.pocco.py

## get-lucumr.pocco.py
import asyncio
import sys
import httpx
from bs4 import BeautifulSoup


site = 'https://lucumr.pocoo.org'


def get_next_page_url(page):
    next_link_sel = 'body > div > div.body > div.pagination > a:nth-child(3)'
    node = page.select_one(next_link_sel)
    if node:
        return site + node.get('href')
    return None


def get_article_urls(page):
    article_link_sel = 'body > div > div.body > div.entry-wrapper > div.entry-overview > div.detail > h1 > a'
    return map(lambda x: x.get('href'), page.select(article_link_sel))


def parse_article(page):
    article_title_sel = 'body > div > div.body > h1'
    article_date_sel = 'body > div > div.body > p.date'
    article_content_sel = 'body > div.container > div.body'
    title = page.select_one(article_title_sel).get_text()
    date = page.select_one(article_date_sel).get_text().replace('written on ', '')
    content = str(page.select_one(article_content_sel))
    return {
        'title': title,
        'date': date,
        'content': content,
    }


async def fetch_article(url):
    async with httpx.AsyncClient() as client:
        r = await client.get(site + url)
    if r.status_code == 200:
        page = BeautifulSoup(r.text)
        article = parse_article(page)
        print(article['title'], '-', article['date'])


def get_page(current_page):
    r = httpx.get(current_page)
    if r.status_code == 200:
        page = BeautifulSoup(r.text)
        article_urls = get_article_urls(page)
        current_page = get_next_page_url(page)
        return article_urls, current_page


if __name__ == '__main__':
    page_url = site
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    try:
        while page_url:
            article_urls, page_url = get_page(page_url)
            fetches = asyncio.gather(*list(map(fetch_article, article_urls)))
            loop.run_until_complete(fetches)

    except Exception as err:
        print(str(err), file=sys.stderr)
    finally:
        loop.close()
        print('Done')
	import asyncio
	import sys
	import httpx
	from bs4 import BeautifulSoup


	site = 'https://lucumr.pocoo.org'


	def get_next_page_url(page):
	next_link_sel = 'body > div > div.body > div.pagination > a:nth-child(3)'
	node = page.select_one(next_link_sel)
	if node:
	return site + node.get('href')
	return None


	def get_article_urls(page):
	article_link_sel = 'body > div > div.body > div.entry-wrapper > div.entry-overview > div.detail > h1 > a'
	return map(lambda x: x.get('href'), page.select(article_link_sel))


	def parse_article(page):
	article_title_sel = 'body > div > div.body > h1'
	article_date_sel = 'body > div > div.body > p.date'
	article_content_sel = 'body > div.container > div.body'
	title = page.select_one(article_title_sel).get_text()
	date = page.select_one(article_date_sel).get_text().replace('written on ', '')
	content = str(page.select_one(article_content_sel))
	return {
	'title': title,
	'date': date,
	'content': content,
	}


	async def fetch_article(url):
	async with httpx.AsyncClient() as client:
	r = await client.get(site + url)
	if r.status_code == 200:
	page = BeautifulSoup(r.text)
	article = parse_article(page)
	print(article['title'], '-', article['date'])


	def get_page(current_page):
	r = httpx.get(current_page)
	if r.status_code == 200:
	page = BeautifulSoup(r.text)
	article_urls = get_article_urls(page)
	current_page = get_next_page_url(page)
	return article_urls, current_page


	if __name__ == '__main__':
	page_url = site
	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)
	try:
	while page_url:
	article_urls, page_url = get_page(page_url)
	fetches = asyncio.gather(*list(map(fetch_article, article_urls)))
	loop.run_until_complete(fetches)

	except Exception as err:
	print(str(err), file=sys.stderr)
	finally:
	loop.close()
	print('Done')