U-1F992/re_scraping.py

## re_scraping.py
import re
from urllib import request


def get_document(url):
    with request.urlopen(url) as res:
        return res.read().decode("utf-8")


def get_article_urls(doc: str) -> list[str]:
    section = re.findall(
        r'<section class="mb-64" id="article-tile">.+?</section>', doc, re.DOTALL)
    if len(section) != 1:
        return []

    lis: list[str] = re.findall(r'<li>.+?</li>', section[0], re.DOTALL)
    hrefs: list[str] = []
    for li in lis:
        href = re.findall(
            r'(?<=<a href=")//mainichi.jp/maisho/articles/.+?(?=">)', li)
        if len(href) != 1:
            continue
        hrefs.append("https:" + href[0])

    return hrefs


def get_article(doc: str) -> tuple[str, str]:
    title: list[str] = re.findall(
        r'(?<=<h1 class="title-page">).+?(?=</h1>)', doc, re.DOTALL)
    if len(title) != 1:
        return "", ""

    section: list[str] = re.findall(
        r'<section class="articledetail-body" id="articledetail-body">.+?</section>', doc, re.DOTALL)
    if len(section) != 1:
        return title[0], ""

    ps: list[str] = re.findall(r'<p>.+?</p>', section[0], re.DOTALL)
    body = "\n".join([re.sub(r'<p>(.+?)</p>', r'\1', p).strip() for p in ps])
    body = re.sub(r'<ruby><rb>(.+?)</rb>.+?</ruby>', r'\1', body)

    return title[0], body


if __name__ == "__main__":

    ARTICLES_URL = "https://mainichi.jp/maisho/ch170361626i/%E6%AF%8E%E5%B0%8F%E3%83%8B%E3%83%A5%E3%83%BC%E3%82%B9"
    doc = get_document(ARTICLES_URL)

    for url in get_article_urls(doc):
        title, body = get_article(get_document(url))
        print(f"""title: {title}
body: {body}
""")
	import re
	from urllib import request


	def get_document(url):
	with request.urlopen(url) as res:
	return res.read().decode("utf-8")


	def get_article_urls(doc: str) -> list[str]:
	section = re.findall(
	r'<section class="mb-64" id="article-tile">.+?</section>', doc, re.DOTALL)
	if len(section) != 1:
	return []

	lis: list[str] = re.findall(r'<li>.+?</li>', section[0], re.DOTALL)
	hrefs: list[str] = []
	for li in lis:
	href = re.findall(
	r'(?<=<a href=")//mainichi.jp/maisho/articles/.+?(?=">)', li)
	if len(href) != 1:
	continue
	hrefs.append("https:" + href[0])

	return hrefs


	def get_article(doc: str) -> tuple[str, str]:
	title: list[str] = re.findall(
	r'(?<=<h1 class="title-page">).+?(?=</h1>)', doc, re.DOTALL)
	if len(title) != 1:
	return "", ""

	section: list[str] = re.findall(
	r'<section class="articledetail-body" id="articledetail-body">.+?</section>', doc, re.DOTALL)
	if len(section) != 1:
	return title[0], ""

	ps: list[str] = re.findall(r'<p>.+?</p>', section[0], re.DOTALL)
	body = "\n".join([re.sub(r'<p>(.+?)</p>', r'\1', p).strip() for p in ps])
	body = re.sub(r'<ruby><rb>(.+?)</rb>.+?</ruby>', r'\1', body)

	return title[0], body


	if __name__ == "__main__":

	ARTICLES_URL = "https://mainichi.jp/maisho/ch170361626i/%E6%AF%8E%E5%B0%8F%E3%83%8B%E3%83%A5%E3%83%BC%E3%82%B9"
	doc = get_document(ARTICLES_URL)

	for url in get_article_urls(doc):
	title, body = get_article(get_document(url))
	print(f"""title: {title}
	body: {body}
	""")