Skip to content

Instantly share code, notes, and snippets.

@U-1F992
Created April 13, 2023 16:17
Show Gist options
  • Save U-1F992/6a6098bee88d62365ab4ebd6efe84489 to your computer and use it in GitHub Desktop.
Save U-1F992/6a6098bee88d62365ab4ebd6efe84489 to your computer and use it in GitHub Desktop.
正規表現だけでスクレイピングする例
import re
from urllib import request
def get_document(url):
with request.urlopen(url) as res:
return res.read().decode("utf-8")
def get_article_urls(doc: str) -> list[str]:
section = re.findall(
r'<section class="mb-64" id="article-tile">.+?</section>', doc, re.DOTALL)
if len(section) != 1:
return []
lis: list[str] = re.findall(r'<li>.+?</li>', section[0], re.DOTALL)
hrefs: list[str] = []
for li in lis:
href = re.findall(
r'(?<=<a href=")//mainichi.jp/maisho/articles/.+?(?=">)', li)
if len(href) != 1:
continue
hrefs.append("https:" + href[0])
return hrefs
def get_article(doc: str) -> tuple[str, str]:
title: list[str] = re.findall(
r'(?<=<h1 class="title-page">).+?(?=</h1>)', doc, re.DOTALL)
if len(title) != 1:
return "", ""
section: list[str] = re.findall(
r'<section class="articledetail-body" id="articledetail-body">.+?</section>', doc, re.DOTALL)
if len(section) != 1:
return title[0], ""
ps: list[str] = re.findall(r'<p>.+?</p>', section[0], re.DOTALL)
body = "\n".join([re.sub(r'<p>(.+?)</p>', r'\1', p).strip() for p in ps])
body = re.sub(r'<ruby><rb>(.+?)</rb>.+?</ruby>', r'\1', body)
return title[0], body
if __name__ == "__main__":
ARTICLES_URL = "https://mainichi.jp/maisho/ch170361626i/%E6%AF%8E%E5%B0%8F%E3%83%8B%E3%83%A5%E3%83%BC%E3%82%B9"
doc = get_document(ARTICLES_URL)
for url in get_article_urls(doc):
title, body = get_article(get_document(url))
print(f"""title: {title}
body: {body}
""")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment