Skip to content

Instantly share code, notes, and snippets.

@pineapplehunter
Last active October 25, 2022 18:35
Show Gist options
  • Save pineapplehunter/671f99ec1a9107b104dd0f9a9523f608 to your computer and use it in GitHub Desktop.
Save pineapplehunter/671f99ec1a9107b104dd0f9a9523f608 to your computer and use it in GitHub Desktop.
中條研の論文をスクレイピングしてMDを作るスクリプト
import json
import re
from bs4 import BeautifulSoup, UnicodeDammit
import requests
from pathlib import Path
cache_file = Path("paper_data_cache.json")
output_dir = Path("markdown")
if not cache_file.exists():
all_docs = {}
for i in range(2003, 2022 + 1):
response = requests.get(f"http://www.nj.cs.tuat.ac.jp/paper_{i}.html")
content = UnicodeDammit(response.content, ["shift_jis", "utf-8"])
print(i, content.original_encoding)
html = BeautifulSoup(content.unicode_markup, "html.parser")
elem = html.select_one("div.common>div")
# print(elem.get_text())
elem.name
doc = []
title = "None"
contents = []
for child in elem.findChildren():
if child.name.lower() == "h1":
title = child.get_text().strip()
if child.name.lower() == "menu":
contents = re.split(r"\[\d\]", child.get_text())
contents = map(
lambda x: x.strip()
.replace("\t", "")
.replace("\n", "")
.replace(" ", ""),
contents,
)
contents = filter(lambda x: len(x) > 0, contents)
contents = list(contents)
doc.append(
{
"title": title,
"contents": contents,
}
)
# pprint(doc)
all_docs[i] = doc
with open(cache_file, "w") as f:
json.dump(all_docs, f, ensure_ascii=False)
output_dir.mkdir(exist_ok=True)
with open(cache_file, "r") as f:
all_docs: dict[int, list[dict[str, any]]] = json.load(f)
for key, contents in all_docs.items():
frontmatter = f"""---
title: {key}年 論文・発表
description: {key}年の論文や発表のまとめです。
---
<!-- automaticly generated from script https://gist.github.com/pineapplehunter/671f99ec1a9107b104dd0f9a9523f608 -->
"""
content_str = ""
for doc in contents:
title: str = doc["title"]
content_str += f"## {title}\n"
for c in doc["contents"]:
c: str = c
content_str += f"{c}\n\n"
with open(output_dir / f"{key}.md", "w") as f:
f.write(frontmatter + content_str)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment