Last active
October 17, 2020 05:03
-
-
Save yuiseki/5570ba839fd92dea10a740320ca2f839 to your computer and use it in GitHub Desktop.
IETFのRFCをScrapboxにimportできるJSONにするスクリプト
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
import time | |
import re | |
from datetime import datetime, timedelta | |
base_url = "https://www.ietf.org/ietf-ftp/rfc/" | |
numrange = range(1, 500) | |
scrapbox_json = {'pages':[]} | |
authors_list = [] | |
abbr_list = [] | |
for num in numrange: | |
print(num, end=",") | |
# | |
file_name = "rfc"+str(num).zfill(4) | |
json_url = base_url+file_name+".json" | |
text_url = base_url+file_name+".txt" | |
headers = {"content-type": "application/json;utf-8"} | |
res = requests.get(json_url, headers=headers) | |
res.encoding = res.apparent_encoding | |
docinfo = res.json()[0] | |
# title | |
title = docinfo['title'] | |
if title.startswith(" "): | |
title = title.lstrip() | |
if title.endswith(" "): | |
title = title.rstrip() | |
abbrs = re.findall(r'[A-Z]+', title) | |
for abbr in abbrs: | |
if len(abbr) == 1: | |
continue | |
if not abbr in abbr_list: | |
abbr_list.append(abbr) | |
lines = [docinfo['doc_id'], title, ""] | |
# 著者のリンク | |
authors = docinfo['authors'] | |
for author in authors: | |
if not author in authors_list: | |
authors_list.append(author) | |
author_link = "author: ["+author+"]" | |
lines.append(author_link) | |
# 発行年のリンク | |
pub_date = docinfo['pub_date'] | |
# 2パターンある | |
# April 1978 | |
# 1 April 1978 | |
if(len(pub_date.split(' ')) == 3): | |
pub_year = int(pub_date.split(' ')[2]) | |
pub_month = pub_date.split(' ')[1] | |
pub_day = pub_date.split(' ')[0] | |
else: | |
pub_year = int(pub_date.split(' ')[1]) | |
pub_month = pub_date.split(' ')[0] | |
pub_day = 1 | |
pub_year_link = "published at: ["+str(pub_year)+"]" | |
lines.append(pub_year_link) | |
# unixtime | |
unixtimestamp = 0 | |
if pub_year > 1969: | |
if pub_year == 1970 and pub_month == "January": | |
unixtimestamp = 0 | |
else: | |
if(len(pub_date.split(' ')) == 3): | |
pub_datetime = datetime.strptime(pub_date, "%d %B %Y") | |
unixtimestamp = int(pub_datetime.timestamp()) | |
else: | |
pub_datetime = datetime.strptime(pub_date, "%B %Y") | |
unixtimestamp = int(pub_datetime.timestamp()) | |
# 他のRFCへの参照のリンク | |
ref_words = ["updates", "updated_by", "obsoletes", "obsoleted_by", "see_also"] | |
for word in ref_words: | |
for ref in docinfo[word]: | |
if ref is not None and ref is not '': | |
ref = ref.replace(" ", "") | |
ref_link = word+": ["+ref+"]" | |
lines.append(ref_link) | |
# doi | |
if docinfo['doi'] is not None: | |
lines.append("doi: https://doi.org/"+docinfo['doi']) | |
lines.append("") | |
headers = {"content-type": "text/plain;utf-8"} | |
res = requests.get(text_url, headers=headers) | |
if res.status_code == 200: | |
res.encoding = res.apparent_encoding | |
body = res.text.split("\n") | |
for l in body: | |
abbrs = re.findall(r'[A-Z]+', l) | |
for abbr in abbrs: | |
if len(abbr) == 1: | |
continue | |
if not abbr in abbr_list: | |
abbr_list.append(abbr) | |
l.replace(abbr, "["+abbr+"]") | |
lines.append(l) | |
page_json = { | |
"title": docinfo['doc_id'], | |
"created": unixtimestamp, | |
"updated": datetime.now().timestamp(), | |
"lines": lines | |
} | |
scrapbox_json['pages'].append(page_json) | |
# 著者ページ | |
for author in authors_list: | |
if author == "": | |
continue | |
print(author, end=",") | |
page_json = { | |
"title": author, | |
"created": datetime.now().timestamp(), | |
"updated": datetime.now().timestamp(), | |
"lines": [author, "[author]"] | |
} | |
scrapbox_json['pages'].append(page_json) | |
# 略語ページ | |
for abbr in abbr_list: | |
if len(abbr) == 1: | |
continue | |
print(abbr, end=",") | |
page_json = { | |
"title": abbr, | |
"created": datetime.now().timestamp(), | |
"updated": datetime.now().timestamp(), | |
"lines": [abbr, "[abbr]"] | |
} | |
scrapbox_json['pages'].append(page_json) | |
result = json.dumps(scrapbox_json) | |
with open('rfc.json', 'w') as f: | |
f.write(result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment