Skip to content

Instantly share code, notes, and snippets.

@yuiseki
Last active October 17, 2020 05:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yuiseki/5570ba839fd92dea10a740320ca2f839 to your computer and use it in GitHub Desktop.
Save yuiseki/5570ba839fd92dea10a740320ca2f839 to your computer and use it in GitHub Desktop.
IETFのRFCをScrapboxにimportできるJSONにするスクリプト
import requests
import json
import time
import re
from datetime import datetime, timedelta
base_url = "https://www.ietf.org/ietf-ftp/rfc/"
numrange = range(1, 500)
scrapbox_json = {'pages':[]}
authors_list = []
abbr_list = []
for num in numrange:
print(num, end=",")
#
file_name = "rfc"+str(num).zfill(4)
json_url = base_url+file_name+".json"
text_url = base_url+file_name+".txt"
headers = {"content-type": "application/json;utf-8"}
res = requests.get(json_url, headers=headers)
res.encoding = res.apparent_encoding
docinfo = res.json()[0]
# title
title = docinfo['title']
if title.startswith(" "):
title = title.lstrip()
if title.endswith(" "):
title = title.rstrip()
abbrs = re.findall(r'[A-Z]+', title)
for abbr in abbrs:
if len(abbr) == 1:
continue
if not abbr in abbr_list:
abbr_list.append(abbr)
lines = [docinfo['doc_id'], title, ""]
# 著者のリンク
authors = docinfo['authors']
for author in authors:
if not author in authors_list:
authors_list.append(author)
author_link = "author: ["+author+"]"
lines.append(author_link)
# 発行年のリンク
pub_date = docinfo['pub_date']
# 2パターンある
# April 1978
# 1 April 1978
if(len(pub_date.split(' ')) == 3):
pub_year = int(pub_date.split(' ')[2])
pub_month = pub_date.split(' ')[1]
pub_day = pub_date.split(' ')[0]
else:
pub_year = int(pub_date.split(' ')[1])
pub_month = pub_date.split(' ')[0]
pub_day = 1
pub_year_link = "published at: ["+str(pub_year)+"]"
lines.append(pub_year_link)
# unixtime
unixtimestamp = 0
if pub_year > 1969:
if pub_year == 1970 and pub_month == "January":
unixtimestamp = 0
else:
if(len(pub_date.split(' ')) == 3):
pub_datetime = datetime.strptime(pub_date, "%d %B %Y")
unixtimestamp = int(pub_datetime.timestamp())
else:
pub_datetime = datetime.strptime(pub_date, "%B %Y")
unixtimestamp = int(pub_datetime.timestamp())
# 他のRFCへの参照のリンク
ref_words = ["updates", "updated_by", "obsoletes", "obsoleted_by", "see_also"]
for word in ref_words:
for ref in docinfo[word]:
if ref is not None and ref is not '':
ref = ref.replace(" ", "")
ref_link = word+": ["+ref+"]"
lines.append(ref_link)
# doi
if docinfo['doi'] is not None:
lines.append("doi: https://doi.org/"+docinfo['doi'])
lines.append("")
headers = {"content-type": "text/plain;utf-8"}
res = requests.get(text_url, headers=headers)
if res.status_code == 200:
res.encoding = res.apparent_encoding
body = res.text.split("\n")
for l in body:
abbrs = re.findall(r'[A-Z]+', l)
for abbr in abbrs:
if len(abbr) == 1:
continue
if not abbr in abbr_list:
abbr_list.append(abbr)
l.replace(abbr, "["+abbr+"]")
lines.append(l)
page_json = {
"title": docinfo['doc_id'],
"created": unixtimestamp,
"updated": datetime.now().timestamp(),
"lines": lines
}
scrapbox_json['pages'].append(page_json)
# 著者ページ
for author in authors_list:
if author == "":
continue
print(author, end=",")
page_json = {
"title": author,
"created": datetime.now().timestamp(),
"updated": datetime.now().timestamp(),
"lines": [author, "[author]"]
}
scrapbox_json['pages'].append(page_json)
# 略語ページ
for abbr in abbr_list:
if len(abbr) == 1:
continue
print(abbr, end=",")
page_json = {
"title": abbr,
"created": datetime.now().timestamp(),
"updated": datetime.now().timestamp(),
"lines": [abbr, "[abbr]"]
}
scrapbox_json['pages'].append(page_json)
result = json.dumps(scrapbox_json)
with open('rfc.json', 'w') as f:
f.write(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment