yuiseki/rfc-scrapbox.py

## rfc-scrapbox.py
import requests
import json
import time
import re
from datetime import datetime, timedelta

base_url = "https://www.ietf.org/ietf-ftp/rfc/"

numrange = range(1, 500)
scrapbox_json = {'pages':[]}
authors_list = []
abbr_list = []
for num in numrange:
    print(num, end=",")

    #
    file_name = "rfc"+str(num).zfill(4)
    json_url = base_url+file_name+".json"
    text_url = base_url+file_name+".txt"
    headers = {"content-type": "application/json;utf-8"}
    res = requests.get(json_url, headers=headers)
    res.encoding = res.apparent_encoding
    docinfo = res.json()[0]

    # title
    title = docinfo['title']
    if title.startswith(" "):
        title = title.lstrip()
    if title.endswith(" "):
        title = title.rstrip()
    abbrs = re.findall(r'[A-Z]+', title)
    for abbr in abbrs:
        if len(abbr) == 1:
            continue
        if not abbr in abbr_list:
            abbr_list.append(abbr)

    lines = [docinfo['doc_id'], title, ""]

    # 著者のリンク
    authors = docinfo['authors']
    for author in authors:
        if not author in authors_list:
            authors_list.append(author)
        author_link = "author: ["+author+"]"
        lines.append(author_link)

    # 発行年のリンク
    pub_date = docinfo['pub_date']
    # 2パターンある
    # April 1978
    # 1 April 1978
    if(len(pub_date.split(' ')) == 3):
        pub_year = int(pub_date.split(' ')[2])
        pub_month = pub_date.split(' ')[1]
        pub_day = pub_date.split(' ')[0]
    else:
        pub_year = int(pub_date.split(' ')[1])
        pub_month = pub_date.split(' ')[0]
        pub_day = 1
    pub_year_link = "published at: ["+str(pub_year)+"]"
    lines.append(pub_year_link)

    # unixtime
    unixtimestamp = 0
    if pub_year > 1969:
        if pub_year == 1970 and pub_month == "January":
            unixtimestamp = 0
        else:
            if(len(pub_date.split(' ')) == 3):
                pub_datetime = datetime.strptime(pub_date, "%d %B %Y")
                unixtimestamp = int(pub_datetime.timestamp())
            else:
                pub_datetime = datetime.strptime(pub_date, "%B %Y")
                unixtimestamp = int(pub_datetime.timestamp())

    # 他のRFCへの参照のリンク
    ref_words = ["updates", "updated_by", "obsoletes", "obsoleted_by", "see_also"]
    for word in ref_words:
        for ref in docinfo[word]:
            if ref is not None and ref is not '':
                ref = ref.replace(" ", "")
                ref_link = word+": ["+ref+"]"
                lines.append(ref_link)

    # doi
    if docinfo['doi'] is not None:
        lines.append("doi: https://doi.org/"+docinfo['doi'])

    lines.append("")

    headers = {"content-type": "text/plain;utf-8"}
    res = requests.get(text_url, headers=headers)
    if res.status_code == 200:
        res.encoding = res.apparent_encoding
        body = res.text.split("\n")
        for l in body:
            abbrs = re.findall(r'[A-Z]+', l)
            for abbr in abbrs:
                if len(abbr) == 1:
                    continue
                if not abbr in abbr_list:
                    abbr_list.append(abbr)
                l.replace(abbr, "["+abbr+"]")
            lines.append(l)

    page_json = {
        "title": docinfo['doc_id'],
        "created": unixtimestamp,
        "updated": datetime.now().timestamp(),
        "lines": lines
    }
    scrapbox_json['pages'].append(page_json)

# 著者ページ
for author in authors_list:
    if author == "":
        continue
    print(author, end=",")
    page_json = {
        "title": author,
        "created": datetime.now().timestamp(),
        "updated": datetime.now().timestamp(),
        "lines": [author, "[author]"]
    }
    scrapbox_json['pages'].append(page_json)

# 略語ページ
for abbr in abbr_list:
    if len(abbr) == 1:
        continue
    print(abbr, end=",")
    page_json = {
        "title": abbr,
        "created": datetime.now().timestamp(),
        "updated": datetime.now().timestamp(),
        "lines": [abbr, "[abbr]"]
    }
    scrapbox_json['pages'].append(page_json)

result = json.dumps(scrapbox_json)
with open('rfc.json', 'w') as f:
    f.write(result)
	import requests
	import json
	import time
	import re
	from datetime import datetime, timedelta

	base_url = "https://www.ietf.org/ietf-ftp/rfc/"

	numrange = range(1, 500)
	scrapbox_json = {'pages':[]}
	authors_list = []
	abbr_list = []
	for num in numrange:
	print(num, end=",")

	#
	file_name = "rfc"+str(num).zfill(4)
	json_url = base_url+file_name+".json"
	text_url = base_url+file_name+".txt"
	headers = {"content-type": "application/json;utf-8"}
	res = requests.get(json_url, headers=headers)
	res.encoding = res.apparent_encoding
	docinfo = res.json()[0]

	# title
	title = docinfo['title']
	if title.startswith(" "):
	title = title.lstrip()
	if title.endswith(" "):
	title = title.rstrip()
	abbrs = re.findall(r'[A-Z]+', title)
	for abbr in abbrs:
	if len(abbr) == 1:
	continue
	if not abbr in abbr_list:
	abbr_list.append(abbr)

	lines = [docinfo['doc_id'], title, ""]

	# 著者のリンク
	authors = docinfo['authors']
	for author in authors:
	if not author in authors_list:
	authors_list.append(author)
	author_link = "author: ["+author+"]"
	lines.append(author_link)

	# 発行年のリンク
	pub_date = docinfo['pub_date']
	# 2パターンある
	# April 1978
	# 1 April 1978
	if(len(pub_date.split(' ')) == 3):
	pub_year = int(pub_date.split(' ')[2])
	pub_month = pub_date.split(' ')[1]
	pub_day = pub_date.split(' ')[0]
	else:
	pub_year = int(pub_date.split(' ')[1])
	pub_month = pub_date.split(' ')[0]
	pub_day = 1
	pub_year_link = "published at: ["+str(pub_year)+"]"
	lines.append(pub_year_link)

	# unixtime
	unixtimestamp = 0
	if pub_year > 1969:
	if pub_year == 1970 and pub_month == "January":
	unixtimestamp = 0
	else:
	if(len(pub_date.split(' ')) == 3):
	pub_datetime = datetime.strptime(pub_date, "%d %B %Y")
	unixtimestamp = int(pub_datetime.timestamp())
	else:
	pub_datetime = datetime.strptime(pub_date, "%B %Y")
	unixtimestamp = int(pub_datetime.timestamp())

	# 他のRFCへの参照のリンク
	ref_words = ["updates", "updated_by", "obsoletes", "obsoleted_by", "see_also"]
	for word in ref_words:
	for ref in docinfo[word]:
	if ref is not None and ref is not '':
	ref = ref.replace(" ", "")
	ref_link = word+": ["+ref+"]"
	lines.append(ref_link)

	# doi
	if docinfo['doi'] is not None:
	lines.append("doi: https://doi.org/"+docinfo['doi'])

	lines.append("")

	headers = {"content-type": "text/plain;utf-8"}
	res = requests.get(text_url, headers=headers)
	if res.status_code == 200:
	res.encoding = res.apparent_encoding
	body = res.text.split("\n")
	for l in body:
	abbrs = re.findall(r'[A-Z]+', l)
	for abbr in abbrs:
	if len(abbr) == 1:
	continue
	if not abbr in abbr_list:
	abbr_list.append(abbr)
	l.replace(abbr, "["+abbr+"]")
	lines.append(l)

	page_json = {
	"title": docinfo['doc_id'],
	"created": unixtimestamp,
	"updated": datetime.now().timestamp(),
	"lines": lines
	}
	scrapbox_json['pages'].append(page_json)

	# 著者ページ
	for author in authors_list:
	if author == "":
	continue
	print(author, end=",")
	page_json = {
	"title": author,
	"created": datetime.now().timestamp(),
	"updated": datetime.now().timestamp(),
	"lines": [author, "[author]"]
	}
	scrapbox_json['pages'].append(page_json)

	# 略語ページ
	for abbr in abbr_list:
	if len(abbr) == 1:
	continue
	print(abbr, end=",")
	page_json = {
	"title": abbr,
	"created": datetime.now().timestamp(),
	"updated": datetime.now().timestamp(),
	"lines": [abbr, "[abbr]"]
	}
	scrapbox_json['pages'].append(page_json)

	result = json.dumps(scrapbox_json)
	with open('rfc.json', 'w') as f:
	f.write(result)