hetima/makedocset.py

## readme.md

      
    Raw
  

              readme.md
            
          
    これは Python 3.6.1 の docset 日本語版を生成するスクリプトです。
Dash がインストールする英語版の docset をコピーして html を https://docs.python.jp/3/ からダウンロードしてすげ替えています。
仕様は英語版とまったく同じで、サイドバーの下から出てくるページ内インデックスも動作します。
Dash の Preferences で英語版の Python 3.6.1 docset をダウンロードしておく必要があります。
スクリプトを実行するとカレントディレクトリに Python 3-ja.docset という名前で作成されます。けっこう時間がかかります。python3 と BeautifulSoup4 が必要です。
pip3 install beautifulsoup4

2017-05-17 13:23 更新。コードの出力結果部分を消してコピーしやすくするスクリプトが動いてなかったのを修正、処理時間タイマー追加、20%高速化

  
## makedocset.py
#!/usr/bin/env python3

import os
import time
import copy
import urllib.request
import urllib.parse
from shlex import quote
from math import trunc
from bs4 import BeautifulSoup

# 生成される docset の名前。カレントディレクトリに作られる
new_docset_name = 'Python 3-ja.docset'
# 日本語ドキュメントの URL
base_url = 'https://docs.python.jp/3/'
# 英語版 docset のファイルパス
original_docset_path = '~/Library/Application Support/Dash/DocSets/Python_3/Python 3.docset/Contents/Resources/tarix.tgz'


# 日本語 html ファイルをダウンロード
def get_jp_resource(doc_path):
    if doc_path == 'genindex-Symbols.html':
        doc_path = urllib.parse.quote('genindex-記号.html')
    url = urllib.parse.urljoin(base_url, doc_path)
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    return data


# html ファイル1個ごとに作られる class
class Scraper(object):
    def __init__(self, docroot_path, doc_path):
        super(Scraper, self).__init__()
        self.doc_path = doc_path
        self.full_path = os.path.join(docroot_path, doc_path)
        with open(self.full_path) as fp:
            self.en_soup = BeautifulSoup(fp, "html.parser")
        jp_resource = get_jp_resource(doc_path)
        if jp_resource:
            self.ja_soup = BeautifulSoup(jp_resource, "html.parser")
        else:
            print("download failed")
        if self.doc_path == 'genindex.html': self.workaround_genindex_symbols()

    # 保存は上書き
    def save(self):
        # output = self.ja_soup.prettify(formatter=None)
        output = str(self.ja_soup)
        with open(self.full_path, 'w') as f:
            f.write(output)

    # Dash 上では意味のない script を除去
    def strip_unnecessary_elements(self):
        scripts = self.ja_soup.find_all("script")
        for script in scripts:
            script.clear()
            if script.attrs and script.attrs.get('src'):
                if script.attrs.get('src').endswith('_static/version_switch.js'): script.decompose()
                elif script.attrs.get('src').endswith('_static/translations.js'): script.decompose()
                elif script.attrs.get('src').endswith('_static/sidebar.js'): script.decompose()
                elif script.attrs.get('src').endswith('_static/doctools.js'): script.decompose()


    # 英語版の dashAnchor 要素を探して同じ場所に埋め込む
    def copy_dash_anchors(self):
        dash_anchors = self.en_soup.find_all(attrs={"class": "dashAnchor"})
        for dash_anchor in dash_anchors:
            ja_anchor = self.anchor_in_ja(dash_anchor)
            if ja_anchor is None:
                print("ERROR: anchor not found: " + dash_anchor['name'])
            else:
                ja_anchor.insert_before(copy.copy(dash_anchor))

    # dashAnchor と対応する要素を探す
    def anchor_in_ja(self, dash_anchor):
        # dashAnchor の次の要素が対応する要素 今のところ適合率100%
        next_sibling = dash_anchor.find_next_sibling()
        if next_sibling and next_sibling.has_attr('id'):
            label = next_sibling['id']
        else:
            label = os.path.basename(dash_anchor['name'])
        ja_anchor = self.ja_soup.find(attrs={"id": label})
        if ja_anchor is None:
            name = dash_anchor['name']
            label2 = os.path.basename(name)
            # ここだけ内容が異なる
            if name == '//apple_ref/Function/tabnanny.tokeneater': label2 = 'tabnanny.process_tokens'
            ja_anchor = self.ja_soup.find(attrs={"id": label2})
        return ja_anchor

    # 英語と日本語でファイル名が異なるものがあるので対処
    def workaround_genindex_symbols(self):
        bad_links = self.ja_soup.find_all(href="genindex-記号.html")
        for bad_link in bad_links:
            bad_link.attrs['href'] = "genindex-Symbols.html"


class DocsetJP(object):
    def __init__(self):
        super(DocsetJP, self).__init__()
        self.global_start_time = 0
        self.docroot_path = ""

    def start(self):
        self.global_start_time = time.time()
        if os.path.exists(new_docset_name):
            print('ERROR: "' + new_docset_name + '" already exists in current directory')
            return False
        if self.copy_from_original():
            self.do_main()
            m, s = divmod(time.time() - self.global_start_time, 60)
            print('All done. ({:d} min {:d} sec)'.format(trunc(m), trunc(s)))

    # 圧縮 docset の tarix.tgz を解凍。名前と Info.plist を修正して準備する
    def copy_from_original(self):
        archive_path = os.path.expanduser(original_docset_path)
        if not os.path.exists(archive_path):
            print('ERROR: Python 3.docset (and/or tarix.tgz) not found', archive_path)
            return False
        os.system('tar zxf {}'.format(quote(archive_path)))
        if not os.path.exists('Python 3.docset'):
            print('ERROR: could not extract tarix.tgz')
            return False
        os.rename('Python 3.docset', new_docset_name)
        real_path = os.path.realpath(new_docset_name)
        self.docroot_path = os.path.join(real_path, "Contents/Resources/Documents/doc/")
        plist_path = os.path.join(real_path, "Contents/Info.plist")
        os.system('/usr/libexec/PlistBuddy -c "set :CFBundleIdentifier python_ja" ' + quote(plist_path))
        os.system('/usr/libexec/PlistBuddy -c "set :CFBundleName Python 3-ja" ' + quote(plist_path))
        return True

    # docset 内の html を探してして処理
    def do_main(self):
        if not os.path.exists(self.docroot_path):
            print('ERROR: working Python 3.docset not found', self.docroot_path)
        for root, dirs, files in os.walk(self.docroot_path):
            for file in files:
                t, ext = os.path.splitext(file)
                if ext == ".html":
                    doc_path = os.path.relpath(os.path.join(root, file), self.docroot_path)
                    print(doc_path, end=' ...', flush=True)
                    start_time = time.time()
                    scraper = Scraper(self.docroot_path, doc_path)
                    scraper.copy_dash_anchors()
                    scraper.strip_unnecessary_elements()
                    scraper.save()
                    print(' ({:.2f} sec)'.format(time.time() - start_time))


if __name__ == '__main__':
    d = DocsetJP()
    d.start()
	#!/usr/bin/env python3

	import os
	import time
	import copy
	import urllib.request
	import urllib.parse
	from shlex import quote
	from math import trunc
	from bs4 import BeautifulSoup

	# 生成される docset の名前。カレントディレクトリに作られる
	new_docset_name = 'Python 3-ja.docset'
	# 日本語ドキュメントの URL
	base_url = 'https://docs.python.jp/3/'
	# 英語版 docset のファイルパス
	original_docset_path = '~/Library/Application Support/Dash/DocSets/Python_3/Python 3.docset/Contents/Resources/tarix.tgz'


	# 日本語 html ファイルをダウンロード
	def get_jp_resource(doc_path):
	if doc_path == 'genindex-Symbols.html':
	doc_path = urllib.parse.quote('genindex-記号.html')
	url = urllib.parse.urljoin(base_url, doc_path)
	response = urllib.request.urlopen(url)
	data = response.read().decode('utf-8')
	return data


	# html ファイル1個ごとに作られる class
	class Scraper(object):
	def __init__(self, docroot_path, doc_path):
	super(Scraper, self).__init__()
	self.doc_path = doc_path
	self.full_path = os.path.join(docroot_path, doc_path)
	with open(self.full_path) as fp:
	self.en_soup = BeautifulSoup(fp, "html.parser")
	jp_resource = get_jp_resource(doc_path)
	if jp_resource:
	self.ja_soup = BeautifulSoup(jp_resource, "html.parser")
	else:
	print("download failed")
	if self.doc_path == 'genindex.html': self.workaround_genindex_symbols()

	# 保存は上書き
	def save(self):
	# output = self.ja_soup.prettify(formatter=None)
	output = str(self.ja_soup)
	with open(self.full_path, 'w') as f:
	f.write(output)

	# Dash 上では意味のない script を除去
	def strip_unnecessary_elements(self):
	scripts = self.ja_soup.find_all("script")
	for script in scripts:
	script.clear()
	if script.attrs and script.attrs.get('src'):
	if script.attrs.get('src').endswith('_static/version_switch.js'): script.decompose()
	elif script.attrs.get('src').endswith('_static/translations.js'): script.decompose()
	elif script.attrs.get('src').endswith('_static/sidebar.js'): script.decompose()
	elif script.attrs.get('src').endswith('_static/doctools.js'): script.decompose()


	# 英語版の dashAnchor 要素を探して同じ場所に埋め込む
	def copy_dash_anchors(self):
	dash_anchors = self.en_soup.find_all(attrs={"class": "dashAnchor"})
	for dash_anchor in dash_anchors:
	ja_anchor = self.anchor_in_ja(dash_anchor)
	if ja_anchor is None:
	print("ERROR: anchor not found: " + dash_anchor['name'])
	else:
	ja_anchor.insert_before(copy.copy(dash_anchor))

	# dashAnchor と対応する要素を探す
	def anchor_in_ja(self, dash_anchor):
	# dashAnchor の次の要素が対応する要素今のところ適合率100%
	next_sibling = dash_anchor.find_next_sibling()
	if next_sibling and next_sibling.has_attr('id'):
	label = next_sibling['id']
	else:
	label = os.path.basename(dash_anchor['name'])
	ja_anchor = self.ja_soup.find(attrs={"id": label})
	if ja_anchor is None:
	name = dash_anchor['name']
	label2 = os.path.basename(name)
	# ここだけ内容が異なる
	if name == '//apple_ref/Function/tabnanny.tokeneater': label2 = 'tabnanny.process_tokens'
	ja_anchor = self.ja_soup.find(attrs={"id": label2})
	return ja_anchor

	# 英語と日本語でファイル名が異なるものがあるので対処
	def workaround_genindex_symbols(self):
	bad_links = self.ja_soup.find_all(href="genindex-記号.html")
	for bad_link in bad_links:
	bad_link.attrs['href'] = "genindex-Symbols.html"


	class DocsetJP(object):
	def __init__(self):
	super(DocsetJP, self).__init__()
	self.global_start_time = 0
	self.docroot_path = ""

	def start(self):
	self.global_start_time = time.time()
	if os.path.exists(new_docset_name):
	print('ERROR: "' + new_docset_name + '" already exists in current directory')
	return False
	if self.copy_from_original():
	self.do_main()
	m, s = divmod(time.time() - self.global_start_time, 60)
	print('All done. ({:d} min {:d} sec)'.format(trunc(m), trunc(s)))

	# 圧縮 docset の tarix.tgz を解凍。名前と Info.plist を修正して準備する
	def copy_from_original(self):
	archive_path = os.path.expanduser(original_docset_path)
	if not os.path.exists(archive_path):
	print('ERROR: Python 3.docset (and/or tarix.tgz) not found', archive_path)
	return False
	os.system('tar zxf {}'.format(quote(archive_path)))
	if not os.path.exists('Python 3.docset'):
	print('ERROR: could not extract tarix.tgz')
	return False
	os.rename('Python 3.docset', new_docset_name)
	real_path = os.path.realpath(new_docset_name)
	self.docroot_path = os.path.join(real_path, "Contents/Resources/Documents/doc/")
	plist_path = os.path.join(real_path, "Contents/Info.plist")
	os.system('/usr/libexec/PlistBuddy -c "set :CFBundleIdentifier python_ja" ' + quote(plist_path))
	os.system('/usr/libexec/PlistBuddy -c "set :CFBundleName Python 3-ja" ' + quote(plist_path))
	return True

	# docset 内の html を探してして処理
	def do_main(self):
	if not os.path.exists(self.docroot_path):
	print('ERROR: working Python 3.docset not found', self.docroot_path)
	for root, dirs, files in os.walk(self.docroot_path):
	for file in files:
	t, ext = os.path.splitext(file)
	if ext == ".html":
	doc_path = os.path.relpath(os.path.join(root, file), self.docroot_path)
	print(doc_path, end=' ...', flush=True)
	start_time = time.time()
	scraper = Scraper(self.docroot_path, doc_path)
	scraper.copy_dash_anchors()
	scraper.strip_unnecessary_elements()
	scraper.save()
	print(' ({:.2f} sec)'.format(time.time() - start_time))


	if __name__ == '__main__':
	d = DocsetJP()
	d.start()