Created
September 16, 2010 10:17
-
-
Save tkosaka/582211 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- Mode: python -*- | |
# -*- coding: utf-8 -*- | |
# Definition: Convert a html file of Aozora Bunko to be more readable in Kindle. | |
# Author: Tomohiko KOSAKA <tomohiko.kosaka@gmail.com> | |
# Version: 0.01 | |
# Date: 2010/09/16 (Originally written in 2010/09/07) | |
# Requirements: KindleGen (http://www.amazon.com/gp/feature.html?ie=UTF8&docId=1000234621) | |
# Usage: | |
# 1. Convert an AOZORA BUNKO's HTML file. | |
# | |
# python aozora_converter.py AOZORA_BUNKO_HTML_FILE OUTPUT_HTML_FILE | |
# | |
# 2. You should also execute another command to create a mobi file. | |
# | |
# kindlegen OUTPUT_HTML_FILE OUTPUT_MOBI_FILE | |
# | |
# 3. Move the generated mobi file to the Kindle's <documents> directory. | |
# | |
# Note: <i> tag does not seem to work. | |
# ------------------ | |
import re | |
def main (html_in, html_out): | |
f = file(html_in, "r") | |
buf = f.read() | |
f.close() | |
regexp_gaiji = re.compile("<img\ssrc=.*\salt=(\".*\")\sclass=\"gaiji\"\s/>") | |
regexp_rb = re.compile("<rb>([^<]+)</rb>") | |
regexp_rp = re.compile("<rp>[^<]+</rp>") | |
regexp_rt = re.compile("<rt>[^<]+</rt>") | |
regexp_ruby = re.compile("<ruby>([^<]+)</ruby>") | |
regexp_em_class_sesame_dot = re.compile("<em\sclass=\"sesame_dot\">([^<]+)</em>") | |
buf = regexp_gaiji.sub("\\1", buf) | |
buf = regexp_rb.sub("\\1", buf) | |
buf = regexp_rp.sub("", buf) | |
buf = regexp_rt.sub("", buf) | |
buf = regexp_ruby.sub("\\1", buf) | |
buf = regexp_em_class_sesame_dot.sub("<i>\\1</i>", buf) | |
f = file(html_out, "w") | |
f.write(buf) | |
f.close() | |
if __name__ == "__main__": | |
import sys | |
main(sys.argv[1], sys.argv[2]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment