Skip to content

Instantly share code, notes, and snippets.

@tkosaka
Created September 16, 2010 10:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tkosaka/582211 to your computer and use it in GitHub Desktop.
Save tkosaka/582211 to your computer and use it in GitHub Desktop.
# -*- Mode: python -*-
# -*- coding: utf-8 -*-
# Definition: Convert a html file of Aozora Bunko to be more readable in Kindle.
# Author: Tomohiko KOSAKA <tomohiko.kosaka@gmail.com>
# Version: 0.01
# Date: 2010/09/16 (Originally written in 2010/09/07)
# Requirements: KindleGen (http://www.amazon.com/gp/feature.html?ie=UTF8&docId=1000234621)
# Usage:
# 1. Convert an AOZORA BUNKO's HTML file.
#
# python aozora_converter.py AOZORA_BUNKO_HTML_FILE OUTPUT_HTML_FILE
#
# 2. You should also execute another command to create a mobi file.
#
# kindlegen OUTPUT_HTML_FILE OUTPUT_MOBI_FILE
#
# 3. Move the generated mobi file to the Kindle's <documents> directory.
#
# Note: <i> tag does not seem to work.
# ------------------
import re
def main (html_in, html_out):
f = file(html_in, "r")
buf = f.read()
f.close()
regexp_gaiji = re.compile("<img\ssrc=.*\salt=(\".*\")\sclass=\"gaiji\"\s/>")
regexp_rb = re.compile("<rb>([^<]+)</rb>")
regexp_rp = re.compile("<rp>[^<]+</rp>")
regexp_rt = re.compile("<rt>[^<]+</rt>")
regexp_ruby = re.compile("<ruby>([^<]+)</ruby>")
regexp_em_class_sesame_dot = re.compile("<em\sclass=\"sesame_dot\">([^<]+)</em>")
buf = regexp_gaiji.sub("\\1", buf)
buf = regexp_rb.sub("\\1", buf)
buf = regexp_rp.sub("", buf)
buf = regexp_rt.sub("", buf)
buf = regexp_ruby.sub("\\1", buf)
buf = regexp_em_class_sesame_dot.sub("<i>\\1</i>", buf)
f = file(html_out, "w")
f.write(buf)
f.close()
if __name__ == "__main__":
import sys
main(sys.argv[1], sys.argv[2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment