Created
December 6, 2010 21:37
-
-
Save singpolyma/731004 to your computer and use it in GitHub Desktop.
Convert ePub files to XHTML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
# Usage ./epub2html.rb thing.epub thing.html | |
# | |
# ePub files are just XHTML with a ton of (often useless) stuff layered on top | |
# This script extracts the XHTML content and sticks it together | |
# It tries to preserve common metadata | |
# It tries to preserve images | |
# It converts Google Books' crap .gtxt_heading to <h1> | |
# Copyright © 2010, Stephen Paul Weber <singpolyma.net> | |
# | |
# Permission to use, copy, modify, and/or distribute this software for any | |
# purpose with or without fee is hereby granted, provided that the above | |
# copyright notice and this permission notice appear in all copies. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | |
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | |
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
require 'mime/types' | |
require 'nokogiri' | |
require 'zip/zip' | |
require 'cgi' | |
fh = open(ARGV[1], 'w') | |
fh.puts '<?xml version="1.0" ?>' | |
fh.puts '<!DOCTYPE html>' | |
fh.puts '<html xmlns="http://www.w3.org/1999/xhtml" xmlns:dc="http://purl.org/dc/elements/1.1/">' | |
Zip::ZipFile.open(ARGV[0]) { |zipfile| | |
rootfile = Nokogiri::parse(zipfile.read('META-INF/container.xml')).at('rootfile').attributes['full-path'].to_s | |
rootdir = File::dirname(rootfile) | |
doc = Nokogiri::parse(zipfile.read(rootfile)) | |
fh.puts '<head>' | |
if (title = doc.at('//dc:title', {'dc' => 'http://purl.org/dc/elements/1.1/'}).text rescue nil) | |
fh.puts "<title>#{CGI::escapeHTML(title)}</title>" | |
fh.puts "<dc:title>#{CGI::escapeHTML(title)}</dc:title>" | |
end | |
if (creator = doc.at('//dc:creator', {'dc' => 'http://purl.org/dc/elements/1.1/'}).text rescue nil) | |
fh.puts "<dc:creator>#{CGI::escapeHTML(creator)}</dc:creator>" | |
end | |
if (publisher = doc.at('//dc:publisher', {'dc' => 'http://purl.org/dc/elements/1.1/'}).text rescue nil) | |
fh.puts "<dc:publisher>#{CGI::escapeHTML(publisher)}</dc:publisher>" | |
end | |
doc.search('metadata meta').each { |meta| | |
fh.puts meta.to_html | |
} | |
fh.puts '</head>' | |
fh.puts '<body>' | |
doc.search('manifest item[media-type="application/xhtml+xml"]').each { |item| | |
infile = File::join(rootdir, item.attributes['href'].to_s) | |
indir = File::dirname(infile) | |
indoc = Nokogiri::parse(zipfile.read(infile)).at('body') | |
indoc.search('img').each { |img| | |
if img.attributes['src'].to_s !~ /^http/i | |
img['src'] = 'data:' + MIME::Types::type_for(img.attributes['src'].to_s).first.to_s + ';base64,' + \ | |
[zipfile.read(File::expand_path(File::join(indir, img.attributes['src'].to_s), '/')[1..-1])].pack('m').gsub(/\s+/,'') | |
end | |
} | |
# HACK: convert Google's gtxt_heading style to actual headings | |
indoc.search('.gtxt_heading').each { |heading| | |
heading.name = 'h1' | |
} | |
# Files are probably sections of some kind | |
fh.puts '<section>' | |
fh.puts indoc.to_html | |
fh.puts '</section>' | |
} | |
} | |
fh.puts '</body>' | |
fh.puts '</html>' | |
fh.close |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
cannot run, any ideas?