singpolyma/epub2html.rb

## epub2html.rb
#!/usr/bin/ruby

# Usage ./epub2html.rb thing.epub thing.html
#
# ePub files are just XHTML with a ton of (often useless) stuff layered on top
# This script extracts the XHTML content and sticks it together
# It tries to preserve common metadata
# It tries to preserve images
# It converts Google Books' crap .gtxt_heading to <h1>

# Copyright © 2010, Stephen Paul Weber <singpolyma.net>
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

require 'mime/types'
require 'nokogiri'
require 'zip/zip'
require 'cgi'

fh = open(ARGV[1], 'w')

fh.puts '<?xml version="1.0" ?>'
fh.puts '<!DOCTYPE html>'
fh.puts '<html xmlns="http://www.w3.org/1999/xhtml" xmlns:dc="http://purl.org/dc/elements/1.1/">'

Zip::ZipFile.open(ARGV[0]) { |zipfile|
	rootfile = Nokogiri::parse(zipfile.read('META-INF/container.xml')).at('rootfile').attributes['full-path'].to_s
	rootdir = File::dirname(rootfile)
	doc = Nokogiri::parse(zipfile.read(rootfile))

	fh.puts '<head>'
	if (title = doc.at('//dc:title', {'dc' => 'http://purl.org/dc/elements/1.1/'}).text rescue nil)
		fh.puts "<title>#{CGI::escapeHTML(title)}</title>"
		fh.puts "<dc:title>#{CGI::escapeHTML(title)}</dc:title>"
	end
	if (creator = doc.at('//dc:creator', {'dc' => 'http://purl.org/dc/elements/1.1/'}).text rescue nil)
		fh.puts "<dc:creator>#{CGI::escapeHTML(creator)}</dc:creator>"
	end
	if (publisher = doc.at('//dc:publisher', {'dc' => 'http://purl.org/dc/elements/1.1/'}).text rescue nil)
		fh.puts "<dc:publisher>#{CGI::escapeHTML(publisher)}</dc:publisher>"
	end
	doc.search('metadata meta').each { |meta|
		fh.puts meta.to_html
	}
	fh.puts '</head>'

	fh.puts '<body>'
	doc.search('manifest item[media-type="application/xhtml+xml"]').each { |item|
		infile = File::join(rootdir, item.attributes['href'].to_s)
		indir = File::dirname(infile)
		indoc = Nokogiri::parse(zipfile.read(infile)).at('body')

		indoc.search('img').each { |img|
			if img.attributes['src'].to_s !~ /^http/i
				img['src'] = 'data:' + MIME::Types::type_for(img.attributes['src'].to_s).first.to_s + ';base64,' + \
				             [zipfile.read(File::expand_path(File::join(indir, img.attributes['src'].to_s), '/')[1..-1])].pack('m').gsub(/\s+/,'')
			end
		}

		# HACK: convert Google's gtxt_heading style to actual headings
		indoc.search('.gtxt_heading').each { |heading|
			heading.name = 'h1'
		}

		# Files are probably sections of some kind
		fh.puts '<section>'
		fh.puts indoc.to_html
		fh.puts '</section>'
	}
}

fh.puts '</body>'
fh.puts '</html>'

fh.close
	#!/usr/bin/ruby

	# Usage ./epub2html.rb thing.epub thing.html
	#
	# ePub files are just XHTML with a ton of (often useless) stuff layered on top
	# This script extracts the XHTML content and sticks it together
	# It tries to preserve common metadata
	# It tries to preserve images
	# It converts Google Books' crap .gtxt_heading to <h1>

	# Copyright © 2010, Stephen Paul Weber <singpolyma.net>
	#
	# Permission to use, copy, modify, and/or distribute this software for any
	# purpose with or without fee is hereby granted, provided that the above
	# copyright notice and this permission notice appear in all copies.
	#
	# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
	# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
	# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
	# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
	# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
	# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
	# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

	require 'mime/types'
	require 'nokogiri'
	require 'zip/zip'
	require 'cgi'

	fh = open(ARGV[1], 'w')

	fh.puts '<?xml version="1.0" ?>'
	fh.puts '<!DOCTYPE html>'
	fh.puts '<html xmlns="http://www.w3.org/1999/xhtml" xmlns:dc="http://purl.org/dc/elements/1.1/">'

	Zip::ZipFile.open(ARGV[0]) { \|zipfile\|
	rootfile = Nokogiri::parse(zipfile.read('META-INF/container.xml')).at('rootfile').attributes['full-path'].to_s
	rootdir = File::dirname(rootfile)
	doc = Nokogiri::parse(zipfile.read(rootfile))

	fh.puts '<head>'
	if (title = doc.at('//dc:title', {'dc' => 'http://purl.org/dc/elements/1.1/'}).text rescue nil)
	fh.puts "<title>#{CGI::escapeHTML(title)}</title>"
	fh.puts "<dc:title>#{CGI::escapeHTML(title)}</dc:title>"
	end
	if (creator = doc.at('//dc:creator', {'dc' => 'http://purl.org/dc/elements/1.1/'}).text rescue nil)
	fh.puts "<dc:creator>#{CGI::escapeHTML(creator)}</dc:creator>"
	end
	if (publisher = doc.at('//dc:publisher', {'dc' => 'http://purl.org/dc/elements/1.1/'}).text rescue nil)
	fh.puts "<dc:publisher>#{CGI::escapeHTML(publisher)}</dc:publisher>"
	end
	doc.search('metadata meta').each { \|meta\|
	fh.puts meta.to_html
	}
	fh.puts '</head>'

	fh.puts '<body>'
	doc.search('manifest item[media-type="application/xhtml+xml"]').each { \|item\|
	infile = File::join(rootdir, item.attributes['href'].to_s)
	indir = File::dirname(infile)
	indoc = Nokogiri::parse(zipfile.read(infile)).at('body')

	indoc.search('img').each { \|img\|
	if img.attributes['src'].to_s !~ /^http/i
	img['src'] = 'data:' + MIME::Types::type_for(img.attributes['src'].to_s).first.to_s + ';base64,' + \
	[zipfile.read(File::expand_path(File::join(indir, img.attributes['src'].to_s), '/')[1..-1])].pack('m').gsub(/\s+/,'')
	end
	}

	# HACK: convert Google's gtxt_heading style to actual headings
	indoc.search('.gtxt_heading').each { \|heading\|
	heading.name = 'h1'
	}

	# Files are probably sections of some kind
	fh.puts '<section>'
	fh.puts indoc.to_html
	fh.puts '</section>'
	}
	}

	fh.puts '</body>'
	fh.puts '</html>'

	fh.close