Skip to content

Instantly share code, notes, and snippets.

@imudak
Forked from tdtds/MOVED
Created January 20, 2011 16:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save imudak/788127 to your computer and use it in GitHub Desktop.
Save imudak/788127 to your computer and use it in GitHub Desktop.
#
# Rakefile for self publishing ebook of Kindle3 made by scanning paper book.
#
# modify parameters by your environment:
# SRC (must): source PDF file name.
# TOP, BOTTOM, LEFT, RIGHT: default margins (pixel) of trimming.
# SIZE: adjust image size by destination format.
# LEVEL (optional): level option of ImageMagic.
#
# for Debian or Ubuntu user, needs packages below:
# ppler-utils poppler-data imagemagick pdftk
#
SRC = (ENV['SRC']||`ls *.pdf|head -n 1|tr -d "\n"`)
TYPE = (ENV['TYPE']||"bunko")
TRIM = "-fuzz 50% -trim"
case TYPE
when "bunko"
TOP = 0
BOTTOM = 0
LEFT = 0
RIGHT = 0
LEVEL = '0%,100%,0.3'
SIZE = 'x735' # for small books reading portrait style
when "comic"
TOP = 0
BOTTOM = 0
LEFT = 0
RIGHT = 0
LEVEL = '0%,100%,0.5'
SIZE = 'x735' # for small books reading portrait style
when "landscape"
TOP = 0
BOTTOM = 0
LEFT = 0
RIGHT = 0
LEVEL = '0%,100%,0.5'
SIZE = '722' # for large books reading landscape style
when "slide"
TOP = 0
BOTTOM = 0
LEFT = 0
RIGHT = 0
LEVEL = '0%,100%,0.3'
SIZE = 'x735' # for small books reading portrait style
when "custom"
TOP = (ENV['TOP']||0)
BOTTOM = (ENV['BOTTOM']||0)
LEFT = (ENV['LEFT']||0)
RIGHT = (ENV['RIGHT']||0)
LEVEL = (ENV['LEVEL']||'0%,100%,0.3')
SIZE = (ENV['SIZE']||'x735')
when "test"
TOP = (ENV['TOP']||0)
BOTTOM = (ENV['BOTTOM']||0)
LEFT = (ENV['LEFT']||0)
RIGHT = (ENV['RIGHT']||0)
LEVEL = (ENV['LEVEL']||'0%,100%,0.3')
SIZE = (ENV['SIZE']||'x735')
TRIM = ""
else
# default
TYPE="default"
TOP = 0
BOTTOM = 0
LEFT = 0
RIGHT = 0
LEVEL = '0%,100%,0.5'
SIZE = 'x735' # for small books reading portrait style
end
# original gist memo.
# SIZE = 'x693' # for generating mobi, portrait style only
#LEVEL = '0%,100%'
#---------------------------------------------------------
PGM_DIR = './pgm'; directory PGM_DIR
PNG_DIR = './png'; directory PNG_DIR
PDF_DIR = './pdf'; directory PDF_DIR
DST = SRC.sub( /\.pdf$/, ".#{TYPE}.pdf" )
MOBI = SRC.sub( /\.pdf$/, '.mobi' )
OPF = SRC.sub( /\.pdf$/, '.opf' )
HTML = SRC.sub( /\.pdf$/, '.html' )
LOG = SRC.sub( /\.pdf$/, '.log' )
CONVINFO = './convdata.txt'
# log
sh "LANG=C date >> #{LOG}"
CMD = "SRC=#{SRC} TYPE=#{TYPE} TOP=#{TOP} BOTTOM=#{BOTTOM} LEFT=#{LEFT} RIGHT=#{RIGHT} LEVEL=#{LEVEL} SIZE=#{SIZE} rake"
sh "echo #{CMD} >> #{LOG}"
sh "echo InfoKey: ConvInfoCmd > #{CONVINFO}"
sh "echo InfoValue: #{CMD} >> #{CONVINFO}"
def count_pages
open( "|pdfinfo #{SRC}", 'r:utf-8', &:read ).scan( /^Pages:\s*(\d+)/ ).flatten[0].to_i
end
def book_title
open( "|pdfinfo #{SRC}", 'r:utf-8', &:read ).scan( /^Title:\s*(.+)$/ ).flatten[0]
end
def book_author
open( "|pdfinfo #{SRC}", 'r:utf-8', &:read ).scan( /^Author:\s*(.+)$/ ).flatten[0]
end
def image_list( dir, ext, count )
[].tap do |l|
if count < 100 then
1.upto( count ) do |i|
l << "#{dir}/tmp-#{'%02d' % i}.#{ext}"
end
else
1.upto( count ) do |i|
l << "#{dir}/tmp-#{'%03d' % i}.#{ext}"
end
end
end
end
def width( png )
open( "|identify -verbose #{png}", 'r:utf-8', &:read).scan( /Page geometry:\s+(\d+)x.*/).flatten[0].to_i
end
def checkpng( png )
w = width(png)
if w > 10000 then
sh "convert -size 1x1 xc:white #{png}"
end
end
def pgm2png( pgm, png )
sh "convert #{pgm} -level '#{LEVEL}' \
-chop #{LEFT}x#{TOP} \
-gravity SouthEast -chop #{RIGHT}x#{BOTTOM}\
-gravity NorthWest #{TRIM} -resize #{SIZE} #{png}"
# -gravity NorthWest -fuzz 50% -trim -resize #{SIZE} #{png}"
checkpng(png)
# s = `convert #{pgm} -level '#{LEVEL}' \
# -chop #{LEFT}x#{TOP} \
# -gravity SouthEast -chop #{RIGHT}x#{BOTTOM}\
# -gravity NorthWest -fuzz 50% -trim -resize #{SIZE} #{png} 2>&1`
# if s.length > 0 then
# sh "convert -size 1x1 xc:white #{png}"
# end
end
pages = count_pages
PGMS = image_list( PGM_DIR, 'pgm', pages )
PNGS = image_list( PNG_DIR, 'png', pages )
PNGS.each_with_index do |png, i|
file PNGS[i] => [PNG_DIR, PGMS[i]] do |t|
pgm2png( t.prerequisites[1], t.name )
end
file PGMS[i] => [PGM_DIR, SRC] do
unless File::exist?( PGMS[-1] ) then
sh "pdftoppm -r 300 -gray #{SRC} #{PGM_DIR}/tmp"
end
end
end
task :default => :pdf
desc 'generate pdf file by concat all png files.'
task :pdf => DST
file DST => [PDF_DIR, 'metadata.txt'] + PNGS do
pdf_list = []
i = 0
src_pngs = PNGS[i, 50]
while src_pngs do
pdf_list << "#{PDF_DIR}/#{i}.pdf"
sh "convert #{src_pngs.join ' '} -quality 50 #{pdf_list[-1]}"
src_pngs = PNGS[i += 50, 50]
end
sh "pdftk #{pdf_list.join ' '} cat output #{PDF_DIR}/#{DST}"
# sh "pdftk #{PDF_DIR}/#{DST} update_info metadata.txt output #{DST}"
sh "cat metadata.txt #{CONVINFO} > updateinfo.txt"
sh "pdftk #{PDF_DIR}/#{DST} update_info updateinfo.txt output #{DST}"
sh "rm updateinfo.txt"
end
desc 'generate metadata file from source pdf.'
task :metadata => 'metadata.txt'
file 'metadata.txt' => SRC do |t|
sh "pdftk #{t.prerequisites.join ' '} dump_data output ./#{t.name}"
end
desc 'crop pgm files to png files.'
task :png => [PNG_DIR] + PNGS
rule '.png' => '.pgm' do |t|
pgm2png( t.prerequisites[0], t.name )
end
desc 'extract image files from source pdf.'
task :pgm => [PGM_DIR, SRC] + PGMS
desc 'cleanap pgm images.'
task 'clean-pgm' do
begin
rm PGMS
rescue
end
end
desc 'cleanap png images.'
task 'clean-png' do
begin
rm PNGS
rescue
end
end
desc 'cleanap temporaly pdf files.'
task 'clean-pdf' do
rm FileList["#{PDF_DIR}/*.pdf"]
end
desc 'cleanap all tmp files.'
task :clean => ['clean-png', 'clean-pgm', 'clean-pdf'] do
if FileTest.exist?('metadata.txt') then
rm 'metadata.txt'
end
rm [HTML, OPF]
rmdir PGM_DIR
rmdir PNG_DIR
rmdir PDF_DIR
end
desc 'generate MOBI file.'
task :mobi => [OPF, HTML] + PNGS do |t|
sh "kindlegen #{OPF} -unicode -o #{MOBI}"
end
rule '.opf' => '.pdf' do |t|
opf = <<-OPF.gsub( /^\t/, '' )
<?xml version="1.0" encoding="utf-8"?>
<package unique-identifier="uid">
<metadata>
<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core"
xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
<dc:Title>#{book_title}</dc:Title>
<dc:Language>en-US</dc:Language>
<dc:Creator>#{book_author}</dc:Creator>
<dc:Date>#{Time::now.strftime '%m/%d/%Y'}</dc:Date>
</dc-metadata>
<x-metadata>
<output encoding="utf-8" content-type="text/x-oeb1-document"></output>
<EmbeddedCover>#{PNGS[0]}</EmbeddedCover>
</x-metadata>
</metadata>
<manifest>
<item id="contents" media-type="text/html" href="#{HTML}"></item>
</manifest>
<spine>
<itemref idref="contents" />
</spine>
<tours></tours>
<guide>
<reference type="start" title="contents" href="#{HTML}"></reference>
</guide>
</package>
OPF
open( t.name, 'w:utf-8' ){|f| f.write opf}
end
rule '.html' => '.pdf' do |t|
html = <<-HTML.gsub( /^\t/, '' )
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html lang="ja-JP">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>#{book_title}</title>
</head>
<body style="text-align: right;">
#{PNGS.map{|j| %Q|<img style="height: 100%;" src="#{j}" />|}.join "<mbp:pagebreak />\n\t\t"}
</body>
</html>
HTML
open( t.name, 'w:utf-8' ){|f| f.write html}
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment