Skip to content

Instantly share code, notes, and snippets.

@chiral

chiral/bookmaker.rb

Last active Dec 14, 2015
Embed
What would you like to do?
automatical epub generator. contents xpath can be specified with yaml file. css and img files are implicitly included and converted url references.
# -*- coding: utf-8 -*-
require 'erb'
require 'uri'
require 'yaml'
require 'nokogiri'
require 'open-uri'
require 'fileutils'
require 'pathname'
require 'gepub'
$config = YAML.load(open('config.yml').read())
def extract_all(html,xpath)
h = []
html.search(xpath).each do |link|
v = link.inner_html
h.push(v)
end
return(h)
end
def wget_process(url,dir)
html = Nokogiri::HTML(open(url))
uri = URI.parse(url)
host = uri.host
xpath_css = $config['xpath']['css']
xpath_img = $config['xpath']['img']
xpath_site = $config['xpath_site']
xpath_title = xpath_site[host]['title']
xpath_body = xpath_site[host]['body']
title = extract_all(html,xpath_title)
body = extract_all(html,xpath_body)
css = extract_all(html,xpath_css)
img = []
body.each do |s|
html1 = Nokogiri::HTML(s)
img.concat(extract_all(html1,xpath_img))
end
css_files = wget_all(uri,css,dir+'css')
img_files = wget_all(uri,img,dir+'img')
body = replace_img_src(body,img_files)
vals = {
'title'=>title[0],
'css'=>css_files.values,
'body'=>body,
}
top_file = fill_erb(vals,dir)
files = [top_file]
img_files.each { |k,f| files.push('img/'+f) }
#css_files.each { |k,f| files.push('css/'+f) }
return(files)
end
def ng_file(s)
return s.include?('?')
end
def wget_all(base_uri,us,dir)
h={}
us.each do |url|
file = File.basename(url)
next if ng_file(file)
path = dir+file
uri = URI.parse(url)
unless (url.start_with?('http'))
uri = base_uri + uri
end
cmdstr = 'wget -q '+uri.to_s+' -O '+path.to_s
#p '***** ::: '+cmdstr
system(cmdstr)
h[url]=file
end
return(h)
end
def replace(s,k,v)
while (true)
pos = s.index(k)
return unless (pos)
s.slice!(pos,k.length)
s.insert(pos,v)
#p 'replace '+k+' to '+v
end
end
def replace_img_src(body,img)
res=[]
body.each do |b|
img.each do |k,v|
replace(b,k,'img/'+v)
end
res.push(b)
end
return(res)
end
def fill_erb(c,dir)
title = c['title']
css = c['css']
body = c['body']
erb_file = $config['default']['erb']
top_file = $config['default']['top']
open(erb_path = dir+'../..'+erb_file) do |f|
erb = ERB.new(f.read)
res = erb.result(binding)
output = dir+top_file
File.write(dir+top_file,res)
end
return(top_file)
end
def gen_epub(files,dir)
workdir = dir.to_s
builder = GEPUB::Builder.new do
unique_identifier 'lab.adfive.net/test.epub', 'BookId', 'URL'
language 'ja'
title 'EPUBクリッパーのテスト'
resources(workdir: workdir) do
ordered do
files.each { |f| file f }
end
end
end
epubname = File.join(File.dirname(__FILE__), 'hoge1.epub')
builder.generate_epub(epubname)
end
def main()
name = ARGV[0]
name = 'src' if (!name)
file = name + '.yml'
src = YAML.load(open(file).read())
dir = Pathname.pwd
FileUtils.mkdir_p(dir+name)
Dir::chdir(dir+name)
files=[]
src.each_with_index do |item,i|
new_dir = dir+name+i.to_s
FileUtils.mkdir_p(new_dir)
FileUtils.mkdir_p(new_dir+'img')
FileUtils.mkdir_p(new_dir+'css')
Dir::chdir(new_dir)
res = wget_process(item['url'],new_dir)
res.each { |f| files.push(i.to_s+'/'+f) }
end
p files
Dir::chdir(dir)
gen_epub(files,dir+name)
end
main()
default:
erb: "default.erb"
top: "index.html"
xpath:
css: "link[rel='stylesheet']/@href"
img: "img/@src"
xpath_site:
gigazine.net:
title: "title"
body: "#maincol"
hatedebu.hatenablog.com:
title: "title"
body: ".entry-content"
anond.hatelabo.jp:
title: "title"
body: "#body"
<!DOCTYPE html>
<html lang="ja" xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta charset="utf-8" />
<title><%= title %></title>
</head>
<body>
<% body.each do |c| %>
<%= c %>
<% end %>
</body>
</html>
-
url: http://gigazine.net/news/20130301-chicken-ramen-chips/
-
url: http://hatedebu.hatenablog.com/entry/2013/03/01/164135
-
url: http://anond.hatelabo.jp/20130303084659
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment