Last active
December 14, 2015 10:59
-
-
Save chiral/5076305 to your computer and use it in GitHub Desktop.
automatical epub generator. contents xpath can be specified with yaml file. css and img files are implicitly included and converted url references.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
require 'erb' | |
require 'uri' | |
require 'yaml' | |
require 'nokogiri' | |
require 'open-uri' | |
require 'fileutils' | |
require 'pathname' | |
require 'gepub' | |
$config = YAML.load(open('config.yml').read()) | |
def extract_all(html,xpath) | |
h = [] | |
html.search(xpath).each do |link| | |
v = link.inner_html | |
h.push(v) | |
end | |
return(h) | |
end | |
def wget_process(url,dir) | |
html = Nokogiri::HTML(open(url)) | |
uri = URI.parse(url) | |
host = uri.host | |
xpath_css = $config['xpath']['css'] | |
xpath_img = $config['xpath']['img'] | |
xpath_site = $config['xpath_site'] | |
xpath_title = xpath_site[host]['title'] | |
xpath_body = xpath_site[host]['body'] | |
title = extract_all(html,xpath_title) | |
body = extract_all(html,xpath_body) | |
css = extract_all(html,xpath_css) | |
img = [] | |
body.each do |s| | |
html1 = Nokogiri::HTML(s) | |
img.concat(extract_all(html1,xpath_img)) | |
end | |
css_files = wget_all(uri,css,dir+'css') | |
img_files = wget_all(uri,img,dir+'img') | |
body = replace_img_src(body,img_files) | |
vals = { | |
'title'=>title[0], | |
'css'=>css_files.values, | |
'body'=>body, | |
} | |
top_file = fill_erb(vals,dir) | |
files = [top_file] | |
img_files.each { |k,f| files.push('img/'+f) } | |
#css_files.each { |k,f| files.push('css/'+f) } | |
return(files) | |
end | |
def ng_file(s) | |
return s.include?('?') | |
end | |
def wget_all(base_uri,us,dir) | |
h={} | |
us.each do |url| | |
file = File.basename(url) | |
next if ng_file(file) | |
path = dir+file | |
uri = URI.parse(url) | |
unless (url.start_with?('http')) | |
uri = base_uri + uri | |
end | |
cmdstr = 'wget -q '+uri.to_s+' -O '+path.to_s | |
#p '***** ::: '+cmdstr | |
system(cmdstr) | |
h[url]=file | |
end | |
return(h) | |
end | |
def replace(s,k,v) | |
while (true) | |
pos = s.index(k) | |
return unless (pos) | |
s.slice!(pos,k.length) | |
s.insert(pos,v) | |
#p 'replace '+k+' to '+v | |
end | |
end | |
def replace_img_src(body,img) | |
res=[] | |
body.each do |b| | |
img.each do |k,v| | |
replace(b,k,'img/'+v) | |
end | |
res.push(b) | |
end | |
return(res) | |
end | |
def fill_erb(c,dir) | |
title = c['title'] | |
css = c['css'] | |
body = c['body'] | |
erb_file = $config['default']['erb'] | |
top_file = $config['default']['top'] | |
open(erb_path = dir+'../..'+erb_file) do |f| | |
erb = ERB.new(f.read) | |
res = erb.result(binding) | |
output = dir+top_file | |
File.write(dir+top_file,res) | |
end | |
return(top_file) | |
end | |
def gen_epub(files,dir) | |
workdir = dir.to_s | |
builder = GEPUB::Builder.new do | |
unique_identifier 'lab.adfive.net/test.epub', 'BookId', 'URL' | |
language 'ja' | |
title 'EPUBクリッパーのテスト' | |
resources(workdir: workdir) do | |
ordered do | |
files.each { |f| file f } | |
end | |
end | |
end | |
epubname = File.join(File.dirname(__FILE__), 'hoge1.epub') | |
builder.generate_epub(epubname) | |
end | |
def main() | |
name = ARGV[0] | |
name = 'src' if (!name) | |
file = name + '.yml' | |
src = YAML.load(open(file).read()) | |
dir = Pathname.pwd | |
FileUtils.mkdir_p(dir+name) | |
Dir::chdir(dir+name) | |
files=[] | |
src.each_with_index do |item,i| | |
new_dir = dir+name+i.to_s | |
FileUtils.mkdir_p(new_dir) | |
FileUtils.mkdir_p(new_dir+'img') | |
FileUtils.mkdir_p(new_dir+'css') | |
Dir::chdir(new_dir) | |
res = wget_process(item['url'],new_dir) | |
res.each { |f| files.push(i.to_s+'/'+f) } | |
end | |
p files | |
Dir::chdir(dir) | |
gen_epub(files,dir+name) | |
end | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
default: | |
erb: "default.erb" | |
top: "index.html" | |
xpath: | |
css: "link[rel='stylesheet']/@href" | |
img: "img/@src" | |
xpath_site: | |
gigazine.net: | |
title: "title" | |
body: "#maincol" | |
hatedebu.hatenablog.com: | |
title: "title" | |
body: ".entry-content" | |
anond.hatelabo.jp: | |
title: "title" | |
body: "#body" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html lang="ja" xmlns="http://www.w3.org/1999/xhtml"> | |
<head> | |
<meta charset="utf-8" /> | |
<title><%= title %></title> | |
</head> | |
<body> | |
<% body.each do |c| %> | |
<%= c %> | |
<% end %> | |
</body> | |
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
- | |
url: http://gigazine.net/news/20130301-chicken-ramen-chips/ | |
- | |
url: http://hatedebu.hatenablog.com/entry/2013/03/01/164135 | |
- | |
url: http://anond.hatelabo.jp/20130303084659 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment