Skip to content

Instantly share code, notes, and snippets.

@tdtds
Created January 26, 2011 21:55
Show Gist options
  • Save tdtds/797555 to your computer and use it in GitHub Desktop.
Save tdtds/797555 to your computer and use it in GitHub Desktop.
Internet Watchのfeedから、Kindle向けOPFを作る
#!/usr/bin/env ruby
# -*- coding: utf-8; -*-
#
# scraping internet.watch.impress.co.jp for Kindle
#
require 'nokogiri'
require 'open-uri'
require 'uri'
require 'ostruct'
TOP = 'http://internet.watch.impress.co.jp'
def retry_loop( times )
count = 0
begin
yield
rescue
count += 1
if count >= times
raise
else
$stderr.puts $!
$stderr.puts "#{count} retry."
sleep 1
retry
end
end
end
def item_id( uri )
File::basename( uri.path, '.html' )
end
def get_article( uri )
cache = "cache/#{File::basename uri.path}"
begin
html = open( cache, 'r:Shift_JIS', &:read )
rescue Errno::ENOENT
puts "getting article: #{uri.path}".encode( Encoding::default_external )
html = retry_loop( 5 ) do
open( uri, 'r:Shift_JIS', &:read )
end
open( cache, 'w' ){|f| f.write html }
end
Nokogiri( html.encode 'UTF-8' )
end
def html_header( title )
<<-HTML.gsub( /^\t/, '' )
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta>
<title>#{title}</title>
<link rel="stylesheet" href="style.css" type="text/css" media="all"></link>
</head>
<body>
<h1>#{title}</h1>
HTML
end
def html_footer
<<-HTML.gsub( /^\t/, '' )
</body>
</html>
HTML
end
items = []
rdf_file = ARGV.shift || "#{TOP}/cda/rss/internet.rdf"
rdf = retry_loop( 5 ) do
Nokogiri( open( rdf_file, 'r:utf-8', &:read ) )
end
(rdf / 'item' ).each do |item|
uri = URI( item.attr( 'about' ) )
next unless /internet\.watch\.impress\.co\.jp/ =~ uri.host
uri.query = nil # remove query of 'from rss'
title = (item / 'title').text
items << OpenStruct::new( :uri => uri, :title => title )
end
now = Time::now
now_str = now.strftime( '%Y-%m-%d %H:%M' )
#
# generating articles in html
#
items.each do |item|
begin
article = get_article( item.uri )
open( "tmp/#{item_id item.uri}.html", 'w' ) do |f|
f.puts html_header( item.title )
contents = (article / 'div.mainContents')
(contents / 'img').each do |img|
org = img.attr( 'src' )
begin
img_file = retry_loop( 5 ) do
open( "#{TOP}#{org}", &:read )
end
cache = "#{org.gsub( /\//, '_' ).sub( /^_/, '' )}"
open( "tmp/#{cache}", 'w' ){|f| f.write img_file}
img.set_attribute( 'src', cache )
rescue OpenURI::HTTPError
$stderr.puts "skipped an image: #{TOP}#{org}"
end
end
f.puts contents.inner_html
f.puts html_footer
end
rescue
$stderr.puts "#{$!.class}: #$!"
$stderr.puts "skipped an article: #{item.uri}"
end
end
#
# generating TOC in html
#
open( "tmp/toc.html", 'w:utf-8' ) do |f|
f.write html_header( 'Table of Contents' )
f.puts "<ul>"
items.each do |item|
f.puts %Q|\t<li><a href="#{item_id item.uri}.html">#{item.title}</a></li>|
end
f.puts "</ul>"
f.write html_footer
end
#
# generating TOC in ncx
#
open( "tmp/toc.ncx", 'w:utf-8' ) do |f|
f.write <<-XML.gsub( /^\t/, '' )
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<docTitle><text>INTERNET Watch (#{now_str})</text></docTitle>
<navMap>
<navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint>
XML
items.each_with_index do |item, index|
f.puts %Q|\t\t<navPoint id="#{item_id item.uri}" playOrder="#{index}"><navLabel><text>#{item.title}</text></navLabel><content src="#{item_id item.uri}.html" /></navPoint>|
end
f.write <<-XML.gsub( /^\t/, '' )
</navMap>
</ncx>
XML
end
#
# generating OPF
#
open( "tmp/internetwatch.opf", 'w:utf-8' ) do |f|
f.write <<-XML.gsub( /^\t/, '' )
<?xml version="1.0" encoding="utf-8"?>
<package unique-identifier="uid">
<metadata>
<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
<dc:Title>INTERNET Watch (#{now_str})</dc:Title>
<dc:Language>en-US</dc:Language>
<dc:Creator>インプレス</dc:Creator>
<dc:Description>INTERNET Watch、#{now_str}生成</dc:Description>
<dc:Date>#{now.strftime( '%d/%m/%Y' )}</dc:Date>
</dc-metadata>
<x-metadata>
<output encoding="utf-8" content-type="text/x-oeb1-document"></output>
<EmbeddedCover>cover.gif</EmbeddedCover>
</x-metadata>
</metadata>
<manifest>
<item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item>
<item id="style" media-type="text/css" href="style.css"></item>
<item id="index" media-type="text/html" href="toc.html"></item>
XML
items.each do |item|
f.puts %Q|\t\t<item id="#{item_id item.uri}" media-type="text/html" href="#{item_id item.uri}.html"></item>|
end
f.write <<-XML.gsub( /^\t/, '' )
</manifest>
<spine toc="toc">
<itemref idref="index" />
XML
items.each do |item|
f.puts %Q|\t<itemref idref="#{item_id item.uri}" />\n|
end
f.write <<-XML.gsub( /^\t/, '' )
</spine>
<tours></tours>
<guide>
<reference type="toc" title="Table of Contents" href="toc.html"></reference>
<reference type="start" title="Table of Contents" href="toc.html"></reference>
</guide>
</package>
XML
end
* {
margin: 0px;
padding: 0px;
text-indent: 0px;
}
h1 {
font-size: 150%;
font-weight: bold;
}
h2 {
font-size: 120%;
font-weight: bold;
margin: 1em 0em 0em 0em;
}
p {
text-indent: 0em;
margin: 1em 0em 0em 0em;
line-height: 200%;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment