Created
January 4, 2011 08:51
-
-
Save tdtds/764551 to your computer and use it in GitHub Desktop.
日経電子版をスクレイピングして、Kindle向けOPFを作る
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# -*- coding: utf-8; -*- | |
# | |
# scraping nikkei.com for Kindle | |
# | |
require 'nokogiri' | |
require 'open-uri' | |
TOP = 'http://www.nikkei.com' | |
class String | |
def canonical | |
self.gsub( /\uFF5E/, "\u301C" ) # for WAVE DASH problem | |
end | |
end | |
def retry_loop( times ) | |
count = 0 | |
begin | |
yield | |
rescue | |
count += 1 | |
if count >= times | |
raise | |
else | |
$stderr.puts $! | |
$stderr.puts "#{count} retry." | |
retry | |
end | |
end | |
end | |
def html_header( title ) | |
<<-HTML.gsub( /^\t/, '' ) | |
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> | |
<html> | |
<head> | |
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta> | |
<title>#{title}</title> | |
<link rel="stylesheet" href="nikkei.css" type="text/css" media="all"></link> | |
</head> | |
<body> | |
<h1>#{title}</h1> | |
HTML | |
end | |
def get_html_item( uri, sub = nil ) | |
uri.sub!( %r|^http://www.nikkei.com|, '' ) | |
aid = uri2aid( uri ) | |
html = nil | |
if File::exist?( "src/#{aid}#{sub}.html" ) # loading cache | |
html = Nokogiri( open( "src/#{aid}#{sub}.html", 'r:utf-8', &:read ) ) | |
else | |
begin | |
puts "getting html #{aid}#{sub}" | |
retry_loop( 5 ) do | |
html = Nokogiri( open( "#{TOP}#{uri}", 'r:utf-8', &:read ) ) | |
sleep 1 | |
end | |
rescue | |
$stderr.puts "cannot get #{TOP}#{uri}." | |
raise | |
end | |
open( "src/#{aid}#{sub}.html", 'w:utf-8' ) do |f| | |
f.write( html.to_html ) | |
end | |
end | |
html | |
end | |
def scrape_html_item( html ) | |
result = '' | |
(html / 'div.cmn-article_text').each do |div| | |
(div / 'div.cmn-photo_style2 img').each do |image_tag| | |
image_url = image_tag.attr( 'src' ) | |
next if /^http/ =~ image_url | |
image_file = File::basename( image_url ) | |
puts " getting image #{image_file}" | |
begin | |
image = open( "#{TOP}#{image_url.sub /PN/, 'PB'}", &:read ) | |
open( "tmp/#{image_file}", 'w' ){|fp| fp.write image} | |
result << %Q|\t<p><img src="#{image_file}"></p>| | |
rescue | |
$stderr.puts "FAIL TO DOWNLOAD IMAGE: #{image_url}" | |
end | |
end | |
(div / 'p').each do |text| | |
next unless (text / 'a.cmnc-continue').empty? | |
(text / 'span.JSID_urlData').remove | |
para = text.text.strip.sub( /^ /, '' ).canonical | |
result << "\t<p>#{para}</p>" unless para.empty? | |
end | |
(div / 'table').each do |table| | |
result << table.to_html | |
end | |
end | |
result | |
end | |
def html_item( item, uri ) | |
aid = uri2aid( uri ) | |
return '' unless aid | |
html = get_html_item( uri ) | |
open( "tmp/#{aid}.html", 'w:utf-8' ) do |f| | |
f.puts html_header( (html / 'h4.cmn-article_title, h2.cmn-article_title')[0].text.strip.canonical ) | |
f.puts scrape_html_item( html ) | |
(html / 'div.cmn-article_nation ul li a').map {|link| | |
link.attr( 'href' ) | |
}.sort.uniq.each_with_index do |link,index| | |
f.puts scrape_html_item( get_html_item( link, index + 2 ) ) | |
end | |
f.puts html_footer | |
end | |
%Q|\t\t<li><a href="#{aid}.html">#{item}</a></li>| | |
end | |
def html_footer | |
<<-HTML.gsub( /^\t/, '' ) | |
</body> | |
</html> | |
HTML | |
end | |
def ncx_header | |
<<-XML.gsub( /^\t/, '' ) | |
<?xml version="1.0" encoding="UTF-8"?> | |
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd"> | |
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1"> | |
<docTitle><text>日経電子版 (#{Time::now.strftime '%Y-%m-%d %H:%M'})</text></docTitle> | |
<navMap> | |
<navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint> | |
XML | |
end | |
def ncx_item( item, uri, index ) | |
aid = uri2aid( uri ) | |
aid ? %Q|\t\t<navPoint id="#{aid}" playOrder="#{index}"><navLabel><text>#{item}</text></navLabel><content src="#{aid}.html" /></navPoint>| : '' | |
end | |
def ncx_footer | |
<<-XML.gsub( /^\t/, '' ) | |
</navMap> | |
</ncx> | |
XML | |
end | |
def opf_header | |
<<-XML.gsub( /^\t/, '' ) | |
<?xml version="1.0" encoding="utf-8"?> | |
<package unique-identifier="uid"> | |
<metadata> | |
<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/"> | |
<dc:Title>日経電子版 (#{Time::now.strftime '%Y-%m-%d %H:%M'})</dc:Title> | |
<dc:Language>en-US</dc:Language> | |
<dc:Creator>日本経済新聞社</dc:Creator> | |
<dc:Description>日経電子版、#{Time::now.strftime '%Y-%m-%d %H:%M'}生成</dc:Description> | |
<dc:Date>#{Time::now.strftime( '%d/%m/%Y' )}</dc:Date> | |
</dc-metadata> | |
<x-metadata> | |
<output encoding="utf-8" content-type="text/x-oeb1-document"></output> | |
<EmbeddedCover>nikkei.jpg</EmbeddedCover> | |
</x-metadata> | |
</metadata> | |
<manifest> | |
<item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item> | |
<item id="style" media-type="text/css" href="nikkei.css"></item> | |
<item id="index" media-type="text/html" href="toc.html"></item> | |
XML | |
end | |
def opf_item( uri ) | |
aid = uri2aid( uri ) | |
aid ? %Q|\t\t<item id="#{aid}" media-type="text/html" href="#{aid}.html"></item>| : '' | |
end | |
def opf_footer( aids ) | |
r = <<-XML.gsub( /^\t/, '' ) | |
</manifest> | |
<spine toc="toc"> | |
XML | |
aids.each do |aid| | |
r << %Q|\t<itemref idref="#{aid}" />\n| | |
end | |
r << <<-XML.gsub( /^\t/, '' ) | |
<itemref idref="index" /> | |
</spine> | |
<tours></tours> | |
<guide> | |
<reference type="toc" title="Table of Contents" href="toc.html"></reference> | |
<reference type="start" title="Top Story" href="#{aids[0]}.html"></reference> | |
</guide> | |
</package> | |
XML | |
r | |
end | |
def uri2aid( uri ) | |
uri.scan( /g=([^;$]+)/ ).flatten[0] | |
end | |
def generate( toc ) | |
open( 'tmp/toc.html', 'w:utf-8' ) do |html| | |
open( 'tmp/toc.ncx', 'w:utf-8' ) do |ncx| | |
open( 'tmp/nikkei.opf', 'w:utf-8' ) do |opf| | |
first = true | |
toc_index = 0 | |
aids = [] | |
ncx.puts ncx_header | |
opf.puts opf_header | |
toc.each do |category| | |
category.each do |article| | |
if article.class == String | |
html.puts first ? | |
html_header( 'Table of Contents' ) : | |
"\t</ul>\n\t<mbp:pagebreak />" | |
html.puts "\t<h2>#{article}</h2>" | |
html.puts "\t<ul>" | |
first = false | |
else | |
html.puts html_item( article[0], article[1] ) | |
ncx.puts ncx_item( article[0], article[1], toc_index += 1 ) | |
unless aids.index( uri2aid( article[1] ) ) | |
opf.puts opf_item( article[1] ) | |
aids << uri2aid( article[1] ) if uri2aid( article[1] ) | |
end | |
end | |
end | |
end | |
html.puts "\t</ul>" | |
html.puts html_footer | |
ncx.puts ncx_footer | |
opf.puts opf_footer( aids ) | |
end | |
end | |
end | |
end | |
toc = [] | |
top = Nokogiri( open( ARGV[0] || TOP, 'r:utf-8', &:read ) ) | |
# | |
# scraping top news | |
# | |
toc_top = ['TOP NEWS'] | |
%w(first second_alone third fourth).each do |category| | |
(top / "div.nx-top_news_#{category} h3 a").each do |a| | |
toc_top << [a.text.strip.canonical, a.attr( 'href' )] | |
end | |
end | |
toc << toc_top | |
# | |
# scraping all categories | |
# | |
(top / 'div.cmnc-genre').each do |genre| | |
toc_cat = [] | |
(genre / 'h4.cmnc-genre_title a').each do |cat| | |
next if /local/ =~ cat.attr( 'href' ) | |
toc_cat << cat.text | |
(genre / 'li a').each do |article| | |
toc_cat << [article.text.canonical, article.attr( 'href' )] | |
end | |
end | |
toc << toc_cat | |
end | |
begin | |
generate( toc ) | |
exit( 0 ) | |
rescue | |
$stderr.puts $! | |
#$@.each{|i| $stderr.puts i} | |
exit( 1 ) | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
* { | |
margin: 0px; | |
padding: 0px; | |
text-indent: 0px; | |
} | |
h1 { | |
font-size: 150%; | |
font-weight: bold; | |
} | |
h2 { | |
font-size: 120%; | |
font-weight: bold; | |
margin: 1em 0em 0em 0em; | |
} | |
p { | |
text-indent: 0em; | |
margin: 1em 0em 0em 0em; | |
line-height: 200%; | |
} | |
table { | |
border-top: 1px solid #444; | |
border-left: 1px solid #444; | |
border-collapse: collapse; | |
border-spacing: 0; | |
background-color: #ffffff; | |
padding: 4px; | |
text-align: left; | |
} | |
th { | |
border-right:1px solid #444; | |
border-bottom:1px solid #444; | |
padding:0.3em 1em; | |
} | |
td { | |
border-right:1px solid #444; | |
border-bottom:1px solid #444; | |
padding:0.3em 1em; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment