tdtds/nikkei-scraper.rb

## nikkei-scraper.rb
#!/usr/bin/env ruby
# -*- coding: utf-8; -*-
#
# scraping nikkei.com for Kindle
#

require 'nokogiri'
require 'open-uri'

TOP = 'http://www.nikkei.com'

class String
	def canonical
		self.gsub( /\uFF5E/, "\u301C" ) # for WAVE DASH problem
	end
end

def retry_loop( times )
	count = 0
	begin
		yield
	rescue
		count += 1
		if count >= times
			raise
		else
			$stderr.puts $!
			$stderr.puts "#{count} retry."
			retry
		end
	end
end

def html_header( title )
	<<-HTML.gsub( /^\t/, '' )
	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
	<html>
	<head>
		<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta>
		<title>#{title}</title>
		<link rel="stylesheet" href="nikkei.css" type="text/css" media="all"></link>
	</head>
	<body>
		<h1>#{title}</h1>
	HTML
end

def get_html_item( uri, sub = nil )
	uri.sub!( %r|^http://www.nikkei.com|, '' )
	aid = uri2aid( uri )
	html = nil
	if File::exist?( "src/#{aid}#{sub}.html" ) # loading cache
		html = Nokogiri( open( "src/#{aid}#{sub}.html", 'r:utf-8', &:read ) )
	else
		begin
			puts "getting html #{aid}#{sub}"
			retry_loop( 5 ) do
				html = Nokogiri( open( "#{TOP}#{uri}", 'r:utf-8', &:read ) )
				sleep 1
			end
		rescue
			$stderr.puts "cannot get #{TOP}#{uri}."
			raise
		end
		open( "src/#{aid}#{sub}.html", 'w:utf-8' ) do |f|
			f.write( html.to_html )
		end
	end
	html
end

def scrape_html_item( html )
	result = ''
	(html / 'div.cmn-article_text').each do |div|
		(div / 'div.cmn-photo_style2 img').each do |image_tag|
			image_url = image_tag.attr( 'src' )
			next if /^http/ =~ image_url
			image_file = File::basename( image_url )
			puts "   getting image #{image_file}"
			begin
				image = open( "#{TOP}#{image_url.sub /PN/, 'PB'}", &:read )
				open( "tmp/#{image_file}", 'w' ){|fp| fp.write image}
				result << %Q|\t<p><img src="#{image_file}"></p>|
			rescue
				$stderr.puts "FAIL TO DOWNLOAD IMAGE: #{image_url}"
			end
		end
		(div / 'p').each do |text|
			next unless (text / 'a.cmnc-continue').empty?
			(text / 'span.JSID_urlData').remove
			para = text.text.strip.sub( /^　/, '' ).canonical
			result << "\t<p>#{para}</p>" unless para.empty?
		end
		(div / 'table').each do |table|
			result << table.to_html
		end
	end
	result
end

def html_item( item, uri )
	aid = uri2aid( uri )
	return '' unless aid
	html = get_html_item( uri )

	open( "tmp/#{aid}.html", 'w:utf-8' ) do |f|
		f.puts html_header( (html / 'h4.cmn-article_title, h2.cmn-article_title')[0].text.strip.canonical )
		f.puts scrape_html_item( html )
		(html / 'div.cmn-article_nation ul li a').map {|link|
			link.attr( 'href' )
		}.sort.uniq.each_with_index do |link,index|
			f.puts scrape_html_item( get_html_item( link, index + 2 ) )
		end
		f.puts html_footer
	end

	%Q|\t\t<li><a href="#{aid}.html">#{item}</a></li>|
end

def html_footer
	<<-HTML.gsub( /^\t/, '' )
	</body>
	</html>
	HTML
end

def ncx_header
	<<-XML.gsub( /^\t/, '' )
	<?xml version="1.0" encoding="UTF-8"?>
	<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
	<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
	<docTitle><text>日経電子版 (#{Time::now.strftime '%Y-%m-%d %H:%M'})</text></docTitle>
	<navMap>
		<navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint>
	XML
end

def ncx_item( item, uri, index )
	aid = uri2aid( uri )
	aid ? %Q|\t\t<navPoint id="#{aid}" playOrder="#{index}"><navLabel><text>#{item}</text></navLabel><content src="#{aid}.html" /></navPoint>| : ''
end

def ncx_footer
	<<-XML.gsub( /^\t/, '' )
	</navMap>
	</ncx>
	XML
end

def opf_header
	<<-XML.gsub( /^\t/, '' )
	<?xml version="1.0" encoding="utf-8"?>
	<package unique-identifier="uid">
		<metadata>
			<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
				<dc:Title>日経電子版 (#{Time::now.strftime '%Y-%m-%d %H:%M'})</dc:Title>
				<dc:Language>en-US</dc:Language>
				<dc:Creator>日本経済新聞社</dc:Creator>
				<dc:Description>日経電子版、#{Time::now.strftime '%Y-%m-%d %H:%M'}生成</dc:Description>
				<dc:Date>#{Time::now.strftime( '%d/%m/%Y' )}</dc:Date>
			</dc-metadata>
			<x-metadata>
				<output encoding="utf-8" content-type="text/x-oeb1-document"></output>
				<EmbeddedCover>nikkei.jpg</EmbeddedCover>
			</x-metadata>
		</metadata>
		<manifest>
			<item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item>
			<item id="style" media-type="text/css" href="nikkei.css"></item>
			<item id="index" media-type="text/html" href="toc.html"></item>
	XML
end

def opf_item( uri )
	aid = uri2aid( uri )
	aid ? %Q|\t\t<item id="#{aid}" media-type="text/html" href="#{aid}.html"></item>| : ''
end

def opf_footer( aids )
	r = <<-XML.gsub( /^\t/, '' )
	</manifest>
	<spine toc="toc">
	XML
	aids.each do |aid|
		r << %Q|\t<itemref idref="#{aid}" />\n|
	end
	r << <<-XML.gsub( /^\t/, '' )
		<itemref idref="index" />
	</spine>
	<tours></tours>
	<guide>
	  <reference type="toc" title="Table of Contents" href="toc.html"></reference>
	  <reference type="start" title="Top Story" href="#{aids[0]}.html"></reference>
	</guide>
	</package>
	XML
	r
end

def uri2aid( uri )
	uri.scan( /g=([^;$]+)/ ).flatten[0]
end

def generate( toc )
	open( 'tmp/toc.html', 'w:utf-8' ) do |html|
	open( 'tmp/toc.ncx', 'w:utf-8' ) do |ncx|
	open( 'tmp/nikkei.opf', 'w:utf-8' ) do |opf|
		first = true
		toc_index = 0
		aids = []
		ncx.puts ncx_header
		opf.puts opf_header
		toc.each do |category|
			category.each do |article|
				if article.class == String
					html.puts first ?
						html_header( 'Table of Contents' ) :
						"\t</ul>\n\t<mbp:pagebreak />"
					html.puts "\t<h2>#{article}</h2>"
					html.puts "\t<ul>"
					first = false
				else
					html.puts html_item( article[0], article[1] )
					ncx.puts ncx_item( article[0], article[1], toc_index += 1 )
					unless aids.index( uri2aid( article[1] ) )
						opf.puts opf_item( article[1] )
						aids << uri2aid( article[1] ) if uri2aid( article[1] )
					end
				end
			end
		end
		html.puts "\t</ul>"
		html.puts html_footer
		ncx.puts ncx_footer
		opf.puts opf_footer( aids )
	end
	end
	end
end


toc = []
top = Nokogiri( open( ARGV[0] || TOP, 'r:utf-8', &:read ) )

#
# scraping top news
#
toc_top = ['TOP NEWS']

%w(first second_alone third fourth).each do |category|
	(top / "div.nx-top_news_#{category} h3 a").each do |a|
		toc_top << [a.text.strip.canonical, a.attr( 'href' )]
	end
end
toc << toc_top

#
# scraping all categories
#
(top / 'div.cmnc-genre').each do |genre|
	toc_cat = []
	(genre / 'h4.cmnc-genre_title a').each do |cat|
		next if /local/ =~ cat.attr( 'href' )
		toc_cat << cat.text
		(genre / 'li a').each do |article|
			toc_cat << [article.text.canonical, article.attr( 'href' )]
		end
	end
	toc << toc_cat
end

begin
	generate( toc )
	exit( 0 )
rescue
	$stderr.puts $!
	#$@.each{|i| $stderr.puts i}
	exit( 1 )
end

## nikkei.css
* {
	margin: 0px;
	padding: 0px;
	text-indent: 0px;
}

h1 {
	font-size: 150%;
	font-weight: bold;
}

h2 {
	font-size: 120%;
	font-weight: bold;
	margin: 1em 0em 0em 0em;
}

p {
	text-indent: 0em;
	margin: 1em 0em 0em 0em;
	line-height: 200%;
}

table {
	border-top: 1px solid #444;
	border-left: 1px solid #444;
	border-collapse: collapse;
	border-spacing: 0;
	background-color: #ffffff;
	padding: 4px;
	text-align: left;
}
th {
	border-right:1px solid #444;
	border-bottom:1px solid #444;
	padding:0.3em 1em;
}

td {
	border-right:1px solid #444;
	border-bottom:1px solid #444;
	padding:0.3em 1em;
}
	#!/usr/bin/env ruby
	# -- coding: utf-8; --
	#
	# scraping nikkei.com for Kindle
	#

	require 'nokogiri'
	require 'open-uri'

	TOP = 'http://www.nikkei.com'

	class String
	def canonical
	self.gsub( /\uFF5E/, "\u301C" ) # for WAVE DASH problem
	end
	end

	def retry_loop( times )
	count = 0
	begin
	yield
	rescue
	count += 1
	if count >= times
	raise
	else
	$stderr.puts $!
	$stderr.puts "#{count} retry."
	retry
	end
	end
	end

	def html_header( title )
	<<-HTML.gsub( /^\t/, '' )
	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
	<html>
	<head>
	<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta>
	<title>#{title}</title>
	<link rel="stylesheet" href="nikkei.css" type="text/css" media="all"></link>
	</head>
	<body>
	<h1>#{title}</h1>
	HTML
	end

	def get_html_item( uri, sub = nil )
	uri.sub!( %r\|^http://www.nikkei.com\|, '' )
	aid = uri2aid( uri )
	html = nil
	if File::exist?( "src/#{aid}#{sub}.html" ) # loading cache
	html = Nokogiri( open( "src/#{aid}#{sub}.html", 'r:utf-8', &:read ) )
	else
	begin
	puts "getting html #{aid}#{sub}"
	retry_loop( 5 ) do
	html = Nokogiri( open( "#{TOP}#{uri}", 'r:utf-8', &:read ) )
	sleep 1
	end
	rescue
	$stderr.puts "cannot get #{TOP}#{uri}."
	raise
	end
	open( "src/#{aid}#{sub}.html", 'w:utf-8' ) do \|f\|
	f.write( html.to_html )
	end
	end
	html
	end

	def scrape_html_item( html )
	result = ''
	(html / 'div.cmn-article_text').each do \|div\|
	(div / 'div.cmn-photo_style2 img').each do \|image_tag\|
	image_url = image_tag.attr( 'src' )
	next if /^http/ =~ image_url
	image_file = File::basename( image_url )
	puts " getting image #{image_file}"
	begin
	image = open( "#{TOP}#{image_url.sub /PN/, 'PB'}", &:read )
	open( "tmp/#{image_file}", 'w' ){\|fp\| fp.write image}
	result << %Q\|\t<p><img src="#{image_file}"></p>\|
	rescue
	$stderr.puts "FAIL TO DOWNLOAD IMAGE: #{image_url}"
	end
	end
	(div / 'p').each do \|text\|
	next unless (text / 'a.cmnc-continue').empty?
	(text / 'span.JSID_urlData').remove
	para = text.text.strip.sub( /^　/, '' ).canonical
	result << "\t<p>#{para}</p>" unless para.empty?
	end
	(div / 'table').each do \|table\|
	result << table.to_html
	end
	end
	result
	end

	def html_item( item, uri )
	aid = uri2aid( uri )
	return '' unless aid
	html = get_html_item( uri )

	open( "tmp/#{aid}.html", 'w:utf-8' ) do \|f\|
	f.puts html_header( (html / 'h4.cmn-article_title, h2.cmn-article_title')[0].text.strip.canonical )
	f.puts scrape_html_item( html )
	(html / 'div.cmn-article_nation ul li a').map {\|link\|
	link.attr( 'href' )
	}.sort.uniq.each_with_index do \|link,index\|
	f.puts scrape_html_item( get_html_item( link, index + 2 ) )
	end
	f.puts html_footer
	end

	%Q\|\t\t<li><a href="#{aid}.html">#{item}</a></li>\|
	end

	def html_footer
	<<-HTML.gsub( /^\t/, '' )
	</body>
	</html>
	HTML
	end

	def ncx_header
	<<-XML.gsub( /^\t/, '' )
	<?xml version="1.0" encoding="UTF-8"?>
	<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
	<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
	<docTitle><text>日経電子版 (#{Time::now.strftime '%Y-%m-%d %H:%M'})</text></docTitle>
	<navMap>
	<navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint>
	XML
	end

	def ncx_item( item, uri, index )
	aid = uri2aid( uri )
	aid ? %Q\|\t\t<navPoint id="#{aid}" playOrder="#{index}"><navLabel><text>#{item}</text></navLabel><content src="#{aid}.html" /></navPoint>\| : ''
	end

	def ncx_footer
	<<-XML.gsub( /^\t/, '' )
	</navMap>
	</ncx>
	XML
	end

	def opf_header
	<<-XML.gsub( /^\t/, '' )
	<?xml version="1.0" encoding="utf-8"?>
	<package unique-identifier="uid">
	<metadata>
	<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
	<dc:Title>日経電子版 (#{Time::now.strftime '%Y-%m-%d %H:%M'})</dc:Title>
	<dc:Language>en-US</dc:Language>
	<dc:Creator>日本経済新聞社</dc:Creator>
	<dc:Description>日経電子版、#{Time::now.strftime '%Y-%m-%d %H:%M'}生成</dc:Description>
	<dc:Date>#{Time::now.strftime( '%d/%m/%Y' )}</dc:Date>
	</dc-metadata>
	<x-metadata>
	<output encoding="utf-8" content-type="text/x-oeb1-document"></output>
	<EmbeddedCover>nikkei.jpg</EmbeddedCover>
	</x-metadata>
	</metadata>
	<manifest>
	<item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item>
	<item id="style" media-type="text/css" href="nikkei.css"></item>
	<item id="index" media-type="text/html" href="toc.html"></item>
	XML
	end

	def opf_item( uri )
	aid = uri2aid( uri )
	aid ? %Q\|\t\t<item id="#{aid}" media-type="text/html" href="#{aid}.html"></item>\| : ''
	end

	def opf_footer( aids )
	r = <<-XML.gsub( /^\t/, '' )
	</manifest>
	<spine toc="toc">
	XML
	aids.each do \|aid\|
	r << %Q\|\t<itemref idref="#{aid}" />\n\|
	end
	r << <<-XML.gsub( /^\t/, '' )
	<itemref idref="index" />
	</spine>
	<tours></tours>
	<guide>
	<reference type="toc" title="Table of Contents" href="toc.html"></reference>
	<reference type="start" title="Top Story" href="#{aids[0]}.html"></reference>
	</guide>
	</package>
	XML
	r
	end

	def uri2aid( uri )
	uri.scan( /g=([^;$]+)/ ).flatten[0]
	end

	def generate( toc )
	open( 'tmp/toc.html', 'w:utf-8' ) do \|html\|
	open( 'tmp/toc.ncx', 'w:utf-8' ) do \|ncx\|
	open( 'tmp/nikkei.opf', 'w:utf-8' ) do \|opf\|
	first = true
	toc_index = 0
	aids = []
	ncx.puts ncx_header
	opf.puts opf_header
	toc.each do \|category\|
	category.each do \|article\|
	if article.class == String
	html.puts first ?
	html_header( 'Table of Contents' ) :
	"\t</ul>\n\t<mbp:pagebreak />"
	html.puts "\t<h2>#{article}</h2>"
	html.puts "\t<ul>"
	first = false
	else
	html.puts html_item( article[0], article[1] )
	ncx.puts ncx_item( article[0], article[1], toc_index += 1 )
	unless aids.index( uri2aid( article[1] ) )
	opf.puts opf_item( article[1] )
	aids << uri2aid( article[1] ) if uri2aid( article[1] )
	end
	end
	end
	end
	html.puts "\t</ul>"
	html.puts html_footer
	ncx.puts ncx_footer
	opf.puts opf_footer( aids )
	end
	end
	end
	end


	toc = []
	top = Nokogiri( open( ARGV[0] \|\| TOP, 'r:utf-8', &:read ) )

	#
	# scraping top news
	#
	toc_top = ['TOP NEWS']

	%w(first second_alone third fourth).each do \|category\|
	(top / "div.nx-top_news_#{category} h3 a").each do \|a\|
	toc_top << [a.text.strip.canonical, a.attr( 'href' )]
	end
	end
	toc << toc_top

	#
	# scraping all categories
	#
	(top / 'div.cmnc-genre').each do \|genre\|
	toc_cat = []
	(genre / 'h4.cmnc-genre_title a').each do \|cat\|
	next if /local/ =~ cat.attr( 'href' )
	toc_cat << cat.text
	(genre / 'li a').each do \|article\|
	toc_cat << [article.text.canonical, article.attr( 'href' )]
	end
	end
	toc << toc_cat
	end

	begin
	generate( toc )
	exit( 0 )
	rescue
	$stderr.puts $!
	#$@.each{\|i\| $stderr.puts i}
	exit( 1 )
	end
	* {
	margin: 0px;
	padding: 0px;
	text-indent: 0px;
	}

	h1 {
	font-size: 150%;
	font-weight: bold;
	}

	h2 {
	font-size: 120%;
	font-weight: bold;
	margin: 1em 0em 0em 0em;
	}

	p {
	text-indent: 0em;
	margin: 1em 0em 0em 0em;
	line-height: 200%;
	}

	table {
	border-top: 1px solid #444;
	border-left: 1px solid #444;
	border-collapse: collapse;
	border-spacing: 0;
	background-color: #ffffff;
	padding: 4px;
	text-align: left;
	}
	th {
	border-right:1px solid #444;
	border-bottom:1px solid #444;
	padding:0.3em 1em;
	}

	td {
	border-right:1px solid #444;
	border-bottom:1px solid #444;
	padding:0.3em 1em;
	}