PatrickLerner/JapNewsToKindle

## JapNewsToKindle
#!/usr/bin/env ruby
# encoding: utf-8
# Version: 0.2a 2013-06-28

require 'nokogiri'
require 'open-uri'
require 'tmpdir'
require 'trollop'
require 'rbconfig'
$is_windows = (RbConfig::CONFIG['host_os'] =~ /mswin|mingw|cygwin/)

def clean_string (str)
  str.tr('0-9', '０-９').sub('h２', 'h2').sub('h３', 'h3').sub('h４', 'h4')
end

def strip_element_tags (node, element_name)
  node.search('.//' + element_name).each do |e|
    e.replace e.inner_html
  end
end

def strip_ruby_tags (node)
  node.search('.//rt').remove
  strip_element_tags(node, 'ruby')
end

class Article
  def get_title (options = {})
    @doc.xpath(@XPath_title).each do |lines|
      strip_ruby_tags lines if not options[:ruby]
      return lines.content.to_s if options[:clean]
      return clean_string(lines.to_s)
    end
  end

  def get_date (options = {})
    @doc.xpath(@XPath_time).each do |lines|
      strip_element_tags lines, 'span'
      return clean_string(lines.to_s)
    end
  end

  def get_content (options = {:ruby => false})
    @doc.xpath(@XPath_article).each do |lines|
      strip_ruby_tags lines if not options[:ruby]
      strip_element_tags lines, 'span'
      strip_element_tags lines, 'a'
      return clean_string(lines.inner_html.to_s)
    end
  end
end

class NHKEasyArticle < Article
  def initialize (url)
    @doc = Nokogiri::HTML(open(url))
    @XPath_title = '//*[@id="newstitle"]/h2'
    @XPath_time = '//*[@id="newsDate"]'
    @XPath_article = '//*[@id="newsarticle"]'
  end
end

class NHKArticle < Article
  def initialize (url)
    @doc = Nokogiri::HTML(open(url))
    @XPath_title = '//*[@id="news"]/div[2]/div/div/div[1]/h1/span'
    @XPath_time = '//*[@id="news"]/div[2]/div/div/div[1]/h1/div'
    @XPath_article = '//*[@id="news"]/div[2]/div/div/div'
  end

  def get_title (options = {})
    super.gsub 'span', 'h2'
  end

  def get_date (options = {})
    super.gsub('<div class="time">', '<p id="newsDate">[').gsub('</div>', ']</p>')
  end

  def get_content (options = {:ruby => false})
    c = ''
    @doc.xpath(@XPath_article).each do |lines|
      break if lines.attribute('id').to_s == "news_mkanren"
      strip_ruby_tags lines if not options[:ruby]
      strip_element_tags lines, 'span'
      strip_element_tags lines, 'a'
      c += clean_string(lines.inner_html.to_s)
    end
    c.sub(/.*<p id="news_textbody">/m, '<p id="news_textbody">')
  end
end

class HTMLOutput
  def initialize (article, fileName, options = {})
    title = article.get_title(:ruby => false, :clean => true)

    @horizontal_css = <<eos
body {
  font-family: serif; }
h2, h3 {
  font-weight: bold;
  padding-top: 2em;
  margin-right: 1em;
  margin-left: 1em; }
h2 {
  font-size: 120%; }
p {
  text-indent: 1em; }
#newsDate {
  font-size: 90%;
  font-weight:bold;
  line-height: 1.5; }
eos

    @vertical_css = <<eos
body {
  -webkit-writing-mode: vertical-rl; }
#newsDate {
  padding-top: 10em;
  text-indent: -4em; }
eos
  @vertical_css = @horizontal_css + @vertical_css

    @html_header = <<eos
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xml:lang="ja" xmlns="http://www.w3.org/1999/xhtml">
<head>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  <meta http-equiv="Content-Style-Type" content="text/css" />
  <meta name="generator" content="pandoc" />
  <title>{{TITLE}}</title>
  <link rel="stylesheet" href="{{CSS_FILE}}" type="text/css" />
  <link rel="Schema.DC" href="http://purl.org/dc/elements/1.1/" />
  <meta name="DC.Title" content="{{TITLE}}" />
  <meta name="DC.Creator" content="NHK" />
  <meta name="DC.Publisher" content="NHK" /></head>
<body>
eos

    @html_footer = <<eos
</body>
</html>
eos

    @html_header.gsub! '{{TITLE}}', title
    @html_header.gsub! '{{CSS_FILE}}', fileName + ".css"

    File.open(fileName + ".css", 'w') { |file|
      file.write(@horizontal_css) if options[:horizontal]
      file.write(@vertical_css) if not options[:horizontal]
    }

    File.open(fileName + ".html", 'w') { |file|
      file.write(@html_header.sub('{{CSS_FILE}}', fileName + ".css"))
      file.write(article.get_title(options))
      file.write(article.get_date(options))
      file.write(article.get_content(options))
      file.write(@html_footer)
    }
  end
end

class KindleOutput
  def initialize (article, fileName, options = {})
    title = article.get_title(:ruby => false, :clean => true)

    @opf_file = <<eos
<?xml version="1.0" encoding="UTF-8"?>
<package version="3.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId">
 <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/">
   <dc:title>{{TITLE}}</dc:title>
   <dc:contributor>NHK</dc:contributor>
   <dc:language>ja</dc:language>
   <dc:publisher>NHK</dc:publisher>
 </metadata>
 <manifest>
  <item id="style" href="{{CSS_FILE}}" media-type="text/css" />
  <item id="titlepage" href="{{FILENAME}}.html" media-type="application/xhtml+xml" />
 </manifest>
 <spine toc="tocncx" page-progression-direction="rtl">
  <itemref idref="titlepage" />
 </spine>
</package>
eos
    @opf_file.gsub! '{{TITLE}}', title
    @opf_file.gsub! '{{FILENAME}}', fileName
    @opf_file.gsub! '{{CSS_FILE}}', fileName + ".css"

    Dir.mktmpdir { |dir|
      HTMLOutput.new(article, dir + "/" + fileName, options)

      File.open(dir + "/" + fileName + ".opf", 'w') { |file|
        file.write(@opf_file)
      }
      if $is_windows
        system "kindlegen.exe \"#{dir + "/" + fileName}.opf\""
      else
        system "kindlegen \"#{dir + "/" + fileName}.opf\""
      end
      FileUtils.cp dir + "/" + fileName + ".mobi", fileName + ".mobi"
    }
  end
end

# main part

opts = Trollop::options do
  version "JapNewsToKindle 0.2a (c) 2013 Patrick Lerner [PatrickLerner@me.com]"
  banner <<-EOS
This program dumps Japanese News websites into a kindle compatible mobi file using Amazon's kindlegen (needs to be in path!).

Usage:
       JapNewsToKindle [options]
where [options] are:
EOS

  opt :ruby, "Get furigana if possible", :short => 'r'
  opt :url, "The URL that is supposed to be dumped", :type => String, :short => 'u'
  opt :out, "The output filename", :type => String, :short => 'O'
  opt :horizontal, "Use a horizontal layout instead of the default vertical one", :default => false, :short => 'n'
  opt :open, "Open the generated file in the Kindle Application", :default => false, :short => 'o'
end

backends = [
  [/nhk.or.jp\/news\/easy\/k[0-9]+\/k[0-9]+\.html/, NHKEasyArticle],
  [/nhk.or.jp\/news\/html\/[0-9]+\/[a-z][0-9]+\.html/, NHKArticle]
]

backends.each { |b|
  if b[0].match(opts[:url])
    article = b[1].new(opts[:url])
    if opts[:out]
      fileName = opts[:out]
    else
      fileName = article.get_title(:ruby => false, :clean => true)
    end
    KindleOutput.new(article, fileName, {:ruby => opts[:ruby], :horizontal => opts[:horizontal]})

    if opts[:open] and not $is_windows
      system "killall Kindle"
      kindleFilePath = ENV['HOME'] + "/Library/Application Support/Kindle/My Kindle Content/#{fileName}.mobi"
      FileUtils.rm kindleFilePath if File.exists? (kindleFilePath)
      system "open \"#{fileName}.mobi\""
    end
   exit
  end
}

Trollop::die :url, "must match against a backend supported by this program"
	#!/usr/bin/env ruby
	# encoding: utf-8
	# Version: 0.2a 2013-06-28

	require 'nokogiri'
	require 'open-uri'
	require 'tmpdir'
	require 'trollop'
	require 'rbconfig'
	$is_windows = (RbConfig::CONFIG['host_os'] =~ /mswin\|mingw\|cygwin/)

	def clean_string (str)
	str.tr('0-9', '０-９').sub('h２', 'h2').sub('h３', 'h3').sub('h４', 'h4')
	end

	def strip_element_tags (node, element_name)
	node.search('.//' + element_name).each do \|e\|
	e.replace e.inner_html
	end
	end

	def strip_ruby_tags (node)
	node.search('.//rt').remove
	strip_element_tags(node, 'ruby')
	end

	class Article
	def get_title (options = {})
	@doc.xpath(@XPath_title).each do \|lines\|
	strip_ruby_tags lines if not options[:ruby]
	return lines.content.to_s if options[:clean]
	return clean_string(lines.to_s)
	end
	end

	def get_date (options = {})
	@doc.xpath(@XPath_time).each do \|lines\|
	strip_element_tags lines, 'span'
	return clean_string(lines.to_s)
	end
	end

	def get_content (options = {:ruby => false})
	@doc.xpath(@XPath_article).each do \|lines\|
	strip_ruby_tags lines if not options[:ruby]
	strip_element_tags lines, 'span'
	strip_element_tags lines, 'a'
	return clean_string(lines.inner_html.to_s)
	end
	end
	end

	class NHKEasyArticle < Article
	def initialize (url)
	@doc = Nokogiri::HTML(open(url))
	@XPath_title = '//*[@id="newstitle"]/h2'
	@XPath_time = '//*[@id="newsDate"]'
	@XPath_article = '//*[@id="newsarticle"]'
	end
	end

	class NHKArticle < Article
	def initialize (url)
	@doc = Nokogiri::HTML(open(url))
	@XPath_title = '//*[@id="news"]/div[2]/div/div/div[1]/h1/span'
	@XPath_time = '//*[@id="news"]/div[2]/div/div/div[1]/h1/div'
	@XPath_article = '//*[@id="news"]/div[2]/div/div/div'
	end

	def get_title (options = {})
	super.gsub 'span', 'h2'
	end

	def get_date (options = {})
	super.gsub('<div class="time">', '<p id="newsDate">[').gsub('</div>', ']</p>')
	end

	def get_content (options = {:ruby => false})
	c = ''
	@doc.xpath(@XPath_article).each do \|lines\|
	break if lines.attribute('id').to_s == "news_mkanren"
	strip_ruby_tags lines if not options[:ruby]
	strip_element_tags lines, 'span'
	strip_element_tags lines, 'a'
	c += clean_string(lines.inner_html.to_s)
	end
	c.sub(/.*<p id="news_textbody">/m, '<p id="news_textbody">')
	end
	end

	class HTMLOutput
	def initialize (article, fileName, options = {})
	title = article.get_title(:ruby => false, :clean => true)

	@horizontal_css = <<eos
	body {
	font-family: serif; }
	h2, h3 {
	font-weight: bold;
	padding-top: 2em;
	margin-right: 1em;
	margin-left: 1em; }
	h2 {
	font-size: 120%; }
	p {
	text-indent: 1em; }
	#newsDate {
	font-size: 90%;
	font-weight:bold;
	line-height: 1.5; }
	eos

	@vertical_css = <<eos
	body {
	-webkit-writing-mode: vertical-rl; }
	#newsDate {
	padding-top: 10em;
	text-indent: -4em; }
	eos
	@vertical_css = @horizontal_css + @vertical_css

	@html_header = <<eos
	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
	<html xml:lang="ja" xmlns="http://www.w3.org/1999/xhtml">
	<head>
	<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
	<meta http-equiv="Content-Style-Type" content="text/css" />
	<meta name="generator" content="pandoc" />
	<title>{{TITLE}}</title>
	<link rel="stylesheet" href="{{CSS_FILE}}" type="text/css" />
	<link rel="Schema.DC" href="http://purl.org/dc/elements/1.1/" />
	<meta name="DC.Title" content="{{TITLE}}" />
	<meta name="DC.Creator" content="NHK" />
	<meta name="DC.Publisher" content="NHK" /></head>
	<body>
	eos

	@html_footer = <<eos
	</body>
	</html>
	eos

	@html_header.gsub! '{{TITLE}}', title
	@html_header.gsub! '{{CSS_FILE}}', fileName + ".css"

	File.open(fileName + ".css", 'w') { \|file\|
	file.write(@horizontal_css) if options[:horizontal]
	file.write(@vertical_css) if not options[:horizontal]
	}

	File.open(fileName + ".html", 'w') { \|file\|
	file.write(@html_header.sub('{{CSS_FILE}}', fileName + ".css"))
	file.write(article.get_title(options))
	file.write(article.get_date(options))
	file.write(article.get_content(options))
	file.write(@html_footer)
	}
	end
	end

	class KindleOutput
	def initialize (article, fileName, options = {})
	title = article.get_title(:ruby => false, :clean => true)

	@opf_file = <<eos
	<?xml version="1.0" encoding="UTF-8"?>
	<package version="3.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId">
	<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/">
	<dc:title>{{TITLE}}</dc:title>
	<dc:contributor>NHK</dc:contributor>
	<dc:language>ja</dc:language>
	<dc:publisher>NHK</dc:publisher>
	</metadata>
	<manifest>
	<item id="style" href="{{CSS_FILE}}" media-type="text/css" />
	<item id="titlepage" href="{{FILENAME}}.html" media-type="application/xhtml+xml" />
	</manifest>
	<spine toc="tocncx" page-progression-direction="rtl">
	<itemref idref="titlepage" />
	</spine>
	</package>
	eos
	@opf_file.gsub! '{{TITLE}}', title
	@opf_file.gsub! '{{FILENAME}}', fileName
	@opf_file.gsub! '{{CSS_FILE}}', fileName + ".css"

	Dir.mktmpdir { \|dir\|
	HTMLOutput.new(article, dir + "/" + fileName, options)

	File.open(dir + "/" + fileName + ".opf", 'w') { \|file\|
	file.write(@opf_file)
	}
	if $is_windows
	system "kindlegen.exe \"#{dir + "/" + fileName}.opf\""
	else
	system "kindlegen \"#{dir + "/" + fileName}.opf\""
	end
	FileUtils.cp dir + "/" + fileName + ".mobi", fileName + ".mobi"
	}
	end
	end

	# main part

	opts = Trollop::options do
	version "JapNewsToKindle 0.2a (c) 2013 Patrick Lerner [PatrickLerner@me.com]"
	banner <<-EOS
	This program dumps Japanese News websites into a kindle compatible mobi file using Amazon's kindlegen (needs to be in path!).

	Usage:
	JapNewsToKindle [options]
	where [options] are:
	EOS

	opt :ruby, "Get furigana if possible", :short => 'r'
	opt :url, "The URL that is supposed to be dumped", :type => String, :short => 'u'
	opt :out, "The output filename", :type => String, :short => 'O'
	opt :horizontal, "Use a horizontal layout instead of the default vertical one", :default => false, :short => 'n'
	opt :open, "Open the generated file in the Kindle Application", :default => false, :short => 'o'
	end

	backends = [
	[/nhk.or.jp\/news\/easy\/k[0-9]+\/k[0-9]+\.html/, NHKEasyArticle],
	[/nhk.or.jp\/news\/html\/[0-9]+\/[a-z][0-9]+\.html/, NHKArticle]
	]

	backends.each { \|b\|
	if b[0].match(opts[:url])
	article = b[1].new(opts[:url])
	if opts[:out]
	fileName = opts[:out]
	else
	fileName = article.get_title(:ruby => false, :clean => true)
	end
	KindleOutput.new(article, fileName, {:ruby => opts[:ruby], :horizontal => opts[:horizontal]})

	if opts[:open] and not $is_windows
	system "killall Kindle"
	kindleFilePath = ENV['HOME'] + "/Library/Application Support/Kindle/My Kindle Content/#{fileName}.mobi"
	FileUtils.rm kindleFilePath if File.exists? (kindleFilePath)
	system "open \"#{fileName}.mobi\""
	end
	exit
	end
	}

	Trollop::die :url, "must match against a backend supported by this program"