rummelonp/Tumblr2epub.rb

## Tumblr2epub.rb
$KCODE = 'u'

require 'rubygems'
require 'open-uri'
require 'xmlsimple'
require 'eeepub'

class String

  def strip_tags
    return self.gsub(/<[^>]+?>/i, '')
  end

  def each_char
    return self.scan(/./) do |c|
      yield(c)
    end
  end

  def char_count
    n = 0
    self.each_char do
      n += 1
    end
    return n
  end

  def substr(from, to)
    i = 0
    s = ''
    self.each_char do |c|
      s += c if i >= from && to && i < to
      i += 1
    end
    return s
  end

end

class Numeric

  RomanTable = {
    1 => 'I',
    4 => 'IV',
    5 => 'V',
    9 => 'IX',
    10 => 'X',
    40 => 'XL',
    50 => 'L',
    90 => 'XC',
    100 => 'C',
    400 => 'CD',
    500 => 'D',
    900 => 'CM',
    1000 => 'M'
   }

  def roman(s = self)
   return '-' if 3999 < s || 0 >= s
   k = RomanTable.keys.delete_if{|x| x > s}.sort[-1]
   return RomanTable[k] + roman(s - k).gsub('-', '')
  end

end

class Tumblr2epub

  URL      = 'http://:username.tumblr.com/api/read?num=50&start=:count'
  USER_DIR = File.expand_path(ENV['HOME'])
  TEMP_DIR = File.expand_path(ENV['TMPDIR'])

  def self.read(username, count, type = nil, log = false)
    i = 0
    tumblr = {}
    posts = []
    loop do
      puts "Reading #{i + 1} page" if log
      url = URL.gsub(/:username/, username).gsub(/:count/, (i * 50).to_s)
      url += '&type=' + type if ['quote', 'photo', 'regular'].include?(type)
      xml_read = lambda do |error_count|
        begin
          return xml = XmlSimple.xml_in(open(url).read)
        rescue => error
          if error_count < 10
            puts error
            puts "Read failure, Retry" if log
            sleep 1
            xml_read.call(error_count + 1)
          else
            raise error
          end
        end
      end
      xml = xml_read.call(0)
      tumblr[:tumblelog] = xml['tumblelog'][0] if i == 0
      posts.concat(xml['posts'][0]['post'])
      puts "Readed #{i + 1} page, load #{posts.length} posts" if log
      if posts.length >= count || posts.length % 50 != 0
        break
      else
        i += 1
      end
    end
    tumblr[:posts] = posts
    return tumblr
  end

  def self.load(path, log = false)
    path = File.expand_path(path)
    puts "Loading yaml from \"#{path}\"" if log
    tumblr = YAML.load_file(path)
    puts "Loaded yaml" if log
    return tumblr
  end

  def self.dump(tumblr, dir = USER_DIR, log = false)
    puts "Dumping yaml" if log
    filename = "#{tumblr[:tumblelog]['name'].gsub(/-/, '_')}_#{Time.now.strftime('%Y%M%d%H%M%S')}.yaml"
    path = "#{File.expand_path(dir)}/#{filename}"
    YAML.dump(tumblr, File.open(path, 'wb'))
    puts "Dumped yaml to \"#{path}\"" if log
    return path
  end

  def self.make(tumblr, dir = USER_DIR, log = false)
    tumblelog = tumblr[:tumblelog]
    posts     = tumblr[:posts]
    epub = EeePub::Easy.new do
      title       tumblelog['title']
      creator     tumblelog['name']
      date        Time.now.strftime('%Y-%M-%d')
      identifier  "http://#{tumblelog['name']}.tumblr.com", :scheme => 'URL'
      uid         "http://#{tumblelog['name']}.tumblr.com"
    end
    css_path = make_css
    epub.assets << css_path
    css_name = File.basename(css_path)
    posts.each do |post|
      if post['type'] == 'photo'
        photo_url = post['photo-url'].find {|photo_url| photo_url['max-width'] == '500'}
        epub.assets << Photo.download(photo_url['content'], log)
      end
      html = to_html(post)
      if html
        epub.sections <<
        [
          to_title(post),
<<-HTML
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="ja">
  <head>
    <title>Tumblr</title>
  </head>
  <link rel="stylesheet" href="#{css_name}" type="text/css" />
  <body>
    #{html}
  </body>
</html>
HTML
        ]
      end
    end
    filename = "#{tumblelog['name'].gsub(/-/, '_')}_#{Time.now.strftime('%Y%M%d%H%M%S')}.epub"
    path = "#{File.expand_path(dir)}/#{filename}"
    puts "Making epub"
    epub.save(path)
    Photo.clean(log)
    puts "Make epub to \"#{path}\"" if log
    return filename
  end

  def self.make_css
css = <<-CSS
body {
  font: 14px/1.4 Arial, Helvetica, sans-serif;
  margin: 0;
  padding: 0;
}
p, ul, ol, blockquote {
  margin: 3px 0 1px;
}
body :first-child,
body :last-child,
p:last-child,
ul:last-child,
ol:last-child {
  margin-bottom: 0 !important;
}
a img {
  border-width: 0;
}
blockquote {
  border-left: 4px solid #dcdcdc;
  margin-left: 0 !important;
  margin-right: 0 !important;
  padding-left: 10px !important;
  border-width: 4px !important;
  border-color: #a8bccf !important;
}
blockquote blockquote {
  border-color: #839aaf !important;
}
blockquote blockquote blockquote {
  border-color: #6b7d8f !important;
}
blockquote blockquote blockquote blockquote {
  border-color: #4c5e6f !important;
}
blockquote blockquote blockquote blockquote blockquote {
  border-color: #36434f !important;
}
CSS
    path = "#{TEMP_DIR}/style.css"
    file = File.open(path, 'wb')
    file.write(css)
    file.close
    return path
  end

  def self.to_html(post)
    if post['type'] == 'quote'
      return "#{post['quote-text'].to_s}\n#{post['quote-source'].to_s}\n"
    elsif post['type'] == 'photo'
      photo_url = post['photo-url'].find {|photo_url| photo_url['max-width'] == '500'}
      image_path = Photo.get(photo_url['content'])
      if image_path
        image_path = "#{File.basename(image_path)}"
      else
        image_path = photo_url['content']
      end
      return "<a href=\"#{image_path}\"><img src=\"#{image_path}\" /></a>#{post['photo-caption'].to_s}"
    elsif post['type'] == 'regular'
      if post['regular-title']
        return "<h2>#{post['regular-title'].to_s}</h2>\n#{post['regular-body'].to_s}\n"
      else
        return "#{post['regular-body'].to_s}\n"
      end
    else
      return nil
    end
  end

  def self.to_title(post)
    type = post['type'].capitalize
    title = nil
    if post['type'] == 'quote'
      title = post['quote-text']
    elsif post['type'] == 'photo'
      if post['photo-caption']
        title = post['photo-caption']
      end
    elsif post['type'] == 'regular'
      if post['regular-title']
        title = post['regular-title']
      else
        title = post['regular-body']
      end
    end
    if title
      title = title.to_s.strip_tags.gsub(/\s{2,}/, ' ').strip
      if title.char_count > 20
        return "#{type} - #{title.substr(0, 20)}..."
      else
        return "#{type} - #{title}"
      end
    else
      return type
    end
  end

  class Photo

    @@images = {}

    def self.download(url, log = false)
      puts "Downloading \"#{url}\"" if log
      ext = File.extname(url)
      ext = '.png' unless ext
      photo_download = lambda do |error_count|
        begin
          image = open(url)
          path  = "#{TEMP_DIR}/#{rand(256**16).to_s(16)}#{ext}"
          file  = File.open(path, 'wb')
          file.write(image.read)
          file.close
          image.close
          return path
        rescue => error
          if error_count < 10
            puts error
            puts "Download failure, Retry" if log
            sleep 1
            photo_download.call(error_count + 1)
          else
            raise error
          end
        end
      end
      path = photo_download.call(0)
      puts "Downloaded to \"#{path}\"" if log
      @@images[url] = path
      return path
    end

    def self.get(url)
      @@images[url]
    end

    def self.clean(log = false)
      puts "Delete cache files" if log && @@images.length > 0
      @@images.each do |url, file|
        puts "Deleting \"#{file}\""
        File.delete(file)
      end
      @@images = {}
      puts "Deleted all cache files" if log
      return true
    end

  end

end
	$KCODE = 'u'

	require 'rubygems'
	require 'open-uri'
	require 'xmlsimple'
	require 'eeepub'

	class String

	def strip_tags
	return self.gsub(/<[^>]+?>/i, '')
	end

	def each_char
	return self.scan(/./) do \|c\|
	yield(c)
	end
	end

	def char_count
	n = 0
	self.each_char do
	n += 1
	end
	return n
	end

	def substr(from, to)
	i = 0
	s = ''
	self.each_char do \|c\|
	s += c if i >= from && to && i < to
	i += 1
	end
	return s
	end

	end

	class Numeric

	RomanTable = {
	1 => 'I',
	4 => 'IV',
	5 => 'V',
	9 => 'IX',
	10 => 'X',
	40 => 'XL',
	50 => 'L',
	90 => 'XC',
	100 => 'C',
	400 => 'CD',
	500 => 'D',
	900 => 'CM',
	1000 => 'M'
	}

	def roman(s = self)
	return '-' if 3999 < s \|\| 0 >= s
	k = RomanTable.keys.delete_if{\|x\| x > s}.sort[-1]
	return RomanTable[k] + roman(s - k).gsub('-', '')
	end

	end

	class Tumblr2epub

	URL = 'http://:username.tumblr.com/api/read?num=50&start=:count'
	USER_DIR = File.expand_path(ENV['HOME'])
	TEMP_DIR = File.expand_path(ENV['TMPDIR'])

	def self.read(username, count, type = nil, log = false)
	i = 0
	tumblr = {}
	posts = []
	loop do
	puts "Reading #{i + 1} page" if log
	url = URL.gsub(/:username/, username).gsub(/:count/, (i * 50).to_s)
	url += '&type=' + type if ['quote', 'photo', 'regular'].include?(type)
	xml_read = lambda do \|error_count\|
	begin
	return xml = XmlSimple.xml_in(open(url).read)
	rescue => error
	if error_count < 10
	puts error
	puts "Read failure, Retry" if log
	sleep 1
	xml_read.call(error_count + 1)
	else
	raise error
	end
	end
	end
	xml = xml_read.call(0)
	tumblr[:tumblelog] = xml['tumblelog'][0] if i == 0
	posts.concat(xml['posts'][0]['post'])
	puts "Readed #{i + 1} page, load #{posts.length} posts" if log
	if posts.length >= count \|\| posts.length % 50 != 0
	break
	else
	i += 1
	end
	end
	tumblr[:posts] = posts
	return tumblr
	end

	def self.load(path, log = false)
	path = File.expand_path(path)
	puts "Loading yaml from \"#{path}\"" if log
	tumblr = YAML.load_file(path)
	puts "Loaded yaml" if log
	return tumblr
	end

	def self.dump(tumblr, dir = USER_DIR, log = false)
	puts "Dumping yaml" if log
	filename = "#{tumblr[:tumblelog]['name'].gsub(/-/, '_')}_#{Time.now.strftime('%Y%M%d%H%M%S')}.yaml"
	path = "#{File.expand_path(dir)}/#{filename}"
	YAML.dump(tumblr, File.open(path, 'wb'))
	puts "Dumped yaml to \"#{path}\"" if log
	return path
	end

	def self.make(tumblr, dir = USER_DIR, log = false)
	tumblelog = tumblr[:tumblelog]
	posts = tumblr[:posts]
	epub = EeePub::Easy.new do
	title tumblelog['title']
	creator tumblelog['name']
	date Time.now.strftime('%Y-%M-%d')
	identifier "http://#{tumblelog['name']}.tumblr.com", :scheme => 'URL'
	uid "http://#{tumblelog['name']}.tumblr.com"
	end
	css_path = make_css
	epub.assets << css_path
	css_name = File.basename(css_path)
	posts.each do \|post\|
	if post['type'] == 'photo'
	photo_url = post['photo-url'].find {\|photo_url\| photo_url['max-width'] == '500'}
	epub.assets << Photo.download(photo_url['content'], log)
	end
	html = to_html(post)
	if html
	epub.sections <<
	[
	to_title(post),
	<<-HTML
	<?xml version="1.0" encoding="UTF-8"?>
	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
	<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="ja">
	<head>
	<title>Tumblr</title>
	</head>
	<link rel="stylesheet" href="#{css_name}" type="text/css" />
	<body>
	#{html}
	</body>
	</html>
	HTML
	]
	end
	end
	filename = "#{tumblelog['name'].gsub(/-/, '_')}_#{Time.now.strftime('%Y%M%d%H%M%S')}.epub"
	path = "#{File.expand_path(dir)}/#{filename}"
	puts "Making epub"
	epub.save(path)
	Photo.clean(log)
	puts "Make epub to \"#{path}\"" if log
	return filename
	end

	def self.make_css
	css = <<-CSS
	body {
	font: 14px/1.4 Arial, Helvetica, sans-serif;
	margin: 0;
	padding: 0;
	}
	p, ul, ol, blockquote {
	margin: 3px 0 1px;
	}
	body :first-child,
	body :last-child,
	p:last-child,
	ul:last-child,
	ol:last-child {
	margin-bottom: 0 !important;
	}
	a img {
	border-width: 0;
	}
	blockquote {
	border-left: 4px solid #dcdcdc;
	margin-left: 0 !important;
	margin-right: 0 !important;
	padding-left: 10px !important;
	border-width: 4px !important;
	border-color: #a8bccf !important;
	}
	blockquote blockquote {
	border-color: #839aaf !important;
	}
	blockquote blockquote blockquote {
	border-color: #6b7d8f !important;
	}
	blockquote blockquote blockquote blockquote {
	border-color: #4c5e6f !important;
	}
	blockquote blockquote blockquote blockquote blockquote {
	border-color: #36434f !important;
	}
	CSS
	path = "#{TEMP_DIR}/style.css"
	file = File.open(path, 'wb')
	file.write(css)
	file.close
	return path
	end

	def self.to_html(post)
	if post['type'] == 'quote'
	return "#{post['quote-text'].to_s}\n#{post['quote-source'].to_s}\n"
	elsif post['type'] == 'photo'
	photo_url = post['photo-url'].find {\|photo_url\| photo_url['max-width'] == '500'}
	image_path = Photo.get(photo_url['content'])
	if image_path
	image_path = "#{File.basename(image_path)}"
	else
	image_path = photo_url['content']
	end
	return "<a href=\"#{image_path}\"><img src=\"#{image_path}\" /></a>#{post['photo-caption'].to_s}"
	elsif post['type'] == 'regular'
	if post['regular-title']
	return "<h2>#{post['regular-title'].to_s}</h2>\n#{post['regular-body'].to_s}\n"
	else
	return "#{post['regular-body'].to_s}\n"
	end
	else
	return nil
	end
	end

	def self.to_title(post)
	type = post['type'].capitalize
	title = nil
	if post['type'] == 'quote'
	title = post['quote-text']
	elsif post['type'] == 'photo'
	if post['photo-caption']
	title = post['photo-caption']
	end
	elsif post['type'] == 'regular'
	if post['regular-title']
	title = post['regular-title']
	else
	title = post['regular-body']
	end
	end
	if title
	title = title.to_s.strip_tags.gsub(/\s{2,}/, ' ').strip
	if title.char_count > 20
	return "#{type} - #{title.substr(0, 20)}..."
	else
	return "#{type} - #{title}"
	end
	else
	return type
	end
	end

	class Photo

	@@images = {}

	def self.download(url, log = false)
	puts "Downloading \"#{url}\"" if log
	ext = File.extname(url)
	ext = '.png' unless ext
	photo_download = lambda do \|error_count\|
	begin
	image = open(url)
	path = "#{TEMP_DIR}/#{rand(256**16).to_s(16)}#{ext}"
	file = File.open(path, 'wb')
	file.write(image.read)
	file.close
	image.close
	return path
	rescue => error
	if error_count < 10
	puts error
	puts "Download failure, Retry" if log
	sleep 1
	photo_download.call(error_count + 1)
	else
	raise error
	end
	end
	end
	path = photo_download.call(0)
	puts "Downloaded to \"#{path}\"" if log
	@@images[url] = path
	return path
	end

	def self.get(url)
	@@images[url]
	end

	def self.clean(log = false)
	puts "Delete cache files" if log && @@images.length > 0
	@@images.each do \|url, file\|
	puts "Deleting \"#{file}\""
	File.delete(file)
	end
	@@images = {}
	puts "Deleted all cache files" if log
	return true
	end

	end

	end