gangmax/sina_blog_exporter.rb

## sina_blog_exporter.rb
# encoding: UTF-8

require 'nokogiri'
require 'open-uri'
require 'time'

# https://github.com/tenderlove/nokogiri/wiki/

class BlogItem
  attr_accessor :title, :tags, :content, :created_time, :link

  def to_str
    "[title=#{@title}, is_original=#{is_original?} tags=#{@tags}, content=#{@content}, created_time=#{@created_time}, link=#{@link}]"
  end

  def is_original?
    return true unless @title.include?('转贴') or @title.downcase.include?('zt') or @tags.include?('转贴')
    return false
  end

  # This is the octopress blog item style tag string, seperated by a space.
  def get_tag_str
    result = ''
    @tags.each {|tag| result += (tag + ' ')}
    result.strip
  end
end

module Parser
  def parse(original_item_url, tags_mapper=nil, *given_tags)
    puts "Parsing blog item '#{original_item_url}'..."
    article = Nokogiri::HTML(open(original_item_url)).css('div.artical').first
    item = BlogItem.new
    item.title = article.css('h2.titName').first.content
    item.content = article.css('div.articalContent').first.content.to_str.strip
    tags = Array.new
    article.css('div.articalTag').first.css('a').each {|a| tags.push(a.content)}
    if(given_tags)
      tags.concat(given_tags)
    end
    if(tags_mapper)
      tags.each do |t|
        if(tags_mapper.keys.index(t))
          tags[tags.index(t)] = tags_mapper[t]
        end
      end
    end
    item.tags = tags
    item.created_time = Time.parse(article.css('span.time').first.content[1..-2])
    item.link = original_item_url
    # puts "Parsing finished: #{item.to_str}"
    item
  end
end

module Writer

  def format_markdown_line(blog_item)
    result = ''
    last_line_is_concrete = false
    blog_item.content.each_line do |line|
      this_line_is_concrete = is_concrete_line?(line)
      if(last_line_is_concrete and this_line_is_concrete)
        result += "\n"
      end
      result += line
      last_line_is_concrete = this_line_is_concrete
    end
    blog_item.content = result
    return blog_item
  end

  def to_markdown(blog_item)
    "---\nlayout: post\ntitle: '#{blog_item.title}'\ndate: #{blog_item.created_time.strftime('%Y-%m-%d %H:%M')}\ncomments: true\ncategories: #{blog_item.get_tag_str}\npublished: true\n---\n\nMigarated from [here][original_blog_url] at '#{Time.now.strftime('%Y-%m-%d %H:%M:%S')}'.\n\n<!--more-->\n\n#{blog_item.content}\n\n\n[original_blog_url]: #{blog_item.link}"
  end

  # Create the markdown file name according to the item content.
  def markdown_file_name(blog_item, index)
    t = blog_item.created_time
    "#{t.strftime('%Y-%m-%d-%H-%M-%S')}-migrated-from-sina-#{index}.markdown"
  end

  def write(file_name, file_content)
    File.open(file_name, 'w') do |file|
      file.write(file_content)
      file.flush
    end
  end

  def is_concrete_line?(line)
    return line.strip.size > 0
  end
end

class Converter

  attr_accessor :index_pages, :identity_string, :source_item_links

  # Here I use 韩寒's Sina blog as the example.
  def initialize(index_pages=['http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html',
                              'http://blog.sina.com.cn/s/articlelist_1191258123_0_2.html',
                              'http://blog.sina.com.cn/s/articlelist_1191258123_0_3.html',
                              'http://blog.sina.com.cn/s/articlelist_1191258123_0_4.html',
                              'http://blog.sina.com.cn/s/articlelist_1191258123_0_5.html',
                              'http://blog.sina.com.cn/s/articlelist_1191258123_0_6.html'],
                 identity_string = 'http://blog.sina.com.cn/s/blog_4701280b')
    @index_pages = index_pages
    @identity_string = identity_string
    @source_blog_links = Array.new
    # Get all the original blog links.
    @index_pages.each do |index_page|
      doc = Nokogiri::HTML(open(index_page))
      doc.search('a').each do |tag|
        if tag['href'].start_with?(identity_string)
          @source_blog_links.push(tag['href'])
        end
      end
    end
  end

  include Parser

  def get_blogs
    @blogs = Array.new
    @source_blog_links.each do |link|
      @blogs.push(parse(link))
    end
    @blogs
  end

  include Writer
end

converter = Converter.new
index = 1
converter.get_blogs.reverse.each do |item|
  # Fix illegal blog title.
  if(!item.is_original?)
    item.title = '[转贴]' + item.title
  end
  if(item.title.end_with?('(ZT'))
    item.title = item.title[0..-4]
  end
  # Get file name and content.
  file_name = converter.markdown_file_name(item, index)
  file_content = converter.to_markdown(converter.format_markdown_line(item))
  # puts "file_name = #{file_name}, file_content = #{file_content}"
  index += 1
  # Generate files.
  converter.write('/tmp/' + file_name, file_content)
end
	# encoding: UTF-8

	require 'nokogiri'
	require 'open-uri'
	require 'time'

	# https://github.com/tenderlove/nokogiri/wiki/

	class BlogItem
	attr_accessor :title, :tags, :content, :created_time, :link

	def to_str
	"[title=#{@title}, is_original=#{is_original?} tags=#{@tags}, content=#{@content}, created_time=#{@created_time}, link=#{@link}]"
	end

	def is_original?
	return true unless @title.include?('转贴') or @title.downcase.include?('zt') or @tags.include?('转贴')
	return false
	end

	# This is the octopress blog item style tag string, seperated by a space.
	def get_tag_str
	result = ''
	@tags.each {\|tag\| result += (tag + ' ')}
	result.strip
	end
	end

	module Parser
	def parse(original_item_url, tags_mapper=nil, *given_tags)
	puts "Parsing blog item '#{original_item_url}'..."
	article = Nokogiri::HTML(open(original_item_url)).css('div.artical').first
	item = BlogItem.new
	item.title = article.css('h2.titName').first.content
	item.content = article.css('div.articalContent').first.content.to_str.strip
	tags = Array.new
	article.css('div.articalTag').first.css('a').each {\|a\| tags.push(a.content)}
	if(given_tags)
	tags.concat(given_tags)
	end
	if(tags_mapper)
	tags.each do \|t\|
	if(tags_mapper.keys.index(t))
	tags[tags.index(t)] = tags_mapper[t]
	end
	end
	end
	item.tags = tags
	item.created_time = Time.parse(article.css('span.time').first.content[1..-2])
	item.link = original_item_url
	# puts "Parsing finished: #{item.to_str}"
	item
	end
	end

	module Writer

	def format_markdown_line(blog_item)
	result = ''
	last_line_is_concrete = false
	blog_item.content.each_line do \|line\|
	this_line_is_concrete = is_concrete_line?(line)
	if(last_line_is_concrete and this_line_is_concrete)
	result += "\n"
	end
	result += line
	last_line_is_concrete = this_line_is_concrete
	end
	blog_item.content = result
	return blog_item
	end

	def to_markdown(blog_item)
	"---\nlayout: post\ntitle: '#{blog_item.title}'\ndate: #{blog_item.created_time.strftime('%Y-%m-%d %H:%M')}\ncomments: true\ncategories: #{blog_item.get_tag_str}\npublished: true\n---\n\nMigarated from [here][original_blog_url] at '#{Time.now.strftime('%Y-%m-%d %H:%M:%S')}'.\n\n<!--more-->\n\n#{blog_item.content}\n\n\n[original_blog_url]: #{blog_item.link}"
	end

	# Create the markdown file name according to the item content.
	def markdown_file_name(blog_item, index)
	t = blog_item.created_time
	"#{t.strftime('%Y-%m-%d-%H-%M-%S')}-migrated-from-sina-#{index}.markdown"
	end

	def write(file_name, file_content)
	File.open(file_name, 'w') do \|file\|
	file.write(file_content)
	file.flush
	end
	end

	def is_concrete_line?(line)
	return line.strip.size > 0
	end
	end

	class Converter

	attr_accessor :index_pages, :identity_string, :source_item_links

	# Here I use 韩寒's Sina blog as the example.
	def initialize(index_pages=['http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html',
	'http://blog.sina.com.cn/s/articlelist_1191258123_0_2.html',
	'http://blog.sina.com.cn/s/articlelist_1191258123_0_3.html',
	'http://blog.sina.com.cn/s/articlelist_1191258123_0_4.html',
	'http://blog.sina.com.cn/s/articlelist_1191258123_0_5.html',
	'http://blog.sina.com.cn/s/articlelist_1191258123_0_6.html'],
	identity_string = 'http://blog.sina.com.cn/s/blog_4701280b')
	@index_pages = index_pages
	@identity_string = identity_string
	@source_blog_links = Array.new
	# Get all the original blog links.
	@index_pages.each do \|index_page\|
	doc = Nokogiri::HTML(open(index_page))
	doc.search('a').each do \|tag\|
	if tag['href'].start_with?(identity_string)
	@source_blog_links.push(tag['href'])
	end
	end
	end
	end

	include Parser

	def get_blogs
	@blogs = Array.new
	@source_blog_links.each do \|link\|
	@blogs.push(parse(link))
	end
	@blogs
	end

	include Writer
	end

	converter = Converter.new
	index = 1
	converter.get_blogs.reverse.each do \|item\|
	# Fix illegal blog title.
	if(!item.is_original?)
	item.title = '[转贴]' + item.title
	end
	if(item.title.end_with?('(ZT'))
	item.title = item.title[0..-4]
	end
	# Get file name and content.
	file_name = converter.markdown_file_name(item, index)
	file_content = converter.to_markdown(converter.format_markdown_line(item))
	# puts "file_name = #{file_name}, file_content = #{file_content}"
	index += 1
	# Generate files.
	converter.write('/tmp/' + file_name, file_content)
	end