Skip to content

Instantly share code, notes, and snippets.

@gangmax
Created May 23, 2012 07:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gangmax/2773778 to your computer and use it in GitHub Desktop.
Save gangmax/2773778 to your computer and use it in GitHub Desktop.
This is a program which can export blog items from Sina blog(http://blog.sina.com.cn) of any specific blogger to the octopress-blog-style markdown files.
# encoding: UTF-8
require 'nokogiri'
require 'open-uri'
require 'time'
# https://github.com/tenderlove/nokogiri/wiki/
class BlogItem
attr_accessor :title, :tags, :content, :created_time, :link
def to_str
"[title=#{@title}, is_original=#{is_original?} tags=#{@tags}, content=#{@content}, created_time=#{@created_time}, link=#{@link}]"
end
def is_original?
return true unless @title.include?('转贴') or @title.downcase.include?('zt') or @tags.include?('转贴')
return false
end
# This is the octopress blog item style tag string, seperated by a space.
def get_tag_str
result = ''
@tags.each {|tag| result += (tag + ' ')}
result.strip
end
end
module Parser
def parse(original_item_url, tags_mapper=nil, *given_tags)
puts "Parsing blog item '#{original_item_url}'..."
article = Nokogiri::HTML(open(original_item_url)).css('div.artical').first
item = BlogItem.new
item.title = article.css('h2.titName').first.content
item.content = article.css('div.articalContent').first.content.to_str.strip
tags = Array.new
article.css('div.articalTag').first.css('a').each {|a| tags.push(a.content)}
if(given_tags)
tags.concat(given_tags)
end
if(tags_mapper)
tags.each do |t|
if(tags_mapper.keys.index(t))
tags[tags.index(t)] = tags_mapper[t]
end
end
end
item.tags = tags
item.created_time = Time.parse(article.css('span.time').first.content[1..-2])
item.link = original_item_url
# puts "Parsing finished: #{item.to_str}"
item
end
end
module Writer
def format_markdown_line(blog_item)
result = ''
last_line_is_concrete = false
blog_item.content.each_line do |line|
this_line_is_concrete = is_concrete_line?(line)
if(last_line_is_concrete and this_line_is_concrete)
result += "\n"
end
result += line
last_line_is_concrete = this_line_is_concrete
end
blog_item.content = result
return blog_item
end
def to_markdown(blog_item)
"---\nlayout: post\ntitle: '#{blog_item.title}'\ndate: #{blog_item.created_time.strftime('%Y-%m-%d %H:%M')}\ncomments: true\ncategories: #{blog_item.get_tag_str}\npublished: true\n---\n\nMigarated from [here][original_blog_url] at '#{Time.now.strftime('%Y-%m-%d %H:%M:%S')}'.\n\n<!--more-->\n\n#{blog_item.content}\n\n\n[original_blog_url]: #{blog_item.link}"
end
# Create the markdown file name according to the item content.
def markdown_file_name(blog_item, index)
t = blog_item.created_time
"#{t.strftime('%Y-%m-%d-%H-%M-%S')}-migrated-from-sina-#{index}.markdown"
end
def write(file_name, file_content)
File.open(file_name, 'w') do |file|
file.write(file_content)
file.flush
end
end
def is_concrete_line?(line)
return line.strip.size > 0
end
end
class Converter
attr_accessor :index_pages, :identity_string, :source_item_links
# Here I use 韩寒's Sina blog as the example.
def initialize(index_pages=['http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html',
'http://blog.sina.com.cn/s/articlelist_1191258123_0_2.html',
'http://blog.sina.com.cn/s/articlelist_1191258123_0_3.html',
'http://blog.sina.com.cn/s/articlelist_1191258123_0_4.html',
'http://blog.sina.com.cn/s/articlelist_1191258123_0_5.html',
'http://blog.sina.com.cn/s/articlelist_1191258123_0_6.html'],
identity_string = 'http://blog.sina.com.cn/s/blog_4701280b')
@index_pages = index_pages
@identity_string = identity_string
@source_blog_links = Array.new
# Get all the original blog links.
@index_pages.each do |index_page|
doc = Nokogiri::HTML(open(index_page))
doc.search('a').each do |tag|
if tag['href'].start_with?(identity_string)
@source_blog_links.push(tag['href'])
end
end
end
end
include Parser
def get_blogs
@blogs = Array.new
@source_blog_links.each do |link|
@blogs.push(parse(link))
end
@blogs
end
include Writer
end
converter = Converter.new
index = 1
converter.get_blogs.reverse.each do |item|
# Fix illegal blog title.
if(!item.is_original?)
item.title = '[转贴]' + item.title
end
if(item.title.end_with?('(ZT'))
item.title = item.title[0..-4]
end
# Get file name and content.
file_name = converter.markdown_file_name(item, index)
file_content = converter.to_markdown(converter.format_markdown_line(item))
# puts "file_name = #{file_name}, file_content = #{file_content}"
index += 1
# Generate files.
converter.write('/tmp/' + file_name, file_content)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment