Created
May 23, 2012 07:55
-
-
Save gangmax/2773778 to your computer and use it in GitHub Desktop.
This is a program which can export blog items from Sina blog(http://blog.sina.com.cn) of any specific blogger to the octopress-blog-style markdown files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: UTF-8 | |
require 'nokogiri' | |
require 'open-uri' | |
require 'time' | |
# https://github.com/tenderlove/nokogiri/wiki/ | |
class BlogItem | |
attr_accessor :title, :tags, :content, :created_time, :link | |
def to_str | |
"[title=#{@title}, is_original=#{is_original?} tags=#{@tags}, content=#{@content}, created_time=#{@created_time}, link=#{@link}]" | |
end | |
def is_original? | |
return true unless @title.include?('转贴') or @title.downcase.include?('zt') or @tags.include?('转贴') | |
return false | |
end | |
# This is the octopress blog item style tag string, seperated by a space. | |
def get_tag_str | |
result = '' | |
@tags.each {|tag| result += (tag + ' ')} | |
result.strip | |
end | |
end | |
module Parser | |
def parse(original_item_url, tags_mapper=nil, *given_tags) | |
puts "Parsing blog item '#{original_item_url}'..." | |
article = Nokogiri::HTML(open(original_item_url)).css('div.artical').first | |
item = BlogItem.new | |
item.title = article.css('h2.titName').first.content | |
item.content = article.css('div.articalContent').first.content.to_str.strip | |
tags = Array.new | |
article.css('div.articalTag').first.css('a').each {|a| tags.push(a.content)} | |
if(given_tags) | |
tags.concat(given_tags) | |
end | |
if(tags_mapper) | |
tags.each do |t| | |
if(tags_mapper.keys.index(t)) | |
tags[tags.index(t)] = tags_mapper[t] | |
end | |
end | |
end | |
item.tags = tags | |
item.created_time = Time.parse(article.css('span.time').first.content[1..-2]) | |
item.link = original_item_url | |
# puts "Parsing finished: #{item.to_str}" | |
item | |
end | |
end | |
module Writer | |
def format_markdown_line(blog_item) | |
result = '' | |
last_line_is_concrete = false | |
blog_item.content.each_line do |line| | |
this_line_is_concrete = is_concrete_line?(line) | |
if(last_line_is_concrete and this_line_is_concrete) | |
result += "\n" | |
end | |
result += line | |
last_line_is_concrete = this_line_is_concrete | |
end | |
blog_item.content = result | |
return blog_item | |
end | |
def to_markdown(blog_item) | |
"---\nlayout: post\ntitle: '#{blog_item.title}'\ndate: #{blog_item.created_time.strftime('%Y-%m-%d %H:%M')}\ncomments: true\ncategories: #{blog_item.get_tag_str}\npublished: true\n---\n\nMigarated from [here][original_blog_url] at '#{Time.now.strftime('%Y-%m-%d %H:%M:%S')}'.\n\n<!--more-->\n\n#{blog_item.content}\n\n\n[original_blog_url]: #{blog_item.link}" | |
end | |
# Create the markdown file name according to the item content. | |
def markdown_file_name(blog_item, index) | |
t = blog_item.created_time | |
"#{t.strftime('%Y-%m-%d-%H-%M-%S')}-migrated-from-sina-#{index}.markdown" | |
end | |
def write(file_name, file_content) | |
File.open(file_name, 'w') do |file| | |
file.write(file_content) | |
file.flush | |
end | |
end | |
def is_concrete_line?(line) | |
return line.strip.size > 0 | |
end | |
end | |
class Converter | |
attr_accessor :index_pages, :identity_string, :source_item_links | |
# Here I use 韩寒's Sina blog as the example. | |
def initialize(index_pages=['http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html', | |
'http://blog.sina.com.cn/s/articlelist_1191258123_0_2.html', | |
'http://blog.sina.com.cn/s/articlelist_1191258123_0_3.html', | |
'http://blog.sina.com.cn/s/articlelist_1191258123_0_4.html', | |
'http://blog.sina.com.cn/s/articlelist_1191258123_0_5.html', | |
'http://blog.sina.com.cn/s/articlelist_1191258123_0_6.html'], | |
identity_string = 'http://blog.sina.com.cn/s/blog_4701280b') | |
@index_pages = index_pages | |
@identity_string = identity_string | |
@source_blog_links = Array.new | |
# Get all the original blog links. | |
@index_pages.each do |index_page| | |
doc = Nokogiri::HTML(open(index_page)) | |
doc.search('a').each do |tag| | |
if tag['href'].start_with?(identity_string) | |
@source_blog_links.push(tag['href']) | |
end | |
end | |
end | |
end | |
include Parser | |
def get_blogs | |
@blogs = Array.new | |
@source_blog_links.each do |link| | |
@blogs.push(parse(link)) | |
end | |
@blogs | |
end | |
include Writer | |
end | |
converter = Converter.new | |
index = 1 | |
converter.get_blogs.reverse.each do |item| | |
# Fix illegal blog title. | |
if(!item.is_original?) | |
item.title = '[转贴]' + item.title | |
end | |
if(item.title.end_with?('(ZT')) | |
item.title = item.title[0..-4] | |
end | |
# Get file name and content. | |
file_name = converter.markdown_file_name(item, index) | |
file_content = converter.to_markdown(converter.format_markdown_line(item)) | |
# puts "file_name = #{file_name}, file_content = #{file_content}" | |
index += 1 | |
# Generate files. | |
converter.write('/tmp/' + file_name, file_content) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment