Skip to content

Instantly share code, notes, and snippets.

@mccxj
Created June 24, 2009 11:29
Show Gist options
  • Save mccxj/135189 to your computer and use it in GitHub Desktop.
Save mccxj/135189 to your computer and use it in GitHub Desktop.
backup_from_myblog
# backup from uptolife.blogspot.com
$KCODE='gbk'
require 'rubygems'
require 'nokogiri'
require 'open-uri'
gem 'activerecord','2.1.2'
require 'active_record'
ActiveRecord::Base.establish_connection(:adapter => 'mysql',
:database => 'test', :host => '192.168.1.184', :username => 'root',
:password => '123456', :encoding => 'utf8')
# id,title,body,tag,post_on,link
class Post < ActiveRecord::Base
end
links = []
page = Nokogiri::HTML(open("http://chi.pazou.net/visit.php?u=Oi8vdXB0b2xpZmUuYmxvZ3Nwb3QuY29t&b=7"))
page.search('div#BlogArchive1_ArchiveList').first.search('a.post-count-link').each do |ap|
links << ap['href'] if ap.text =~ /^.+[0-9]{4}$/
end
dh = nil
links[32..links.size].each do |link|
p "try to parse:#{link}"
page = Nokogiri::HTML(open(link))
posts = page.search('div.blog-posts').first
posts.children.each do |po|
unless po['class'].nil?
if po['class'] == "post"
post = Post.new
dh ||= Date.today
ph = po.search('h3.post-title').first.search('a').first
post.link = ph['href'] # 链接
post.title = ph.text # 标题
post.body = po.search('div.post-body').inner_html # 内容
label_links = po.search('div.post-footer').first.search('span.post-labels').first.search('a')
post.tag = label_links.map(&:text).join(' ')
post.post_on = dh
p "post :#{dh.to_s}"
post.save
dh = nil
end
if po['class'] == "date-header"
dh = Date.strptime(po.text.split(' ').first,'%Y年%m月%d日')
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment