Skip to content

Instantly share code, notes, and snippets.

@gangmax
Created June 11, 2012 07:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gangmax/2908932 to your computer and use it in GitHub Desktop.
Save gangmax/2908932 to your computer and use it in GitHub Desktop.
Demostrate how to export the alumni messages of a specific class from "chinaren.com" with Ruby
# encoding = UTF-8
# http://class.chinaren.com/a/~feed/list.do?start=730&size=10&app=1&cid=class_id&ts=1338876609146
# http://stackoverflow.com/questions/1360808/rubys-open-uri-and-cookies
require 'net/http'
require 'open-uri'
require 'json'
require 'nokogiri'
require 'time'
require 'sqlite3'
class MessageItem
attr_accessor :message_id, :author, :creation_time, :content, :is_reply, :reply_id
def to_str
"[message_id=#{@message_id}, author=#{@author} creation_time=#{@creation_time.strftime('%Y-%m-%d %H:%M:%S')}, content=#{@content}, is_reply=#{@is_reply}, reply_id=#{@reply_id}]\n"
end
end
module Parser
def parse_main(site, port, path, headers)
conn = Net::HTTP.new(site, port)
response, data = conn.get(path, headers)
# puts "response = #{response}"
# puts "data = #{data}"
json = JSON.parse data
html = json["data"]["list"]
items = Array.new
Nokogiri::HTML(html).css('li.msgItem').each do |item|
# puts "---------------------------------------------\n#{item.inner_html}" # inner_html to_html
# parse current main message content.
message = MessageItem.new
message.message_id = item.css('div.option').first.attribute("id").to_str[9..-1]
message.author = item.css('img.avatar-48').first.attribute('title').content
message.is_reply = false
if(item.css('div.msgBody').first and item.css('div.msgBody').first.css('span').first)
message.content = item.css('div.msgBody').first.css('span').first.content.strip
message.creation_time = Time.parse(item.css('span.meta').first.content) if (item.css('span.meta').first)
else
message.content = item.css('div.msgHead').first.css('h4').first.content.split.last.strip
message.creation_time = Time.parse(item.css('span.time').first.content) if (item.css('span.time').first)
end
# 1. Replace the emotion span; 2. Replace the "C2 A0" UTF-8 whitespace, from: http://stackoverflow.com/questions/7689854/ruby-incorrect-method-behaviour-possible-depending-charset
message.content = replace_emotion(message.content).gsub(/^\p{Space}+|\p{Space}+$/, "")
items.push(message)
root_id = message.message_id
# Parse more comments/inner comments.
# http://class.chinaren.com/a/~comment/all.do?appid=1&itemid=3000000001344149190&allow=true&classid=class_id&ts=1338901364183
if(item.css('li.commentMore').first)
onclick_content = item.css('li.commentMore').first.css('a').first.attribute('onclick').to_str
# puts onclick_content
onclick_content =~ /itemid:'(.{18,21})'/
comment_url = "http://class.chinaren.com/a/~comment/all.do?appid=1&itemid=#{$1}&allow=true&classid=class_id&ts=1338901364183"
# puts comment_url
comments = parse_more_comments(site, port, comment_url, headers, root_id)
comments.each {|comment| items.push(comment)}
else
# Parse inner comments.
inner_comments = item.css('div.commentInfo')
# puts inner_comments.inner_html
if(inner_comments)
inner_comments.each do |item|
message = MessageItem.new
message.author = item.css('a').first.content
message.creation_time = Time.parse(item.css('span.meta').first.content)
message.is_reply = true
# Parse message id and reply_id.
if(item.css('em').first)
link_info = item.css('em').first.css('a').first.attribute('onclick').to_str
message.message_id = /\d{18,20}/.match(link_info)[0]
link_info =~ /replyid:'(.{1,10})'/
message.reply_id = $1
else
message.message_id = root_id
end
# Parse message content.
html = item.inner_html
index = html.index('</h4>')
if(index>0)
message.content = html[index+5..-1].strip
message.content = replace_emotion(message.content).gsub(/^\p{Space}+|\p{Space}+$/, "")
end
items.push(message)
end
end
end
# puts "\n---------------------------------------------\n#{message.to_str}"
end
return items
end
def parse_more_comments(site, port, path, headers, root_id)
conn = Net::HTTP.new(site, port)
response, data = conn.get(path, headers)
# puts "data = #{data}"
json = JSON.parse data
html = json["data"]["list"]
items = Array.new
Nokogiri::HTML(html).css('li.commentC').each do |item|
# puts "---------------------------------------------\n#{item.inner_html}"
message = MessageItem.new
message.author = item.css('img.avatar-32').first.attribute('title').content
message.creation_time = Time.parse(item.css('span.meta').first.content)
message.is_reply = true
# Parse message id and reply_id.
if(item.css('em').first and item.css('em').first.css('a').first)
# puts item.inner_html
link_info = item.css('em').first.css('a').first.attribute('onclick').to_str
message.message_id = /\d{18,20}/.match(link_info)[0]
link_info =~ /replyid:'(.{1,10})'/
message.reply_id = $1
else
message.message_id = root_id
end
# Parse message content.
html = item.inner_html
index = html.index('</h4>')
if(index>0)
message.content = html[index+5..-10].strip # remove leading '</h4>' and ending '</div>'
message.content = replace_emotion(message.content).gsub(/^\p{Space}+|\p{Space}+$/, "")
end
items.push(message)
end
return items
end
# replace '<span class="emot e-base-13" title="流泪"></span>' to '[流泪]'.
def replace_emotion(content, prefix='<span class=', postfix='</span>', re=/title="(.+?)">/)
return content until content
start = content.index(prefix)
finish = content.index(postfix)
# puts "start=#{start}, finish=#{finish}"
while (start and start > 0 and finish and finish > start and content =~ re) do
content = content[0..start-1] + '[表情:' + $1 + ']' + content[finish+7..-1]
# puts content
start = content.index(prefix)
finish = content.index(postfix)
# puts "start=#{start}, finish=#{finish}"
end
return content
end
end
include Parser
site = 'class.chinaren.com'
port = 80
headers = { "Cookie" => 'This is the cookie content copied from Firebug console when accessing chinaren.com' }
page_size = 50
=begin
File.open('/tmp/chinaren.txt', 'w') do |file|
for i in 0..75 do
puts "i=#{i}"
path = "/a/~feed/list.do?start=#{page_size*i}&size=#{page_size}&app=1&cid=class_id&ts=1338876609146"
parse_main(site, port, path, headers).each do |item|
puts item.to_str
file.write(item.to_str)
end
file.flush
end
end
=end
# Create db & table.
db = SQLite3::Database.new '/tmp/message96.db'
db.execute <<-SQL
create table if not exists messages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
message_id TEXT,
is_reply INTEGER,
reply_id TEXT,
author TEXT,
creation_time TEXT,
content TEXT);
SQL
# Clear table if needed.
db.execute <<-SQL
delete from messages;
SQL
# Query data and write to db.
for i in 0..75 do
puts "i=#{i}"
path = "/a/~feed/list.do?start=#{page_size*i}&size=#{page_size}&app=1&cid=class_id&ts=1338876609146"
parse_main(site, port, path, headers).each do |item|
# puts "message_id.class=#{item.message_id.class}, reply_id=#{item.reply_id.class}, author=#{item.author.class}, creation_time=#{item.creation_time.class}, content=#{item.content.class}."
puts item.to_str
db.execute "insert into messages values (NULL, :message_id, :is_reply, :reply_id, :author, :creation_time, :content)", {message_id: item.message_id, is_reply: item.is_reply.to_s, reply_id: item.reply_id, author: item.author, creation_time: item.creation_time.strftime('%Y-%m-%d %H:%M:%S'), content: item.content}
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment