Created
June 11, 2012 07:41
-
-
Save gangmax/2908932 to your computer and use it in GitHub Desktop.
Demostrate how to export the alumni messages of a specific class from "chinaren.com" with Ruby
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding = UTF-8 | |
# http://class.chinaren.com/a/~feed/list.do?start=730&size=10&app=1&cid=class_id&ts=1338876609146 | |
# http://stackoverflow.com/questions/1360808/rubys-open-uri-and-cookies | |
require 'net/http' | |
require 'open-uri' | |
require 'json' | |
require 'nokogiri' | |
require 'time' | |
require 'sqlite3' | |
class MessageItem | |
attr_accessor :message_id, :author, :creation_time, :content, :is_reply, :reply_id | |
def to_str | |
"[message_id=#{@message_id}, author=#{@author} creation_time=#{@creation_time.strftime('%Y-%m-%d %H:%M:%S')}, content=#{@content}, is_reply=#{@is_reply}, reply_id=#{@reply_id}]\n" | |
end | |
end | |
module Parser | |
def parse_main(site, port, path, headers) | |
conn = Net::HTTP.new(site, port) | |
response, data = conn.get(path, headers) | |
# puts "response = #{response}" | |
# puts "data = #{data}" | |
json = JSON.parse data | |
html = json["data"]["list"] | |
items = Array.new | |
Nokogiri::HTML(html).css('li.msgItem').each do |item| | |
# puts "---------------------------------------------\n#{item.inner_html}" # inner_html to_html | |
# parse current main message content. | |
message = MessageItem.new | |
message.message_id = item.css('div.option').first.attribute("id").to_str[9..-1] | |
message.author = item.css('img.avatar-48').first.attribute('title').content | |
message.is_reply = false | |
if(item.css('div.msgBody').first and item.css('div.msgBody').first.css('span').first) | |
message.content = item.css('div.msgBody').first.css('span').first.content.strip | |
message.creation_time = Time.parse(item.css('span.meta').first.content) if (item.css('span.meta').first) | |
else | |
message.content = item.css('div.msgHead').first.css('h4').first.content.split.last.strip | |
message.creation_time = Time.parse(item.css('span.time').first.content) if (item.css('span.time').first) | |
end | |
# 1. Replace the emotion span; 2. Replace the "C2 A0" UTF-8 whitespace, from: http://stackoverflow.com/questions/7689854/ruby-incorrect-method-behaviour-possible-depending-charset | |
message.content = replace_emotion(message.content).gsub(/^\p{Space}+|\p{Space}+$/, "") | |
items.push(message) | |
root_id = message.message_id | |
# Parse more comments/inner comments. | |
# http://class.chinaren.com/a/~comment/all.do?appid=1&itemid=3000000001344149190&allow=true&classid=class_id&ts=1338901364183 | |
if(item.css('li.commentMore').first) | |
onclick_content = item.css('li.commentMore').first.css('a').first.attribute('onclick').to_str | |
# puts onclick_content | |
onclick_content =~ /itemid:'(.{18,21})'/ | |
comment_url = "http://class.chinaren.com/a/~comment/all.do?appid=1&itemid=#{$1}&allow=true&classid=class_id&ts=1338901364183" | |
# puts comment_url | |
comments = parse_more_comments(site, port, comment_url, headers, root_id) | |
comments.each {|comment| items.push(comment)} | |
else | |
# Parse inner comments. | |
inner_comments = item.css('div.commentInfo') | |
# puts inner_comments.inner_html | |
if(inner_comments) | |
inner_comments.each do |item| | |
message = MessageItem.new | |
message.author = item.css('a').first.content | |
message.creation_time = Time.parse(item.css('span.meta').first.content) | |
message.is_reply = true | |
# Parse message id and reply_id. | |
if(item.css('em').first) | |
link_info = item.css('em').first.css('a').first.attribute('onclick').to_str | |
message.message_id = /\d{18,20}/.match(link_info)[0] | |
link_info =~ /replyid:'(.{1,10})'/ | |
message.reply_id = $1 | |
else | |
message.message_id = root_id | |
end | |
# Parse message content. | |
html = item.inner_html | |
index = html.index('</h4>') | |
if(index>0) | |
message.content = html[index+5..-1].strip | |
message.content = replace_emotion(message.content).gsub(/^\p{Space}+|\p{Space}+$/, "") | |
end | |
items.push(message) | |
end | |
end | |
end | |
# puts "\n---------------------------------------------\n#{message.to_str}" | |
end | |
return items | |
end | |
def parse_more_comments(site, port, path, headers, root_id) | |
conn = Net::HTTP.new(site, port) | |
response, data = conn.get(path, headers) | |
# puts "data = #{data}" | |
json = JSON.parse data | |
html = json["data"]["list"] | |
items = Array.new | |
Nokogiri::HTML(html).css('li.commentC').each do |item| | |
# puts "---------------------------------------------\n#{item.inner_html}" | |
message = MessageItem.new | |
message.author = item.css('img.avatar-32').first.attribute('title').content | |
message.creation_time = Time.parse(item.css('span.meta').first.content) | |
message.is_reply = true | |
# Parse message id and reply_id. | |
if(item.css('em').first and item.css('em').first.css('a').first) | |
# puts item.inner_html | |
link_info = item.css('em').first.css('a').first.attribute('onclick').to_str | |
message.message_id = /\d{18,20}/.match(link_info)[0] | |
link_info =~ /replyid:'(.{1,10})'/ | |
message.reply_id = $1 | |
else | |
message.message_id = root_id | |
end | |
# Parse message content. | |
html = item.inner_html | |
index = html.index('</h4>') | |
if(index>0) | |
message.content = html[index+5..-10].strip # remove leading '</h4>' and ending '</div>' | |
message.content = replace_emotion(message.content).gsub(/^\p{Space}+|\p{Space}+$/, "") | |
end | |
items.push(message) | |
end | |
return items | |
end | |
# replace '<span class="emot e-base-13" title="流泪"></span>' to '[流泪]'. | |
def replace_emotion(content, prefix='<span class=', postfix='</span>', re=/title="(.+?)">/) | |
return content until content | |
start = content.index(prefix) | |
finish = content.index(postfix) | |
# puts "start=#{start}, finish=#{finish}" | |
while (start and start > 0 and finish and finish > start and content =~ re) do | |
content = content[0..start-1] + '[表情:' + $1 + ']' + content[finish+7..-1] | |
# puts content | |
start = content.index(prefix) | |
finish = content.index(postfix) | |
# puts "start=#{start}, finish=#{finish}" | |
end | |
return content | |
end | |
end | |
include Parser | |
site = 'class.chinaren.com' | |
port = 80 | |
headers = { "Cookie" => 'This is the cookie content copied from Firebug console when accessing chinaren.com' } | |
page_size = 50 | |
=begin | |
File.open('/tmp/chinaren.txt', 'w') do |file| | |
for i in 0..75 do | |
puts "i=#{i}" | |
path = "/a/~feed/list.do?start=#{page_size*i}&size=#{page_size}&app=1&cid=class_id&ts=1338876609146" | |
parse_main(site, port, path, headers).each do |item| | |
puts item.to_str | |
file.write(item.to_str) | |
end | |
file.flush | |
end | |
end | |
=end | |
# Create db & table. | |
db = SQLite3::Database.new '/tmp/message96.db' | |
db.execute <<-SQL | |
create table if not exists messages ( | |
id INTEGER PRIMARY KEY AUTOINCREMENT, | |
message_id TEXT, | |
is_reply INTEGER, | |
reply_id TEXT, | |
author TEXT, | |
creation_time TEXT, | |
content TEXT); | |
SQL | |
# Clear table if needed. | |
db.execute <<-SQL | |
delete from messages; | |
SQL | |
# Query data and write to db. | |
for i in 0..75 do | |
puts "i=#{i}" | |
path = "/a/~feed/list.do?start=#{page_size*i}&size=#{page_size}&app=1&cid=class_id&ts=1338876609146" | |
parse_main(site, port, path, headers).each do |item| | |
# puts "message_id.class=#{item.message_id.class}, reply_id=#{item.reply_id.class}, author=#{item.author.class}, creation_time=#{item.creation_time.class}, content=#{item.content.class}." | |
puts item.to_str | |
db.execute "insert into messages values (NULL, :message_id, :is_reply, :reply_id, :author, :creation_time, :content)", {message_id: item.message_id, is_reply: item.is_reply.to_s, reply_id: item.reply_id, author: item.author, creation_time: item.creation_time.strftime('%Y-%m-%d %H:%M:%S'), content: item.content} | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment