gangmax/chinaren.rb

## chinaren.rb
# encoding = UTF-8

# http://class.chinaren.com/a/~feed/list.do?start=730&size=10&app=1&cid=class_id&ts=1338876609146
# http://stackoverflow.com/questions/1360808/rubys-open-uri-and-cookies

require 'net/http'
require 'open-uri'
require 'json'
require 'nokogiri'
require 'time'
require 'sqlite3'

class MessageItem
  attr_accessor :message_id, :author, :creation_time, :content, :is_reply, :reply_id

  def to_str
    "[message_id=#{@message_id}, author=#{@author} creation_time=#{@creation_time.strftime('%Y-%m-%d %H:%M:%S')}, content=#{@content}, is_reply=#{@is_reply}, reply_id=#{@reply_id}]\n"
  end
end

module Parser
  def parse_main(site, port, path, headers)
    conn = Net::HTTP.new(site, port)
    response, data = conn.get(path, headers)
    # puts "response = #{response}"
    # puts "data = #{data}"
    json = JSON.parse data
    html = json["data"]["list"]
    items = Array.new
    Nokogiri::HTML(html).css('li.msgItem').each do |item|
      # puts "---------------------------------------------\n#{item.inner_html}" # inner_html to_html

      # parse current main message content.
      message = MessageItem.new
      message.message_id = item.css('div.option').first.attribute("id").to_str[9..-1]
      message.author = item.css('img.avatar-48').first.attribute('title').content

      message.is_reply = false
      if(item.css('div.msgBody').first and item.css('div.msgBody').first.css('span').first)
        message.content = item.css('div.msgBody').first.css('span').first.content.strip
        message.creation_time = Time.parse(item.css('span.meta').first.content) if (item.css('span.meta').first)
      else
        message.content = item.css('div.msgHead').first.css('h4').first.content.split.last.strip
        message.creation_time = Time.parse(item.css('span.time').first.content) if (item.css('span.time').first)
      end
      # 1. Replace the emotion span; 2. Replace the "C2 A0" UTF-8 whitespace, from: http://stackoverflow.com/questions/7689854/ruby-incorrect-method-behaviour-possible-depending-charset
      message.content = replace_emotion(message.content).gsub(/^\p{Space}+|\p{Space}+$/, "")
      items.push(message)
      root_id = message.message_id

      # Parse more comments/inner comments.
      # http://class.chinaren.com/a/~comment/all.do?appid=1&itemid=3000000001344149190&allow=true&classid=class_id&ts=1338901364183
      if(item.css('li.commentMore').first)
        onclick_content = item.css('li.commentMore').first.css('a').first.attribute('onclick').to_str
        # puts onclick_content
        onclick_content =~ /itemid:'(.{18,21})'/
        comment_url = "http://class.chinaren.com/a/~comment/all.do?appid=1&itemid=#{$1}&allow=true&classid=class_id&ts=1338901364183"
        # puts comment_url
        comments = parse_more_comments(site, port, comment_url, headers, root_id)
        comments.each {|comment| items.push(comment)}
      else
        # Parse inner comments.
        inner_comments = item.css('div.commentInfo')
        # puts inner_comments.inner_html
        if(inner_comments)
          inner_comments.each do |item|
          message = MessageItem.new
          message.author = item.css('a').first.content
          message.creation_time = Time.parse(item.css('span.meta').first.content)
          message.is_reply = true
          # Parse message id and reply_id.
          if(item.css('em').first)
            link_info = item.css('em').first.css('a').first.attribute('onclick').to_str
            message.message_id = /\d{18,20}/.match(link_info)[0]
            link_info =~ /replyid:'(.{1,10})'/
            message.reply_id = $1
          else
            message.message_id = root_id
          end
          # Parse message content.
          html = item.inner_html
          index = html.index('</h4>')
          if(index>0)
            message.content = html[index+5..-1].strip
            message.content = replace_emotion(message.content).gsub(/^\p{Space}+|\p{Space}+$/, "")
          end
          items.push(message)
          end
        end
      end

      # puts "\n---------------------------------------------\n#{message.to_str}"
    end
    return items
  end

  def parse_more_comments(site, port, path, headers, root_id)
    conn = Net::HTTP.new(site, port)
    response, data = conn.get(path, headers)
    # puts "data = #{data}"
    json = JSON.parse data
    html = json["data"]["list"]
    items = Array.new
    Nokogiri::HTML(html).css('li.commentC').each do |item|
      # puts "---------------------------------------------\n#{item.inner_html}"
      message = MessageItem.new
      message.author = item.css('img.avatar-32').first.attribute('title').content
      message.creation_time = Time.parse(item.css('span.meta').first.content)
      message.is_reply = true
      # Parse message id and reply_id.
      if(item.css('em').first and item.css('em').first.css('a').first)
        # puts item.inner_html
        link_info = item.css('em').first.css('a').first.attribute('onclick').to_str
        message.message_id = /\d{18,20}/.match(link_info)[0]
        link_info =~ /replyid:'(.{1,10})'/
        message.reply_id = $1
      else
        message.message_id = root_id
      end
      # Parse message content.
      html = item.inner_html
      index = html.index('</h4>')
      if(index>0)
        message.content = html[index+5..-10].strip # remove leading '</h4>' and ending '</div>'
        message.content = replace_emotion(message.content).gsub(/^\p{Space}+|\p{Space}+$/, "")
      end
      items.push(message)
    end
    return items
  end

  # replace '<span class="emot e-base-13" title="流泪"></span>' to '[流泪]'.
  def replace_emotion(content, prefix='<span class=', postfix='</span>', re=/title="(.+?)">/)
    return content until content
    start = content.index(prefix)
    finish = content.index(postfix)
    # puts "start=#{start}, finish=#{finish}"
    while (start and start > 0 and finish and finish > start and content =~ re) do
      content = content[0..start-1] + '[表情:' + $1 + ']' + content[finish+7..-1]
      # puts content
      start = content.index(prefix)
      finish = content.index(postfix)
      # puts "start=#{start}, finish=#{finish}"
    end
    return content
  end
end

include Parser

site = 'class.chinaren.com'
port = 80
headers = { "Cookie" => 'This is the cookie content copied from Firebug console when accessing chinaren.com' }
page_size = 50

=begin
File.open('/tmp/chinaren.txt', 'w') do |file|
  for i in 0..75 do
    puts "i=#{i}"
    path = "/a/~feed/list.do?start=#{page_size*i}&size=#{page_size}&app=1&cid=class_id&ts=1338876609146"
    parse_main(site, port, path, headers).each do |item|
      puts item.to_str
      file.write(item.to_str)
    end
    file.flush
  end
end
=end

# Create db & table.
db = SQLite3::Database.new '/tmp/message96.db'
db.execute <<-SQL
  create table if not exists messages (
  id INTEGER PRIMARY KEY AUTOINCREMENT,
  message_id TEXT,
  is_reply INTEGER,
  reply_id TEXT,
  author TEXT,
  creation_time TEXT,
  content TEXT);
SQL
# Clear table if needed.
db.execute <<-SQL
  delete from messages;
SQL

# Query data and write to db.
for i in 0..75 do
  puts "i=#{i}"
  path = "/a/~feed/list.do?start=#{page_size*i}&size=#{page_size}&app=1&cid=class_id&ts=1338876609146"
  parse_main(site, port, path, headers).each do |item|
    # puts "message_id.class=#{item.message_id.class}, reply_id=#{item.reply_id.class}, author=#{item.author.class}, creation_time=#{item.creation_time.class}, content=#{item.content.class}."
    puts item.to_str
    db.execute "insert into messages values (NULL, :message_id, :is_reply, :reply_id, :author, :creation_time, :content)", {message_id: item.message_id, is_reply: item.is_reply.to_s, reply_id: item.reply_id, author: item.author, creation_time: item.creation_time.strftime('%Y-%m-%d %H:%M:%S'), content: item.content}
  end
end
	# encoding = UTF-8

	# http://class.chinaren.com/a/~feed/list.do?start=730&size=10&app=1&cid=class_id&ts=1338876609146
	# http://stackoverflow.com/questions/1360808/rubys-open-uri-and-cookies

	require 'net/http'
	require 'open-uri'
	require 'json'
	require 'nokogiri'
	require 'time'
	require 'sqlite3'

	class MessageItem
	attr_accessor :message_id, :author, :creation_time, :content, :is_reply, :reply_id

	def to_str
	"[message_id=#{@message_id}, author=#{@author} creation_time=#{@creation_time.strftime('%Y-%m-%d %H:%M:%S')}, content=#{@content}, is_reply=#{@is_reply}, reply_id=#{@reply_id}]\n"
	end
	end

	module Parser
	def parse_main(site, port, path, headers)
	conn = Net::HTTP.new(site, port)
	response, data = conn.get(path, headers)
	# puts "response = #{response}"
	# puts "data = #{data}"
	json = JSON.parse data
	html = json["data"]["list"]
	items = Array.new
	Nokogiri::HTML(html).css('li.msgItem').each do \|item\|
	# puts "---------------------------------------------\n#{item.inner_html}" # inner_html to_html

	# parse current main message content.
	message = MessageItem.new
	message.message_id = item.css('div.option').first.attribute("id").to_str[9..-1]
	message.author = item.css('img.avatar-48').first.attribute('title').content

	message.is_reply = false
	if(item.css('div.msgBody').first and item.css('div.msgBody').first.css('span').first)
	message.content = item.css('div.msgBody').first.css('span').first.content.strip
	message.creation_time = Time.parse(item.css('span.meta').first.content) if (item.css('span.meta').first)
	else
	message.content = item.css('div.msgHead').first.css('h4').first.content.split.last.strip
	message.creation_time = Time.parse(item.css('span.time').first.content) if (item.css('span.time').first)
	end
	# 1. Replace the emotion span; 2. Replace the "C2 A0" UTF-8 whitespace, from: http://stackoverflow.com/questions/7689854/ruby-incorrect-method-behaviour-possible-depending-charset
	message.content = replace_emotion(message.content).gsub(/^\p{Space}+\|\p{Space}+$/, "")
	items.push(message)
	root_id = message.message_id

	# Parse more comments/inner comments.
	# http://class.chinaren.com/a/~comment/all.do?appid=1&itemid=3000000001344149190&allow=true&classid=class_id&ts=1338901364183
	if(item.css('li.commentMore').first)
	onclick_content = item.css('li.commentMore').first.css('a').first.attribute('onclick').to_str
	# puts onclick_content
	onclick_content =~ /itemid:'(.{18,21})'/
	comment_url = "http://class.chinaren.com/a/~comment/all.do?appid=1&itemid=#{$1}&allow=true&classid=class_id&ts=1338901364183"
	# puts comment_url
	comments = parse_more_comments(site, port, comment_url, headers, root_id)
	comments.each {\|comment\| items.push(comment)}
	else
	# Parse inner comments.
	inner_comments = item.css('div.commentInfo')
	# puts inner_comments.inner_html
	if(inner_comments)
	inner_comments.each do \|item\|
	message = MessageItem.new
	message.author = item.css('a').first.content
	message.creation_time = Time.parse(item.css('span.meta').first.content)
	message.is_reply = true
	# Parse message id and reply_id.
	if(item.css('em').first)
	link_info = item.css('em').first.css('a').first.attribute('onclick').to_str
	message.message_id = /\d{18,20}/.match(link_info)[0]
	link_info =~ /replyid:'(.{1,10})'/
	message.reply_id = $1
	else
	message.message_id = root_id
	end
	# Parse message content.
	html = item.inner_html
	index = html.index('</h4>')
	if(index>0)
	message.content = html[index+5..-1].strip
	message.content = replace_emotion(message.content).gsub(/^\p{Space}+\|\p{Space}+$/, "")
	end
	items.push(message)
	end
	end
	end

	# puts "\n---------------------------------------------\n#{message.to_str}"
	end
	return items
	end

	def parse_more_comments(site, port, path, headers, root_id)
	conn = Net::HTTP.new(site, port)
	response, data = conn.get(path, headers)
	# puts "data = #{data}"
	json = JSON.parse data
	html = json["data"]["list"]
	items = Array.new
	Nokogiri::HTML(html).css('li.commentC').each do \|item\|
	# puts "---------------------------------------------\n#{item.inner_html}"
	message = MessageItem.new
	message.author = item.css('img.avatar-32').first.attribute('title').content
	message.creation_time = Time.parse(item.css('span.meta').first.content)
	message.is_reply = true
	# Parse message id and reply_id.
	if(item.css('em').first and item.css('em').first.css('a').first)
	# puts item.inner_html
	link_info = item.css('em').first.css('a').first.attribute('onclick').to_str
	message.message_id = /\d{18,20}/.match(link_info)[0]
	link_info =~ /replyid:'(.{1,10})'/
	message.reply_id = $1
	else
	message.message_id = root_id
	end
	# Parse message content.
	html = item.inner_html
	index = html.index('</h4>')
	if(index>0)
	message.content = html[index+5..-10].strip # remove leading '</h4>' and ending '</div>'
	message.content = replace_emotion(message.content).gsub(/^\p{Space}+\|\p{Space}+$/, "")
	end
	items.push(message)
	end
	return items
	end

	# replace '<span class="emot e-base-13" title="流泪"></span>' to '[流泪]'.
	def replace_emotion(content, prefix='<span class=', postfix='</span>', re=/title="(.+?)">/)
	return content until content
	start = content.index(prefix)
	finish = content.index(postfix)
	# puts "start=#{start}, finish=#{finish}"
	while (start and start > 0 and finish and finish > start and content =~ re) do
	content = content[0..start-1] + '[表情:' + $1 + ']' + content[finish+7..-1]
	# puts content
	start = content.index(prefix)
	finish = content.index(postfix)
	# puts "start=#{start}, finish=#{finish}"
	end
	return content
	end
	end

	include Parser

	site = 'class.chinaren.com'
	port = 80
	headers = { "Cookie" => 'This is the cookie content copied from Firebug console when accessing chinaren.com' }
	page_size = 50

	=begin
	File.open('/tmp/chinaren.txt', 'w') do \|file\|
	for i in 0..75 do
	puts "i=#{i}"
	path = "/a/~feed/list.do?start=#{page_size*i}&size=#{page_size}&app=1&cid=class_id&ts=1338876609146"
	parse_main(site, port, path, headers).each do \|item\|
	puts item.to_str
	file.write(item.to_str)
	end
	file.flush
	end
	end
	=end

	# Create db & table.
	db = SQLite3::Database.new '/tmp/message96.db'
	db.execute <<-SQL
	create table if not exists messages (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	message_id TEXT,
	is_reply INTEGER,
	reply_id TEXT,
	author TEXT,
	creation_time TEXT,
	content TEXT);
	SQL
	# Clear table if needed.
	db.execute <<-SQL
	delete from messages;
	SQL

	# Query data and write to db.
	for i in 0..75 do
	puts "i=#{i}"
	path = "/a/~feed/list.do?start=#{page_size*i}&size=#{page_size}&app=1&cid=class_id&ts=1338876609146"
	parse_main(site, port, path, headers).each do \|item\|
	# puts "message_id.class=#{item.message_id.class}, reply_id=#{item.reply_id.class}, author=#{item.author.class}, creation_time=#{item.creation_time.class}, content=#{item.content.class}."
	puts item.to_str
	db.execute "insert into messages values (NULL, :message_id, :is_reply, :reply_id, :author, :creation_time, :content)", {message_id: item.message_id, is_reply: item.is_reply.to_s, reply_id: item.reply_id, author: item.author, creation_time: item.creation_time.strftime('%Y-%m-%d %H:%M:%S'), content: item.content}
	end
	end