Created
October 14, 2014 03:05
-
-
Save songjiayang/488c4919a6f45971e8c8 to your computer and use it in GitHub Desktop.
A robot for http://www.mcbbs.net/forum.php download source.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'mechanize' | |
require 'csv' | |
class McbbsRobot | |
def initialize | |
@agent = Mechanize.new | |
@cache_data = Array.new | |
@nodes = Array.new | |
end | |
def perform | |
puts "启动抓取任务...." | |
puts "初始化待抓取页面...." | |
prepare_work | |
puts "开始抓取任务...." | |
parse_pages | |
puts "结束抓取...." | |
end | |
private | |
def prepare_work | |
# get all index page link | |
%w(servermod mod skin texture).each do |node_name| | |
node = {} | |
node[:name] = node_name | |
page = @agent.get node_url(node_name) | |
node[:page_count] = page.search(xpaths[:pagenation]).first.attributes['href'].value.scan(/\d+/).first.to_i | |
node[:links] = node_page_links(page) | |
(2..node[:page_count]).each do | current_page| | |
page = @agent.get node_url(node_name, current_page) | |
node[:links] += node_page_links(page) | |
end | |
node[:links].uniq | |
@nodes << node | |
end | |
serialize_data('./nodes.json', @nodes) | |
end | |
def node_url(node, page=1) | |
"http://www.mcbbs.net/forum-#{node}-#{page}.html" | |
end | |
def node_page_links(page) | |
page.search(xpaths[:link]).map{|link|link.attributes['href'].to_s} | |
end | |
def parse_pages | |
CSV.open("./result.csv", "wb") do |csv| | |
csv << ["论坛节点","抓取链接", "标题", "下载地址", "原帖地址"] | |
@nodes.each do |node| | |
node[:links].each do |link| | |
begin | |
data = parse_page(link) | |
csv << [node[:name], data[:link], data[:title], data[:download_link], data[:original_page]] | |
rescue Exception => e | |
end | |
end | |
end | |
end | |
end | |
def parse_page(link) | |
link = "http://www.mcbbs.net/#{link}" | |
data = { } | |
data[:link] = link | |
page = @agent.get(link) | |
data[:title] = page.search("//a[@id='thread_subject']").children.to_s | |
ths = page.search('//div[@class="typeoption"]//th').map { |th| th.children.to_s } | |
tds = page.search('//div[@class="typeoption"]//td').map{ |td| td.children } | |
index1 = ths.index('下载地址:') | |
data[:download_link] = tds[index1].children.to_s if index1 | |
index2 = ths.index('原帖地址:') | |
data[:original_page] = tds[index2].children.to_s if index2 | |
data | |
end | |
def xpaths | |
@xpaths ||= { | |
pagenation: '//div[@class="pg"]//a[@class="last"]', | |
link: '//form[@id="moderate"]//th//a[@class="xst"]' | |
} | |
end | |
def serialize_data(file_name, data) | |
file = File.open(file_name, 'w+') | |
file.puts data | |
file.close | |
end | |
end | |
McbbsRobot.new.perform |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment