Skip to content

Instantly share code, notes, and snippets.

@songjiayang
Created October 14, 2014 03:05
Show Gist options
  • Save songjiayang/488c4919a6f45971e8c8 to your computer and use it in GitHub Desktop.
Save songjiayang/488c4919a6f45971e8c8 to your computer and use it in GitHub Desktop.
A robot for http://www.mcbbs.net/forum.php download source.
require 'mechanize'
require 'csv'
class McbbsRobot
def initialize
@agent = Mechanize.new
@cache_data = Array.new
@nodes = Array.new
end
def perform
puts "启动抓取任务...."
puts "初始化待抓取页面...."
prepare_work
puts "开始抓取任务...."
parse_pages
puts "结束抓取...."
end
private
def prepare_work
# get all index page link
%w(servermod mod skin texture).each do |node_name|
node = {}
node[:name] = node_name
page = @agent.get node_url(node_name)
node[:page_count] = page.search(xpaths[:pagenation]).first.attributes['href'].value.scan(/\d+/).first.to_i
node[:links] = node_page_links(page)
(2..node[:page_count]).each do | current_page|
page = @agent.get node_url(node_name, current_page)
node[:links] += node_page_links(page)
end
node[:links].uniq
@nodes << node
end
serialize_data('./nodes.json', @nodes)
end
def node_url(node, page=1)
"http://www.mcbbs.net/forum-#{node}-#{page}.html"
end
def node_page_links(page)
page.search(xpaths[:link]).map{|link|link.attributes['href'].to_s}
end
def parse_pages
CSV.open("./result.csv", "wb") do |csv|
csv << ["论坛节点","抓取链接", "标题", "下载地址", "原帖地址"]
@nodes.each do |node|
node[:links].each do |link|
begin
data = parse_page(link)
csv << [node[:name], data[:link], data[:title], data[:download_link], data[:original_page]]
rescue Exception => e
end
end
end
end
end
def parse_page(link)
link = "http://www.mcbbs.net/#{link}"
data = { }
data[:link] = link
page = @agent.get(link)
data[:title] = page.search("//a[@id='thread_subject']").children.to_s
ths = page.search('//div[@class="typeoption"]//th').map { |th| th.children.to_s }
tds = page.search('//div[@class="typeoption"]//td').map{ |td| td.children }
index1 = ths.index('下载地址:')
data[:download_link] = tds[index1].children.to_s if index1
index2 = ths.index('原帖地址:')
data[:original_page] = tds[index2].children.to_s if index2
data
end
def xpaths
@xpaths ||= {
pagenation: '//div[@class="pg"]//a[@class="last"]',
link: '//form[@id="moderate"]//th//a[@class="xst"]'
}
end
def serialize_data(file_name, data)
file = File.open(file_name, 'w+')
file.puts data
file.close
end
end
McbbsRobot.new.perform
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment