Created
June 18, 2013 07:03
-
-
Save windix/5803225 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "open-uri" | |
def download | |
if $*[0] == nil or $*[1] == nil or $*[2] == nil | |
abort "用法示例:ruby #$0 开始数 结束数 存放的目录 EX:如ruby #$0 200 300 d:\\ " | |
end | |
time = Time.now | |
threads = [] | |
for i in $*[0]..$*[1] | |
exports = "http://blog.ntjy.net/my_blogs/" + i.to_s | |
threads << Thread.new(i) do |thread| | |
begin | |
data = open(exports){|f|f.read} | |
open("#{$*[2]}\\#{thread}.html","wb"){|f|f.write(data)} | |
print thread, "-" | |
rescue | |
end | |
end | |
end | |
threads.each {|thr| thr.join} | |
puts " 下载完成,共耗时:#{Time.now - time}秒" | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
#encoding: UTF-8 | |
# 抓取每一个站点的首页链接数量 | |
# require 'rubygems' # 1.8.7 | |
require 'ap' # gem install awesome_print | |
require 'json' | |
require 'net/http' | |
require 'nokogiri' # gem install nokogiri | |
require 'forkmanager' # gem install parallel-forkmanager | |
require 'beanstalk-client' # gem install beanstalk-client | |
class MultipleCrawler | |
class Crawler | |
def initialize(user_agent, redirect_limit=1) | |
@user_agent = user_agent | |
@redirect_limit = redirect_limit | |
@timeout = 20 | |
end | |
attr_accessor :user_agent, :redirect_limit, :timeout | |
def fetch(website) | |
print "Pid:#{Process.pid}, fetch: #{website}\n" | |
redirect, url = @redirect_limit, website | |
start_time = Time.now | |
redirecting = false | |
begin | |
begin | |
uri = URI.parse(url) | |
req = Net::HTTP::Get.new(uri.path) | |
req.add_field('User-Agent', @user_agent) | |
res = Net::HTTP.start(uri.host, uri.port) do |http| | |
http.read_timeout = @timeout | |
http.request(req) | |
end | |
if res.header['location'] # 遇到重定向,则url设定为location,再次抓取 | |
url = res.header['location'] | |
redirecting = true | |
end | |
redirect -= 1 | |
end while redirecting and redirect>=0 | |
opened_time = (Time.now - start_time).round(4) # 统计打开网站耗时 | |
encoding = res.body.scan(/<meta.+?charset=["'\s]*([\w-]+)/i)[0] | |
encoding = encoding ? encoding[0].upcase : 'GB18030' | |
html = 'UTF-8'==encoding ? res.body : res.body.force_encoding('GB2312'==encoding || 'GBK'==encoding ? 'GB18030' : encoding).encode('UTF-8') | |
doc = Nokogiri::HTML(html) | |
processed_time = (Time.now - start_time - opened_time).round(4) # 统计分析链接耗时, 1.8.7, ('%.4f' % float).to_f 替换 round(4) | |
[opened_time, processed_time, doc.css('a[@href]').size, res.header['server']] | |
rescue =>e | |
e.message | |
end | |
end | |
end | |
def initialize(websites, beanstalk_jobs, pm_max=1, user_agent='', redirect_limit=1) | |
@websites = websites # 网址数组 | |
@beanstalk_jobs = beanstalk_jobs # beanstalk服务器地址和管道参数 | |
@pm_max = pm_max # 最大并行运行进程数 | |
@user_agent = user_agent # user_agent 伪装成浏览器访问 | |
@redirect_limit = redirect_limit # 允许最大重定向次数 | |
@ipc_reader, @ipc_writer = IO.pipe # 缓存结果的 ipc 管道 | |
end | |
attr_accessor :user_agent, :redirect_limit | |
def init_beanstalk_jobs # 准备beanstalk任务 | |
beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs) | |
#清空beanstalk的残留消息队列 | |
begin | |
while job = beanstalk.reserve(0.1) | |
job.delete | |
end | |
rescue Beanstalk::TimedOut | |
print "Beanstalk queues cleared!\n" | |
end | |
@websites.size.times{|i| beanstalk.put(i)} # 将所有的任务压栈 | |
beanstalk.close | |
rescue => e | |
puts e | |
exit | |
end | |
def process_jobs # 处理任务 | |
start_time = Time.now | |
pm = Parallel::ForkManager.new(@pm_max) | |
@pm_max.times do |i| | |
pm.start(i) and next # 启动后,立刻 next 不会等待进程执行完,这样才可以并行运算 | |
beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs) | |
@ipc_reader.close # 关闭读取管道,子进程只返回数据 | |
loop{ | |
begin | |
job = beanstalk.reserve(0.1) # 检测超时为0.1秒,因为任务以前提前压栈 | |
index = job.body | |
job.delete | |
website = @websites[index.to_i] | |
result = Crawler.new(@user_agent).fetch(website) | |
@ipc_writer.puts( ({website=>result}).to_json ) | |
rescue Beanstalk::DeadlineSoonError, Beanstalk::TimedOut, SystemExit, Interrupt | |
break | |
end | |
} | |
@ipc_writer.close | |
pm.finish(0) | |
end | |
@ipc_writer.close | |
begin | |
pm.wait_all_children # 等待所有子进程处理完毕 | |
rescue SystemExit, Interrupt # 遇到中断,打印消息 | |
print "Interrupt wait all children!\n" | |
ensure | |
results = read_results | |
ap results, :indent => -4 , :index=>false # 打印处理结果 | |
print "Process end, total: #{@websites.size}, crawled: #{results.size}, time: #{'%.4f' % (Time.now - start_time)}s.\n" | |
end | |
end | |
def read_results # 通过管道读取子进程抓取返回的数据 | |
results = {} | |
while result = @ipc_reader.gets | |
results.merge! JSON.parse(result) | |
end | |
@ipc_reader.close | |
results | |
end | |
def run # 运行入口 | |
init_beanstalk_jobs | |
process_jobs | |
end | |
end | |
websites = %w( | |
http://www.51buy.com/ http://www.360buy.com/ http://www.tmall.com/ http://www.taobao.com/ | |
http://china.alibaba.com/ http://www.paipai.com/ http://shop.qq.com/ http://www.lightinthebox.com/ | |
http://www.amazon.cn/ http://www.newegg.com.cn/ http://www.vancl.com/ http://www.yihaodian.com/ | |
http://www.dangdang.com/ http://www.m18.com/ http://www.suning.com/ http://www.hstyle.com/ | |
) | |
beanstalk_jobs = [['localhost:11300'],'crawler-jobs'] | |
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:13.0) Gecko/20100101 Firefox/13.0' | |
pm_max = 10 | |
MultipleCrawler.new(websites, beanstalk_jobs, pm_max, user_agent).run |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment