Skip to content

Instantly share code, notes, and snippets.

@parano
Last active October 1, 2015 06:28
Show Gist options
  • Save parano/44c086b3d09c329751f0 to your computer and use it in GitHub Desktop.
Save parano/44c086b3d09c329751f0 to your computer and use it in GitHub Desktop.
multithread scrape
#!/usr/bin/ruby
# coding: utf-8
require 'uri'
#require 'iconv'
require 'open-uri'
require 'thread'
def scrape(from,to,n)
begin
file = File.open("data#{n}.txt","w+")
num = 1
for num in from..to
page = open(@url + num.to_s)
text = page.read; nil
text = text.to_s.scan(@regexp)
if( @regexp_title =~ text.to_s )
print "#{n}:#{num.to_s}:#{$1},#{$2},"
file << "#{$1},#{$2},"
end
if( @regexp_jingdian =~ text.to_s )
print "#{$1}\n"
file << "#{$1}\n"
end
end
file.close
rescue Timeout::Error
logger.debug "Time out"
end
end
if $0 == __FILE__
@url = 'http://jingdian.tuniu.com/fengjing/'
@regexp = /\<h1\>.*span\>/
@regexp_jingdian = /\<h1\>(.*)\<\/h1\>/
@regexp_jingdian1 = /title=\".*\"\>(.*)\<\/a\>\//
@regexp_jingdian2 = /href=.*"\>(.*)\<\/a\>"/
@regexp_title = /href=.*"\>(.*)\<\/a\>\/<a.*"\>(.*)\<\/a\>/
# from to 10376..38144
threads = []
1.upto(40) do |n|
threads << Thread.new {
# do task
if n != 40
tem_from = 10376 + (n-1) * 694
tem_to = tem_from + 693
scrape(tem_from, tem_to, n)
else
tem_from = 10376 + (n-1) * 694
tem_to = 38144
scrape(tem_from, tem_to, n)
end
}
end
threads.each { |t| t.join }
puts "shit"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment