Skip to content

Instantly share code, notes, and snippets.

@takuma-saito
Last active May 21, 2018 08:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save takuma-saito/467d9fa92f6b36e1c6bd to your computer and use it in GitHub Desktop.
Save takuma-saito/467d9fa92f6b36e1c6bd to your computer and use it in GitHub Desktop.
google-crawler
# coding: utf-8
require 'nokogiri'
require 'open-uri'
require 'uri'
instanceID = 'xxxxxx'
## if you are blocked by google, you have to change your ip address.
## You can change it with "elastic ip address" provided by ec2.
def change_ip()
`./change-ip.sh #{instancdID}`
end
STDOUT.sync = true
STDERR.sync = true
class Array
def align_column
min = self.map {|x| x.size}.min
self.map {|x| x[0, min]}
end
end
def valid_url?(url)
return (url =~ URI::regexp)
end
def link(doc)
return doc.css('.r').map {|x|
x.css('a')[0][:href].split("?")[1].split("&")[0].split("=")[1]}
end
def text(doc, elem)
return doc.css(elem).map {|x| x.text.tr("\n", "")}
end
def crawling(keyword, n, delay, &block)
(0..(n/10-1)).each do |i|
count_of_failure = 0
counter = 1
begin
begin
query = "https://www.google.co.jp/search?q=" +
"#{URI.escape(keyword)}&oe=UTF-8&ie=UTF-8&hl=ja&start=#{i*10}"
doc = Nokogiri::HTML.parse(open(query))
[link(doc).select {|x| valid_url?(x)}.map {|x| URI.unescape(x)},
text(doc, '.r'), text(doc, 'span.st')]
.align_column.transpose.each do |x|
block.call([keyword, counter] + x)
counter += 1
end
delay.call() # delay interval
count_of_failure = -1
rescue OpenURI::HTTPError => error
# change ip address, and resume
change_ip()
count_of_failure += 1
rescue => err
$stderr.puts err
count_of_failure += 1
end
end while (count_of_failure >= 0 and count_of_failure < 3)
end
end
raise "ERROR: keywords file must be specified" if ARGV.empty?
keywords = File.read(ARGV[0]).split("\n")
keywords.each do |keyword|
crawling(keyword, 20, -> {sleep(rand(2) + 3)}) {|xs|
puts xs.map {|x| '"' + x.to_s + '"'}.join(",")
}
time = rand(10) + 10
$stderr.puts time
sleep(time)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment