Skip to content

Instantly share code, notes, and snippets.

@yhara yhara/crawl.rb
Created Aug 19, 2009

Embed
What would you like to do?
require 'mechanize'
def report(r)
percent = ->(n){ "%.2f" % (n.to_f / r[:count] * 100) }
rest = r[:count] - r[:req_dir] - r[:require] - r[:dirname] - r[:dollar_zero]
puts <<-EOD
processed #{r[:count]} lines
#{r[:req_dir]} contains 'require' and 'dirname' (#{percent[r[:req_dir]]}%)
#{r[:require]} contains 'require' (#{percent[r[:require]]}%)
#{r[:dirname]} contains 'dirname' (#{percent[r[:dirname]]}%)
#{r[:dollar_zero]} contains '$0' (#{percent[r[:dollar_zero]]}%)
#{rest} does not contain any (#{percent[rest]}%)
EOD
end
@codes = Hash.new{ "" }
@result = Hash.new(0)
@dirnames = Hash.new(0)
def process_pre(pre)
kind = nil
pre.inner_text.each_line do |line|
next unless line =~ /__FILE__/
case line
when /require.*dirname/
@result[:req_dir] += 1
kind ||= :req_dir
when /require/
@result[:require] += 1
kind ||= :require
when /dirname/
@result[:dirname] += 1
kind ||= :dirname
@dirnames[ line.scan(/dirname/).size ] += 1
when /\$0/
@result[:dollar_zero] += 1
kind ||= :dollar_zero
end
@result[:count] += 1
end
kind ||= :other
return kind
end
def process(page)
doc = page.root
(doc/"div.r").each do |div|
kind = (div/:pre).map{|pre| process_pre pre}.first
@codes[kind] += div.to_html
end
end
agent = WWW::Mechanize.new
style = nil
page = agent.get("http://google.com/codesearch?q=__FILE__+lang%3Aruby")
loop do #1.times do
print "." ; $stdout.flush
process page
style ||= (page/:style).to_html
next_link = page.links.find{|link| link.text =~ /next/i}
break if next_link.nil?
page = next_link.click
end
report @result
p @dirnames
File.open("codes.html", "w"){|f|
f.puts style
@codes.each do |kind, html|
f.puts "<h1>#{kind}</h1>"
f.write html
end
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.