/wiki-scraper.rb

## wiki-scraper.rb
#!/usr/bin/env ruby
"""
Google search scraper to list all results likely to be MediaWiki installations
"""
# CC-0, ArchiveTeam/WikiTeam, 2013

require 'rubygems'
require 'mechanize'
require 'uri'
require 'cgi'

domains = Array.new

a = Mechanize.new { |agent|
  agent.user_agent_alias = 'Linux Konqueror'
}
prng = Random.new

search_result = a.get('http://www.google.it/') # webhp?num=30&complete=0&hl=it
search_form = search_result.form('f')
search_form.q = '"Magnus Manske, Brion Vibber, Lee Daniel Crocker" -site:wikia.com -wikimedia'

search_result = a.submit(search_form, search_form.buttons.first)

# FIXME: Continue clicking "Next" endlessly; need to exit at some point
while !search_result.nil?
  search_result.search("//h3/a").each do |link|
    # The result URLs are in h3 headers and passed through google.com/url?q=
    target = CGI.parse(link['href'])['/url?q'][0]
    unless target.nil?
      # Take each result URI provided
      uri = URI.parse(target)
      # Try to extract the entry URL to MediaWiki: index.php if we're lucky, otherwise the article path
      # We could try and be smart, open the URL and follow the link rel=EditURI; but it's too recent a feature
      unless uri.query.nil?
      # If there are parameters, perhaps we're lucky, just take till the path
      # TODO: This looks silly
      entry = uri.scheme + '://' + uri.host + uri.path
      else
      # But if there is none it's probably using short URLs or some other rewriting:
      # the last part must be the page title, remove it
      entry = target.split("/")[0..-2].join("/")
      end

      unless domains.include?(entry)
      domains << entry
      print '.'
      end

      # A human would probably click every now and then
      if prng.rand(0..3.0) < 1 then
        begin
        trash = a.get('http://google.com' + link['href'])
        rescue Exception
        # Nothing to do; we don't care at all
        end
      end
    end
  end

  sleep(prng.rand(150..300.0))
  begin
  search_result = search_result.link_with(:text => 'Avanti').click
  rescue NoMethodError
    begin
    search_result = search_result.link_with(:text => 'ripetere la ricerca includendo i risultati omessi').click
    rescue NoMethodError
      search_result = nil
    end
  rescue Net::HTTPServiceUnavailable
    puts "We got a 503, party is over"
    search_result
  end

end

# Print all domains found
domains.each do |domain|
    puts domain
end
	#!/usr/bin/env ruby
	"""
	Google search scraper to list all results likely to be MediaWiki installations
	"""
	# CC-0, ArchiveTeam/WikiTeam, 2013

	require 'rubygems'
	require 'mechanize'
	require 'uri'
	require 'cgi'

	domains = Array.new

	a = Mechanize.new { \|agent\|
	agent.user_agent_alias = 'Linux Konqueror'
	}
	prng = Random.new

	search_result = a.get('http://www.google.it/') # webhp?num=30&complete=0&hl=it
	search_form = search_result.form('f')
	search_form.q = '"Magnus Manske, Brion Vibber, Lee Daniel Crocker" -site:wikia.com -wikimedia'

	search_result = a.submit(search_form, search_form.buttons.first)

	# FIXME: Continue clicking "Next" endlessly; need to exit at some point
	while !search_result.nil?
	search_result.search("//h3/a").each do \|link\|
	# The result URLs are in h3 headers and passed through google.com/url?q=
	target = CGI.parse(link['href'])['/url?q'][0]
	unless target.nil?
	# Take each result URI provided
	uri = URI.parse(target)
	# Try to extract the entry URL to MediaWiki: index.php if we're lucky, otherwise the article path
	# We could try and be smart, open the URL and follow the link rel=EditURI; but it's too recent a feature
	unless uri.query.nil?
	# If there are parameters, perhaps we're lucky, just take till the path
	# TODO: This looks silly
	entry = uri.scheme + '://' + uri.host + uri.path
	else
	# But if there is none it's probably using short URLs or some other rewriting:
	# the last part must be the page title, remove it
	entry = target.split("/")[0..-2].join("/")
	end

	unless domains.include?(entry)
	domains << entry
	print '.'
	end

	# A human would probably click every now and then
	if prng.rand(0..3.0) < 1 then
	begin
	trash = a.get('http://google.com' + link['href'])
	rescue Exception
	# Nothing to do; we don't care at all
	end
	end
	end
	end

	sleep(prng.rand(150..300.0))
	begin
	search_result = search_result.link_with(:text => 'Avanti').click
	rescue NoMethodError
	begin
	search_result = search_result.link_with(:text => 'ripetere la ricerca includendo i risultati omessi').click
	rescue NoMethodError
	search_result = nil
	end
	rescue Net::HTTPServiceUnavailable
	puts "We got a 503, party is over"
	search_result
	end

	end

	# Print all domains found
	domains.each do \|domain\|
	puts domain
	end