Skip to content

Instantly share code, notes, and snippets.

@danneu
Created January 23, 2011 06:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danneu/791887 to your computer and use it in GitHub Desktop.
Save danneu/791887 to your computer and use it in GitHub Desktop.
def scrape
require 'nokogiri'
require 'open-uri'
# keyword = "nokogiri is my favorite"
# domain = "tenderlovemaking.com"
keyword = self.text.split.join("+")
domain = self.website.url
user_agent = get_random_user_agent
log = Logger.new("scraping.log")
log.debug "logger created for '#{keyword}' on '#{domain}'"
start = 0 # page (crawling 10 pages: 0,10,20..90)
i = 1 # position (1-100)
position = nil
until position or start==100 # until position gets set or reaches 10th page
page = (start.to_i+10)/10
log.debug "page: #{page}... "
url = "http://www.google.com/search?q=#{keyword}&start=#{start}"
log.debug "url: #{url}"
#url = Rails.root + "test/google-search/search#{page}.html"
doc = Nokogiri::HTML(open(url, "User-Agent" => user_agent))
links = doc.xpath('//h3/a[contains(@class, "l")]')
#links = doc.search('//h3/a[@class="l"]')
#if links empty, try a more general scrape (sometimes <a> tags didn't have a class!)
if links.empty?
links = doc.xpath('//h3[@class = "r"]/a]')
end
if links.empty?
if doc.at_xpath('//div/p').to_s =~ /did not match any documents/
# If no results returned,
log.debug "Page contains no results"
break
else
# No links, but also no "found no results" page. throw error
log.debug doc.to_s
log.debug "Raising: PageContainsNoLinks"
raise "PageContainsNoLinks"
end
end
links.each do |link|
# Remove protocol prefix
to_remove = ["http://", "https://"]
reg = Regexp.new(to_remove.map{ |s| "(#{s})" }.join('|'))
link = link['href'].gsub(reg, '')
# If link start with '/url?q=', remove it
if link[0..6] == '/url?q='
link = link[7..-1]
end
# If link starts with 'www.', remove it
if link[0..3] == 'www.'
link = link[4..-1]
end
log.debug "link #{i}. '#{link[0..domain.length-1]}' against '#{domain}'"
# If match, set position and stop looping
if domain == link[0..domain.length-1]
position = i
break
end
i += 1
end
sleep 2.seconds # reduce request spam
start += 10
end
position ||= -1 # -1 if not found
log.debug "position: #{position} | measured_at: #{Time.now.to_s(:short)}"
log.debug "============================================================="
log.debug ""
log.debug ""
return {:position => position, :measured_at => Time.now, :engine => "Google"}
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment