danneu/google-scraper.rb

## google-scraper.rb
def scrape
    require 'nokogiri'
    require 'open-uri'

# keyword = "nokogiri is my favorite"
# domain = "tenderlovemaking.com"

    keyword = self.text.split.join("+")
    domain = self.website.url
    user_agent = get_random_user_agent


    log = Logger.new("scraping.log")
    log.debug "logger created for '#{keyword}' on '#{domain}'"

    start = 0 # page (crawling 10 pages: 0,10,20..90)
    i = 1 # position (1-100)
    position = nil
    until position or start==100 # until position gets set or reaches 10th page
      page = (start.to_i+10)/10
      log.debug "page: #{page}... "
      url = "http://www.google.com/search?q=#{keyword}&start=#{start}"
      log.debug "url: #{url}"
      #url = Rails.root + "test/google-search/search#{page}.html"
      doc = Nokogiri::HTML(open(url, "User-Agent" => user_agent))
      links = doc.xpath('//h3/a[contains(@class, "l")]')
      #links = doc.search('//h3/a[@class="l"]')

      #if links empty, try a more general scrape (sometimes <a> tags didn't have a class!)
      if links.empty?
        links = doc.xpath('//h3[@class = "r"]/a]')
      end

      if links.empty?
        if doc.at_xpath('//div/p').to_s =~ /did not match any documents/
          # If no results returned,
          log.debug "Page contains no results"
          break
        else
          # No links, but also no "found no results" page. throw error
          log.debug doc.to_s
          log.debug "Raising: PageContainsNoLinks"
          raise "PageContainsNoLinks"
        end
      end

      links.each do |link|

        # Remove protocol prefix
        to_remove = ["http://", "https://"]
        reg = Regexp.new(to_remove.map{ |s| "(#{s})" }.join('|'))
        link = link['href'].gsub(reg, '')

        # If link start with '/url?q=', remove it
        if link[0..6] == '/url?q='
           link = link[7..-1]
        end

        # If link starts with 'www.', remove it
        if link[0..3] == 'www.'
          link = link[4..-1]
        end

        log.debug "link #{i}. '#{link[0..domain.length-1]}' against '#{domain}'"

        # If match, set position and stop looping
        if domain == link[0..domain.length-1]
          position = i
          break
        end

        i += 1
      end

      sleep 2.seconds # reduce request spam
      start += 10
    end

    position ||= -1 # -1 if not found

    log.debug "position: #{position} | measured_at: #{Time.now.to_s(:short)}"
    log.debug "============================================================="
    log.debug ""
    log.debug ""

    return {:position => position, :measured_at => Time.now, :engine => "Google"}
  end
	def scrape
	require 'nokogiri'
	require 'open-uri'

	# keyword = "nokogiri is my favorite"
	# domain = "tenderlovemaking.com"

	keyword = self.text.split.join("+")
	domain = self.website.url
	user_agent = get_random_user_agent


	log = Logger.new("scraping.log")
	log.debug "logger created for '#{keyword}' on '#{domain}'"

	start = 0 # page (crawling 10 pages: 0,10,20..90)
	i = 1 # position (1-100)
	position = nil
	until position or start==100 # until position gets set or reaches 10th page
	page = (start.to_i+10)/10
	log.debug "page: #{page}... "
	url = "http://www.google.com/search?q=#{keyword}&start=#{start}"
	log.debug "url: #{url}"
	#url = Rails.root + "test/google-search/search#{page}.html"
	doc = Nokogiri::HTML(open(url, "User-Agent" => user_agent))
	links = doc.xpath('//h3/a[contains(@class, "l")]')
	#links = doc.search('//h3/a[@class="l"]')

	#if links empty, try a more general scrape (sometimes <a> tags didn't have a class!)
	if links.empty?
	links = doc.xpath('//h3[@class = "r"]/a]')
	end

	if links.empty?
	if doc.at_xpath('//div/p').to_s =~ /did not match any documents/
	# If no results returned,
	log.debug "Page contains no results"
	break
	else
	# No links, but also no "found no results" page. throw error
	log.debug doc.to_s
	log.debug "Raising: PageContainsNoLinks"
	raise "PageContainsNoLinks"
	end
	end

	links.each do \|link\|

	# Remove protocol prefix
	to_remove = ["http://", "https://"]
	reg = Regexp.new(to_remove.map{ \|s\| "(#{s})" }.join('\|'))
	link = link['href'].gsub(reg, '')

	# If link start with '/url?q=', remove it
	if link[0..6] == '/url?q='
	link = link[7..-1]
	end

	# If link starts with 'www.', remove it
	if link[0..3] == 'www.'
	link = link[4..-1]
	end

	log.debug "link #{i}. '#{link[0..domain.length-1]}' against '#{domain}'"

	# If match, set position and stop looping
	if domain == link[0..domain.length-1]
	position = i
	break
	end

	i += 1
	end

	sleep 2.seconds # reduce request spam
	start += 10
	end

	position \|\|= -1 # -1 if not found

	log.debug "position: #{position} \| measured_at: #{Time.now.to_s(:short)}"
	log.debug "============================================================="
	log.debug ""
	log.debug ""

	return {:position => position, :measured_at => Time.now, :engine => "Google"}
	end