Skip to content

Instantly share code, notes, and snippets.

@derwiki
Last active August 29, 2015 14:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save derwiki/e763baf274b6538aa77c to your computer and use it in GitHub Desktop.
Save derwiki/e763baf274b6538aa77c to your computer and use it in GitHub Desktop.
Simple class to scrape Google search results. Useful for scripting reports of SEO rankings.
import pprint
import re
import sys
import urllib
import urllib2
# pip install pyquery
try:
from pyquery import PyQuery as pq
except:
print "Missing dependency 'pyquery' -- please run `pip install pyquery` "\
"and try again"
sys.exit(-1)
def google_serp(q):
organic = []
for offset in xrange(0, 50, 10):
qs = urllib.urlencode(dict(q=q, start=offset))
url = "http://www.google.com/search?{}".format(qs)
req = urllib2.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html = urllib2.urlopen(req).read()
d = pq(html)
for i, result in enumerate(d('li.g h3 a')):
raw_href = d(result).attr('href')
maybe_match = re.search('adurl=(.*?)(%3F|&)', raw_href)
if maybe_match:
href = maybe_match.group(1)
else:
maybe_match = re.search('q=(.*?)&', raw_href)
href = maybe_match.group(1)
organic.append((i + offset, href))
return organic
if __name__ == '__main__':
if len(sys.argv) < 2:
print "No keyword phrase specified."
print " Usage: python ./seo.py keyword phrase"
print " Ex: python ./seo.py ad retargeting"
sys.exit(-1)
keyword_phrase = ' '.join(sys.argv[1:])
print "Scraping ranking results for {}...".format(keyword_phrase)
pprint.pprint(google_serp(keyword_phrase))
require 'open-uri'
require 'nokogiri'
module Seo
class << self
def google_serp(q)
[[], []].tap do |organic, paid|
start = [0, 10, 20, 30, 40, 50]
start.each do |offset|
url = "http://www.google.com/search?q="\
"#{ URI::encode(q) }&start=#{ offset }"
doc = Nokogiri::HTML(open(url))
doc.css('li.g h3 a').each_with_index do |result, i|
raw_href = result.attribute('href').value
href = raw_href.match(/adurl=(.*?)(%3F|&)/).try(:[], 1)
href = raw_href.match(/q=(.*?)&/)[1] if href.blank?
organic << { rank: i + offset, href: href, title: result.text }
end
doc.css('#rhs_block li h3 a').each_with_index do |result, i|
raw_href = result.attribute('href').value
href = raw_href.match(/adurl=(.*?)(%3F|&)/).try(:[], 1)
href = raw_href.match(/q=(.*?)&/)[1] if href.blank?
paid << { rank: i, href: href, title: result.text }
end
end
end
end
# `m` for memoize
def m(url, pattern=nil)
@cache ||= {}
if (payload = @cache[url]).present?
payload
else
payload = @cache[url] = Nokogiri::HTML(open(url))
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment