Last active
August 29, 2015 14:16
-
-
Save derwiki/e763baf274b6538aa77c to your computer and use it in GitHub Desktop.
Simple class to scrape Google search results. Useful for scripting reports of SEO rankings.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pprint | |
import re | |
import sys | |
import urllib | |
import urllib2 | |
# pip install pyquery | |
try: | |
from pyquery import PyQuery as pq | |
except: | |
print "Missing dependency 'pyquery' -- please run `pip install pyquery` "\ | |
"and try again" | |
sys.exit(-1) | |
def google_serp(q): | |
organic = [] | |
for offset in xrange(0, 50, 10): | |
qs = urllib.urlencode(dict(q=q, start=offset)) | |
url = "http://www.google.com/search?{}".format(qs) | |
req = urllib2.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) | |
html = urllib2.urlopen(req).read() | |
d = pq(html) | |
for i, result in enumerate(d('li.g h3 a')): | |
raw_href = d(result).attr('href') | |
maybe_match = re.search('adurl=(.*?)(%3F|&)', raw_href) | |
if maybe_match: | |
href = maybe_match.group(1) | |
else: | |
maybe_match = re.search('q=(.*?)&', raw_href) | |
href = maybe_match.group(1) | |
organic.append((i + offset, href)) | |
return organic | |
if __name__ == '__main__': | |
if len(sys.argv) < 2: | |
print "No keyword phrase specified." | |
print " Usage: python ./seo.py keyword phrase" | |
print " Ex: python ./seo.py ad retargeting" | |
sys.exit(-1) | |
keyword_phrase = ' '.join(sys.argv[1:]) | |
print "Scraping ranking results for {}...".format(keyword_phrase) | |
pprint.pprint(google_serp(keyword_phrase)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'nokogiri' | |
module Seo | |
class << self | |
def google_serp(q) | |
[[], []].tap do |organic, paid| | |
start = [0, 10, 20, 30, 40, 50] | |
start.each do |offset| | |
url = "http://www.google.com/search?q="\ | |
"#{ URI::encode(q) }&start=#{ offset }" | |
doc = Nokogiri::HTML(open(url)) | |
doc.css('li.g h3 a').each_with_index do |result, i| | |
raw_href = result.attribute('href').value | |
href = raw_href.match(/adurl=(.*?)(%3F|&)/).try(:[], 1) | |
href = raw_href.match(/q=(.*?)&/)[1] if href.blank? | |
organic << { rank: i + offset, href: href, title: result.text } | |
end | |
doc.css('#rhs_block li h3 a').each_with_index do |result, i| | |
raw_href = result.attribute('href').value | |
href = raw_href.match(/adurl=(.*?)(%3F|&)/).try(:[], 1) | |
href = raw_href.match(/q=(.*?)&/)[1] if href.blank? | |
paid << { rank: i, href: href, title: result.text } | |
end | |
end | |
end | |
end | |
# `m` for memoize | |
def m(url, pattern=nil) | |
@cache ||= {} | |
if (payload = @cache[url]).present? | |
payload | |
else | |
payload = @cache[url] = Nokogiri::HTML(open(url)) | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment