Skip to content

Instantly share code, notes, and snippets.

@ulope
Created February 18, 2011 19:22
Show Gist options
  • Save ulope/834249 to your computer and use it in GitHub Desktop.
Save ulope/834249 to your computer and use it in GitHub Desktop.
This is an attemt to make the code from http://blog.jgc.org/2011/02/how-could-i-have-coded-this-better.html a bit more pythonic
#!/usr/bin/env python
import csv
import sys
from BeautifulSoup import BeautifulSoup
import urllib, urllib2
from urlparse import urlparse
from collections import defaultdict
__doc__ = """
Script to perform Google searches and extract the domain names of
the returned results. The order in which the domain names are
returned is used to determine a ranking between different companies.
"""
# This is the list of domains to look for in Google searches. To
# search for more or different domains simply alter this list. The
# order of this list determines the order in which the results are
# saved.
DOMAINS = ['apple.com', 'microsoft.com', 'engadget.com', 'wired.com', 'cnet.com', ]
# All possible search strings to be used are generated from this list by using
# 'product'. This can be modified to create other search terms by
# altering the lists (add/delete elements, or add/delete lists). Each
# of these terms will have "whizz bang" appended below
SEARCH_TERMS = (
('apple', 'microsoft', 'gadget', ),
('iphone', 'xbox', 'news', ),
('4', '360', ),
)
# All search terms are passed through this format string (so you can add
# static pre-/post-fixes)
SEARCH_MODIFIER = "%s"
GOOGLE_URL = "http://google.com/search?q=%s&num=100&hl=en&start=0"
def google(q): # Query string (will be URL quoted by this function)
"""
Performs a Google search and returns a BeautifulSoup object
containing the parsed, returned page.
"""
# This attempts to ask for 100 results from Google. This does not
# always seem to work correctly. Note that the fake User-Agent is
# required otherwise Google will reject the search
url = GOOGLE_URL % urllib.quote_plus(q)
try:
req = urllib2.Request(url, None, {
'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; '
'en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.94 '
'Safari/534.13'})
res = urllib2.urlopen(req)
dat = res.read()
# optional add error handling (e.g. return None on broken request or some such)
finally:
try: res.close()
except: pass
return BeautifulSoup(dat)
def product(*args): # List to x together
"""
Cartesian product function (similar to itertools.product but joins
the #elements together as a space separated string rather than
returning a tuple)
"""
pools = map(tuple, args)
result = [[]]
for pool in pools:
result = [x+[y] for x in result for y in pool]
for prod in result:
yield " ".join(prod)
def main():
writer = csv.writer(sys.stdout)
writer.writerow(['term'] + DOMAINS)
for t in product(*SEARCH_TERMS):
# Format search term
qu = SEARCH_MODIFIER % t
# This performs a Google query using the helper function.
so = google(qu)
# Then extracts all the <a> tags that have class "l". If Google
# changes the structure of their pages this is where this code
# will break. Currently class=l grabs all the appropriate links
# (displayed in green in the search results).
# The urlparse(u['href'])[1] works by extracting the href from the
# <a> tag, parsing it into component parts and extracting the 1th
# element of the returned tuple which contains the netloc (the
# domain name)
hrefs = [ urlparse(u['href'])[1] for u in so.findAll("a", {"class":"l"}) ]
# Used to hold the rank of the domains a result was found on.
# Uses defaultdict(int) to have a default of 0 for not found domains
ranks = defaultdict(int)
# Here we iterate through the returned domain names in hrefs and
# match them up with the domain names we are looking for.
for i, href in enumerate(hrefs):
for domain in DOMAINS:
# Note that the comparison here deals with two cases. The
# domain is entirely 'foo.com' (for example), or the
# domain ends with '.foo.com' (for example).
if href == domain or href.endswith(".%s" % domain):
ranks[domain] = i
writer.writerow([qu] + [ranks[domain] for domain in DOMAINS])
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment