Skip to content

Instantly share code, notes, and snippets.

@maxcutler
Created October 14, 2010 20:59
Show Gist options
  • Save maxcutler/03abb4979ba338704bb8 to your computer and use it in GitHub Desktop.
Save maxcutler/03abb4979ba338704bb8 to your computer and use it in GitHub Desktop.
import csv
import urllib2
import urlparse
search_list = {
'TownNews': ['townnews.com', 'blox'],
'MatchBin': ['matchbin.com'],
'Zope.com': ['zope.net'],
'WordPress': ['wp-content'],
'Ellington': ['/rss/headlines/', 'Scripps Interactive'],
'MediaNewsGroup': ['mnginteractive.com', 'cnpapers.com', 'medianewsgroup.com'],
'Rush Publishing': ['/scripts/search/', '/help/partners/'],
'Swift Communications': ['Swift Communications'],
'Freedom Communications': ['freedom.com'],
'McClatchy Company': ['McClatchy Company'],
'Microsoft FrontPage': ['Microsoft FrontPage'],
}
def find_cms_in_page(page):
for name, needles in search_list.items():
for needle in needles:
if needle in page:
return name
return ''
total = 0
domains = {}
rev = csv.writer(open('hnews_rev.csv', 'w'))
sites = csv.reader(open('hnews.csv', 'r'))
for site in sites:
url = site[1]
print url
domain = urlparse.urlparse(url).netloc
count = domains.get(domain, 0)
domains[domain] = count + 1
page = urllib2.urlopen(url).read()
total = total + 1
cms = find_cms_in_page(page)
rev.writerow([site[0], url, cms, count == 0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment