Skip to content

Instantly share code, notes, and snippets.

@mmccollow
Created February 3, 2012 02:04
Show Gist options
  • Save mmccollow/1727195 to your computer and use it in GitHub Desktop.
Save mmccollow/1727195 to your computer and use it in GitHub Desktop.
Reddit historical lexical diversity
import urllib2
import HTMLParser
from BeautifulSoup import BeautifulSoup
import time
def _fetch_page(url):
try:
page = urllib2.urlopen(url)
soup = BeautifulSoup(page)
except urllib2.URLError:
print "Failed to fetch page from: " + url
except HTMLParser.HTMLParseError:
print "Failed to parse page from: " + url
return soup
def fetch_reddit(dateStr=None):
if dateStr == None:
return _fetch_page('http://www.reddit.com')
else:
wayback_url = 'http://web.archive.org/web/' + dateStr + '/http://reddit.com/?'
return _fetch_page(wayback_url)
def get_valid_wayback_dates(year):
dates = []
page = _fetch_page('http://wayback.archive.org/web/' + year + '*/http://www.reddit.com')
divs = page.findAll('div')
for d in divs:
if d.has_key('class'):
if d['class'].strip() == 'position':
dateTag = d.findChildren('div')[1]
dates.append(time.strftime("%Y%m%d", time.strptime(dateTag['id'], "%b-%d-%Y")))
return dates
def get_titles(page):
titles = []
anchors = page.findAll('a')
for a in anchors:
if a.has_key('class'):
if a['class'].strip() == 'title':
titles.append(a.contents[0])
return titles
def get_words(titles):
words = []
for t in titles:
words += [w for w in t.split()]
return words
def lex_div(words):
return 1.0 * len(set(words)) / len(words)
def generate_datfile():
"""
To use the file this function creates, start GNUplot and run the following command:
plot "reddit.dat" using 1:2 title "Lexical Diversity" with lines
"""
years = ['2006', '2007', '2008', '2009', '2010', '2011']
frontpages = []
output = []
for year in years:
dates = get_valid_wayback_dates(year)
frontpages.append((year, fetch_reddit(dates[0])))
frontpages.append(('2012', fetch_reddit())) #current frontpage
for page in frontpages:
words = get_words(get_titles(page[1]))
output.append((page[0], lex_div(words)))
fp = open("reddit.dat", "w+")
for item in output:
fp.write(item[0] + "\t" + str(item[1]) + "\n")
fp.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment