Skip to content

Instantly share code, notes, and snippets.

@fnielsen
Created April 5, 2011 15:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fnielsen/903857 to your computer and use it in GitHub Desktop.
Save fnielsen/903857 to your computer and use it in GitHub Desktop.
Analysis of Wikipedia coverage
#!/usr/bin/env python
#
# $Id: Royal2008Whats.py,v 1.2 2011/04/05 15:40:37 fn Exp $
from pylab import *
from scipy.signal import medfilt
from scipy.stats.stats import spearmanr
from urllib import FancyURLopener, urlopen, urlencode
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class MyOpener(FancyURLopener):
version = 'Finn Aarup Nielsen (http://www.imm.dtu.dk/~fn/, fn@imm.dtu.dk)'
myopener = MyOpener()
years = arange(1900, 2011)
urlbase = 'http://en.wikipedia.org/w/index.php'
data = []
for year in years:
urldata = {'action': 'raw', 'title': year}
wikitext = myopener.open(urlbase + '?' + urlencode(urldata)).read()
data.append({'wikitext': wikitext,
'length': len(unicode(wikitext)),
'year': year})
print(year)
lengths = np.asarray([ d['length'] for d in data ])
# There is a bad end correction in medfilt
# plot(years, medfilt(lengths, 25), color=(0.5,1,0.5), linewidth=5)
plot(years, lengths, 'b')
xlabel('Years')
ylabel('Number of characters of Wikipedia article')
text(1910, 12000,
"Spearman correlation = %.3f" % spearmanr(years, lengths)[0],
color=(1,0,0))
xticks(range(1900, 2011, 10))
grid()
i = np.argsort(-lengths)
text(years[i[0]]+1, lengths[i[0]], "%s" % (years[i[0]]), color=(1,0,0))
text(years[i[1]]+1, lengths[i[1]], "%s" % (years[i[1]]), color=(1,0,0))
text(years[i[2]]+1, lengths[i[2]], "%s" % (years[i[2]]), color=(1,0,0))
i = np.argsort(lengths[50:]) + 50
text(years[i[0]]+1, lengths[i[0]], "%s" % (years[i[0]]), color=(1,0,0))
show()
savefig('Royal2008Whats.png')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment