Created
September 13, 2012 17:23
-
-
Save astanin/3715979 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
# public domain | |
import csv | |
import json | |
import gzip | |
import re | |
import sys | |
import time | |
import urllib2 | |
from StringIO import StringIO | |
from operator import itemgetter | |
__all__ = [ "fetch_current_ranks", "load_ranks", "save_ranks", | |
"plot_normalized_ranks", "plot_rank_changes", | |
"main" ] | |
SLEEP=0.5 | |
GITHUB_RANK_RE="([\w\+\-\.# ]+) <em>is the #(\d+) most popular language" | |
SO_TAGS_URL="https://api.stackexchange.com/2.1" + \ | |
"/tags/%s/info?order=desc&sort=popular&site=stackoverflow" | |
SO_PAGESIZE=30 | |
# translation from GitHub to Stackoverflow names | |
LANG_SYNONYMS = { "visual-basic": "vb.net", # but there is also [vba], 9k vs 35k | |
"emacs-lisp": "elisp", | |
"julia": "julia-lang", | |
"standard-ml": "sml", | |
"viml": "vimscript", | |
"asp": "asp.net", | |
"max": "max-lang", # doesn't exist yet, but [max] is different | |
"apex": "oracle-apex", # isn't it? | |
"openedge-abl": "progress-4gl", | |
} | |
def rankdata(a): | |
# http://stackoverflow.com/a/3071441/25450 | |
def rank_simple(vector): | |
return sorted(range(len(vector)), key=vector.__getitem__) | |
n = len(a) | |
ivec=rank_simple(a) | |
svec=[a[rank] for rank in ivec] | |
sumranks = 0 | |
dupcount = 0 | |
newarray = [0]*n | |
for i in xrange(n): | |
sumranks += i | |
dupcount += 1 | |
if i==n-1 or svec[i] != svec[i+1]: | |
averank = sumranks / float(dupcount) + 1 | |
for j in xrange(i-dupcount+1,i+1): | |
newarray[ivec[j]] = averank | |
sumranks = 0 | |
dupcount = 0 | |
return map(int,newarray) | |
def rerank_pairs(pairs): | |
"Usefule when some of the ranked pairs have been removed." | |
names = map(itemgetter(0), pairs) | |
values = map(itemgetter(1), pairs) | |
ranks = rankdata(values) | |
return zip(names, ranks) | |
def normalize_name(langname): | |
n = langname.lower().replace(" ","-") | |
return LANG_SYNONYMS.get(n, n) | |
def normalize_ranks(ranks): | |
minr = min(ranks) | |
maxr = max(ranks) | |
return [ float(r-minr)/(maxr-minr) for r in ranks ] | |
def github_place(rellink): | |
link = "https://github.com" + rellink | |
print >>sys.stderr, link | |
page = urllib2.urlopen(link).read() | |
time.sleep(SLEEP) | |
try: | |
name, place = re.findall(GITHUB_RANK_RE, page)[0] | |
place = int(place) | |
except IndexError: | |
name = urllib2.unquote(rellink[len("/languages/"):]) | |
place = 1 | |
return normalize_name(name), place | |
def github_places(): | |
langspage = urllib2.urlopen("https://github.com/languages").read() | |
time.sleep(SLEEP) | |
links = set(re.findall("\"(/languages/[A-Za-z0-9%_]+)\"", langspage)) | |
langplaces = [ github_place(l) for l in links ] | |
return langplaces | |
def github_ranks(langplaces): | |
names = [ n for n, p in langplaces ] | |
places = [ -p for n, p in langplaces ] | |
ranks = rankdata(places) | |
return zip(names, ranks) | |
def so_tagcounts_onepage(tags): | |
"Return JSON response with tag info." | |
tagpart = ";".join([ normalize_name(tag) for tag in tags ]) | |
tagpart = urllib2.quote(tagpart) | |
url = SO_TAGS_URL % tagpart | |
print >>sys.stderr, url | |
buf = StringIO(urllib2.urlopen(url).read()) | |
text = gzip.GzipFile(fileobj=buf).read() # SO returns gzipped data | |
time.sleep(SLEEP) | |
data = json.loads(text) | |
if data["has_more"]: | |
print >>sys.stderr, "Warning: not all tags fetched in one request." | |
return data | |
def so_tagcounts(names): | |
tags = [] | |
for page in xrange(len(names)//SO_PAGESIZE+1): | |
somenames = names[page*SO_PAGESIZE:(page+1)*SO_PAGESIZE] | |
counts = so_tagcounts_onepage(somenames) | |
tags += counts["items"] | |
stags = sorted(tags, key=itemgetter("count"), reverse=True) | |
return [ (t["name"], t["count"]) for t in stags] | |
def so_ranks(tagcounts): | |
names = map(itemgetter(0), tagcounts) | |
counts = map(itemgetter(1), tagcounts) | |
ranks = rankdata(counts) | |
return zip(names, ranks) | |
def fetch_current_ranks(): | |
"""Download data and return current language ranking on both sites.""" | |
ghrs = github_ranks(github_places()) | |
langs = map(itemgetter(0), ghrs) | |
sots = so_tagcounts(langs) | |
sors = so_ranks(sots) | |
data = { "github.rank": ghrs, | |
"so.tagcount": sots, | |
"so.rank": sors } | |
return data | |
def save_ranks(filename_csv, datadict): | |
"""Save data in CSV format similar to raw data by Drew Conway""" | |
gh_ranks = datadict.get("github.rank", []) | |
so_ranks = dict(datadict.get("so.rank", [])) | |
so_tags = dict(datadict.get("so.tagcount", [])) | |
rows = [ ("language", "github.rank", "so.tagcount", "so.rank") ] | |
rows += [ (normalize_name(n), ghr, so_tags.get(n,0), so_ranks.get(n,0)) | |
for n, ghr in gh_ranks ] | |
w = csv.writer(file(filename_csv, "w")) | |
w.writerows(rows) | |
def load_ranks(filename_csv): | |
"""Load github.rank, so.tagcount and so.rank | |
from the original CSV by Drew Conway""" | |
r = csv.reader(file(filename_csv)) | |
rows = list(r)[1:] | |
names, ghrs, sotags, sors = zip(*rows) | |
names = map(normalize_name, names) | |
ghrs = map(int, ghrs) | |
sotags = map(int, sotags) | |
sors = map(int, sors) | |
return { "github.rank": zip(names,ghrs), | |
"so.tagcount": zip(names,sotags), | |
"so.rank": zip(names,sors) } | |
def plot_normalized_ranks(rankdata, color="black"): | |
"""Plot ranking scatter plot.""" | |
import matplotlib.pyplot as plt | |
import numpy as np | |
gh_ranks = rankdata["github.rank"] | |
so_ranks = rankdata["so.rank"] | |
fig = plt.figure() | |
p = fig.add_subplot(111) | |
names = map(itemgetter(0), gh_ranks) | |
ghscores = normalize_ranks(map(itemgetter(1), gh_ranks)) | |
so_ranks = dict(so_ranks) | |
soscores = normalize_ranks([so_ranks.get(n,0) for n in names ]) | |
## scater plot | |
p.scatter(ghscores, soscores, marker='.', color=color) | |
p.set_xlabel("normalized GitHub rank") | |
p.set_ylabel("normalized Stackoverflow rank (by tag count)") | |
p.grid() | |
for n, x, y in zip(names, ghscores, soscores): | |
p.text(x,y,n,fontsize=7,alpha=0.5,color=color) | |
## linear fitting | |
s1,s0 = np.polyfit(ghscores, soscores,1) | |
xs = np.linspace(0,1) | |
ys = xs*s1+s0 | |
p.plot(xs,ys,alpha=0.5,color=color) | |
return fig | |
def plot_rank_changes(olddata, newdata, | |
oldcolor="black", | |
gaincolor="blue", | |
losecolor="red", | |
nochangecolor="black", | |
change_threshold=3): | |
"""Show displacement of common data points between two sets.""" | |
import matplotlib.pyplot as plt | |
import numpy as np | |
## evaluate normalized scores and their changes | |
old_ghrs = olddata["github.rank"] | |
old_sors = olddata["so.rank"] | |
new_ghrs = newdata["github.rank"] | |
new_sors = newdata["so.rank"] | |
old_langs = map(itemgetter(0),old_ghrs) | |
new_langs = map(itemgetter(0),new_ghrs) | |
common_langs = set(old_langs).intersection(set(new_langs)) | |
## show only common | |
def filter_common(pairs): | |
return rerank_pairs([ (n, r) for n, r in pairs if n in common_langs ]) | |
old_ghrs = filter_common(old_ghrs) | |
old_sors = filter_common(old_sors) | |
new_ghrs = filter_common(new_ghrs) | |
new_sors = filter_common(new_sors) | |
old_langs = map(itemgetter(0),old_ghrs) | |
new_langs = map(itemgetter(0),new_ghrs) | |
def scores_from_pairs(ghpairs, sopairs): | |
names = map(itemgetter(0), ghpairs) | |
sopairs = dict(sopairs) | |
ghscores = map(itemgetter(1), ghpairs) | |
soscores = [ sopairs.get(n, 0) for n in names ] | |
return ghscores, soscores | |
old_ghscores, old_soscores = scores_from_pairs(old_ghrs, old_sors) | |
new_ghscores, new_soscores = scores_from_pairs(new_ghrs, new_sors) | |
oldgh = dict(zip(old_langs, old_ghscores)) | |
oldso = dict(zip(old_langs, old_soscores)) | |
newgh = dict(zip(new_langs, new_ghscores)) | |
newso = dict(zip(new_langs, new_soscores)) | |
changes = [ ( oldgh.get(lang, 0), | |
oldso.get(lang, 0), | |
newgh.get(lang,0)-oldgh.get(lang, 0), | |
newso.get(lang,0)-oldso.get(lang, 0) ) | |
for lang in common_langs ] # [ (x,y,dx,dy) ] | |
langchanges = dict(zip(common_langs, changes)) | |
## colorize labels | |
def langcolor(lang): | |
x,y,dx,dy = langchanges.get(lang, [0,0,0,0]) | |
change = dx+dy | |
if dx+dy > change_threshold or (dx > 0 and dy > 0): | |
# definitely or mostly improved | |
return gaincolor | |
elif dx+dy < -change_threshold or (dx < 0 and dy < 0): | |
return losecolor | |
else: | |
return nochangecolor | |
langcolors = dict([ (lang, langcolor(lang)) | |
for lang in set(old_langs).union(set(new_langs)) ]) | |
## plotting | |
fig = plt.figure() | |
p = fig.add_subplot(111) | |
for (x,y,dx,dy), lang in zip(changes, common_langs): | |
p.arrow(x,y,dx,dy,color=langcolors[lang],alpha=0.33, | |
length_includes_head=True, shape="full") | |
p.scatter(old_ghscores, old_soscores, color=oldcolor, marker=".") | |
def plot_newlangs_with_color(color): | |
names = [ lang for lang in new_langs if langcolor(lang) == color ] | |
xs = [ newgh.get(n) for n in names ] | |
ys = [ newso.get(n) for n in names ] | |
p.scatter(xs,ys,color=color,marker=".") | |
plot_newlangs_with_color(gaincolor) | |
plot_newlangs_with_color(losecolor) | |
plot_newlangs_with_color(nochangecolor) | |
p.set_xlabel("GitHub rank") | |
p.set_ylabel("Stackoverflow rank (by tag count)") | |
for n, x, y in zip(new_langs, new_ghscores, new_soscores): | |
p.text(x,y,n,fontsize=7,alpha=0.5,color=langcolors[n]) | |
for n in set(old_langs).difference(common_langs): | |
x = oldgh.get(n) | |
y = oldso.get(n) | |
p.text(x,y,n,fontsize=7,alpha=0.5,color=nochangecolor) | |
p.grid() | |
return fig | |
def main(): | |
rankdata = fetch_current_ranks() | |
write_ranks("language_ranks.new.csv", rankdata) | |
olddata = load_ranks("language_ranks.csv") | |
fig = plot_rank_changes(olddata, rankdata) | |
fig.savefig("rank_changes.png", dpi=150) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment