Skip to content

Instantly share code, notes, and snippets.

@astanin
Created September 13, 2012 17:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save astanin/3715979 to your computer and use it in GitHub Desktop.
Save astanin/3715979 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python2
# public domain
import csv
import json
import gzip
import re
import sys
import time
import urllib2
from StringIO import StringIO
from operator import itemgetter
__all__ = [ "fetch_current_ranks", "load_ranks", "save_ranks",
"plot_normalized_ranks", "plot_rank_changes",
"main" ]
SLEEP=0.5
GITHUB_RANK_RE="([\w\+\-\.# ]+) <em>is the #(\d+) most popular language"
SO_TAGS_URL="https://api.stackexchange.com/2.1" + \
"/tags/%s/info?order=desc&sort=popular&site=stackoverflow"
SO_PAGESIZE=30
# translation from GitHub to Stackoverflow names
LANG_SYNONYMS = { "visual-basic": "vb.net", # but there is also [vba], 9k vs 35k
"emacs-lisp": "elisp",
"julia": "julia-lang",
"standard-ml": "sml",
"viml": "vimscript",
"asp": "asp.net",
"max": "max-lang", # doesn't exist yet, but [max] is different
"apex": "oracle-apex", # isn't it?
"openedge-abl": "progress-4gl",
}
def rankdata(a):
# http://stackoverflow.com/a/3071441/25450
def rank_simple(vector):
return sorted(range(len(vector)), key=vector.__getitem__)
n = len(a)
ivec=rank_simple(a)
svec=[a[rank] for rank in ivec]
sumranks = 0
dupcount = 0
newarray = [0]*n
for i in xrange(n):
sumranks += i
dupcount += 1
if i==n-1 or svec[i] != svec[i+1]:
averank = sumranks / float(dupcount) + 1
for j in xrange(i-dupcount+1,i+1):
newarray[ivec[j]] = averank
sumranks = 0
dupcount = 0
return map(int,newarray)
def rerank_pairs(pairs):
"Usefule when some of the ranked pairs have been removed."
names = map(itemgetter(0), pairs)
values = map(itemgetter(1), pairs)
ranks = rankdata(values)
return zip(names, ranks)
def normalize_name(langname):
n = langname.lower().replace(" ","-")
return LANG_SYNONYMS.get(n, n)
def normalize_ranks(ranks):
minr = min(ranks)
maxr = max(ranks)
return [ float(r-minr)/(maxr-minr) for r in ranks ]
def github_place(rellink):
link = "https://github.com" + rellink
print >>sys.stderr, link
page = urllib2.urlopen(link).read()
time.sleep(SLEEP)
try:
name, place = re.findall(GITHUB_RANK_RE, page)[0]
place = int(place)
except IndexError:
name = urllib2.unquote(rellink[len("/languages/"):])
place = 1
return normalize_name(name), place
def github_places():
langspage = urllib2.urlopen("https://github.com/languages").read()
time.sleep(SLEEP)
links = set(re.findall("\"(/languages/[A-Za-z0-9%_]+)\"", langspage))
langplaces = [ github_place(l) for l in links ]
return langplaces
def github_ranks(langplaces):
names = [ n for n, p in langplaces ]
places = [ -p for n, p in langplaces ]
ranks = rankdata(places)
return zip(names, ranks)
def so_tagcounts_onepage(tags):
"Return JSON response with tag info."
tagpart = ";".join([ normalize_name(tag) for tag in tags ])
tagpart = urllib2.quote(tagpart)
url = SO_TAGS_URL % tagpart
print >>sys.stderr, url
buf = StringIO(urllib2.urlopen(url).read())
text = gzip.GzipFile(fileobj=buf).read() # SO returns gzipped data
time.sleep(SLEEP)
data = json.loads(text)
if data["has_more"]:
print >>sys.stderr, "Warning: not all tags fetched in one request."
return data
def so_tagcounts(names):
tags = []
for page in xrange(len(names)//SO_PAGESIZE+1):
somenames = names[page*SO_PAGESIZE:(page+1)*SO_PAGESIZE]
counts = so_tagcounts_onepage(somenames)
tags += counts["items"]
stags = sorted(tags, key=itemgetter("count"), reverse=True)
return [ (t["name"], t["count"]) for t in stags]
def so_ranks(tagcounts):
names = map(itemgetter(0), tagcounts)
counts = map(itemgetter(1), tagcounts)
ranks = rankdata(counts)
return zip(names, ranks)
def fetch_current_ranks():
"""Download data and return current language ranking on both sites."""
ghrs = github_ranks(github_places())
langs = map(itemgetter(0), ghrs)
sots = so_tagcounts(langs)
sors = so_ranks(sots)
data = { "github.rank": ghrs,
"so.tagcount": sots,
"so.rank": sors }
return data
def save_ranks(filename_csv, datadict):
"""Save data in CSV format similar to raw data by Drew Conway"""
gh_ranks = datadict.get("github.rank", [])
so_ranks = dict(datadict.get("so.rank", []))
so_tags = dict(datadict.get("so.tagcount", []))
rows = [ ("language", "github.rank", "so.tagcount", "so.rank") ]
rows += [ (normalize_name(n), ghr, so_tags.get(n,0), so_ranks.get(n,0))
for n, ghr in gh_ranks ]
w = csv.writer(file(filename_csv, "w"))
w.writerows(rows)
def load_ranks(filename_csv):
"""Load github.rank, so.tagcount and so.rank
from the original CSV by Drew Conway"""
r = csv.reader(file(filename_csv))
rows = list(r)[1:]
names, ghrs, sotags, sors = zip(*rows)
names = map(normalize_name, names)
ghrs = map(int, ghrs)
sotags = map(int, sotags)
sors = map(int, sors)
return { "github.rank": zip(names,ghrs),
"so.tagcount": zip(names,sotags),
"so.rank": zip(names,sors) }
def plot_normalized_ranks(rankdata, color="black"):
"""Plot ranking scatter plot."""
import matplotlib.pyplot as plt
import numpy as np
gh_ranks = rankdata["github.rank"]
so_ranks = rankdata["so.rank"]
fig = plt.figure()
p = fig.add_subplot(111)
names = map(itemgetter(0), gh_ranks)
ghscores = normalize_ranks(map(itemgetter(1), gh_ranks))
so_ranks = dict(so_ranks)
soscores = normalize_ranks([so_ranks.get(n,0) for n in names ])
## scater plot
p.scatter(ghscores, soscores, marker='.', color=color)
p.set_xlabel("normalized GitHub rank")
p.set_ylabel("normalized Stackoverflow rank (by tag count)")
p.grid()
for n, x, y in zip(names, ghscores, soscores):
p.text(x,y,n,fontsize=7,alpha=0.5,color=color)
## linear fitting
s1,s0 = np.polyfit(ghscores, soscores,1)
xs = np.linspace(0,1)
ys = xs*s1+s0
p.plot(xs,ys,alpha=0.5,color=color)
return fig
def plot_rank_changes(olddata, newdata,
oldcolor="black",
gaincolor="blue",
losecolor="red",
nochangecolor="black",
change_threshold=3):
"""Show displacement of common data points between two sets."""
import matplotlib.pyplot as plt
import numpy as np
## evaluate normalized scores and their changes
old_ghrs = olddata["github.rank"]
old_sors = olddata["so.rank"]
new_ghrs = newdata["github.rank"]
new_sors = newdata["so.rank"]
old_langs = map(itemgetter(0),old_ghrs)
new_langs = map(itemgetter(0),new_ghrs)
common_langs = set(old_langs).intersection(set(new_langs))
## show only common
def filter_common(pairs):
return rerank_pairs([ (n, r) for n, r in pairs if n in common_langs ])
old_ghrs = filter_common(old_ghrs)
old_sors = filter_common(old_sors)
new_ghrs = filter_common(new_ghrs)
new_sors = filter_common(new_sors)
old_langs = map(itemgetter(0),old_ghrs)
new_langs = map(itemgetter(0),new_ghrs)
def scores_from_pairs(ghpairs, sopairs):
names = map(itemgetter(0), ghpairs)
sopairs = dict(sopairs)
ghscores = map(itemgetter(1), ghpairs)
soscores = [ sopairs.get(n, 0) for n in names ]
return ghscores, soscores
old_ghscores, old_soscores = scores_from_pairs(old_ghrs, old_sors)
new_ghscores, new_soscores = scores_from_pairs(new_ghrs, new_sors)
oldgh = dict(zip(old_langs, old_ghscores))
oldso = dict(zip(old_langs, old_soscores))
newgh = dict(zip(new_langs, new_ghscores))
newso = dict(zip(new_langs, new_soscores))
changes = [ ( oldgh.get(lang, 0),
oldso.get(lang, 0),
newgh.get(lang,0)-oldgh.get(lang, 0),
newso.get(lang,0)-oldso.get(lang, 0) )
for lang in common_langs ] # [ (x,y,dx,dy) ]
langchanges = dict(zip(common_langs, changes))
## colorize labels
def langcolor(lang):
x,y,dx,dy = langchanges.get(lang, [0,0,0,0])
change = dx+dy
if dx+dy > change_threshold or (dx > 0 and dy > 0):
# definitely or mostly improved
return gaincolor
elif dx+dy < -change_threshold or (dx < 0 and dy < 0):
return losecolor
else:
return nochangecolor
langcolors = dict([ (lang, langcolor(lang))
for lang in set(old_langs).union(set(new_langs)) ])
## plotting
fig = plt.figure()
p = fig.add_subplot(111)
for (x,y,dx,dy), lang in zip(changes, common_langs):
p.arrow(x,y,dx,dy,color=langcolors[lang],alpha=0.33,
length_includes_head=True, shape="full")
p.scatter(old_ghscores, old_soscores, color=oldcolor, marker=".")
def plot_newlangs_with_color(color):
names = [ lang for lang in new_langs if langcolor(lang) == color ]
xs = [ newgh.get(n) for n in names ]
ys = [ newso.get(n) for n in names ]
p.scatter(xs,ys,color=color,marker=".")
plot_newlangs_with_color(gaincolor)
plot_newlangs_with_color(losecolor)
plot_newlangs_with_color(nochangecolor)
p.set_xlabel("GitHub rank")
p.set_ylabel("Stackoverflow rank (by tag count)")
for n, x, y in zip(new_langs, new_ghscores, new_soscores):
p.text(x,y,n,fontsize=7,alpha=0.5,color=langcolors[n])
for n in set(old_langs).difference(common_langs):
x = oldgh.get(n)
y = oldso.get(n)
p.text(x,y,n,fontsize=7,alpha=0.5,color=nochangecolor)
p.grid()
return fig
def main():
rankdata = fetch_current_ranks()
write_ranks("language_ranks.new.csv", rankdata)
olddata = load_ranks("language_ranks.csv")
fig = plot_rank_changes(olddata, rankdata)
fig.savefig("rank_changes.png", dpi=150)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment