Instantly share code, notes, and snippets.
tuxdna/so-tags.py
Created May 9, 2016
Finding Top 20 Tags on StackOverflow main site
import json | |
import pandas as pd | |
import numpy as np | |
import xmltodict | |
import matplotlib.pyplot as plt | |
f = open("Tags.xml") | |
all_data = f.read() | |
o = xmltodict.parse(all_data) | |
df = pd.DataFrame.from_dict(o['tags']['row']) | |
df[['counts']] = df[['@Count']].astype(int) | |
df2 = df.sort_values(by=['counts'], ascending=False).head(20)[['counts', '@TagName']] | |
""" | |
In [53]: df2 | |
counts @TagName | |
2 1067078 javascript | |
11 1025688 java | |
6 918586 c# | |
4 885422 php | |
703 800779 android | |
422 712360 jquery | |
10 542985 python | |
1 511091 html | |
7 431790 c++ | |
19333 414394 ios | |
14 380535 mysql | |
3 372444 css | |
15 319001 sql | |
56 282582 asp.net | |
3199 253474 objective-c | |
2327 235532 ruby-on-rails | |
0 227675 .net | |
86 210953 iphone | |
5 210835 c | |
64 170559 arrays | |
""" | |
counts = df2['counts'].as_matrix() | |
x = np.array(range(len(counts))) | |
labels = df2['@TagName'].values | |
fig = plt.figure() | |
fig.set_size_inches(15, 10.5) | |
ax = fig.add_axes([0.1, 0.1, 0.8, 0.8]) | |
ax.set_xlabel("Tags") | |
ax.set_ylabel("Counts") | |
ax.bar(x, counts, align='center') | |
ax.set_xticks(x) | |
ax.set_xticklabels(labels) | |
fig.show() | |
fig.savefig('plot.png', format='png') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment