Created
February 9, 2014 16:16
-
-
Save dzlab/8901363 to your computer and use it in GitHub Desktop.
A sample program for mining Github repositories to analyze a community of developers interested in similar things among the seed repository used for initial graph crawling
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Code example from the book 'Mining Social Web' | |
import os | |
from github import Github | |
import networkx as nx | |
import sys | |
from operator import itemgetter | |
from collections import Counter | |
from networkx.readwrite import json_graph | |
ACCESS_TOKEN = 'GET A Personal Access Key from through your Github account' | |
USER = 'ptwobrussell' | |
REPO = 'Mining-the-Social-Web' | |
client = Github(ACCESS_TOKEN, per_page=100) | |
user = client.get_user(USER) | |
repo = user.get_repo(REPO) | |
stargazers = [s for s in repo.get_stargazers() ] | |
#print "Number of stargazers", len(stargazers) | |
g = nx.DiGraph() | |
g.add_node(repo.name + '(repo)', type='repo', lang=repo.language, owner=user.login) | |
for sg in stargazers: | |
g.add_node(sg.login + '(user)', type='user') | |
g.add_edge(sg.login + '(user)', repo.name + '(repo)', type='gazes') | |
print nx.info(g) | |
''' | |
print g.node['Mining-the-Social-Web(repo)'] | |
print g.node['ptwobrussell(user)'] | |
print g['ptwobrussell(user)']['Mining-the-Social-Web(repo)'] | |
# throwing error when no such edge can be found | |
print g['ptwobrussell(user)'] | |
print g['Mining-the-Social-Web(repo)'] | |
print g.in_edges(['ptwobrussell(user)']) | |
print g.out_edges(['ptwobrussell(user)']) | |
print g.in_edges(['Mining-the-Social-Web(repo)']) | |
print g.out_edges(['Mining-the-Social-Web(repo)']) | |
''' | |
for i, sg in enumerate(stargazers): | |
#Add "follows" edges between stargazers in the graph if any relationship exists | |
try: | |
for follower in sg.get_followers(): | |
if follower.login + '(user)' in g: | |
g.add_edge(follower.login + '(user)', sg.login + '(user)', type='follows') | |
except Exception, e: #ssl.SSLError | |
print >> sys.stderr, "Encountered an error fetching followers for", sg.login, "Skipping." | |
print >> sys.stderr, e | |
print "Processed", i+1, " stargazers. Num nodes/edges in graph", g.number_of_nodes(), "/", g.number_of_edges() | |
print "Rate limit remaining", client.rate_limiting | |
# Let's see how many social edges we added since last time | |
print nx.info(g) | |
# The number of "follows" edges is the difference | |
print len([e for e in g.edges_iter(data=True) if e[2]['type'] == 'follows']) | |
# The repository owner is possibly one of the more popular users in this graph | |
print len([e for e in g.edges_iter(data=True) if e[2]['type'] == 'follows' and e[1]=='ptwobrussell(user)']) | |
# The number of "follows" edges is the difference | |
print len([e for e in g.edges_iter(data=True) if e[2]['type'] == 'follows' and e[1] == 'ptwobrussell(user)']) | |
# Let's examine the number of adjacent edges to each node | |
print sorted([n for n in g.degree_iter()], key=itemgetter(1), reverse=True)[:10] | |
# Consider the ratio of incoming and outgoing edges for a couple of users with high node degrees... | |
# A user who follows many but is not followed back by many | |
print len(g.out_edges('hcilab(user)')) | |
print len(g.in_edges('hcilab(user)')) | |
# A user who is followed by many but does not followed back | |
print len(g.out_edges('ptwobrussell(user)')) | |
print len(g.in_edges('ptwobrussell(user)')) | |
c = Counter([e[1] for e in g.edges_iter(data=True) if e[2]['type'] == 'follows']) | |
popular_users = [ (u, f) for (u, f) in c.most_common() if f > 1 ] | |
print "Number of popular users", len(popular_users) | |
print "Top 10 popular users:", popular_users[:10] | |
# save yout work by serializing out (pickling) the graph | |
nx.write_gpickle(g, "resources/github.github_gpickle.1") | |
# How to restore the graph: g = nx.read_gpickle("resources/github.github_gpickle.1") | |
# Create a copy of the graph so that we can experiment on it | |
h = g.copy() | |
# Remove the seed of the interest graph, which is a supernode | |
h.remove_node('Mining-the-Social-Web(repo)') | |
# Disply the centrality measures for the top 10 nodes | |
dc = sorted(nx.degree_centrality(h).items(), key=itemgetter(1), reverse=True) | |
print "Degree Centrality" | |
print dc[:10] | |
bc = sorted(nx.betweenness_centrality(h).items(), key=itemgetter(1), reverse=True) | |
print "Betweenness Centrality" | |
print bc[:10] | |
cc = sorted(nx.closeness_centrality(h).items(), key=itemgetter(1), reverse=True) | |
print "Closeness Centrality" | |
print cc[:10] | |
# Let's add each stargazer's additional starred repos and add edges to find additional interest | |
MAX_REPOS = 500 | |
for i, sg in enumerate(stargazers): | |
print sg.login | |
try: | |
for starred in sg.get_starred()[:MAX_REPOS]: | |
g.add_node(starred.name + '(repo)', type='repo', lang=starred.language, owner=starred.owner.login) | |
g.add_edge(sg.login + '(user)', starred.name + '(repo)', type='gazes') | |
except Exception, e: | |
print "Encountered an error fetching starred repos for", sg.login, "Skipping." | |
print "Processed", i+1, "stargazers' starred repos" | |
print "Num nodes/edges in graph", g.number_of_nodes(), "/", g.number_of_edges() | |
print "Rate limit", client.rate_limiting | |
# Exploring the graph after updates with starred repositories | |
print nx.info(g) | |
# Get a list of repositories from the graph | |
repos = [n for n in g.nodes_iter() if g.node[n]['type'] == 'repo'] | |
# Get most popular repositories | |
print "Popular repositories" | |
print sorted([(n,d) for (n,d) in g.in_degree_iter() if g.node[n]['type'] == 'repo'], key=itemgetter(1), reverse=True)[:10] | |
# Projects gazed at by a user | |
print "Repositories that ptwobrussell has bookmarked" | |
print [(n,g.node[n]['lang']) | |
for n in g['ptwobrussell(user)'] | |
if g['ptwobrussell(user)'][n]['type'] == 'gazes' ] | |
# Programming languages for each user | |
print "Programming languages ptwobrussell is interested in" | |
print list(set([g.node[n]['lang'] | |
for n in g['ptwobrussell(user)'] | |
if g['ptwobrussell(user)'][n]['type'] == 'gazes'])) | |
# Find supernodes in the graph by approximating with a high number of outgoing edges | |
print "Supernode candidates" | |
print sorted([(n, len(g.out_edges(n))) | |
for n in g.nodes_iter() | |
if g.node[n]['type'] == 'user' and len(g.out_edges(n)) > 500], \ | |
key=itemgetter(1), reverse=True) | |
# Updating the graph to include nodes for Programming languages | |
repos = [n | |
for n in g.nodes_iter() | |
if g.node[n]['type'] == 'repo'] | |
for repo in repos: | |
lang = (g.node[repo]['lang'] or "") + "(repo)" | |
stargazers = [u | |
for (u, r, d) in g.in_edges_iter(repo, data=True) | |
if d['type'] == 'gazes' | |
] | |
for sg in stargazers: | |
g.add_node(lang, type='lang') | |
g.add_edge(sg, lang, type='programs') | |
g.add_edge(lang, repo, type='implements') | |
# Quering the final graph for answering questions related to languages | |
print nx.info(g) | |
print [n | |
for n in g['ptwobrussell(user)'] | |
if g['ptwobrussell(user)'][n]['type'] == 'programs'] | |
# What is the most popular Programming language? | |
print "Most popular languages" | |
print sorted([(n, g.in_degree(n)) | |
for n in g.nodes_iter() | |
if g.node[n]['type'] == 'lang'], key=itemgetter(1), reverse=True)[:10] | |
# How many users program in a particular language? | |
python_programmers = [u | |
for (u, l) in g.in_edges_iter('Python(lang)') | |
if g.node[u]['type'] == 'user'] | |
print "Number of Python programmers:", len(python_programmers) | |
javascript_programmers = [u | |
for (u, l) in g.in_edges_iter('Javascript(lang)') | |
if g.node[u]['type'] == 'user'] | |
print "Number of Javascript programmers:", len(python_programmers) | |
# What users program in both Python and Javascript? | |
print "Number of programmers who use Javascript and Python" | |
print len(set(python_programmers).intersection(set(javascript_programmers))) | |
# Programmers who use Javascript but not Python | |
print "Number of programmers who use Javascript but not Python" | |
print len(set(python_programmers).difference(set(javascript_programmers))) | |
# Determine who is the most polygot programmer? | |
# Graph visualization of the social network for the original interest graph | |
print "Stats on the full graph" | |
print nx.info(g) | |
# Create a subgraph from a collection of nodes. In this case, the collection is all of the users in the original interest graph | |
mtsw_users = [n for n in g if g.node[n]['type'] == 'user'] | |
h = g.subgraph(mtsw_users) | |
print "Stats on the extracted subgraph" | |
print nx.info(h) | |
# Visualize the social network of all people from the original interest graph | |
d = json_graph.node_link_data(h) | |
json.dump(d, open('resources/force.json', 'w')) | |
# IPython Notebook can serve files and display them into inline frames. Prepend the path with the 'files' prefix | |
# A D3 template for displaying the graph data | |
viz_file = 'files/resources/force.html' | |
# Display the D3 visualization | |
display(IFrame(viz_file, '100%', '600px')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment