Skip to content

Instantly share code, notes, and snippets.

@dzlab
Created February 9, 2014 16:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dzlab/8901363 to your computer and use it in GitHub Desktop.
Save dzlab/8901363 to your computer and use it in GitHub Desktop.
A sample program for mining Github repositories to analyze a community of developers interested in similar things among the seed repository used for initial graph crawling
# Code example from the book 'Mining Social Web'
import os
from github import Github
import networkx as nx
import sys
from operator import itemgetter
from collections import Counter
from networkx.readwrite import json_graph
ACCESS_TOKEN = 'GET A Personal Access Key from through your Github account'
USER = 'ptwobrussell'
REPO = 'Mining-the-Social-Web'
client = Github(ACCESS_TOKEN, per_page=100)
user = client.get_user(USER)
repo = user.get_repo(REPO)
stargazers = [s for s in repo.get_stargazers() ]
#print "Number of stargazers", len(stargazers)
g = nx.DiGraph()
g.add_node(repo.name + '(repo)', type='repo', lang=repo.language, owner=user.login)
for sg in stargazers:
g.add_node(sg.login + '(user)', type='user')
g.add_edge(sg.login + '(user)', repo.name + '(repo)', type='gazes')
print nx.info(g)
print
'''
print g.node['Mining-the-Social-Web(repo)']
print g.node['ptwobrussell(user)']
print
print g['ptwobrussell(user)']['Mining-the-Social-Web(repo)']
# throwing error when no such edge can be found
print
print g['ptwobrussell(user)']
print g['Mining-the-Social-Web(repo)']
print
print g.in_edges(['ptwobrussell(user)'])
print g.out_edges(['ptwobrussell(user)'])
print
print g.in_edges(['Mining-the-Social-Web(repo)'])
print g.out_edges(['Mining-the-Social-Web(repo)'])
'''
for i, sg in enumerate(stargazers):
#Add "follows" edges between stargazers in the graph if any relationship exists
try:
for follower in sg.get_followers():
if follower.login + '(user)' in g:
g.add_edge(follower.login + '(user)', sg.login + '(user)', type='follows')
except Exception, e: #ssl.SSLError
print >> sys.stderr, "Encountered an error fetching followers for", sg.login, "Skipping."
print >> sys.stderr, e
print "Processed", i+1, " stargazers. Num nodes/edges in graph", g.number_of_nodes(), "/", g.number_of_edges()
print "Rate limit remaining", client.rate_limiting
# Let's see how many social edges we added since last time
print nx.info(g)
print
# The number of "follows" edges is the difference
print len([e for e in g.edges_iter(data=True) if e[2]['type'] == 'follows'])
print
# The repository owner is possibly one of the more popular users in this graph
print len([e for e in g.edges_iter(data=True) if e[2]['type'] == 'follows' and e[1]=='ptwobrussell(user)'])
print
# The number of "follows" edges is the difference
print len([e for e in g.edges_iter(data=True) if e[2]['type'] == 'follows' and e[1] == 'ptwobrussell(user)'])
print
# Let's examine the number of adjacent edges to each node
print sorted([n for n in g.degree_iter()], key=itemgetter(1), reverse=True)[:10]
print
# Consider the ratio of incoming and outgoing edges for a couple of users with high node degrees...
# A user who follows many but is not followed back by many
print len(g.out_edges('hcilab(user)'))
print len(g.in_edges('hcilab(user)'))
# A user who is followed by many but does not followed back
print len(g.out_edges('ptwobrussell(user)'))
print len(g.in_edges('ptwobrussell(user)'))
c = Counter([e[1] for e in g.edges_iter(data=True) if e[2]['type'] == 'follows'])
popular_users = [ (u, f) for (u, f) in c.most_common() if f > 1 ]
print "Number of popular users", len(popular_users)
print "Top 10 popular users:", popular_users[:10]
# save yout work by serializing out (pickling) the graph
nx.write_gpickle(g, "resources/github.github_gpickle.1")
# How to restore the graph: g = nx.read_gpickle("resources/github.github_gpickle.1")
# Create a copy of the graph so that we can experiment on it
h = g.copy()
# Remove the seed of the interest graph, which is a supernode
h.remove_node('Mining-the-Social-Web(repo)')
# Disply the centrality measures for the top 10 nodes
dc = sorted(nx.degree_centrality(h).items(), key=itemgetter(1), reverse=True)
print "Degree Centrality"
print dc[:10]
print
bc = sorted(nx.betweenness_centrality(h).items(), key=itemgetter(1), reverse=True)
print "Betweenness Centrality"
print bc[:10]
print
cc = sorted(nx.closeness_centrality(h).items(), key=itemgetter(1), reverse=True)
print "Closeness Centrality"
print cc[:10]
# Let's add each stargazer's additional starred repos and add edges to find additional interest
MAX_REPOS = 500
for i, sg in enumerate(stargazers):
print sg.login
try:
for starred in sg.get_starred()[:MAX_REPOS]:
g.add_node(starred.name + '(repo)', type='repo', lang=starred.language, owner=starred.owner.login)
g.add_edge(sg.login + '(user)', starred.name + '(repo)', type='gazes')
except Exception, e:
print "Encountered an error fetching starred repos for", sg.login, "Skipping."
print "Processed", i+1, "stargazers' starred repos"
print "Num nodes/edges in graph", g.number_of_nodes(), "/", g.number_of_edges()
print "Rate limit", client.rate_limiting
# Exploring the graph after updates with starred repositories
print nx.info(g)
print
# Get a list of repositories from the graph
repos = [n for n in g.nodes_iter() if g.node[n]['type'] == 'repo']
# Get most popular repositories
print "Popular repositories"
print sorted([(n,d) for (n,d) in g.in_degree_iter() if g.node[n]['type'] == 'repo'], key=itemgetter(1), reverse=True)[:10]
print
# Projects gazed at by a user
print "Repositories that ptwobrussell has bookmarked"
print [(n,g.node[n]['lang'])
for n in g['ptwobrussell(user)']
if g['ptwobrussell(user)'][n]['type'] == 'gazes' ]
print
# Programming languages for each user
print "Programming languages ptwobrussell is interested in"
print list(set([g.node[n]['lang']
for n in g['ptwobrussell(user)']
if g['ptwobrussell(user)'][n]['type'] == 'gazes']))
print
# Find supernodes in the graph by approximating with a high number of outgoing edges
print "Supernode candidates"
print sorted([(n, len(g.out_edges(n)))
for n in g.nodes_iter()
if g.node[n]['type'] == 'user' and len(g.out_edges(n)) > 500], \
key=itemgetter(1), reverse=True)
# Updating the graph to include nodes for Programming languages
repos = [n
for n in g.nodes_iter()
if g.node[n]['type'] == 'repo']
for repo in repos:
lang = (g.node[repo]['lang'] or "") + "(repo)"
stargazers = [u
for (u, r, d) in g.in_edges_iter(repo, data=True)
if d['type'] == 'gazes'
]
for sg in stargazers:
g.add_node(lang, type='lang')
g.add_edge(sg, lang, type='programs')
g.add_edge(lang, repo, type='implements')
# Quering the final graph for answering questions related to languages
print nx.info(g)
print
print [n
for n in g['ptwobrussell(user)']
if g['ptwobrussell(user)'][n]['type'] == 'programs']
# What is the most popular Programming language?
print "Most popular languages"
print sorted([(n, g.in_degree(n))
for n in g.nodes_iter()
if g.node[n]['type'] == 'lang'], key=itemgetter(1), reverse=True)[:10]
print
# How many users program in a particular language?
python_programmers = [u
for (u, l) in g.in_edges_iter('Python(lang)')
if g.node[u]['type'] == 'user']
print "Number of Python programmers:", len(python_programmers)
print
javascript_programmers = [u
for (u, l) in g.in_edges_iter('Javascript(lang)')
if g.node[u]['type'] == 'user']
print "Number of Javascript programmers:", len(python_programmers)
print
# What users program in both Python and Javascript?
print "Number of programmers who use Javascript and Python"
print len(set(python_programmers).intersection(set(javascript_programmers)))
# Programmers who use Javascript but not Python
print "Number of programmers who use Javascript but not Python"
print len(set(python_programmers).difference(set(javascript_programmers)))
# Determine who is the most polygot programmer?
# Graph visualization of the social network for the original interest graph
print "Stats on the full graph"
print nx.info(g)
print
# Create a subgraph from a collection of nodes. In this case, the collection is all of the users in the original interest graph
mtsw_users = [n for n in g if g.node[n]['type'] == 'user']
h = g.subgraph(mtsw_users)
print "Stats on the extracted subgraph"
print nx.info(h)
# Visualize the social network of all people from the original interest graph
d = json_graph.node_link_data(h)
json.dump(d, open('resources/force.json', 'w'))
# IPython Notebook can serve files and display them into inline frames. Prepend the path with the 'files' prefix
# A D3 template for displaying the graph data
viz_file = 'files/resources/force.html'
# Display the D3 visualization
display(IFrame(viz_file, '100%', '600px'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment