dzlab/github_repo_analyzer

## github_repo_analyzer
# Code example from the book 'Mining Social Web'
import os
from github import Github
import networkx as nx
import sys
from operator import itemgetter
from collections import Counter
from networkx.readwrite import json_graph

ACCESS_TOKEN = 'GET A Personal Access Key from through your Github account'

USER = 'ptwobrussell'
REPO = 'Mining-the-Social-Web'

client = Github(ACCESS_TOKEN, per_page=100)
user = client.get_user(USER)
repo = user.get_repo(REPO)

stargazers = [s for s in repo.get_stargazers() ]
#print "Number of stargazers", len(stargazers)

g = nx.DiGraph()
g.add_node(repo.name + '(repo)', type='repo', lang=repo.language, owner=user.login)

for sg in stargazers:
	g.add_node(sg.login + '(user)', type='user')
	g.add_edge(sg.login + '(user)', repo.name + '(repo)', type='gazes')

print nx.info(g)
print
'''
print g.node['Mining-the-Social-Web(repo)']
print g.node['ptwobrussell(user)']
print
print g['ptwobrussell(user)']['Mining-the-Social-Web(repo)']

# throwing error when no such edge can be found
print
print g['ptwobrussell(user)']
print g['Mining-the-Social-Web(repo)']
print
print g.in_edges(['ptwobrussell(user)'])
print g.out_edges(['ptwobrussell(user)'])
print
print g.in_edges(['Mining-the-Social-Web(repo)'])
print g.out_edges(['Mining-the-Social-Web(repo)'])
'''

for i, sg in enumerate(stargazers):
	#Add "follows" edges between stargazers in the graph if any relationship exists
	try:
		for follower in sg.get_followers():
			if follower.login + '(user)' in g:
				g.add_edge(follower.login + '(user)', sg.login + '(user)', type='follows')
	except Exception, e: #ssl.SSLError
		print >> sys.stderr, "Encountered an error fetching followers for", sg.login, "Skipping."
		print >> sys.stderr, e

	print "Processed", i+1, " stargazers. Num nodes/edges in graph", g.number_of_nodes(), "/", g.number_of_edges()
	print "Rate limit remaining", client.rate_limiting

# Let's see how many social edges we added since last time
print nx.info(g)
print

# The number of "follows" edges is the difference
print len([e for e in g.edges_iter(data=True) if e[2]['type'] == 'follows'])
print

# The repository owner is possibly one of the more popular users in this graph
print len([e for e in g.edges_iter(data=True) if e[2]['type'] == 'follows' and e[1]=='ptwobrussell(user)'])
print

# The number of "follows" edges is the difference
print len([e for e in g.edges_iter(data=True) if e[2]['type'] == 'follows' and e[1] == 'ptwobrussell(user)'])
print

# Let's examine the number of adjacent edges to each node
print sorted([n for n in g.degree_iter()], key=itemgetter(1), reverse=True)[:10]
print

# Consider the ratio of incoming and outgoing edges for a couple of users with high node degrees...

# A user who follows many but is not followed back by many
print len(g.out_edges('hcilab(user)'))
print len(g.in_edges('hcilab(user)'))

# A user who is followed by many but does not followed back
print len(g.out_edges('ptwobrussell(user)'))
print len(g.in_edges('ptwobrussell(user)'))

c = Counter([e[1] for e in g.edges_iter(data=True) if e[2]['type'] == 'follows'])
popular_users = [ (u, f) for (u, f) in c.most_common() if f > 1 ]
print "Number of popular users", len(popular_users)
print "Top 10 popular users:", popular_users[:10]

# save yout work by serializing out (pickling) the graph
nx.write_gpickle(g, "resources/github.github_gpickle.1")

# How to restore the graph: g = nx.read_gpickle("resources/github.github_gpickle.1")

# Create a copy of the graph so that we can experiment on it
h = g.copy()

# Remove the seed of the interest graph, which is a supernode
h.remove_node('Mining-the-Social-Web(repo)')

# Disply the centrality measures for the top 10 nodes
dc = sorted(nx.degree_centrality(h).items(), key=itemgetter(1), reverse=True)
print "Degree Centrality"
print dc[:10]
print

bc = sorted(nx.betweenness_centrality(h).items(), key=itemgetter(1), reverse=True)
print "Betweenness Centrality"
print bc[:10]
print

cc = sorted(nx.closeness_centrality(h).items(), key=itemgetter(1), reverse=True)
print "Closeness Centrality"
print cc[:10]


# Let's add each stargazer's additional starred repos and add edges to find additional interest

MAX_REPOS = 500

for i, sg in enumerate(stargazers):
	print sg.login
	try:
		for starred in sg.get_starred()[:MAX_REPOS]:
			g.add_node(starred.name + '(repo)', type='repo', lang=starred.language, owner=starred.owner.login)
			g.add_edge(sg.login + '(user)', starred.name + '(repo)', type='gazes')
	except Exception, e:
		print "Encountered an error fetching starred repos for", sg.login, "Skipping."

	print "Processed", i+1, "stargazers' starred repos"
	print "Num nodes/edges in graph", g.number_of_nodes(), "/", g.number_of_edges()
	print "Rate limit", client.rate_limiting

# Exploring the graph after updates with starred repositories

print nx.info(g)
print

# Get a list of repositories from the graph
repos = [n for n in g.nodes_iter() if g.node[n]['type'] == 'repo']

# Get most popular repositories
print "Popular repositories"
print sorted([(n,d) for (n,d) in g.in_degree_iter() if g.node[n]['type'] == 'repo'], key=itemgetter(1), reverse=True)[:10]
print

# Projects gazed at by a user
print "Repositories that ptwobrussell has bookmarked"
print [(n,g.node[n]['lang'])
	for n in g['ptwobrussell(user)']
		if g['ptwobrussell(user)'][n]['type'] == 'gazes' ]
print

# Programming languages for each user
print "Programming languages ptwobrussell is interested in"
print list(set([g.node[n]['lang']
	for n in g['ptwobrussell(user)']
		if g['ptwobrussell(user)'][n]['type'] == 'gazes']))
print

# Find supernodes in the graph by approximating with a high number of outgoing edges
print "Supernode candidates"
print sorted([(n, len(g.out_edges(n)))
	for n in g.nodes_iter()
		if g.node[n]['type'] == 'user' and len(g.out_edges(n)) > 500], \
		key=itemgetter(1), reverse=True)

# Updating the graph to include nodes for Programming languages
repos = [n
		for n in g.nodes_iter()
			if g.node[n]['type'] == 'repo']

for repo in repos:
	lang = (g.node[repo]['lang'] or "") + "(repo)"
	stargazers = [u
					for (u, r, d) in g.in_edges_iter(repo, data=True)
						if d['type'] == 'gazes'
				]
	for sg in stargazers:
		g.add_node(lang, type='lang')
		g.add_edge(sg, lang, type='programs')
		g.add_edge(lang, repo, type='implements')

# Quering the final graph for answering questions related to languages
print nx.info(g)
print

print [n
		for n in g['ptwobrussell(user)']
			if g['ptwobrussell(user)'][n]['type'] == 'programs']

# What is the most popular Programming language?
print "Most popular languages"
print sorted([(n, g.in_degree(n))
			for n in g.nodes_iter()
				if g.node[n]['type'] == 'lang'], key=itemgetter(1), reverse=True)[:10]
print

# How many users program in a particular language?
python_programmers = [u
						for (u, l) in g.in_edges_iter('Python(lang)')
							if g.node[u]['type'] == 'user']
print "Number of Python programmers:", len(python_programmers)
print

javascript_programmers = [u
						for (u, l) in g.in_edges_iter('Javascript(lang)')
							if g.node[u]['type'] == 'user']
print "Number of Javascript programmers:", len(python_programmers)
print

# What users program in both Python and Javascript?
print "Number of programmers who use Javascript and Python"
print len(set(python_programmers).intersection(set(javascript_programmers)))

# Programmers who use Javascript but not Python
print "Number of programmers who use Javascript but not Python"
print len(set(python_programmers).difference(set(javascript_programmers)))

# Determine who is the most polygot programmer?


# Graph visualization of the social network for the original interest graph
print "Stats on the full graph"
print nx.info(g)
print

# Create a subgraph from a collection of nodes. In this case, the collection is all of the users in the original interest graph
mtsw_users = [n for n in g if g.node[n]['type'] == 'user']
h = g.subgraph(mtsw_users)

print "Stats on the extracted subgraph"
print nx.info(h)

# Visualize the social network of all people from the original interest graph
d = json_graph.node_link_data(h)
json.dump(d, open('resources/force.json', 'w'))

# IPython Notebook can serve files and display them into inline frames. Prepend the path with the 'files' prefix
# A D3 template for displaying the graph data
viz_file = 'files/resources/force.html'

# Display the D3 visualization
display(IFrame(viz_file, '100%', '600px'))
	# Code example from the book 'Mining Social Web'
	import os
	from github import Github
	import networkx as nx
	import sys
	from operator import itemgetter
	from collections import Counter
	from networkx.readwrite import json_graph

	ACCESS_TOKEN = 'GET A Personal Access Key from through your Github account'

	USER = 'ptwobrussell'
	REPO = 'Mining-the-Social-Web'

	client = Github(ACCESS_TOKEN, per_page=100)
	user = client.get_user(USER)
	repo = user.get_repo(REPO)

	stargazers = [s for s in repo.get_stargazers() ]
	#print "Number of stargazers", len(stargazers)

	g = nx.DiGraph()
	g.add_node(repo.name + '(repo)', type='repo', lang=repo.language, owner=user.login)

	for sg in stargazers:
	g.add_node(sg.login + '(user)', type='user')
	g.add_edge(sg.login + '(user)', repo.name + '(repo)', type='gazes')

	print nx.info(g)
	print
	'''
	print g.node['Mining-the-Social-Web(repo)']
	print g.node['ptwobrussell(user)']
	print
	print g['ptwobrussell(user)']['Mining-the-Social-Web(repo)']

	# throwing error when no such edge can be found
	print
	print g['ptwobrussell(user)']
	print g['Mining-the-Social-Web(repo)']
	print
	print g.in_edges(['ptwobrussell(user)'])
	print g.out_edges(['ptwobrussell(user)'])
	print
	print g.in_edges(['Mining-the-Social-Web(repo)'])
	print g.out_edges(['Mining-the-Social-Web(repo)'])
	'''

	for i, sg in enumerate(stargazers):
	#Add "follows" edges between stargazers in the graph if any relationship exists
	try:
	for follower in sg.get_followers():
	if follower.login + '(user)' in g:
	g.add_edge(follower.login + '(user)', sg.login + '(user)', type='follows')
	except Exception, e: #ssl.SSLError
	print >> sys.stderr, "Encountered an error fetching followers for", sg.login, "Skipping."
	print >> sys.stderr, e

	print "Processed", i+1, " stargazers. Num nodes/edges in graph", g.number_of_nodes(), "/", g.number_of_edges()
	print "Rate limit remaining", client.rate_limiting

	# Let's see how many social edges we added since last time
	print nx.info(g)
	print

	# The number of "follows" edges is the difference
	print len([e for e in g.edges_iter(data=True) if e[2]['type'] == 'follows'])
	print

	# The repository owner is possibly one of the more popular users in this graph
	print len([e for e in g.edges_iter(data=True) if e[2]['type'] == 'follows' and e[1]=='ptwobrussell(user)'])
	print

	# The number of "follows" edges is the difference
	print len([e for e in g.edges_iter(data=True) if e[2]['type'] == 'follows' and e[1] == 'ptwobrussell(user)'])
	print

	# Let's examine the number of adjacent edges to each node
	print sorted([n for n in g.degree_iter()], key=itemgetter(1), reverse=True)[:10]
	print

	# Consider the ratio of incoming and outgoing edges for a couple of users with high node degrees...

	# A user who follows many but is not followed back by many
	print len(g.out_edges('hcilab(user)'))
	print len(g.in_edges('hcilab(user)'))

	# A user who is followed by many but does not followed back
	print len(g.out_edges('ptwobrussell(user)'))
	print len(g.in_edges('ptwobrussell(user)'))

	c = Counter([e[1] for e in g.edges_iter(data=True) if e[2]['type'] == 'follows'])
	popular_users = [ (u, f) for (u, f) in c.most_common() if f > 1 ]
	print "Number of popular users", len(popular_users)
	print "Top 10 popular users:", popular_users[:10]

	# save yout work by serializing out (pickling) the graph
	nx.write_gpickle(g, "resources/github.github_gpickle.1")

	# How to restore the graph: g = nx.read_gpickle("resources/github.github_gpickle.1")

	# Create a copy of the graph so that we can experiment on it
	h = g.copy()

	# Remove the seed of the interest graph, which is a supernode
	h.remove_node('Mining-the-Social-Web(repo)')

	# Disply the centrality measures for the top 10 nodes
	dc = sorted(nx.degree_centrality(h).items(), key=itemgetter(1), reverse=True)
	print "Degree Centrality"
	print dc[:10]
	print

	bc = sorted(nx.betweenness_centrality(h).items(), key=itemgetter(1), reverse=True)
	print "Betweenness Centrality"
	print bc[:10]
	print

	cc = sorted(nx.closeness_centrality(h).items(), key=itemgetter(1), reverse=True)
	print "Closeness Centrality"
	print cc[:10]


	# Let's add each stargazer's additional starred repos and add edges to find additional interest

	MAX_REPOS = 500

	for i, sg in enumerate(stargazers):
	print sg.login
	try:
	for starred in sg.get_starred()[:MAX_REPOS]:
	g.add_node(starred.name + '(repo)', type='repo', lang=starred.language, owner=starred.owner.login)
	g.add_edge(sg.login + '(user)', starred.name + '(repo)', type='gazes')
	except Exception, e:
	print "Encountered an error fetching starred repos for", sg.login, "Skipping."

	print "Processed", i+1, "stargazers' starred repos"
	print "Num nodes/edges in graph", g.number_of_nodes(), "/", g.number_of_edges()
	print "Rate limit", client.rate_limiting

	# Exploring the graph after updates with starred repositories

	print nx.info(g)
	print

	# Get a list of repositories from the graph
	repos = [n for n in g.nodes_iter() if g.node[n]['type'] == 'repo']

	# Get most popular repositories
	print "Popular repositories"
	print sorted([(n,d) for (n,d) in g.in_degree_iter() if g.node[n]['type'] == 'repo'], key=itemgetter(1), reverse=True)[:10]
	print

	# Projects gazed at by a user
	print "Repositories that ptwobrussell has bookmarked"
	print [(n,g.node[n]['lang'])
	for n in g['ptwobrussell(user)']
	if g['ptwobrussell(user)'][n]['type'] == 'gazes' ]
	print

	# Programming languages for each user
	print "Programming languages ptwobrussell is interested in"
	print list(set([g.node[n]['lang']
	for n in g['ptwobrussell(user)']
	if g['ptwobrussell(user)'][n]['type'] == 'gazes']))
	print

	# Find supernodes in the graph by approximating with a high number of outgoing edges
	print "Supernode candidates"
	print sorted([(n, len(g.out_edges(n)))
	for n in g.nodes_iter()
	if g.node[n]['type'] == 'user' and len(g.out_edges(n)) > 500], \
	key=itemgetter(1), reverse=True)

	# Updating the graph to include nodes for Programming languages
	repos = [n
	for n in g.nodes_iter()
	if g.node[n]['type'] == 'repo']

	for repo in repos:
	lang = (g.node[repo]['lang'] or "") + "(repo)"
	stargazers = [u
	for (u, r, d) in g.in_edges_iter(repo, data=True)
	if d['type'] == 'gazes'
	]
	for sg in stargazers:
	g.add_node(lang, type='lang')
	g.add_edge(sg, lang, type='programs')
	g.add_edge(lang, repo, type='implements')

	# Quering the final graph for answering questions related to languages
	print nx.info(g)
	print

	print [n
	for n in g['ptwobrussell(user)']
	if g['ptwobrussell(user)'][n]['type'] == 'programs']

	# What is the most popular Programming language?
	print "Most popular languages"
	print sorted([(n, g.in_degree(n))
	for n in g.nodes_iter()
	if g.node[n]['type'] == 'lang'], key=itemgetter(1), reverse=True)[:10]
	print

	# How many users program in a particular language?
	python_programmers = [u
	for (u, l) in g.in_edges_iter('Python(lang)')
	if g.node[u]['type'] == 'user']
	print "Number of Python programmers:", len(python_programmers)
	print

	javascript_programmers = [u
	for (u, l) in g.in_edges_iter('Javascript(lang)')
	if g.node[u]['type'] == 'user']
	print "Number of Javascript programmers:", len(python_programmers)
	print

	# What users program in both Python and Javascript?
	print "Number of programmers who use Javascript and Python"
	print len(set(python_programmers).intersection(set(javascript_programmers)))

	# Programmers who use Javascript but not Python
	print "Number of programmers who use Javascript but not Python"
	print len(set(python_programmers).difference(set(javascript_programmers)))

	# Determine who is the most polygot programmer?


	# Graph visualization of the social network for the original interest graph
	print "Stats on the full graph"
	print nx.info(g)
	print

	# Create a subgraph from a collection of nodes. In this case, the collection is all of the users in the original interest graph
	mtsw_users = [n for n in g if g.node[n]['type'] == 'user']
	h = g.subgraph(mtsw_users)

	print "Stats on the extracted subgraph"
	print nx.info(h)

	# Visualize the social network of all people from the original interest graph
	d = json_graph.node_link_data(h)
	json.dump(d, open('resources/force.json', 'w'))

	# IPython Notebook can serve files and display them into inline frames. Prepend the path with the 'files' prefix
	# A D3 template for displaying the graph data
	viz_file = 'files/resources/force.html'

	# Display the D3 visualization
	display(IFrame(viz_file, '100%', '600px'))