Skip to content

Instantly share code, notes, and snippets.

@kedarbellare
Created July 18, 2012 03:11
Show Gist options
  • Save kedarbellare/3133918 to your computer and use it in GitHub Desktop.
Save kedarbellare/3133918 to your computer and use it in GitHub Desktop.
Finding the max and average centrality in a graph
#
# Write centrality_max to return the maximum distance
# from a node to all the other nodes it can reach
#
from collections import deque
def centrality_max(G, v):
open_list = deque([v])
distance = {v: 0}
while open_list:
head = open_list.popleft()
for nbr in G[head]:
if nbr not in distance:
open_list.append(nbr)
distance[nbr] = distance[head] + 1
return distance[head]
#def centrality_max(G, v):
# # your code here
# open_list = [v]
# distance = {v: 0}
# best_distance = 0
# head, length = 0, 1
# while head < length:
# n = open_list[head]
# head += 1
# for nbr in G[n]:
# if nbr not in distance:
# open_list.append(nbr)
# length += 1
# best_distance = distance[n] + 1
# distance[nbr] = best_distance
# return best_distance
def centrality_max2(G, v):
toCrawl = []
nextDepth = [v]
marked = {v: True}
depth = -1
while nextDepth:
toCrawl = [] + nextDepth
nextDepth = []
depth += 1
while toCrawl:
nodeID = toCrawl.pop()
for e in G[nodeID]:
if e not in marked:
nextDepth.append(e)
marked[e] = True
return depth
def centrality_max_my(G, v):
distance = {v:0}
open_list = deque([v])
while open_list:
item = open_list.popleft()
for neighbor in G[item]:
if neighbor not in distance:
distance[neighbor] = distance[item] + 1
open_list.append(neighbor)
return max(distance.values())
def centrality_max_mine(G, v):
toCrawl = []
nextDepth = [v]
marked = {} #keep track of marked nodes so we don't go over something twice
depth = -1
while nextDepth:
toCrawl, nextDepth = nextDepth,toCrawl
gotMarked = False
while toCrawl:
nodeID = toCrawl.pop()
if nodeID not in marked:
gotMarked = True
marked[nodeID] = True #just assigning it something
for e in G[nodeID]:
if e not in marked:
nextDepth.append(e)
if gotMarked:
depth += 1
return depth
def centrality_max5(G, v):
frontier, distance_map = deque([v]), {v: 0}
while frontier:
current_node = frontier.popleft()
for successor in G[current_node]:
if successor not in distance_map:
distance_map[successor] = distance_map[current_node] + 1
frontier.append(successor)
return distance_map[current_node]
def average_centrality(G, v):
frontier, distance = deque([v]), {v: 0}
while frontier:
head = frontier.popleft()
for nbr in G[head]:
if nbr not in distance:
distance[nbr] = distance[head] + 1
frontier.append(nbr)
return (1.0*sum(distance.values()))/len(distance)
#################
# Testing code
#
def make_link(G, node1, node2):
if node1 not in G:
G[node1] = {}
(G[node1])[node2] = 1
if node2 not in G:
G[node2] = {}
(G[node2])[node1] = 1
return G
def test():
chain = ((1,2), (2,3), (3,4), (4,5), (5,6))
G = {}
for n1, n2 in chain:
make_link(G, n1, n2)
assert centrality_max(G, 1) == 5
assert centrality_max(G, 3) == 3
tree = ((1, 2), (1, 3),
(2, 4), (2, 5),
(3, 6), (3, 7),
(4, 8), (4, 9),
(6, 10), (6, 11))
G = {}
for n1, n2 in tree:
make_link(G, n1, n2)
assert centrality_max(G, 1) == 3
assert centrality_max(G, 11) == 6
G = {}
tree = ((1, 2), (1, 3), (2, 3))
for n1, n2 in tree:
make_link(G, n1, n2)
assert centrality_max(G, 1) == 1
G = {}
euler = [(0, 1), (1, 2), (2, 0), (2, 3), (3, 4), (4, 2), (4, 5), (5, 6), (6, 4), (6, 7), (7, 8), (8, 6), (8, 9), (9, 10), (10, 8), (10, 11), (11, 12), (12, 10), (12, 13), (13, 14), (14, 15), (15, 16), (16, 14), (16, 17), (17, 18), (18, 16), (18, 19), (19, 20), (20, 18), (20, 21), (21, 22), (22, 20), (22, 23), (23, 24), (24, 22), (24, 25), (25, 26), (26, 24), (26, 27), (27, 28), (28, 26), (28, 29), (29, 30), (30, 28), (30, 31), (31, 32), (32, 30), (32, 33), (33, 34), (34, 32), (34, 35), (35, 36), (36, 34), (36, 37), (37, 38), (38, 36), (38, 39), (39, 40), (40, 38), (40, 41), (41, 42), (42, 40), (42, 43), (43, 44), (44, 42), (44, 45), (45, 46), (46, 44), (46, 47), (47, 48), (48, 46), (48, 49), (49, 50), (50, 48), (50, 51), (51, 52), (52, 50), (52, 53), (53, 54), (54, 52), (54, 55), (55, 56), (56, 54), (56, 57), (57, 58), (58, 56), (58, 59), (59, 60), (60, 58), (60, 61), (61, 62), (62, 60), (62, 63), (63, 64), (64, 62), (64, 65), (65, 66), (66, 64), (66, 67), (67, 68), (68, 66), (68, 69), (69, 70), (70, 68), (70, 71), (71, 72), (72, 70), (72, 73), (73, 74), (74, 72), (74, 75), (75, 76), (76, 74), (76, 77), (77, 78), (78, 76), (78, 79), (79, 80), (80, 78), (80, 81), (81, 82), (82, 80), (82, 83), (83, 84), (84, 82), (84, 85), (85, 86), (86, 84), (86, 87), (87, 88), (88, 86), (88, 89), (89, 90), (90, 88), (90, 91), (91, 92), (92, 90), (92, 93), (93, 94), (94, 92), (94, 95), (95, 96), (96, 94), (96, 97), (97, 98), (98, 96), (98, 99), (99, 100), (100, 98)]
for n1, n2 in euler:
make_link(G, n1, n2)
assert centrality_max(G, 0) == 51
return 'all tests pass'
import csv, time
def test_enron():
# helper function for getting time in seconds
def now(): return time.time()*1.0
G = {}
file = csv.reader(open('email-Enron.txt', 'rb'), delimiter='\t')
for row in file:
make_link(G, row[0], row[1])
n = 15
a = now()
max([centrality_max5(G, x) for x in G.keys()[:n]])
a -= now()
b = now()
max([centrality_max2(G, x) for x in G.keys()[:n]])
b -= now()
c = now()
max([centrality_max_mine(G, x) for x in G.keys()[:n]])
c -= now()
d = now()
max([centrality_max_my(G, x) for x in G.keys()[:n]])
d -= now()
e = now()
max([centrality_max(G, x) for x in G.keys()[:n]])
e -= now()
print "%.3f,%.3f,%.3f,%.3f,%.3f" % (-a, -b, -c, -d, -e)
#print test()
#for i in xrange(100):
# test_enron()
def parent(i): return (i-1)/2
def left_child(i): return 2*i+1
def right_child(i): return 2*i+2
def is_leaf(L,i):
return (left_child(i) >= len(L)) and (right_child(i) >= len(L))
def one_child(L,i):
return (left_child(i) < len(L)) and (right_child(i) >= len(L))
def up_heapify(L, i):
if i == 0: return
pi = parent(i)
if L[pi] < L[i]:
L[i], L[pi] = L[pi], L[i]
up_heapify(L, pi)
def down_heapify(L, i):
if is_leaf(L, i):
return
elif one_child(L, i):
# check heap property
li = left_child(i)
if L[i] < L[li]:
L[li], L[i] = L[i], L[li]
else:
li, ri = left_child(i), right_child(i)
swapi = li if L[li] > L[ri] else ri
if L[swapi] > L[i]:
L[swapi], L[i] = L[i], L[swapi]
down_heapify(L, swapi)
def insert_heap(L, v, Kmax):
if len(L) >= Kmax and v >= L[0]: return
if len(L) < Kmax:
L.append(v)
up_heapify(L, len(L)-1)
else:
L[0] = v
down_heapify(L, 0)
def extract_max(L):
L[0], L[len(L)-1] = L[len(L)-1], L[0]
current = L.pop()
down_heapify(L, 0)
return current
def imdb_central(fname, Kmax=20):
G = {}
actors = {}
movies = {}
indices = {}
file = csv.reader(open(fname, 'rb'), delimiter='\t')
index = 0
for row in file:
actor = row[0]
movie = '%s (%s)' % (row[1], row[2])
if actor not in actors:
index += 1
indices[index] = actor
actors[actor] = index
if movie not in movies:
index += 1
indices[index] = movie
movies[movie] = index
make_link(G, actors[actor], movies[movie])
heap = []
done = 0
for actor in actors:
centrality = average_centrality(G, actors[actor])
current = (centrality, actor)
insert_heap(heap, current, Kmax)
done += 1
if done % 200 == 0:
print 'finished %d / %d' % (done, len(actors))
print 'final', min(heap)
while heap:
rank = len(heap)
current = extract_max(heap)
print 'rank %d: %s centrality=%.3f' % (rank, current[1], current[0])
imdb_central('imdb-4.tsv')
#print test()
#for i in xrange(100):
# test_enron()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment