kedarbellare/centrality.py

## centrality.py
#
# Write centrality_max to return the maximum distance
# from a node to all the other nodes it can reach
#

from collections import deque

def centrality_max(G, v):
    open_list = deque([v])
    distance = {v: 0}
    while open_list:
        head = open_list.popleft()
        for nbr in G[head]:
            if nbr not in distance:
                open_list.append(nbr)
                distance[nbr] = distance[head] + 1
    return distance[head]

#def centrality_max(G, v):
#    # your code here
#    open_list = [v]
#    distance = {v: 0}
#    best_distance = 0
#    head, length = 0, 1
#    while head < length:
#        n = open_list[head]
#        head += 1
#        for nbr in G[n]:
#            if nbr not in distance:
#                open_list.append(nbr)
#                length += 1
#                best_distance = distance[n] + 1
#                distance[nbr] = best_distance
#    return best_distance

def centrality_max2(G, v):
    toCrawl = []
    nextDepth = [v]
    marked = {v: True}
    depth = -1
    while nextDepth:
        toCrawl = [] + nextDepth
        nextDepth = []
        depth += 1
        while toCrawl:
            nodeID = toCrawl.pop()
            for e in G[nodeID]:
                if e not in marked:
                    nextDepth.append(e)
                    marked[e] = True
    return depth

def centrality_max_my(G, v):
    distance = {v:0}
    open_list = deque([v])
    while open_list:
        item = open_list.popleft()
        for neighbor in G[item]:
            if neighbor not in distance:
                distance[neighbor] = distance[item] + 1
                open_list.append(neighbor)
    return max(distance.values())

def centrality_max_mine(G, v):
    toCrawl = []
    nextDepth = [v]
    marked = {} #keep track of marked nodes so we don't go over something twice
    depth = -1
    while nextDepth:
        toCrawl, nextDepth = nextDepth,toCrawl
        gotMarked = False
        while toCrawl:
            nodeID = toCrawl.pop()
            if nodeID not in marked:
                gotMarked = True
                marked[nodeID] = True #just assigning it something
                for e in G[nodeID]:
                    if e not in marked:
                        nextDepth.append(e)
        if gotMarked:
            depth += 1
    return depth

def centrality_max5(G, v):
    frontier, distance_map = deque([v]), {v: 0}
    while frontier:
        current_node = frontier.popleft()
        for successor in G[current_node]:
            if successor not in distance_map:
                distance_map[successor] = distance_map[current_node] + 1
                frontier.append(successor)
    return distance_map[current_node]

def average_centrality(G, v):
    frontier, distance = deque([v]), {v: 0}
    while frontier:
        head = frontier.popleft()
        for nbr in G[head]:
            if nbr not in distance:
                distance[nbr] = distance[head] + 1
                frontier.append(nbr)
    return (1.0*sum(distance.values()))/len(distance)

#################
# Testing code
#
def make_link(G, node1, node2):
    if node1 not in G:
        G[node1] = {}
    (G[node1])[node2] = 1
    if node2 not in G:
        G[node2] = {}
    (G[node2])[node1] = 1
    return G

def test():
    chain = ((1,2), (2,3), (3,4), (4,5), (5,6))
    G = {}
    for n1, n2 in chain:
        make_link(G, n1, n2)
    assert centrality_max(G, 1) == 5
    assert centrality_max(G, 3) == 3
    tree = ((1, 2), (1, 3),
            (2, 4), (2, 5),
            (3, 6), (3, 7),
            (4, 8), (4, 9),
            (6, 10), (6, 11))
    G = {}
    for n1, n2 in tree:
        make_link(G, n1, n2)
    assert centrality_max(G, 1) == 3
    assert centrality_max(G, 11) == 6
    G = {}
    tree = ((1, 2), (1, 3), (2, 3))
    for n1, n2 in tree:
        make_link(G, n1, n2)
    assert centrality_max(G, 1) == 1
    G = {}
    euler = [(0, 1), (1, 2), (2, 0), (2, 3), (3, 4), (4, 2), (4, 5), (5, 6), (6, 4), (6, 7), (7, 8), (8, 6), (8, 9), (9, 10), (10, 8), (10, 11), (11, 12), (12, 10), (12, 13), (13, 14), (14, 15), (15, 16), (16, 14), (16, 17), (17, 18), (18, 16), (18, 19), (19, 20), (20, 18), (20, 21), (21, 22), (22, 20), (22, 23), (23, 24), (24, 22), (24, 25), (25, 26), (26, 24), (26, 27), (27, 28), (28, 26), (28, 29), (29, 30), (30, 28), (30, 31), (31, 32), (32, 30), (32, 33), (33, 34), (34, 32), (34, 35), (35, 36), (36, 34), (36, 37), (37, 38), (38, 36), (38, 39), (39, 40), (40, 38), (40, 41), (41, 42), (42, 40), (42, 43), (43, 44), (44, 42), (44, 45), (45, 46), (46, 44), (46, 47), (47, 48), (48, 46), (48, 49), (49, 50), (50, 48), (50, 51), (51, 52), (52, 50), (52, 53), (53, 54), (54, 52), (54, 55), (55, 56), (56, 54), (56, 57), (57, 58), (58, 56), (58, 59), (59, 60), (60, 58), (60, 61), (61, 62), (62, 60), (62, 63), (63, 64), (64, 62), (64, 65), (65, 66), (66, 64), (66, 67), (67, 68), (68, 66), (68, 69), (69, 70), (70, 68), (70, 71), (71, 72), (72, 70), (72, 73), (73, 74), (74, 72), (74, 75), (75, 76), (76, 74), (76, 77), (77, 78), (78, 76), (78, 79), (79, 80), (80, 78), (80, 81), (81, 82), (82, 80), (82, 83), (83, 84), (84, 82), (84, 85), (85, 86), (86, 84), (86, 87), (87, 88), (88, 86), (88, 89), (89, 90), (90, 88), (90, 91), (91, 92), (92, 90), (92, 93), (93, 94), (94, 92), (94, 95), (95, 96), (96, 94), (96, 97), (97, 98), (98, 96), (98, 99), (99, 100), (100, 98)]
    for n1, n2 in euler:
        make_link(G, n1, n2)
    assert centrality_max(G, 0) == 51
    return 'all tests pass'

import csv, time
def test_enron():
    # helper function for getting time in seconds
    def now(): return time.time()*1.0

    G = {}
    file = csv.reader(open('email-Enron.txt', 'rb'), delimiter='\t')
    for row in file:
        make_link(G, row[0], row[1])
    n = 15
    a = now()
    max([centrality_max5(G, x) for x in G.keys()[:n]])
    a -= now()
    b = now()
    max([centrality_max2(G, x) for x in G.keys()[:n]])
    b -= now()
    c = now()
    max([centrality_max_mine(G, x) for x in G.keys()[:n]])
    c -= now()
    d = now()
    max([centrality_max_my(G, x) for x in G.keys()[:n]])
    d -= now()
    e = now()
    max([centrality_max(G, x) for x in G.keys()[:n]])
    e -= now()
    print "%.3f,%.3f,%.3f,%.3f,%.3f" % (-a, -b, -c, -d, -e)

#print test()
#for i in xrange(100):
#    test_enron()

def parent(i): return (i-1)/2
def left_child(i): return 2*i+1
def right_child(i): return 2*i+2
def is_leaf(L,i):
    return (left_child(i) >= len(L)) and (right_child(i) >= len(L))
def one_child(L,i):
    return (left_child(i) < len(L)) and (right_child(i) >= len(L))

def up_heapify(L, i):
    if i == 0: return
    pi = parent(i)
    if L[pi] < L[i]:
        L[i], L[pi] = L[pi], L[i]
        up_heapify(L, pi)

def down_heapify(L, i):
    if is_leaf(L, i):
        return
    elif one_child(L, i):
        # check heap property
        li = left_child(i)
        if L[i] < L[li]:
            L[li], L[i] = L[i], L[li]
    else:
        li, ri = left_child(i), right_child(i)
        swapi = li if L[li] > L[ri] else ri
        if L[swapi] > L[i]:
            L[swapi], L[i] = L[i], L[swapi]
            down_heapify(L, swapi)

def insert_heap(L, v, Kmax):
    if len(L) >= Kmax and v >= L[0]: return
    if len(L) < Kmax:
        L.append(v)
        up_heapify(L, len(L)-1)
    else:
        L[0] = v
        down_heapify(L, 0)

def extract_max(L):
    L[0], L[len(L)-1] = L[len(L)-1], L[0]
    current = L.pop()
    down_heapify(L, 0)
    return current

def imdb_central(fname, Kmax=20):
    G = {}
    actors = {}
    movies = {}
    indices = {}
    file = csv.reader(open(fname, 'rb'), delimiter='\t')
    index = 0
    for row in file:
        actor = row[0]
        movie = '%s (%s)' % (row[1], row[2])
        if actor not in actors:
            index += 1
            indices[index] = actor
            actors[actor] = index
        if movie not in movies:
            index += 1
            indices[index] = movie
            movies[movie] = index
        make_link(G, actors[actor], movies[movie])
    heap = []
    done = 0
    for actor in actors:
        centrality = average_centrality(G, actors[actor])
        current = (centrality, actor)
        insert_heap(heap, current, Kmax)
        done += 1
        if done % 200 == 0:
            print 'finished %d / %d' % (done, len(actors))
    print 'final', min(heap)
    while heap:
        rank = len(heap)
        current = extract_max(heap)
        print 'rank %d: %s centrality=%.3f' % (rank, current[1], current[0])

imdb_central('imdb-4.tsv')
#print test()
#for i in xrange(100):
#    test_enron()
	#
	# Write centrality_max to return the maximum distance
	# from a node to all the other nodes it can reach
	#

	from collections import deque

	def centrality_max(G, v):
	open_list = deque([v])
	distance = {v: 0}
	while open_list:
	head = open_list.popleft()
	for nbr in G[head]:
	if nbr not in distance:
	open_list.append(nbr)
	distance[nbr] = distance[head] + 1
	return distance[head]

	#def centrality_max(G, v):
	# # your code here
	# open_list = [v]
	# distance = {v: 0}
	# best_distance = 0
	# head, length = 0, 1
	# while head < length:
	# n = open_list[head]
	# head += 1
	# for nbr in G[n]:
	# if nbr not in distance:
	# open_list.append(nbr)
	# length += 1
	# best_distance = distance[n] + 1
	# distance[nbr] = best_distance
	# return best_distance

	def centrality_max2(G, v):
	toCrawl = []
	nextDepth = [v]
	marked = {v: True}
	depth = -1
	while nextDepth:
	toCrawl = [] + nextDepth
	nextDepth = []
	depth += 1
	while toCrawl:
	nodeID = toCrawl.pop()
	for e in G[nodeID]:
	if e not in marked:
	nextDepth.append(e)
	marked[e] = True
	return depth

	def centrality_max_my(G, v):
	distance = {v:0}
	open_list = deque([v])
	while open_list:
	item = open_list.popleft()
	for neighbor in G[item]:
	if neighbor not in distance:
	distance[neighbor] = distance[item] + 1
	open_list.append(neighbor)
	return max(distance.values())

	def centrality_max_mine(G, v):
	toCrawl = []
	nextDepth = [v]
	marked = {} #keep track of marked nodes so we don't go over something twice
	depth = -1
	while nextDepth:
	toCrawl, nextDepth = nextDepth,toCrawl
	gotMarked = False
	while toCrawl:
	nodeID = toCrawl.pop()
	if nodeID not in marked:
	gotMarked = True
	marked[nodeID] = True #just assigning it something
	for e in G[nodeID]:
	if e not in marked:
	nextDepth.append(e)
	if gotMarked:
	depth += 1
	return depth

	def centrality_max5(G, v):
	frontier, distance_map = deque([v]), {v: 0}
	while frontier:
	current_node = frontier.popleft()
	for successor in G[current_node]:
	if successor not in distance_map:
	distance_map[successor] = distance_map[current_node] + 1
	frontier.append(successor)
	return distance_map[current_node]

	def average_centrality(G, v):
	frontier, distance = deque([v]), {v: 0}
	while frontier:
	head = frontier.popleft()
	for nbr in G[head]:
	if nbr not in distance:
	distance[nbr] = distance[head] + 1
	frontier.append(nbr)
	return (1.0*sum(distance.values()))/len(distance)

	#################
	# Testing code
	#
	def make_link(G, node1, node2):
	if node1 not in G:
	G[node1] = {}
	(G[node1])[node2] = 1
	if node2 not in G:
	G[node2] = {}
	(G[node2])[node1] = 1
	return G

	def test():
	chain = ((1,2), (2,3), (3,4), (4,5), (5,6))
	G = {}
	for n1, n2 in chain:
	make_link(G, n1, n2)
	assert centrality_max(G, 1) == 5
	assert centrality_max(G, 3) == 3
	tree = ((1, 2), (1, 3),
	(2, 4), (2, 5),
	(3, 6), (3, 7),
	(4, 8), (4, 9),
	(6, 10), (6, 11))
	G = {}
	for n1, n2 in tree:
	make_link(G, n1, n2)
	assert centrality_max(G, 1) == 3
	assert centrality_max(G, 11) == 6
	G = {}
	tree = ((1, 2), (1, 3), (2, 3))
	for n1, n2 in tree:
	make_link(G, n1, n2)
	assert centrality_max(G, 1) == 1
	G = {}
	euler = [(0, 1), (1, 2), (2, 0), (2, 3), (3, 4), (4, 2), (4, 5), (5, 6), (6, 4), (6, 7), (7, 8), (8, 6), (8, 9), (9, 10), (10, 8), (10, 11), (11, 12), (12, 10), (12, 13), (13, 14), (14, 15), (15, 16), (16, 14), (16, 17), (17, 18), (18, 16), (18, 19), (19, 20), (20, 18), (20, 21), (21, 22), (22, 20), (22, 23), (23, 24), (24, 22), (24, 25), (25, 26), (26, 24), (26, 27), (27, 28), (28, 26), (28, 29), (29, 30), (30, 28), (30, 31), (31, 32), (32, 30), (32, 33), (33, 34), (34, 32), (34, 35), (35, 36), (36, 34), (36, 37), (37, 38), (38, 36), (38, 39), (39, 40), (40, 38), (40, 41), (41, 42), (42, 40), (42, 43), (43, 44), (44, 42), (44, 45), (45, 46), (46, 44), (46, 47), (47, 48), (48, 46), (48, 49), (49, 50), (50, 48), (50, 51), (51, 52), (52, 50), (52, 53), (53, 54), (54, 52), (54, 55), (55, 56), (56, 54), (56, 57), (57, 58), (58, 56), (58, 59), (59, 60), (60, 58), (60, 61), (61, 62), (62, 60), (62, 63), (63, 64), (64, 62), (64, 65), (65, 66), (66, 64), (66, 67), (67, 68), (68, 66), (68, 69), (69, 70), (70, 68), (70, 71), (71, 72), (72, 70), (72, 73), (73, 74), (74, 72), (74, 75), (75, 76), (76, 74), (76, 77), (77, 78), (78, 76), (78, 79), (79, 80), (80, 78), (80, 81), (81, 82), (82, 80), (82, 83), (83, 84), (84, 82), (84, 85), (85, 86), (86, 84), (86, 87), (87, 88), (88, 86), (88, 89), (89, 90), (90, 88), (90, 91), (91, 92), (92, 90), (92, 93), (93, 94), (94, 92), (94, 95), (95, 96), (96, 94), (96, 97), (97, 98), (98, 96), (98, 99), (99, 100), (100, 98)]
	for n1, n2 in euler:
	make_link(G, n1, n2)
	assert centrality_max(G, 0) == 51
	return 'all tests pass'

	import csv, time
	def test_enron():
	# helper function for getting time in seconds
	def now(): return time.time()*1.0

	G = {}
	file = csv.reader(open('email-Enron.txt', 'rb'), delimiter='\t')
	for row in file:
	make_link(G, row[0], row[1])
	n = 15
	a = now()
	max([centrality_max5(G, x) for x in G.keys()[:n]])
	a -= now()
	b = now()
	max([centrality_max2(G, x) for x in G.keys()[:n]])
	b -= now()
	c = now()
	max([centrality_max_mine(G, x) for x in G.keys()[:n]])
	c -= now()
	d = now()
	max([centrality_max_my(G, x) for x in G.keys()[:n]])
	d -= now()
	e = now()
	max([centrality_max(G, x) for x in G.keys()[:n]])
	e -= now()
	print "%.3f,%.3f,%.3f,%.3f,%.3f" % (-a, -b, -c, -d, -e)

	#print test()
	#for i in xrange(100):
	# test_enron()

	def parent(i): return (i-1)/2
	def left_child(i): return 2*i+1
	def right_child(i): return 2*i+2
	def is_leaf(L,i):
	return (left_child(i) >= len(L)) and (right_child(i) >= len(L))
	def one_child(L,i):
	return (left_child(i) < len(L)) and (right_child(i) >= len(L))

	def up_heapify(L, i):
	if i == 0: return
	pi = parent(i)
	if L[pi] < L[i]:
	L[i], L[pi] = L[pi], L[i]
	up_heapify(L, pi)

	def down_heapify(L, i):
	if is_leaf(L, i):
	return
	elif one_child(L, i):
	# check heap property
	li = left_child(i)
	if L[i] < L[li]:
	L[li], L[i] = L[i], L[li]
	else:
	li, ri = left_child(i), right_child(i)
	swapi = li if L[li] > L[ri] else ri
	if L[swapi] > L[i]:
	L[swapi], L[i] = L[i], L[swapi]
	down_heapify(L, swapi)

	def insert_heap(L, v, Kmax):
	if len(L) >= Kmax and v >= L[0]: return
	if len(L) < Kmax:
	L.append(v)
	up_heapify(L, len(L)-1)
	else:
	L[0] = v
	down_heapify(L, 0)

	def extract_max(L):
	L[0], L[len(L)-1] = L[len(L)-1], L[0]
	current = L.pop()
	down_heapify(L, 0)
	return current

	def imdb_central(fname, Kmax=20):
	G = {}
	actors = {}
	movies = {}
	indices = {}
	file = csv.reader(open(fname, 'rb'), delimiter='\t')
	index = 0
	for row in file:
	actor = row[0]
	movie = '%s (%s)' % (row[1], row[2])
	if actor not in actors:
	index += 1
	indices[index] = actor
	actors[actor] = index
	if movie not in movies:
	index += 1
	indices[index] = movie
	movies[movie] = index
	make_link(G, actors[actor], movies[movie])
	heap = []
	done = 0
	for actor in actors:
	centrality = average_centrality(G, actors[actor])
	current = (centrality, actor)
	insert_heap(heap, current, Kmax)
	done += 1
	if done % 200 == 0:
	print 'finished %d / %d' % (done, len(actors))
	print 'final', min(heap)
	while heap:
	rank = len(heap)
	current = extract_max(heap)
	print 'rank %d: %s centrality=%.3f' % (rank, current[1], current[0])

	imdb_central('imdb-4.tsv')
	#print test()
	#for i in xrange(100):
	# test_enron()