Created
October 18, 2012 13:24
-
-
Save sheymann/3911792 to your computer and use it in GitHub Desktop.
Internal Link Detection on Bipartite Graph
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
#!/usr/bin/env python | |
""" | |
========================================== | |
Internal Link Detection on Bipartite Graph | |
========================================== | |
# Copyright (C) 2012 by | |
# Sébastien Heymann <sebastien.heymann@lip6.fr> | |
# All rights reserved. | |
# BSD license. | |
Detect internal links in a bipartite graph, using the top set of nodes. | |
An internal link is a link in the bipartite graph which does not change | |
the projection of the graph onto the top set of nodes if it is removed. | |
References | |
---------- | |
O. Allali, L. Tabourier, C. Magnien, M. Latapy, | |
"Internal links and pairs as a new tool for the analysis of bipartite complex networks", | |
To appear in Social Network Analysis and Mining, 2012. | |
http://www.complexnetworks.fr/wp-content/uploads/2012/01/Allali_internal_links.pdf | |
Input | |
----- | |
List of links (may be compressed in GZIP). | |
One link (top node, bottom node) per line with extremities separated by a space. | |
Each node should have a unique string identifier and may be delimited by double quotes. | |
The file should NOT contain comment lines (e.g. starting with '#'). | |
Example: | |
u0 v0 | |
u0 v2 | |
u0 v3 | |
u1 v1 | |
u1 v3 | |
u2 v2 | |
u2 v1 | |
u2 v4 | |
u2 v0 | |
u2 v3 | |
u3 v2 | |
u3 v4 | |
Output | |
------ | |
List of internal links. | |
Requires | |
-------- | |
Python 2.7 | |
Networkx 1.7+. | |
Usage | |
------- | |
python2.7 get_internal_links.py my_edgelist.gz internal_links.edgelist | |
""" | |
def read_edge_list(filepath): | |
"""Read an edgelist file in plain text of GZIP compressed format.""" | |
try: | |
import csv | |
import gzip | |
except ImportError: | |
raise ImportError('get_internal_links() requires csv and gzip') | |
try: | |
f = gzip.GzipFile(filepath, 'rb').read(1) #raise exception if not a GZ | |
f = gzip.open(filepath, 'rb') | |
except IOError: | |
f = open(filepath, 'r') | |
reader = csv.reader(f, delimiter=' ', quotechar='"') | |
edgelist = [(row[0], row[1]) for row in reader if row] # (top node, bottom node) | |
return edgelist | |
def get_internal_links(B, nodes): | |
"""Return the set of internal links of the bipartite graph B, | |
projected onto one of its node sets. | |
Parameters | |
---------- | |
B : NetworkX graph | |
nodes : | |
Returns | |
------- | |
I : set | |
A set of tuples (u,v) which are the internal links detected. | |
Raises | |
------ | |
AssertionError if the graph is not bipartite. | |
Examples | |
-------- | |
B = nx.Graph() | |
B.add_nodes_from([1,2,3,4], bipartite=0) | |
B.add_nodes_from(['a','b','c'], bipartite=1) | |
B.add_edges_from([(1,'a'), (1,'b'), (2,'b'), (2,'c'), (3,'c'), (4,'a')]) | |
bottomNodes, topNodes = bipartite.sets(B) | |
print(get_internal_links(B, topNodes)) | |
""" | |
try: | |
from collections import Counter | |
except ImportError: | |
raise ImportError('get_internal_links() requires collections.Counter ') | |
try: | |
import itertools | |
except ImportError: | |
raise ImportError('get_internal_links() requires itertools ') | |
try: | |
import networkx as nx | |
from networkx.algorithms import bipartite | |
except ImportError: | |
raise ImportError('get_internal_links() requires networkx ') | |
assert bipartite.is_bipartite(B) | |
I = set() | |
for u in nodes: | |
m = set(B[u]) | |
cnt = Counter() | |
Nv = (w for v in m for w in B[v]) # iterator on w in B[v], for each v in m | |
for w in Nv: | |
cnt[w] += 1 | |
cnt[u] = 0 | |
r = set(itertools.chain(*[B[w] for w, times in cnt.iteritems() if times == 1])) | |
m -= r | |
I.update((u,v) for v in m) | |
return I | |
# NAIVE IMPLEMENTATION: | |
# I = set() | |
# for u in nodes: | |
# d = {} | |
# m = B[u] | |
# for v in m: | |
# Nv = B[v] | |
# for w in Nv: | |
# d[w] = d[w] + 1 if w in d else 1 | |
# d[u] = 0 | |
# r = itertools.chain(*[B[w] for w, times in d.iteritems() if times == 1]) | |
# Iu = set((u,v) for v in m if v not in r) | |
# I |= Iu | |
if __name__ == "__main__": | |
import argparse | |
import networkx as nx | |
#read command line arguments | |
parser = argparse.ArgumentParser() | |
parser.add_argument("inputFile", help="Path to the input file.") | |
parser.add_argument("outputFile", help="Path to the output file.") | |
parser.add_argument("-r", "--reverse", | |
help="Use the bottom set of nodes to detect the internal links.", | |
action="store_true") | |
parser.add_argument("-q", "--quiet", | |
help="No information on the standard output but critical errors.", | |
action="store_true") | |
args = parser.parse_args() | |
#load the bipartite graph | |
if not args.quiet: | |
print 'load graph' | |
G = nx.Graph() | |
links = read_edge_list(args.inputFile) | |
for (u,v) in links: | |
G.add_node(u, bipartite=0) #top node | |
G.add_node(v, bipartite=1) #bottom node | |
G.add_edge(u, v) | |
topNodes = set(n for n,d in G.nodes(data=True) if d['bipartite']==0) | |
bottomNodes = set(G) - topNodes | |
if not args.quiet: | |
print "nb top nodes: ", len(topNodes) | |
print "nb bottom nodes: ", len(bottomNodes) | |
print "nb links: ", G.number_of_edges() | |
print 'detect internal links' | |
#detect internal links | |
nodes = bottomNodes if args.reverse else topNodes | |
I = get_internal_links(G, nodes) | |
if not args.quiet: | |
print 'nb internal links: ', len(I) | |
print 'write output file' | |
#write output file | |
f = open(args.outputFile, 'w') | |
for (u, v) in I: | |
#assert G.has_edge(u,v), 'link not in graph' | |
f.write(str(u)+' '+str(v)+'\n') # top bottom | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment