-
-
Save conradlee/1331132 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python | |
import os | |
import sys | |
import subprocess | |
import optparse | |
import tempfile | |
# Special feature: can convert files so large that they | |
# don't fit in memory. Works for weighted/unweighted, | |
# directed/undirected edges. | |
def edgelist_to_pajek(input_filename, output_filename="", directed=False, weighted=False, buffer_size=500): | |
""" | |
Input filename is the name of an edgelist file with the following format | |
node1ID node2ID [weight] | |
node1ID node3ID [weight] | |
... | |
nodeiID nodejID [weight] | |
where nodeIDs are separated by whitespace. | |
Edge weights will only be used if the "weighted" argument is set to True. | |
Buffer size is in megabytes. | |
If output is unspecified, then I use stdout. | |
""" | |
# Sort out I/O | |
if output_filename: | |
output_file = open(output_filename, "w") | |
else: | |
output_file = sys.stdout | |
node_idx_map = {} | |
# Write vertices section and produce map from original nodeIDs to | |
# contiguous integer ids that start from one. | |
with Tempfile() as unique_nodes_file: | |
unique_nodes_command = "<%s awk '{ print $1; print $2; }' | sort -n --buffer-size=%dM | uniq>%s" % (input_filename, buffer_size, unique_nodes_file.name) | |
unique_nodes_command = "<" + input_filename + " " + unique_nodes_command | |
run_command(unique_nodes_command) | |
num_nodes = int(run_command("wc -l %s" % unique_nodes_file.name).split()[0]) | |
output_file.write("*Vertices\t%d\n" % num_nodes) | |
with open(unique_nodes_file.name) as nodes_file: | |
for idx, line in enumerate(nodes_file): | |
node_id = int(line.rstrip("\n")) | |
pajek_idx = idx + 1 # Pajek indexing starts with 1 | |
output_file.write('\t%d "%d"\n' % (pajek_idx, node_id)) | |
# Might be slow to add to dict this way, one at a time | |
node_idx_map[node_id] = pajek_idx | |
# Now write edges | |
if directed: | |
output_file.write("*Arcs\n") | |
else: | |
output_file.write("*Edges\n") | |
input_file = open(input_filename) | |
for i, line in enumerate(input_file): | |
try: | |
if weighted: | |
n1, n2, weight = line.strip().split() | |
output_file.write("\t%d\t%d\t%0.6f\n" % (node_idx_map[int(n1)], | |
node_idx_map[int(n2)], | |
float(weight))) | |
else: | |
n1, n2 = map(int, line.strip().split()[:2]) | |
output_file.write("\t%d\t%d\n" % (node_idx_map[n1], | |
node_idx_map[n2])) | |
except ValueError: | |
raise ValueError, "Problem parsing input file on line %d, which reads: \n\t%s\nIf you selected the -w option for weighted edegs, make sure this line has an edg\ | |
e weight" % (i + 1, line) | |
input_file.close() | |
output_file.close() | |
def run_command(command): | |
# Necessary for compatability with python 2.6 which is missing | |
# some of the conveneince funcitons in python 2.7 | |
""" Warning: Will hang if stderr or stdout is large """ | |
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
retcode = process.wait() | |
if retcode != 0: | |
raise Exception, "Problem running command: " + command | |
stdout, stderr = process.communicate() | |
return stdout | |
class Tempfile: | |
def __enter__(self): | |
self.file = tempfile.NamedTemporaryFile(delete=False) | |
return self.file | |
def __exit__(self, type, value, traceback): | |
try: | |
os.remove(self.file.name) | |
except OSError: | |
pass | |
if __name__ == "__main__": | |
parser = optparse.OptionParser(usage="Usage: %prog input_filename <options>") | |
parser.add_option('-d', | |
help="specifies that edges are directed.", | |
dest="directed", | |
default=False, | |
action="store_true") | |
parser.add_option('-w', | |
help="specifies that edges are weighted.", | |
dest="weighted", | |
default=False, | |
action="store_true") | |
parser.add_option('-o', | |
"--out_filename", | |
help="Filename for output, which is in pajek format. Default [stdout]", | |
dest="out_filename", | |
type="string", | |
default="") | |
parser.add_option('-b', | |
"--buffer_size", | |
help="Size of buffer for sort command to use (in megabytes) Default [%default]", | |
dest="buffer_size", | |
type="int", | |
default=500) | |
(opts, args) = parser.parse_args() | |
input_filename = sys.argv[1] | |
edgelist_to_pajek(input_filename, | |
output_filename = opts.out_filename, | |
directed = opts.directed, | |
weighted = opts.weighted, | |
buffer_size = opts.buffer_size) |
Yes, it's only 3 GB available for 32-bit Windows application.
I'll remind you about the script if I don't find any alternative in a week.
Cheers,
Alex.
Thanks for great the program! Could finally do the neighbour detection all swedish twitter accounts :) (see result at http://twittercensus.se/graph2014 )
Hi
i am newbe in pajek. i am importing data from gephi into pajek via txt to pajek. But pajek is giving error Error: this seems unix file or corrupted etc.
Any one can help me regarding this issue....
Thanks in advance!
is there any code available to create temporal network using pajek? because i want to convert network into a temporal network by using dates as time stamp. Basically i am saying that is there any code which can turn csv into pajek file including time stamp
@semenoffalex
That network shouldn't be a problem on your machine (although perhaps with the 32-bit operating system, Windows can only access 3GB, not four---I'm not sure about this).
So it should indeed be possible to write a relatively simple native python script that converts your edgelist file into the pajek format. I don't have time at the moment, but if you remind me later this week, I'll write it up for you.
Conrad