Created
October 17, 2011 12:26
-
-
Save psychemedia/1292500 to your computer and use it in GitHub Desktop.
Generate a graphML file representing meetings between UK Government Ministers and lobbiests
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Script to generate a 'meetings graph' from data cmpiled by the Guardian Datastore | |
# Original data/story: http://www.guardian.co.uk/news/datablog/2011/oct/16/links-government-data-business-data | |
import networkx as nx | |
import csv,md5 | |
# Note that I removed apostrophes, em-dashes, accented chars that caused this script to choke using a text editor | |
# Ideally, the Datastore folk would deal with quirky characters, or at least clearly describe somewhere what | |
# character sets were used in the spreadsheet | |
fn='meetingsandinterests-cleaner.csv' | |
#---- UTILITY ROUTINES | |
#from http://stackoverflow.com/questions/1059559/python-strings-split-with-multiple-separators | |
def multisplit(s, seps): | |
res = [s] | |
for sep in seps: | |
s, res = res, [] | |
for seq in s: | |
tmp=seq.split(sep) | |
res += striplist(tmp) | |
return res | |
#---- | |
#http://code.activestate.com/recipes/205377-strip-whitespace-from-a-list-of-strings/ | |
# This function strips whitespace characters from the start and end of each item in a list | |
def striplist(l): | |
return([x.strip() for x in l]) | |
#---- | |
# Label the nodes in the graph | |
def labelNodes2(G,names): | |
for id in names: | |
G.node[id]['label']=names[id] | |
return G | |
# Add nodes to the graph; for convenience, create a unique MD5 hashed ID to identify each node | |
def addDirectedEdges(DG,fromNode,toSet,names={}): | |
fromID=md5.new(fromNode).hexdigest() | |
if fromID not in names: | |
try: names[fromID]=fromNode.encode('ascii','ignore') | |
except: | |
print fromID,fromNode | |
#For each org, add a directed edge from the Minister to the org. | |
for toNode in toSet: | |
toID=md5.new(toNode).hexdigest() | |
DG.add_edge(fromID,toID) | |
if toID not in names: | |
try: names[toID]=toNode.encode('ascii','ignore') | |
except: | |
print toID,toNode | |
#print nx.info(DG) | |
return DG,names | |
DG=nx.DiGraph() | |
reader = csv.DictReader(open(fn),skipinitialspace=True) | |
# For convenience - just check we have the right column names... | |
headers = reader.fieldnames | |
print headers | |
#names is used to build an index that maps node ID onto the name of the corresponding Minister/lobbiest | |
names={} | |
for row in reader: | |
#get Minister | |
govrep=row['Minister'] | |
#for each org, add edge connecting it to minister | |
orgs=row["Name of organisation"].strip(';') | |
#the multisplit function tries to separate out the different organisations listed together in a meeting | |
#this is a bit hit and miss - different separators are used in different rows (eg ',' and ';'), and some | |
# orgs contain a ',' in their name (which as a result gets split into several false positive org names) | |
# Ideally, the Datastore folk would use a consistent, unique separator to separate org names. | |
orgs=multisplit(orgs,[';',',']) | |
DG,names=addDirectedEdges(DG,govrep,orgs,names) | |
DG=labelNodes2(DG,names) | |
#Dump a graphml representation of the graph that we can load into Gephi | |
nx.write_graphml(DG, '/'.join(['reports','mtgs.graphml'])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment