Generate a graphML file representing meetings between UK Government Ministers and lobbiests
# Script to generate a 'meetings graph' from data cmpiled by the Guardian Datastore | |
# Original data/story: http://www.guardian.co.uk/news/datablog/2011/oct/16/links-government-data-business-data | |
import networkx as nx | |
import csv,md5 | |
# Note that I removed apostrophes, em-dashes, accented chars that caused this script to choke using a text editor | |
# Ideally, the Datastore folk would deal with quirky characters, or at least clearly describe somewhere what | |
# character sets were used in the spreadsheet | |
fn='meetingsandinterests-cleaner.csv' | |
#---- UTILITY ROUTINES | |
#from http://stackoverflow.com/questions/1059559/python-strings-split-with-multiple-separators | |
def multisplit(s, seps): | |
res = [s] | |
for sep in seps: | |
s, res = res, [] | |
for seq in s: | |
tmp=seq.split(sep) | |
res += striplist(tmp) | |
return res | |
#---- | |
#http://code.activestate.com/recipes/205377-strip-whitespace-from-a-list-of-strings/ | |
# This function strips whitespace characters from the start and end of each item in a list | |
def striplist(l): | |
return([x.strip() for x in l]) | |
#---- | |
# Label the nodes in the graph | |
def labelNodes2(G,names): | |
for id in names: | |
G.node[id]['label']=names[id] | |
return G | |
# Add nodes to the graph; for convenience, create a unique MD5 hashed ID to identify each node | |
def addDirectedEdges(DG,fromNode,toSet,names={}): | |
fromID=md5.new(fromNode).hexdigest() | |
if fromID not in names: | |
try: names[fromID]=fromNode.encode('ascii','ignore') | |
except: | |
print fromID,fromNode | |
#For each org, add a directed edge from the Minister to the org. | |
for toNode in toSet: | |
toID=md5.new(toNode).hexdigest() | |
DG.add_edge(fromID,toID) | |
if toID not in names: | |
try: names[toID]=toNode.encode('ascii','ignore') | |
except: | |
print toID,toNode | |
#print nx.info(DG) | |
return DG,names | |
DG=nx.DiGraph() | |
reader = csv.DictReader(open(fn),skipinitialspace=True) | |
# For convenience - just check we have the right column names... | |
headers = reader.fieldnames | |
print headers | |
#names is used to build an index that maps node ID onto the name of the corresponding Minister/lobbiest | |
names={} | |
for row in reader: | |
#get Minister | |
govrep=row['Minister'] | |
#for each org, add edge connecting it to minister | |
orgs=row["Name of organisation"].strip(';') | |
#the multisplit function tries to separate out the different organisations listed together in a meeting | |
#this is a bit hit and miss - different separators are used in different rows (eg ',' and ';'), and some | |
# orgs contain a ',' in their name (which as a result gets split into several false positive org names) | |
# Ideally, the Datastore folk would use a consistent, unique separator to separate org names. | |
orgs=multisplit(orgs,[';',',']) | |
DG,names=addDirectedEdges(DG,govrep,orgs,names) | |
DG=labelNodes2(DG,names) | |
#Dump a graphml representation of the graph that we can load into Gephi | |
nx.write_graphml(DG, '/'.join(['reports','mtgs.graphml'])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment