Generate a graphML file representing meetings between UK Government Ministers and lobbiests
# Script to generate a 'meetings graph' from data cmpiled by the Guardian Datastore
# Original data/story:
import networkx as nx
import csv,md5
# Note that I removed apostrophes, em-dashes, accented chars that caused this script to choke using a text editor
# Ideally, the Datastore folk would deal with quirky characters, or at least clearly describe somewhere what
# character sets were used in the spreadsheet
def multisplit(s, seps):
res = [s]
for sep in seps:
s, res = res, []
for seq in s:
res += striplist(tmp)
return res
# This function strips whitespace characters from the start and end of each item in a list
def striplist(l):
return([x.strip() for x in l])
# Label the nodes in the graph
def labelNodes2(G,names):
for id in names:
return G
# Add nodes to the graph; for convenience, create a unique MD5 hashed ID to identify each node
def addDirectedEdges(DG,fromNode,toSet,names={}):
if fromID not in names:
try: names[fromID]=fromNode.encode('ascii','ignore')
print fromID,fromNode
#For each org, add a directed edge from the Minister to the org.
for toNode in toSet:
if toID not in names:
try: names[toID]=toNode.encode('ascii','ignore')
print toID,toNode
return DG,names
reader = csv.DictReader(open(fn),skipinitialspace=True)
# For convenience - just check we have the right column names...
headers = reader.fieldnames
print headers
#names is used to build an index that maps node ID onto the name of the corresponding Minister/lobbiest
for row in reader:
#get Minister
#for each org, add edge connecting it to minister
orgs=row["Name of organisation"].strip(';')
#the multisplit function tries to separate out the different organisations listed together in a meeting
#this is a bit hit and miss - different separators are used in different rows (eg ',' and ';'), and some
# orgs contain a ',' in their name (which as a result gets split into several false positive org names)
# Ideally, the Datastore folk would use a consistent, unique separator to separate org names.
#Dump a graphml representation of the graph that we can load into Gephi
nx.write_graphml(DG, '/'.join(['reports','mtgs.graphml']))
