Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Generate a graphML file representing meetings between UK Government Ministers and lobbiests
# Script to generate a 'meetings graph' from data cmpiled by the Guardian Datastore
# Original data/story: http://www.guardian.co.uk/news/datablog/2011/oct/16/links-government-data-business-data
import networkx as nx
import csv,md5
# Note that I removed apostrophes, em-dashes, accented chars that caused this script to choke using a text editor
# Ideally, the Datastore folk would deal with quirky characters, or at least clearly describe somewhere what
# character sets were used in the spreadsheet
fn='meetingsandinterests-cleaner.csv'
#---- UTILITY ROUTINES
#from http://stackoverflow.com/questions/1059559/python-strings-split-with-multiple-separators
def multisplit(s, seps):
res = [s]
for sep in seps:
s, res = res, []
for seq in s:
tmp=seq.split(sep)
res += striplist(tmp)
return res
#----
#http://code.activestate.com/recipes/205377-strip-whitespace-from-a-list-of-strings/
# This function strips whitespace characters from the start and end of each item in a list
def striplist(l):
return([x.strip() for x in l])
#----
# Label the nodes in the graph
def labelNodes2(G,names):
for id in names:
G.node[id]['label']=names[id]
return G
# Add nodes to the graph; for convenience, create a unique MD5 hashed ID to identify each node
def addDirectedEdges(DG,fromNode,toSet,names={}):
fromID=md5.new(fromNode).hexdigest()
if fromID not in names:
try: names[fromID]=fromNode.encode('ascii','ignore')
except:
print fromID,fromNode
#For each org, add a directed edge from the Minister to the org.
for toNode in toSet:
toID=md5.new(toNode).hexdigest()
DG.add_edge(fromID,toID)
if toID not in names:
try: names[toID]=toNode.encode('ascii','ignore')
except:
print toID,toNode
#print nx.info(DG)
return DG,names
DG=nx.DiGraph()
reader = csv.DictReader(open(fn),skipinitialspace=True)
# For convenience - just check we have the right column names...
headers = reader.fieldnames
print headers
#names is used to build an index that maps node ID onto the name of the corresponding Minister/lobbiest
names={}
for row in reader:
#get Minister
govrep=row['Minister']
#for each org, add edge connecting it to minister
orgs=row["Name of organisation"].strip(';')
#the multisplit function tries to separate out the different organisations listed together in a meeting
#this is a bit hit and miss - different separators are used in different rows (eg ',' and ';'), and some
# orgs contain a ',' in their name (which as a result gets split into several false positive org names)
# Ideally, the Datastore folk would use a consistent, unique separator to separate org names.
orgs=multisplit(orgs,[';',','])
DG,names=addDirectedEdges(DG,govrep,orgs,names)
DG=labelNodes2(DG,names)
#Dump a graphml representation of the graph that we can load into Gephi
nx.write_graphml(DG, '/'.join(['reports','mtgs.graphml']))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment