psychemedia/meetingsGrapher.py

## meetingsGrapher.py
# Script to generate a 'meetings graph' from data cmpiled by the Guardian Datastore
# Original data/story: http://www.guardian.co.uk/news/datablog/2011/oct/16/links-government-data-business-data
import networkx as nx
import csv,md5

# Note that I removed apostrophes, em-dashes, accented chars that caused this script to choke using a text editor
# Ideally, the Datastore folk would deal with quirky characters, or at least clearly describe somewhere what
# character sets were used in the spreadsheet
fn='meetingsandinterests-cleaner.csv'

#---- UTILITY ROUTINES
#from http://stackoverflow.com/questions/1059559/python-strings-split-with-multiple-separators
def multisplit(s, seps):
    res = [s]
    for sep in seps:
        s, res = res, []
        for seq in s:
        	tmp=seq.split(sep)
        	res += striplist(tmp)
    return res
#----
#http://code.activestate.com/recipes/205377-strip-whitespace-from-a-list-of-strings/
# This function strips whitespace characters from the start and end of each item in a list
def striplist(l):
    return([x.strip() for x in l])
#----

# Label the nodes in the graph
def labelNodes2(G,names):
	for id in names:
		G.node[id]['label']=names[id]
	return G

# Add nodes to the graph; for convenience, create a unique MD5 hashed ID to identify each node
def addDirectedEdges(DG,fromNode,toSet,names={}):
	fromID=md5.new(fromNode).hexdigest()
	if fromID not in names:
		try: names[fromID]=fromNode.encode('ascii','ignore')
		except:
			print fromID,fromNode
	#For each org, add a directed edge from the Minister to the org.
	for toNode in toSet:
		toID=md5.new(toNode).hexdigest()
		DG.add_edge(fromID,toID)
		if toID not in names:
			try: names[toID]=toNode.encode('ascii','ignore')
			except:
				print toID,toNode

	#print nx.info(DG)
	return DG,names


DG=nx.DiGraph()

reader = csv.DictReader(open(fn),skipinitialspace=True)

# For convenience - just check we have the right column names...
headers = reader.fieldnames
print headers

#names is used to build an index that maps node ID onto the name of the corresponding Minister/lobbiest
names={}

for row in reader:
	#get Minister
	govrep=row['Minister']

	#for each org, add edge connecting it to minister
	orgs=row["Name of organisation"].strip(';')
	#the multisplit function tries to separate out the different organisations listed together in a meeting
	#this is a bit hit and miss - different separators are used in different rows (eg ',' and ';'), and some
	# orgs contain a ',' in their name (which as a result gets split into several false positive org names)
	# Ideally, the Datastore folk would use a consistent, unique separator to separate org names.
	orgs=multisplit(orgs,[';',','])
	DG,names=addDirectedEdges(DG,govrep,orgs,names)

DG=labelNodes2(DG,names)

#Dump a graphml representation of the graph that we can load into Gephi
nx.write_graphml(DG, '/'.join(['reports','mtgs.graphml']))
	# Script to generate a 'meetings graph' from data cmpiled by the Guardian Datastore
	# Original data/story: http://www.guardian.co.uk/news/datablog/2011/oct/16/links-government-data-business-data
	import networkx as nx
	import csv,md5

	# Note that I removed apostrophes, em-dashes, accented chars that caused this script to choke using a text editor
	# Ideally, the Datastore folk would deal with quirky characters, or at least clearly describe somewhere what
	# character sets were used in the spreadsheet
	fn='meetingsandinterests-cleaner.csv'

	#---- UTILITY ROUTINES
	#from http://stackoverflow.com/questions/1059559/python-strings-split-with-multiple-separators
	def multisplit(s, seps):
	res = [s]
	for sep in seps:
	s, res = res, []
	for seq in s:
	tmp=seq.split(sep)
	res += striplist(tmp)
	return res
	#----
	#http://code.activestate.com/recipes/205377-strip-whitespace-from-a-list-of-strings/
	# This function strips whitespace characters from the start and end of each item in a list
	def striplist(l):
	return([x.strip() for x in l])
	#----

	# Label the nodes in the graph
	def labelNodes2(G,names):
	for id in names:
	G.node[id]['label']=names[id]
	return G

	# Add nodes to the graph; for convenience, create a unique MD5 hashed ID to identify each node
	def addDirectedEdges(DG,fromNode,toSet,names={}):
	fromID=md5.new(fromNode).hexdigest()
	if fromID not in names:
	try: names[fromID]=fromNode.encode('ascii','ignore')
	except:
	print fromID,fromNode
	#For each org, add a directed edge from the Minister to the org.
	for toNode in toSet:
	toID=md5.new(toNode).hexdigest()
	DG.add_edge(fromID,toID)
	if toID not in names:
	try: names[toID]=toNode.encode('ascii','ignore')
	except:
	print toID,toNode

	#print nx.info(DG)
	return DG,names


	DG=nx.DiGraph()

	reader = csv.DictReader(open(fn),skipinitialspace=True)

	# For convenience - just check we have the right column names...
	headers = reader.fieldnames
	print headers

	#names is used to build an index that maps node ID onto the name of the corresponding Minister/lobbiest
	names={}

	for row in reader:
	#get Minister
	govrep=row['Minister']

	#for each org, add edge connecting it to minister
	orgs=row["Name of organisation"].strip(';')
	#the multisplit function tries to separate out the different organisations listed together in a meeting
	#this is a bit hit and miss - different separators are used in different rows (eg ',' and ';'), and some
	# orgs contain a ',' in their name (which as a result gets split into several false positive org names)
	# Ideally, the Datastore folk would use a consistent, unique separator to separate org names.
	orgs=multisplit(orgs,[';',','])
	DG,names=addDirectedEdges(DG,govrep,orgs,names)

	DG=labelNodes2(DG,names)

	#Dump a graphml representation of the graph that we can load into Gephi
	nx.write_graphml(DG, '/'.join(['reports','mtgs.graphml']))