Skip to content

Instantly share code, notes, and snippets.

@danstowell danstowell/crosstab.py
Last active Aug 29, 2015

Embed
What would you like to do?
Cross-tabulation analysis of the data comparing HOT OSM building damage annotations against damage annotations from the American Red Cross, at https://github.com/AmericanRedCross/OSM-Assessment/tree/gh-pages/data
#!/usr/bin/env python
import os, json
import numpy as np
# crosstabulate the geojson records for OSM damage indicators vs "observed" (by American Red Cross analysts) damage
datadir = '.'
fpaths = [
'osm_damage_no',
'osm_damage_damaged',
'osm_damage_destroyed',
'observed_damage_no',
'observed_damage_partial',
'observed_damage_major',
'observed_damage_total',
]
cross = {}
keys = {'osm':[], 'observed':[]}
# Here we load the data into a form suitable for crosstabulating
for fpath in fpaths:
print("Loading %s" % fpath)
sourcetype = fpath.split('_')[0]
damagetype = fpath.split('_')[2]
keys[sourcetype].append(damagetype)
with open("%s/%s.geojson" % (datadir, fpath), 'rb') as fp:
data = json.load(fp)
print(" contains %i items" % len(data["features"]))
for item in data["features"]:
#print item["properties"]
osmid = int(item["properties"]["osm_id"])
if osmid not in cross:
cross[osmid] = {}
cross[osmid][sourcetype] = damagetype
# Now we calc the crosstab counts
crosstab = {osmkey:{obskey:0 for obskey in keys['observed']} for osmkey in keys['osm']}
unmatched = {sourcetype:{dkey:0 for dkey in dkeys} for sourcetype, dkeys in keys.items()}
marginals = {sourcetype:{dkey:0 for dkey in dkeys} for sourcetype, dkeys in keys.items()}
for item in cross.values():
if ('observed' in item) and ('osm' in item):
crosstab[item['osm']][item['observed']] += 1
elif ('observed' in item):
unmatched['observed'][item['observed']] += 1
else:
unmatched['osm'][item['osm']] += 1
if ('observed' in item):
marginals['observed'][item['observed']] += 1
if ('osm' in item):
marginals['osm'][item['osm']] += 1
print('-------------------------------------------')
print('MARGINALS:')
print marginals
print('')
print('-------------------------------------------')
print('CROSSTABULATION:')
print('\t' + '\t'.join(keys['osm']))
for obskey in keys['observed']:
print(obskey + '\t' + '\t'.join([str(crosstab[osmkey][obskey]) for osmkey in keys['osm']]))
print('')
print('-------------------------------------------')
print('INFORMATION ANALYSIS:')
def entropy(counts):
tot = np.sum(counts, dtype=np.float32)
val = 0.
for count in counts:
prob = float(count)/tot
val -= prob * np.log2(prob)
return val
jointbincounts = [] # this is needed to simplify calcing the joint entropy
for subset in crosstab.values():
jointbincounts.extend(subset.values())
h_obs = entropy(marginals['observed'].values())
h_osm = entropy(marginals['osm'].values())
h_joint = entropy(jointbincounts)
mutualinfo = h_obs + h_osm - h_joint
print('Entropy of observed: %g bits' % (h_obs))
print('Entropy of osm: %g bits' % (h_osm))
print('Entropy of joint: %g bits' % (h_joint))
print('Mutual information: %g bits' % (mutualinfo))
print('Mutual information as a proportion of the observed entropy: %g %%' % (100 * mutualinfo/h_obs))
print('')
print('-------------------------------------------')
print('UNMATCHED:')
print unmatched
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.