Skip to content

Instantly share code, notes, and snippets.

@flyinghyrax
Created November 30, 2014 02:14
Show Gist options
  • Save flyinghyrax/3b42f79ed3402cd6f314 to your computer and use it in GitHub Desktop.
Save flyinghyrax/3b42f79ed3402cd6f314 to your computer and use it in GitHub Desktop.
Determining one-way associations between categorical attributes in CFPB data
[
{
"name": "Issues by product",
"key_col": "Product",
"val_col": "Issue"
},
{
"name": "Products by issue",
"key_col": "Issue",
"val_col": "Product"
},
{
"name": "Sub-products by product",
"key_col": "Product",
"val_col": "Sub-product"
},
{
"name": "Sub-issues by issue",
"key_col": "Issue",
"val_col": "Sub-issue"
}
]
import csv
import json
import argparse
import sys
# define our map names, descriptions, column for set keys and column for
# set values all in one place so we can re-use them w.out hardcoding
def def_get(table, key, default='(nil)'):
"""Gets the value at key in a dictionary, returning a default if the key
is not present or the value is blank"""
exists = (key in table and table[key] != '')
return table[key] if exists else default
def add_to_set(table, key, value):
"""Adds a value to a set in a dictionary of sets, creating a set for the
given key if one does not already exist"""
if key not in table:
table[key] = set()
table[key].add(value)
return table
def get_maps(reader):
"""Returns a mega-dictionary w/ the associations between products and issues,
issues and sub-issues, and products and sub-products, etc."""
# dictionary of dictionaries to hold associations
maps = {n: dict() for n in [d['name'] for d in map_meta]}
# touch each row of data once
for row in reader:
# use hardcoded metadata to grab the correct columns and add them to sets
for meta in map_meta:
dict_key = def_get(row, meta['key'])
set_value = def_get(row, meta['set'])
add_to_set(maps[meta['name']], dict_key, set_value)
# return mega-collection
return maps
def get_maps(reader, meta):
"""Given a CSV dictionary reader and a metadata list defining associations
to look for, create a dictionary that maps titles to nested dictionaries,
which each map a column value to a set of values from another column that
the first value appears with (associations)"""
# dictionary of dictionaries to hold associations
results = {n: dict() for n in [m['name'] for m in meta]}
# for each row in the data file
for record in reader:
# for each association N->(set) we are checking
for m in meta:
# get the value of N for this row
dict_key = def_get(record, m['key_col'])
# get the value associated with N in this row
set_value = def_get(record, m['val_col'])
# add the value to our set for N
add_to_set(results[m['name']], dict_key, set_value)
# return mega collection
return results
def print_map(m):
for k in m.keys():
print("{0!s}: {1!s}".format(k, "; ".join(m[k])))
def process(dataFilename, metaFilename):
# open and parse json metadata
with open(metaFilename, 'r') as jsonFile:
meta = json.load(jsonFile)
# open and process data file
if dataFilename != '':
with open(dataFilename) as csvFile:
data = csv.DictReader(csvFile)
resultMap = get_maps(data, meta)
else:
data = csv.DictReader(sys.stdin)
resultMap = get_maps(data, meta)
# convert sets to lists so that we can serialize them
for n, m in resultMap.items():
for k, v in m.items():
resultMap[n][k] = list(v)
# hit the mediocre middle-ground between humand and machine readable
return json.dumps(resultMap, indent=4)
def getArgParser():
parser = argparse.ArgumentParser(description='Processes simple category associations')
parser.add_argument('data_file',
nargs='?',
default='',
help='CSV file to process (defaults to stdin)')
parser.add_argument('meta_file',
help='JSON file describing category associations to look for')
parser.add_argument('output_file',
nargs='?',
default='',
help='Name of file to write output to (defaults to stdout)')
return parser
if __name__ == "__main__":
args = getArgParser().parse_args()
output = process(args.data_file, args.meta_file)
if args.output_file != '':
with open(args.output_file, 'w') as outfile:
outfile.write(output)
else:
sys.stdout.write(output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment