Created
November 30, 2014 02:14
-
-
Save flyinghyrax/3b42f79ed3402cd6f314 to your computer and use it in GitHub Desktop.
Determining one-way associations between categorical attributes in CFPB data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[ | |
{ | |
"name": "Issues by product", | |
"key_col": "Product", | |
"val_col": "Issue" | |
}, | |
{ | |
"name": "Products by issue", | |
"key_col": "Issue", | |
"val_col": "Product" | |
}, | |
{ | |
"name": "Sub-products by product", | |
"key_col": "Product", | |
"val_col": "Sub-product" | |
}, | |
{ | |
"name": "Sub-issues by issue", | |
"key_col": "Issue", | |
"val_col": "Sub-issue" | |
} | |
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import json | |
import argparse | |
import sys | |
# define our map names, descriptions, column for set keys and column for | |
# set values all in one place so we can re-use them w.out hardcoding | |
def def_get(table, key, default='(nil)'): | |
"""Gets the value at key in a dictionary, returning a default if the key | |
is not present or the value is blank""" | |
exists = (key in table and table[key] != '') | |
return table[key] if exists else default | |
def add_to_set(table, key, value): | |
"""Adds a value to a set in a dictionary of sets, creating a set for the | |
given key if one does not already exist""" | |
if key not in table: | |
table[key] = set() | |
table[key].add(value) | |
return table | |
def get_maps(reader): | |
"""Returns a mega-dictionary w/ the associations between products and issues, | |
issues and sub-issues, and products and sub-products, etc.""" | |
# dictionary of dictionaries to hold associations | |
maps = {n: dict() for n in [d['name'] for d in map_meta]} | |
# touch each row of data once | |
for row in reader: | |
# use hardcoded metadata to grab the correct columns and add them to sets | |
for meta in map_meta: | |
dict_key = def_get(row, meta['key']) | |
set_value = def_get(row, meta['set']) | |
add_to_set(maps[meta['name']], dict_key, set_value) | |
# return mega-collection | |
return maps | |
def get_maps(reader, meta): | |
"""Given a CSV dictionary reader and a metadata list defining associations | |
to look for, create a dictionary that maps titles to nested dictionaries, | |
which each map a column value to a set of values from another column that | |
the first value appears with (associations)""" | |
# dictionary of dictionaries to hold associations | |
results = {n: dict() for n in [m['name'] for m in meta]} | |
# for each row in the data file | |
for record in reader: | |
# for each association N->(set) we are checking | |
for m in meta: | |
# get the value of N for this row | |
dict_key = def_get(record, m['key_col']) | |
# get the value associated with N in this row | |
set_value = def_get(record, m['val_col']) | |
# add the value to our set for N | |
add_to_set(results[m['name']], dict_key, set_value) | |
# return mega collection | |
return results | |
def print_map(m): | |
for k in m.keys(): | |
print("{0!s}: {1!s}".format(k, "; ".join(m[k]))) | |
def process(dataFilename, metaFilename): | |
# open and parse json metadata | |
with open(metaFilename, 'r') as jsonFile: | |
meta = json.load(jsonFile) | |
# open and process data file | |
if dataFilename != '': | |
with open(dataFilename) as csvFile: | |
data = csv.DictReader(csvFile) | |
resultMap = get_maps(data, meta) | |
else: | |
data = csv.DictReader(sys.stdin) | |
resultMap = get_maps(data, meta) | |
# convert sets to lists so that we can serialize them | |
for n, m in resultMap.items(): | |
for k, v in m.items(): | |
resultMap[n][k] = list(v) | |
# hit the mediocre middle-ground between humand and machine readable | |
return json.dumps(resultMap, indent=4) | |
def getArgParser(): | |
parser = argparse.ArgumentParser(description='Processes simple category associations') | |
parser.add_argument('data_file', | |
nargs='?', | |
default='', | |
help='CSV file to process (defaults to stdin)') | |
parser.add_argument('meta_file', | |
help='JSON file describing category associations to look for') | |
parser.add_argument('output_file', | |
nargs='?', | |
default='', | |
help='Name of file to write output to (defaults to stdout)') | |
return parser | |
if __name__ == "__main__": | |
args = getArgParser().parse_args() | |
output = process(args.data_file, args.meta_file) | |
if args.output_file != '': | |
with open(args.output_file, 'w') as outfile: | |
outfile.write(output) | |
else: | |
sys.stdout.write(output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment