Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Cleans up the data set specific to a re-analysis of '"What Went Right and What Went Wrong": An Analysis of 155 Postmortems from Game Development'
#!/usr/bin/env python
################################################################################
# Copyright (c) 2017 Nathan Hwang, "thenoviceoof"
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation files
# (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge,
# publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from argparse import ArgumentParser
from collections import defaultdict
import csv
################################################################################
# Handle file arguments
parser = ArgumentParser()
parser.add_argument('input_path', default='raw_codes.csv')
parser.add_argument('output_path', default='clean_rows.csv')
args = parser.parse_args()
################################################################################
# Data intake/cleanup
cases = defaultdict(list)
i = 0
with open(args.input_path) as raw_codes:
csv_reader = csv.DictReader(raw_codes)
for row_number,row in enumerate(csv_reader):
# Remove bad data.
if row['COMPLETED'] == 'TRUE' and row['Ignore'] != 'TRUE':
# Clean up the rows a bit, remove data we don't use.
del row['Case']
del row['Release Date']
del row['Text']
del row['Notes']
row['Right/Wrong'] = row['Right/Wrong'] == 'What went right?'
del row['COMPLETED']
del row['Ignore']
# *Probably* doesn't mean anything, but uncertain.
del row['Category']
cases[row['Case #']].append(row)
# Make sure we end up with 845 -734- (when we took out DP vars that
# were both false) cases.
assert sum(len(count) for count in cases.values()) == 845
################################################################################
# Collapse to one row per case
# Get the list of variables from the data.
codes = set()
for k,v in cases.iteritems():
codes.update(case['Code'] for case in v)
# Sort alphabetically.
codes = sorted(list(codes))
# 'dv' stands for dependent variable, 'iv' stands for independent variable.
assert len(codes) == 22
codes = ['dv_' + code.lower().replace(' ', '_') for code in codes]
def most_common(key, list_of_dicts):
'''
Returns (most_common_value, number_of_times_value_shows_up, total_items)
'''
var_count = defaultdict(int)
for d in list_of_dicts:
var_count[d[key]] += 1
value, count = max(var_count.iteritems(), key=lambda x:x[1])
return value, count, len(list_of_dicts)
# Convert case lists to a single row.
out_cases = []
iv_mapping = {
'iv_selfpublished': ['Self Published', 'Used Publisher'],
'iv_smallcompany': 'NUMDEVS',
'iv_singleplatform': ['Single Platform', 'Multiplatform'],
}
for k,v in cases.iteritems():
initcase = v[0]
casenum = initcase['Case #']
outcase = {
'case': casenum,
}
# Independent variables
# Have to choose the most common value, because the data is dark
# and full of terrors. Each row for a case does not match every
# other case row.
for iv,params in iv_mapping.iteritems():
if not isinstance(params, list):
assert iv == 'iv_smallcompany'
# Just pass the number through.
value, vcount, tcount = most_common(params, v)
# Make sure there's at most one different value.
assert tcount - vcount <= 1, 'Too much variance %s' % casenum
# Do small vs big company checks.
if value == '0':
value = ''
elif int(value) <= 20:
value = True
else:
value = False
outcase[iv] = value
else:
# Decide whether there's data (if both are false).
pvalue, pcount, t1count = most_common(params[0], v)
nvalue, ncount, t2count = most_common(params[1], v)
# Do sanity checks on data.
assert t1count == t2count, 'Totals not matched %s' % casenum
tcount = t1count
assert tcount - pcount <= 1, 'Too much variance %s' % casenum
assert tcount - ncount <= 1, 'Too much variance %s' % casenum
value = None
if pvalue == 'FALSE' and nvalue == 'FALSE':
value = ''
elif pvalue == 'TRUE' and nvalue == 'FALSE':
value = True
elif pvalue == 'FALSE' and nvalue == 'TRUE':
value = False
else:
assert False, 'Should be impossible %s' % casenum
outcase[iv] = value
# Dependent variables.
for caserow in v:
casecode = caserow['Code'].lower().replace(' ', '_')
outcase['dv_' + casecode] = caserow['Right/Wrong']
# If all the dependent variables aren't there, add dummy variables.
for code in codes:
if code not in outcase:
outcase[code] = ''
out_cases.append(outcase)
# Sort cases by case 'number'.
out_cases = sorted(out_cases, key=lambda x: int(x['case']))
################################################################################
# Output
# Define an ordering for the keys.
ordered_keys = (['case',
'iv_selfpublished', 'iv_smallcompany', 'iv_singleplatform'] +
codes)
with open(args.output_path, 'w') as out_csv:
csv_writer = csv.DictWriter(out_csv, ordered_keys)
csv_writer.writeheader()
for case in out_cases:
csv_writer.writerow(case)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment