thenoviceoof/input_script.py Secret

## input_script.py
#!/usr/bin/env python
################################################################################
# Copyright (c) 2017 Nathan Hwang, "thenoviceoof"
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation files
# (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge,
# publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from argparse import ArgumentParser
from collections import defaultdict
import csv

################################################################################
# Handle file arguments

parser = ArgumentParser()
parser.add_argument('input_path', default='raw_codes.csv')
parser.add_argument('output_path', default='clean_rows.csv')

args = parser.parse_args()

################################################################################
# Data intake/cleanup

cases = defaultdict(list)

i = 0

with open(args.input_path) as raw_codes:
    csv_reader = csv.DictReader(raw_codes)
    for row_number,row in enumerate(csv_reader):
        # Remove bad data.
        if row['COMPLETED'] == 'TRUE' and row['Ignore'] != 'TRUE':
            # Clean up the rows a bit, remove data we don't use.
            del row['Case']
            del row['Release Date']
            del row['Text']
            del row['Notes']
            row['Right/Wrong'] = row['Right/Wrong'] == 'What went right?'
            del row['COMPLETED']
            del row['Ignore']
            # *Probably* doesn't mean anything, but uncertain.
            del row['Category']

            cases[row['Case #']].append(row)

# Make sure we end up with 845 -734- (when we took out DP vars that
# were both false) cases.
assert sum(len(count) for count in cases.values()) == 845

################################################################################
# Collapse to one row per case

# Get the list of variables from the data.
codes = set()
for k,v in cases.iteritems():
    codes.update(case['Code'] for case in v)
# Sort alphabetically.
codes = sorted(list(codes))

# 'dv' stands for dependent variable, 'iv' stands for independent variable.
assert len(codes) == 22
codes = ['dv_' + code.lower().replace(' ', '_') for code in codes]

def most_common(key, list_of_dicts):
    '''
    Returns (most_common_value, number_of_times_value_shows_up, total_items)
    '''
    var_count = defaultdict(int)
    for d in list_of_dicts:
        var_count[d[key]] += 1
    value, count = max(var_count.iteritems(), key=lambda x:x[1])
    return value, count, len(list_of_dicts)

# Convert case lists to a single row.
out_cases = []
iv_mapping = {
    'iv_selfpublished': ['Self Published', 'Used Publisher'],
    'iv_smallcompany': 'NUMDEVS',
    'iv_singleplatform': ['Single Platform', 'Multiplatform'],
}

for k,v in cases.iteritems():
    initcase = v[0]
    casenum = initcase['Case #']
    outcase = {
        'case': casenum,
    }
    # Independent variables
    # Have to choose the most common value, because the data is dark
    # and full of terrors. Each row for a case does not match every
    # other case row.
    for iv,params in iv_mapping.iteritems():
        if not isinstance(params, list):
            assert iv == 'iv_smallcompany'
            # Just pass the number through.
            value, vcount, tcount = most_common(params, v)

            # Make sure there's at most one different value.
            assert tcount - vcount <= 1, 'Too much variance %s' % casenum

            # Do small vs big company checks.
            if value == '0':
                value = ''
            elif int(value) <= 20:
                value = True
            else:
                value = False

            outcase[iv] = value
        else:
            # Decide whether there's data (if both are false).
            pvalue, pcount, t1count = most_common(params[0], v)
            nvalue, ncount, t2count = most_common(params[1], v)

            # Do sanity checks on data.
            assert t1count == t2count, 'Totals not matched %s' % casenum
            tcount = t1count
            assert tcount - pcount <= 1, 'Too much variance %s' % casenum
            assert tcount - ncount <= 1, 'Too much variance %s' % casenum

            value = None
            if pvalue == 'FALSE' and nvalue == 'FALSE':
                value = ''
            elif pvalue == 'TRUE' and nvalue == 'FALSE':
                value = True
            elif pvalue == 'FALSE' and nvalue == 'TRUE':
                value = False
            else:
                assert False, 'Should be impossible %s' % casenum

            outcase[iv] = value

    # Dependent variables.
    for caserow in v:
        casecode = caserow['Code'].lower().replace(' ', '_')
        outcase['dv_' + casecode] = caserow['Right/Wrong']

    # If all the dependent variables aren't there, add dummy variables.
    for code in codes:
        if code not in outcase:
            outcase[code] = ''

    out_cases.append(outcase)

# Sort cases by case 'number'.
out_cases = sorted(out_cases, key=lambda x: int(x['case']))

################################################################################
# Output

# Define an ordering for the keys.
ordered_keys = (['case',
                'iv_selfpublished', 'iv_smallcompany', 'iv_singleplatform'] +
                codes)

with open(args.output_path, 'w') as out_csv:
    csv_writer = csv.DictWriter(out_csv, ordered_keys)
    csv_writer.writeheader()
    for case in out_cases:
        csv_writer.writerow(case)
	#!/usr/bin/env python
	################################################################################
	# Copyright (c) 2017 Nathan Hwang, "thenoviceoof"
	#
	# Permission is hereby granted, free of charge, to any person
	# obtaining a copy of this software and associated documentation files
	# (the "Software"), to deal in the Software without restriction,
	# including without limitation the rights to use, copy, modify, merge,
	# publish, distribute, sublicense, and/or sell copies of the Software,
	# and to permit persons to whom the Software is furnished to do so,
	# subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be
	# included in all copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
	# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
	# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.

	from argparse import ArgumentParser
	from collections import defaultdict
	import csv

	################################################################################
	# Handle file arguments

	parser = ArgumentParser()
	parser.add_argument('input_path', default='raw_codes.csv')
	parser.add_argument('output_path', default='clean_rows.csv')

	args = parser.parse_args()

	################################################################################
	# Data intake/cleanup

	cases = defaultdict(list)

	i = 0

	with open(args.input_path) as raw_codes:
	csv_reader = csv.DictReader(raw_codes)
	for row_number,row in enumerate(csv_reader):
	# Remove bad data.
	if row['COMPLETED'] == 'TRUE' and row['Ignore'] != 'TRUE':
	# Clean up the rows a bit, remove data we don't use.
	del row['Case']
	del row['Release Date']
	del row['Text']
	del row['Notes']
	row['Right/Wrong'] = row['Right/Wrong'] == 'What went right?'
	del row['COMPLETED']
	del row['Ignore']
	# Probably doesn't mean anything, but uncertain.
	del row['Category']

	cases[row['Case #']].append(row)

	# Make sure we end up with 845 -734- (when we took out DP vars that
	# were both false) cases.
	assert sum(len(count) for count in cases.values()) == 845

	################################################################################
	# Collapse to one row per case

	# Get the list of variables from the data.
	codes = set()
	for k,v in cases.iteritems():
	codes.update(case['Code'] for case in v)
	# Sort alphabetically.
	codes = sorted(list(codes))

	# 'dv' stands for dependent variable, 'iv' stands for independent variable.
	assert len(codes) == 22
	codes = ['dv_' + code.lower().replace(' ', '_') for code in codes]

	def most_common(key, list_of_dicts):
	'''
	Returns (most_common_value, number_of_times_value_shows_up, total_items)
	'''
	var_count = defaultdict(int)
	for d in list_of_dicts:
	var_count[d[key]] += 1
	value, count = max(var_count.iteritems(), key=lambda x:x[1])
	return value, count, len(list_of_dicts)

	# Convert case lists to a single row.
	out_cases = []
	iv_mapping = {
	'iv_selfpublished': ['Self Published', 'Used Publisher'],
	'iv_smallcompany': 'NUMDEVS',
	'iv_singleplatform': ['Single Platform', 'Multiplatform'],
	}

	for k,v in cases.iteritems():
	initcase = v[0]
	casenum = initcase['Case #']
	outcase = {
	'case': casenum,
	}
	# Independent variables
	# Have to choose the most common value, because the data is dark
	# and full of terrors. Each row for a case does not match every
	# other case row.
	for iv,params in iv_mapping.iteritems():
	if not isinstance(params, list):
	assert iv == 'iv_smallcompany'
	# Just pass the number through.
	value, vcount, tcount = most_common(params, v)

	# Make sure there's at most one different value.
	assert tcount - vcount <= 1, 'Too much variance %s' % casenum

	# Do small vs big company checks.
	if value == '0':
	value = ''
	elif int(value) <= 20:
	value = True
	else:
	value = False

	outcase[iv] = value
	else:
	# Decide whether there's data (if both are false).
	pvalue, pcount, t1count = most_common(params[0], v)
	nvalue, ncount, t2count = most_common(params[1], v)

	# Do sanity checks on data.
	assert t1count == t2count, 'Totals not matched %s' % casenum
	tcount = t1count
	assert tcount - pcount <= 1, 'Too much variance %s' % casenum
	assert tcount - ncount <= 1, 'Too much variance %s' % casenum

	value = None
	if pvalue == 'FALSE' and nvalue == 'FALSE':
	value = ''
	elif pvalue == 'TRUE' and nvalue == 'FALSE':
	value = True
	elif pvalue == 'FALSE' and nvalue == 'TRUE':
	value = False
	else:
	assert False, 'Should be impossible %s' % casenum

	outcase[iv] = value

	# Dependent variables.
	for caserow in v:
	casecode = caserow['Code'].lower().replace(' ', '_')
	outcase['dv_' + casecode] = caserow['Right/Wrong']

	# If all the dependent variables aren't there, add dummy variables.
	for code in codes:
	if code not in outcase:
	outcase[code] = ''

	out_cases.append(outcase)

	# Sort cases by case 'number'.
	out_cases = sorted(out_cases, key=lambda x: int(x['case']))

	################################################################################
	# Output

	# Define an ordering for the keys.
	ordered_keys = (['case',
	'iv_selfpublished', 'iv_smallcompany', 'iv_singleplatform'] +
	codes)

	with open(args.output_path, 'w') as out_csv:
	csv_writer = csv.DictWriter(out_csv, ordered_keys)
	csv_writer.writeheader()
	for case in out_cases:
	csv_writer.writerow(case)