johnmaxwelliv/lw_survey_dk_analysis.py

## lw_survey_dk_analysis.py
from scipy.stats.stats import pearsonr
import csv

def show_correlation(data, var1, var2):
    has_both_vars = [line for line in data if line[var1] != ' ' and line[var2] != ' ']

    r = pearsonr([float(line[var1]) for line in has_both_vars], [float(line[var2]) for line in has_both_vars])[0]
    print('Correlation between "%s" and "%s": %.4f' % (var1, var2, r))
    print(' ')

def show_breakdown(data, attribute, var):
    print('Breakdown of average "%s" by "%s":' % (var, attribute))
    chars = set([line[attribute] for line in data])
    chars.remove(' ')
    if attribute == 'LessWrongUse':
        chars = [
            "I lurk, but never registered an account",
            "I've registered an account, but never posted",
            "I've posted a comment, but never a top-level post",
            "I've posted in Discussion, but not Main",
            "I've posted in Main",
        ]
    if attribute == 'Sequences':
        chars = [
            "Never even knew they existed until this moment",
            "Know they existed, but never looked at them",
            "Some, but less than 25%",
            "About 25% of the Sequences",
            "About 50% of the Sequences",
            "About 75% of the Sequences",
            "Nearly all of the Sequences",
        ]
    for char in chars:
        numbers = [float(line[var]) for line in data if line[attribute] == char and line[var] != ' ']
        print("%10.4f    \"%s\"" % \
            (sum(numbers) * 1.0 / len(numbers), char))
    print(' ')

with open('for_public.csv', 'r') as infile:
    lines = [line for line in list(csv.DictReader(infile)) if \
        # Data cleaning.
        (line['SATscoresoutof2400'] == ' ' or float(line['SATscoresoutof2400']) <= 2400) and \
        (line['SATscoresoutof1600'] == ' ' or float(line['SATscoresoutof1600']) <= 1600) and \
        '1`8' not in line.values() and \
        '~5' not in line.values()]
    show_correlation(lines, 'IQ', 'KarmaScore')
    show_correlation(lines, 'SATscoresoutof1600', 'KarmaScore')
    show_correlation(lines, 'SATscoresoutof2400', 'KarmaScore')
    show_correlation(lines, 'TimeinCommunity', 'KarmaScore')
    show_breakdown(lines, 'LessWrongUse', 'IQ')
    show_breakdown(lines, 'LessWrongUse', 'SATscoresoutof1600')
    show_breakdown(lines, 'LessWrongUse', 'SATscoresoutof2400')
    show_breakdown(lines, 'Sequences', 'KarmaScore')
    show_breakdown(lines, 'LessWrongUse', 'TimeinCommunity')
    show_breakdown(lines, 'LessWrongUse', 'AutismScore')
    show_breakdown(lines, 'Profession', 'KarmaScore')
    show_breakdown(lines, 'Degree', 'KarmaScore')
	from scipy.stats.stats import pearsonr
	import csv

	def show_correlation(data, var1, var2):
	has_both_vars = [line for line in data if line[var1] != ' ' and line[var2] != ' ']

	r = pearsonr([float(line[var1]) for line in has_both_vars], [float(line[var2]) for line in has_both_vars])[0]
	print('Correlation between "%s" and "%s": %.4f' % (var1, var2, r))
	print(' ')

	def show_breakdown(data, attribute, var):
	print('Breakdown of average "%s" by "%s":' % (var, attribute))
	chars = set([line[attribute] for line in data])
	chars.remove(' ')
	if attribute == 'LessWrongUse':
	chars = [
	"I lurk, but never registered an account",
	"I've registered an account, but never posted",
	"I've posted a comment, but never a top-level post",
	"I've posted in Discussion, but not Main",
	"I've posted in Main",
	]
	if attribute == 'Sequences':
	chars = [
	"Never even knew they existed until this moment",
	"Know they existed, but never looked at them",
	"Some, but less than 25%",
	"About 25% of the Sequences",
	"About 50% of the Sequences",
	"About 75% of the Sequences",
	"Nearly all of the Sequences",
	]
	for char in chars:
	numbers = [float(line[var]) for line in data if line[attribute] == char and line[var] != ' ']
	print("%10.4f \"%s\"" % \
	(sum(numbers) * 1.0 / len(numbers), char))
	print(' ')

	with open('for_public.csv', 'r') as infile:
	lines = [line for line in list(csv.DictReader(infile)) if \
	# Data cleaning.
	(line['SATscoresoutof2400'] == ' ' or float(line['SATscoresoutof2400']) <= 2400) and \
	(line['SATscoresoutof1600'] == ' ' or float(line['SATscoresoutof1600']) <= 1600) and \
	'1`8' not in line.values() and \
	'~5' not in line.values()]
	show_correlation(lines, 'IQ', 'KarmaScore')
	show_correlation(lines, 'SATscoresoutof1600', 'KarmaScore')
	show_correlation(lines, 'SATscoresoutof2400', 'KarmaScore')
	show_correlation(lines, 'TimeinCommunity', 'KarmaScore')
	show_breakdown(lines, 'LessWrongUse', 'IQ')
	show_breakdown(lines, 'LessWrongUse', 'SATscoresoutof1600')
	show_breakdown(lines, 'LessWrongUse', 'SATscoresoutof2400')
	show_breakdown(lines, 'Sequences', 'KarmaScore')
	show_breakdown(lines, 'LessWrongUse', 'TimeinCommunity')
	show_breakdown(lines, 'LessWrongUse', 'AutismScore')
	show_breakdown(lines, 'Profession', 'KarmaScore')
	show_breakdown(lines, 'Degree', 'KarmaScore')