Last active
December 10, 2015 01:19
-
-
Save johnmaxwelliv/4357424 to your computer and use it in GitHub Desktop.
Analysis of data here (http://lesswrong.com/lw/fp5/2012_survey_results/) for possible Dunning-Kruger effects.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scipy.stats.stats import pearsonr | |
import csv | |
def show_correlation(data, var1, var2): | |
has_both_vars = [line for line in data if line[var1] != ' ' and line[var2] != ' '] | |
r = pearsonr([float(line[var1]) for line in has_both_vars], [float(line[var2]) for line in has_both_vars])[0] | |
print('Correlation between "%s" and "%s": %.4f' % (var1, var2, r)) | |
print(' ') | |
def show_breakdown(data, attribute, var): | |
print('Breakdown of average "%s" by "%s":' % (var, attribute)) | |
chars = set([line[attribute] for line in data]) | |
chars.remove(' ') | |
if attribute == 'LessWrongUse': | |
chars = [ | |
"I lurk, but never registered an account", | |
"I've registered an account, but never posted", | |
"I've posted a comment, but never a top-level post", | |
"I've posted in Discussion, but not Main", | |
"I've posted in Main", | |
] | |
if attribute == 'Sequences': | |
chars = [ | |
"Never even knew they existed until this moment", | |
"Know they existed, but never looked at them", | |
"Some, but less than 25%", | |
"About 25% of the Sequences", | |
"About 50% of the Sequences", | |
"About 75% of the Sequences", | |
"Nearly all of the Sequences", | |
] | |
for char in chars: | |
numbers = [float(line[var]) for line in data if line[attribute] == char and line[var] != ' '] | |
print("%10.4f \"%s\"" % \ | |
(sum(numbers) * 1.0 / len(numbers), char)) | |
print(' ') | |
with open('for_public.csv', 'r') as infile: | |
lines = [line for line in list(csv.DictReader(infile)) if \ | |
# Data cleaning. | |
(line['SATscoresoutof2400'] == ' ' or float(line['SATscoresoutof2400']) <= 2400) and \ | |
(line['SATscoresoutof1600'] == ' ' or float(line['SATscoresoutof1600']) <= 1600) and \ | |
'1`8' not in line.values() and \ | |
'~5' not in line.values()] | |
show_correlation(lines, 'IQ', 'KarmaScore') | |
show_correlation(lines, 'SATscoresoutof1600', 'KarmaScore') | |
show_correlation(lines, 'SATscoresoutof2400', 'KarmaScore') | |
show_correlation(lines, 'TimeinCommunity', 'KarmaScore') | |
show_breakdown(lines, 'LessWrongUse', 'IQ') | |
show_breakdown(lines, 'LessWrongUse', 'SATscoresoutof1600') | |
show_breakdown(lines, 'LessWrongUse', 'SATscoresoutof2400') | |
show_breakdown(lines, 'Sequences', 'KarmaScore') | |
show_breakdown(lines, 'LessWrongUse', 'TimeinCommunity') | |
show_breakdown(lines, 'LessWrongUse', 'AutismScore') | |
show_breakdown(lines, 'Profession', 'KarmaScore') | |
show_breakdown(lines, 'Degree', 'KarmaScore') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment