| from scipy.stats.stats import pearsonr | |
| import csv | |
| def show_correlation(data, var1, var2): | |
| has_both_vars = [line for line in data if line[var1] != ' ' and line[var2] != ' '] | |
| r = pearsonr([float(line[var1]) for line in has_both_vars], [float(line[var2]) for line in has_both_vars])[0] | |
| print('Correlation between "%s" and "%s": %.4f' % (var1, var2, r)) | |
| print(' ') | |
| def show_breakdown(data, attribute, var): | |
| print('Breakdown of average "%s" by "%s":' % (var, attribute)) | |
| chars = set([line[attribute] for line in data]) | |
| chars.remove(' ') | |
| if attribute == 'LessWrongUse': | |
| chars = [ | |
| "I lurk, but never registered an account", | |
| "I've registered an account, but never posted", | |
| "I've posted a comment, but never a top-level post", | |
| "I've posted in Discussion, but not Main", | |
| "I've posted in Main", | |
| ] | |
| if attribute == 'Sequences': | |
| chars = [ | |
| "Never even knew they existed until this moment", | |
| "Know they existed, but never looked at them", | |
| "Some, but less than 25%", | |
| "About 25% of the Sequences", | |
| "About 50% of the Sequences", | |
| "About 75% of the Sequences", | |
| "Nearly all of the Sequences", | |
| ] | |
| for char in chars: | |
| numbers = [float(line[var]) for line in data if line[attribute] == char and line[var] != ' '] | |
| print("%10.4f \"%s\"" % \ | |
| (sum(numbers) * 1.0 / len(numbers), char)) | |
| print(' ') | |
| with open('for_public.csv', 'r') as infile: | |
| lines = [line for line in list(csv.DictReader(infile)) if \ | |
| # Data cleaning. | |
| (line['SATscoresoutof2400'] == ' ' or float(line['SATscoresoutof2400']) <= 2400) and \ | |
| (line['SATscoresoutof1600'] == ' ' or float(line['SATscoresoutof1600']) <= 1600) and \ | |
| '1`8' not in line.values() and \ | |
| '~5' not in line.values()] | |
| show_correlation(lines, 'IQ', 'KarmaScore') | |
| show_correlation(lines, 'SATscoresoutof1600', 'KarmaScore') | |
| show_correlation(lines, 'SATscoresoutof2400', 'KarmaScore') | |
| show_correlation(lines, 'TimeinCommunity', 'KarmaScore') | |
| show_breakdown(lines, 'LessWrongUse', 'IQ') | |
| show_breakdown(lines, 'LessWrongUse', 'SATscoresoutof1600') | |
| show_breakdown(lines, 'LessWrongUse', 'SATscoresoutof2400') | |
| show_breakdown(lines, 'Sequences', 'KarmaScore') | |
| show_breakdown(lines, 'LessWrongUse', 'TimeinCommunity') | |
| show_breakdown(lines, 'LessWrongUse', 'AutismScore') | |
| show_breakdown(lines, 'Profession', 'KarmaScore') | |
| show_breakdown(lines, 'Degree', 'KarmaScore') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment