Last active
February 11, 2016 21:55
-
-
Save kmod/ee6ac3c029641b39d0b6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
class ThisDialect(csv.excel): | |
lineterminator='\r' | |
jobnames = list(csv.DictReader(open("job-name.csv", 'U'), dialect=ThisDialect()))[1:] | |
paircounts = list(csv.DictReader(open("pair-count.csv", 'U'), dialect=ThisDialect()))[1:] | |
code_to_name = {} | |
for j in jobnames: | |
code_to_name[j['code']] = j['occupation'] | |
# if "Nurse" in j['occupation']: | |
# print j['occupation'], j['code'] | |
# 1/0 | |
# Sexes: 1 is male, 2 is female | |
married_counts = {} | |
t1 = 0 | |
t2 = 0 | |
for p in paircounts: | |
if p['occ_sp'] in ('0', ''): | |
continue | |
k = (p['sex_sp'], p['occ_sp']) | |
if k[1] == '0': | |
continue | |
if p['sex_sp'] == p['sex']: | |
continue | |
married_counts[k] = married_counts.get(k, 0) + float(p['total']) | |
if p['sex_sp'] == '1': | |
t1 += float(p['total']) | |
if p['sex_sp'] == '2': | |
t2 += float(p['total']) | |
print "Most commonly-married female professions:" | |
most_common_female = [] | |
for (sex, code), count in married_counts.items(): | |
if sex != '2': | |
continue | |
most_common_female.append((count, code)) | |
most_common_female.sort(reverse=True) | |
for count, code in most_common_female[:10]: | |
this_perc = 100.0 * int(count) / t2 | |
print "%s: %.1f%%" % (code_to_name[code], this_perc) | |
# investigate_code = '10' # CEOs | |
investigate_code = '1010' # Programmers | |
# vestigate_code = '3255' # Registered Nurses | |
# investigate_code = '4220' # Janitors | |
# investigate_code = '5700' # Secretaries | |
MARRYING_SEX = '1' | |
MARRIED_SEX = '2' | |
assert MARRIED_SEX != MARRYING_SEX, "sorry the script needs to be updated to look at same-sex marriages" | |
tx = (t2 if MARRIED_SEX == '2' else t1) | |
by_count = [] | |
total = 0 | |
for p in paircounts: | |
if p['occ_sp'] in ('0', ''): | |
continue | |
if p['sex'] != MARRYING_SEX or p['sex_sp'] != MARRIED_SEX: | |
continue | |
if p['occ'] == investigate_code: | |
by_count.append((float(p['total']), p['occ_sp'])) | |
total += by_count[-1][0] | |
by_count.sort(reverse=True) | |
seen = set() | |
normalized = [] | |
over_expected = [] | |
for count, code in by_count: | |
this_perc = 100.0 * int(count) / total | |
global_perc = married_counts[(MARRIED_SEX, code)] / tx * 100 | |
seen.add(code) | |
over_expected.append(((count - total * global_perc / 100) / total, count, code)) | |
normalized.append((this_perc / global_perc, count, code)) | |
print "Most common for %s to marry by count:" % code_to_name[investigate_code] | |
for count, code in by_count[:5]: | |
this_perc = 100.0 * int(count) / total | |
global_perc = married_counts[(MARRIED_SEX, code)] / tx * 100 | |
print "%s: %d, %.1f%%, %.1f%%" % (code_to_name[code], int(count), this_perc, global_perc) | |
print "Most common for %s to marry, normalized vs the population:" % code_to_name[investigate_code] | |
normalized.sort(reverse=True) | |
for mult, count, code in normalized[:5]: | |
this_perc = 100.0 * int(count) / total | |
global_perc = married_counts[(MARRIED_SEX, code)] / t2 * 100 | |
print "%s: %.1fx (%.1f%% vs %.1f%%; %d found)" % (code_to_name[code], mult, this_perc, global_perc, count) | |
print "Most common for %s to marry, over expected:" % code_to_name[investigate_code] | |
over_expected.sort(reverse=True) | |
for over, count, code in over_expected[:5]: | |
this_perc = 100.0 * int(count) / total | |
global_perc = married_counts[(MARRIED_SEX, code)] / t2 * 100 | |
print "%s: %.1f%% more than expected (%.1f%% vs %.1f%%)" % (code_to_name[code], 100 * over, this_perc, global_perc) | |
''' | |
for sex, code in married_counts: | |
if sex != MARRIED_SEX: | |
continue | |
if code not in seen: | |
print "didn't see:", code_to_name[code], married_counts[(MARRIED_SEX, code)] / t2 * 100 | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment