Skip to content

Instantly share code, notes, and snippets.

@kmod
Last active February 11, 2016 21:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kmod/ee6ac3c029641b39d0b6 to your computer and use it in GitHub Desktop.
Save kmod/ee6ac3c029641b39d0b6 to your computer and use it in GitHub Desktop.
import csv
class ThisDialect(csv.excel):
lineterminator='\r'
jobnames = list(csv.DictReader(open("job-name.csv", 'U'), dialect=ThisDialect()))[1:]
paircounts = list(csv.DictReader(open("pair-count.csv", 'U'), dialect=ThisDialect()))[1:]
code_to_name = {}
for j in jobnames:
code_to_name[j['code']] = j['occupation']
# if "Nurse" in j['occupation']:
# print j['occupation'], j['code']
# 1/0
# Sexes: 1 is male, 2 is female
married_counts = {}
t1 = 0
t2 = 0
for p in paircounts:
if p['occ_sp'] in ('0', ''):
continue
k = (p['sex_sp'], p['occ_sp'])
if k[1] == '0':
continue
if p['sex_sp'] == p['sex']:
continue
married_counts[k] = married_counts.get(k, 0) + float(p['total'])
if p['sex_sp'] == '1':
t1 += float(p['total'])
if p['sex_sp'] == '2':
t2 += float(p['total'])
print "Most commonly-married female professions:"
most_common_female = []
for (sex, code), count in married_counts.items():
if sex != '2':
continue
most_common_female.append((count, code))
most_common_female.sort(reverse=True)
for count, code in most_common_female[:10]:
this_perc = 100.0 * int(count) / t2
print "%s: %.1f%%" % (code_to_name[code], this_perc)
print
# investigate_code = '10' # CEOs
investigate_code = '1010' # Programmers
# vestigate_code = '3255' # Registered Nurses
# investigate_code = '4220' # Janitors
# investigate_code = '5700' # Secretaries
MARRYING_SEX = '1'
MARRIED_SEX = '2'
assert MARRIED_SEX != MARRYING_SEX, "sorry the script needs to be updated to look at same-sex marriages"
tx = (t2 if MARRIED_SEX == '2' else t1)
by_count = []
total = 0
for p in paircounts:
if p['occ_sp'] in ('0', ''):
continue
if p['sex'] != MARRYING_SEX or p['sex_sp'] != MARRIED_SEX:
continue
if p['occ'] == investigate_code:
by_count.append((float(p['total']), p['occ_sp']))
total += by_count[-1][0]
by_count.sort(reverse=True)
seen = set()
normalized = []
over_expected = []
for count, code in by_count:
this_perc = 100.0 * int(count) / total
global_perc = married_counts[(MARRIED_SEX, code)] / tx * 100
seen.add(code)
over_expected.append(((count - total * global_perc / 100) / total, count, code))
normalized.append((this_perc / global_perc, count, code))
print "Most common for %s to marry by count:" % code_to_name[investigate_code]
for count, code in by_count[:5]:
this_perc = 100.0 * int(count) / total
global_perc = married_counts[(MARRIED_SEX, code)] / tx * 100
print "%s: %d, %.1f%%, %.1f%%" % (code_to_name[code], int(count), this_perc, global_perc)
print
print "Most common for %s to marry, normalized vs the population:" % code_to_name[investigate_code]
normalized.sort(reverse=True)
for mult, count, code in normalized[:5]:
this_perc = 100.0 * int(count) / total
global_perc = married_counts[(MARRIED_SEX, code)] / t2 * 100
print "%s: %.1fx (%.1f%% vs %.1f%%; %d found)" % (code_to_name[code], mult, this_perc, global_perc, count)
print
print "Most common for %s to marry, over expected:" % code_to_name[investigate_code]
over_expected.sort(reverse=True)
for over, count, code in over_expected[:5]:
this_perc = 100.0 * int(count) / total
global_perc = married_counts[(MARRIED_SEX, code)] / t2 * 100
print "%s: %.1f%% more than expected (%.1f%% vs %.1f%%)" % (code_to_name[code], 100 * over, this_perc, global_perc)
print
'''
for sex, code in married_counts:
if sex != MARRIED_SEX:
continue
if code not in seen:
print "didn't see:", code_to_name[code], married_counts[(MARRIED_SEX, code)] / t2 * 100
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment