Optimal alphabetical breakdown of registration lines
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This script ingests a CSV of last names and builds JSON output on the | |
percentage of last names that begin with each letter of the alphabet. | |
I initially ran it on a CSV-ified version of the US Census's list of all last | |
names with more than 100 occurrences, and used the frequency field within that | |
file to weigh the output, but absent that field, it assigns equal weight to | |
each name, allowing us to also process our TCamp 2012 attendence list. | |
Using the script on the Census data available in: | |
http://www.census.gov/genealogy/www/data/2000surnames/names.zip | |
produces this output: | |
{ | |
"A": 0.03559596939643605, | |
"C": 0.07694824074741784, | |
"B": 0.08791477424094518, | |
"E": 0.018669123556424614, | |
"D": 0.04579860597727158, | |
"G": 0.05439868577642196, | |
"F": 0.034505834128967044, | |
"I": 0.003955862892091837, | |
"H": 0.07268753217954918, | |
"K": 0.03294327720020791, | |
"J": 0.03025983302791189, | |
"M": 0.09608053808826617, | |
"L": 0.04848650695769241, | |
"O": 0.014720280137130485, | |
"N": 0.018547007013788936, | |
"Q": 0.002206565702878153, | |
"P": 0.049319930077139015, | |
"S": 0.09580978699464698, | |
"R": 0.05763521983707609, | |
"U": 0.002210911090800405, | |
"T": 0.035309842314786705, | |
"W": 0.05861438058222256, | |
"V": 0.015875707643636012, | |
"Y": 0.006166216881876424, | |
"X": 0.00024144758019273616, | |
"Z": 0.005097919974221793 | |
} | |
""" | |
from csv import DictReader | |
from collections import defaultdict | |
import json | |
import sys | |
def getitem(l, p, default=None): | |
try: | |
return l[p] | |
except IndexError: | |
return default | |
c = DictReader(open(getitem(sys.argv, 1, "app_c.csv"), "r")) | |
field = getitem(sys.argv, 2, "name") | |
data = defaultdict(int) | |
for row in c: | |
first_letter = row[field][0].upper() | |
if first_letter.isalpha(): | |
data[first_letter] += float(row.get('prop100k', 1)) | |
# adjust to get actual percentages | |
total = sum(data.values()) | |
out = {letter: (count / total) for letter, count in data.iteritems()} | |
print json.dumps(out, indent=4) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This file consumes the output of the letter breakdown script and explores all | |
of the possible partitions of the alphabet, selecting the partition that | |
minimizes the standard deviation of percentages of the alphabet of each | |
partition. Its output is the winning standard deviation, the positions in the | |
alphabet where each partition occurs, and the ranges for each partition, as | |
well as the total percentage for each range. Using the US Census alphabet data | |
and partitioning it in three places (i.e., into four lines) lines looks | |
like this: | |
$ python partition.py 3 census_letters.json | |
0.0171285985132 | |
[4, 11, 17] | |
A-D (24.6%) | |
E-K (24.7%) | |
L-Q (22.9%) | |
R-Z (27.7%) | |
""" | |
import json | |
import sys | |
try: | |
import numpy | |
except ImportError: | |
import numpypy as numpy | |
scores = json.load(open(sys.argv[2])) | |
letters = [chr(num) for num in range(ord('A'), ord('Z') + 1)] | |
for letter in letters: | |
if letter not in scores: | |
scores[letter] = 0 | |
def get_partitions(start, num_partitions): | |
if num_partitions == 1: | |
for i in range(start + 1, 26): | |
yield [i], [sum([scores[letter] for letter in letters[start:i]]), sum([scores[letter] for letter in letters[i:]])] | |
else: | |
for i in range(start + 1, 26): | |
front_score = [sum([scores[letter] for letter in letters[start:i]])] | |
sub_partitions = get_partitions(i, num_partitions - 1) | |
for sub in sub_partitions: | |
yield [i] + sub[0], front_score + sub[1] | |
min_std = 1000000 | |
solution = None | |
for partitions, totals in get_partitions(0, int(sys.argv[1])): | |
std = numpy.std(totals) | |
#print partitions, totals, std, sum(totals) | |
if std < min_std: | |
min_std = std | |
solution = partitions | |
print min_std | |
print solution | |
ext_solution = [0] + solution + [26] | |
for i in range(len(ext_solution) - 1): | |
alpha = letters[ext_solution[i]:ext_solution[i+1]] | |
score = sum([scores[letter] for letter in alpha]) | |
print '%s-%s (%s%%)' % (alpha[0], alpha[-1], round(sum([scores[letter] for letter in alpha]) * 100, 1)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment