Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Optimal alphabetical breakdown of registration lines
"""
This script ingests a CSV of last names and builds JSON output on the
percentage of last names that begin with each letter of the alphabet.
I initially ran it on a CSV-ified version of the US Census's list of all last
names with more than 100 occurrences, and used the frequency field within that
file to weigh the output, but absent that field, it assigns equal weight to
each name, allowing us to also process our TCamp 2012 attendence list.
Using the script on the Census data available in:
http://www.census.gov/genealogy/www/data/2000surnames/names.zip
produces this output:
{
"A": 0.03559596939643605,
"C": 0.07694824074741784,
"B": 0.08791477424094518,
"E": 0.018669123556424614,
"D": 0.04579860597727158,
"G": 0.05439868577642196,
"F": 0.034505834128967044,
"I": 0.003955862892091837,
"H": 0.07268753217954918,
"K": 0.03294327720020791,
"J": 0.03025983302791189,
"M": 0.09608053808826617,
"L": 0.04848650695769241,
"O": 0.014720280137130485,
"N": 0.018547007013788936,
"Q": 0.002206565702878153,
"P": 0.049319930077139015,
"S": 0.09580978699464698,
"R": 0.05763521983707609,
"U": 0.002210911090800405,
"T": 0.035309842314786705,
"W": 0.05861438058222256,
"V": 0.015875707643636012,
"Y": 0.006166216881876424,
"X": 0.00024144758019273616,
"Z": 0.005097919974221793
}
"""
from csv import DictReader
from collections import defaultdict
import json
import sys
def getitem(l, p, default=None):
try:
return l[p]
except IndexError:
return default
c = DictReader(open(getitem(sys.argv, 1, "app_c.csv"), "r"))
field = getitem(sys.argv, 2, "name")
data = defaultdict(int)
for row in c:
first_letter = row[field][0].upper()
if first_letter.isalpha():
data[first_letter] += float(row.get('prop100k', 1))
# adjust to get actual percentages
total = sum(data.values())
out = {letter: (count / total) for letter, count in data.iteritems()}
print json.dumps(out, indent=4)
"""
This file consumes the output of the letter breakdown script and explores all
of the possible partitions of the alphabet, selecting the partition that
minimizes the standard deviation of percentages of the alphabet of each
partition. Its output is the winning standard deviation, the positions in the
alphabet where each partition occurs, and the ranges for each partition, as
well as the total percentage for each range. Using the US Census alphabet data
and partitioning it in three places (i.e., into four lines) lines looks
like this:
$ python partition.py 3 census_letters.json
0.0171285985132
[4, 11, 17]
A-D (24.6%)
E-K (24.7%)
L-Q (22.9%)
R-Z (27.7%)
"""
import json
import sys
try:
import numpy
except ImportError:
import numpypy as numpy
scores = json.load(open(sys.argv[2]))
letters = [chr(num) for num in range(ord('A'), ord('Z') + 1)]
for letter in letters:
if letter not in scores:
scores[letter] = 0
def get_partitions(start, num_partitions):
if num_partitions == 1:
for i in range(start + 1, 26):
yield [i], [sum([scores[letter] for letter in letters[start:i]]), sum([scores[letter] for letter in letters[i:]])]
else:
for i in range(start + 1, 26):
front_score = [sum([scores[letter] for letter in letters[start:i]])]
sub_partitions = get_partitions(i, num_partitions - 1)
for sub in sub_partitions:
yield [i] + sub[0], front_score + sub[1]
min_std = 1000000
solution = None
for partitions, totals in get_partitions(0, int(sys.argv[1])):
std = numpy.std(totals)
#print partitions, totals, std, sum(totals)
if std < min_std:
min_std = std
solution = partitions
print min_std
print solution
ext_solution = [0] + solution + [26]
for i in range(len(ext_solution) - 1):
alpha = letters[ext_solution[i]:ext_solution[i+1]]
score = sum([scores[letter] for letter in alpha])
print '%s-%s (%s%%)' % (alpha[0], alpha[-1], round(sum([scores[letter] for letter in alpha]) * 100, 1))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment