apendleton/letter_breakdown.py

## letter_breakdown.py
"""
This script ingests a CSV of last names and builds JSON output on the
percentage of last names that begin with each letter of the alphabet.

I initially ran it on a CSV-ified version of the US Census's list of all last
names with more than 100 occurrences, and used the frequency field within that
file to weigh the output, but absent that field, it assigns equal weight to
each name, allowing us to also process our TCamp 2012 attendence list.

Using the script on the Census data available in:
http://www.census.gov/genealogy/www/data/2000surnames/names.zip
produces this output:

{
    "A": 0.03559596939643605,
    "C": 0.07694824074741784,
    "B": 0.08791477424094518,
    "E": 0.018669123556424614,
    "D": 0.04579860597727158,
    "G": 0.05439868577642196,
    "F": 0.034505834128967044,
    "I": 0.003955862892091837,
    "H": 0.07268753217954918,
    "K": 0.03294327720020791,
    "J": 0.03025983302791189,
    "M": 0.09608053808826617,
    "L": 0.04848650695769241,
    "O": 0.014720280137130485,
    "N": 0.018547007013788936,
    "Q": 0.002206565702878153,
    "P": 0.049319930077139015,
    "S": 0.09580978699464698,
    "R": 0.05763521983707609,
    "U": 0.002210911090800405,
    "T": 0.035309842314786705,
    "W": 0.05861438058222256,
    "V": 0.015875707643636012,
    "Y": 0.006166216881876424,
    "X": 0.00024144758019273616,
    "Z": 0.005097919974221793
}
"""

from csv import DictReader
from collections import defaultdict
import json
import sys

def getitem(l, p, default=None):
    try:
        return l[p]
    except IndexError:
        return default

c = DictReader(open(getitem(sys.argv, 1, "app_c.csv"), "r"))
field = getitem(sys.argv, 2, "name")

data = defaultdict(int)
for row in c:
    first_letter = row[field][0].upper()
    if first_letter.isalpha():
        data[first_letter] += float(row.get('prop100k', 1))

# adjust to get actual percentages
total = sum(data.values())

out = {letter: (count / total) for letter, count in data.iteritems()}

print json.dumps(out, indent=4)

## partition.py
"""
This file consumes the output of the letter breakdown script and explores all
of the possible partitions of the alphabet, selecting the partition that
minimizes the standard deviation of percentages of the alphabet of each
partition.  Its output is the winning standard deviation, the positions in the
alphabet where each partition occurs, and the ranges for each partition, as
well as the total percentage for each range. Using the US Census alphabet data
and partitioning it in three places (i.e., into four lines) lines looks
like this:

$ python partition.py 3 census_letters.json
0.0171285985132
[4, 11, 17]
A-D (24.6%)
E-K (24.7%)
L-Q (22.9%)
R-Z (27.7%)
"""

import json
import sys

try:
    import numpy
except ImportError:
    import numpypy as numpy

scores = json.load(open(sys.argv[2]))
letters = [chr(num) for num in range(ord('A'), ord('Z') + 1)]
for letter in letters:
    if letter not in scores:
        scores[letter] = 0

def get_partitions(start, num_partitions):
    if num_partitions == 1:
        for i in range(start + 1, 26):
            yield [i], [sum([scores[letter] for letter in letters[start:i]]), sum([scores[letter] for letter in letters[i:]])]
    else:
        for i in range(start + 1, 26):
            front_score = [sum([scores[letter] for letter in letters[start:i]])]
            sub_partitions = get_partitions(i, num_partitions - 1)
            for sub in sub_partitions:
                yield [i] + sub[0], front_score + sub[1]


min_std = 1000000
solution = None

for partitions, totals in get_partitions(0, int(sys.argv[1])):
    std = numpy.std(totals)
    #print partitions, totals, std, sum(totals)
    if std < min_std:
        min_std = std
        solution = partitions

print min_std
print solution

ext_solution = [0] + solution + [26]
for i in range(len(ext_solution) - 1):
    alpha = letters[ext_solution[i]:ext_solution[i+1]]
    score = sum([scores[letter] for letter in alpha])
    print '%s-%s (%s%%)' % (alpha[0], alpha[-1], round(sum([scores[letter] for letter in alpha]) * 100, 1))
	"""
	This script ingests a CSV of last names and builds JSON output on the
	percentage of last names that begin with each letter of the alphabet.

	I initially ran it on a CSV-ified version of the US Census's list of all last
	names with more than 100 occurrences, and used the frequency field within that
	file to weigh the output, but absent that field, it assigns equal weight to
	each name, allowing us to also process our TCamp 2012 attendence list.

	Using the script on the Census data available in:
	http://www.census.gov/genealogy/www/data/2000surnames/names.zip
	produces this output:

	{
	"A": 0.03559596939643605,
	"C": 0.07694824074741784,
	"B": 0.08791477424094518,
	"E": 0.018669123556424614,
	"D": 0.04579860597727158,
	"G": 0.05439868577642196,
	"F": 0.034505834128967044,
	"I": 0.003955862892091837,
	"H": 0.07268753217954918,
	"K": 0.03294327720020791,
	"J": 0.03025983302791189,
	"M": 0.09608053808826617,
	"L": 0.04848650695769241,
	"O": 0.014720280137130485,
	"N": 0.018547007013788936,
	"Q": 0.002206565702878153,
	"P": 0.049319930077139015,
	"S": 0.09580978699464698,
	"R": 0.05763521983707609,
	"U": 0.002210911090800405,
	"T": 0.035309842314786705,
	"W": 0.05861438058222256,
	"V": 0.015875707643636012,
	"Y": 0.006166216881876424,
	"X": 0.00024144758019273616,
	"Z": 0.005097919974221793
	}
	"""

	from csv import DictReader
	from collections import defaultdict
	import json
	import sys

	def getitem(l, p, default=None):
	try:
	return l[p]
	except IndexError:
	return default

	c = DictReader(open(getitem(sys.argv, 1, "app_c.csv"), "r"))
	field = getitem(sys.argv, 2, "name")

	data = defaultdict(int)
	for row in c:
	first_letter = row[field][0].upper()
	if first_letter.isalpha():
	data[first_letter] += float(row.get('prop100k', 1))

	# adjust to get actual percentages
	total = sum(data.values())

	out = {letter: (count / total) for letter, count in data.iteritems()}

	print json.dumps(out, indent=4)
	"""
	This file consumes the output of the letter breakdown script and explores all
	of the possible partitions of the alphabet, selecting the partition that
	minimizes the standard deviation of percentages of the alphabet of each
	partition. Its output is the winning standard deviation, the positions in the
	alphabet where each partition occurs, and the ranges for each partition, as
	well as the total percentage for each range. Using the US Census alphabet data
	and partitioning it in three places (i.e., into four lines) lines looks
	like this:

	$ python partition.py 3 census_letters.json
	0.0171285985132
	[4, 11, 17]
	A-D (24.6%)
	E-K (24.7%)
	L-Q (22.9%)
	R-Z (27.7%)
	"""

	import json
	import sys

	try:
	import numpy
	except ImportError:
	import numpypy as numpy

	scores = json.load(open(sys.argv[2]))
	letters = [chr(num) for num in range(ord('A'), ord('Z') + 1)]
	for letter in letters:
	if letter not in scores:
	scores[letter] = 0

	def get_partitions(start, num_partitions):
	if num_partitions == 1:
	for i in range(start + 1, 26):
	yield [i], [sum([scores[letter] for letter in letters[start:i]]), sum([scores[letter] for letter in letters[i:]])]
	else:
	for i in range(start + 1, 26):
	front_score = [sum([scores[letter] for letter in letters[start:i]])]
	sub_partitions = get_partitions(i, num_partitions - 1)
	for sub in sub_partitions:
	yield [i] + sub[0], front_score + sub[1]


	min_std = 1000000
	solution = None

	for partitions, totals in get_partitions(0, int(sys.argv[1])):
	std = numpy.std(totals)
	#print partitions, totals, std, sum(totals)
	if std < min_std:
	min_std = std
	solution = partitions

	print min_std
	print solution

	ext_solution = [0] + solution + [26]
	for i in range(len(ext_solution) - 1):
	alpha = letters[ext_solution[i]:ext_solution[i+1]]
	score = sum([scores[letter] for letter in alpha])
	print '%s-%s (%s%%)' % (alpha[0], alpha[-1], round(sum([scores[letter] for letter in alpha]) * 100, 1))