Skip to content

Instantly share code, notes, and snippets.

@bsidhom
Created November 4, 2023 06:50
Show Gist options
  • Save bsidhom/986e6a3de408a98a99ca5015e9c01822 to your computer and use it in GitHub Desktop.
Save bsidhom/986e6a3de408a98a99ca5015e9c01822 to your computer and use it in GitHub Desktop.
Merge SSA baby names dataset into a flat CSV
#!/usr/bin/env python3
# Baby names zip can be downloaded from https://www.ssa.gov/oact/babynames/names.zip
import argparse
import csv
import io
import re
import sys
import zipfile
FILE_PATTERN = re.compile(r"^yob(?P<year>\d{4})\.txt$")
def main():
parser = argparse.ArgumentParser(
prog="babynames",
description=
"Merge US SSA baby names dataset into a flat CSV. See https://www.ssa.gov/oact/babynames/limits.html"
)
parser.add_argument("--file",
help="SSA zip file of baby name data",
required=True)
args = parser.parse_args()
writer = csv.writer(sys.stdout)
writer.writerow(("year", "name", "sex", "count"))
dump_names(args.file, writer)
sys.stdout.flush()
def dump_names(fname, writer):
names = []
with zipfile.ZipFile(fname) as z:
for name in z.namelist():
m = FILE_PATTERN.match(name)
if not m:
continue
year = int(m.group("year"))
with io.TextIOWrapper(z.open(name)) as f:
reader = csv.reader(f)
for record in reader:
name = record[0]
sex = record[1]
count = int(record[2])
writer.writerow((year, name, sex, count))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment