Created
November 4, 2023 06:50
-
-
Save bsidhom/986e6a3de408a98a99ca5015e9c01822 to your computer and use it in GitHub Desktop.
Merge SSA baby names dataset into a flat CSV
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Baby names zip can be downloaded from https://www.ssa.gov/oact/babynames/names.zip | |
import argparse | |
import csv | |
import io | |
import re | |
import sys | |
import zipfile | |
FILE_PATTERN = re.compile(r"^yob(?P<year>\d{4})\.txt$") | |
def main(): | |
parser = argparse.ArgumentParser( | |
prog="babynames", | |
description= | |
"Merge US SSA baby names dataset into a flat CSV. See https://www.ssa.gov/oact/babynames/limits.html" | |
) | |
parser.add_argument("--file", | |
help="SSA zip file of baby name data", | |
required=True) | |
args = parser.parse_args() | |
writer = csv.writer(sys.stdout) | |
writer.writerow(("year", "name", "sex", "count")) | |
dump_names(args.file, writer) | |
sys.stdout.flush() | |
def dump_names(fname, writer): | |
names = [] | |
with zipfile.ZipFile(fname) as z: | |
for name in z.namelist(): | |
m = FILE_PATTERN.match(name) | |
if not m: | |
continue | |
year = int(m.group("year")) | |
with io.TextIOWrapper(z.open(name)) as f: | |
reader = csv.reader(f) | |
for record in reader: | |
name = record[0] | |
sex = record[1] | |
count = int(record[2]) | |
writer.writerow((year, name, sex, count)) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment