Skip to content

Instantly share code, notes, and snippets.

@NickCrews
Created March 12, 2024 17:27
Show Gist options
  • Save NickCrews/bb846c80f457c29d9ee7e5bc69f22e88 to your computer and use it in GitHub Desktop.
Save NickCrews/bb846c80f457c29d9ee7e5bc69f22e88 to your computer and use it in GitHub Desktop.
This script takes the Federal Election Commission's weekly PostgreSQL dump file and converts it to a directory of parquet files, using an ephemeral postgres instance in a docker container and duckdb.
#!/bin/bash
# This script takes the FEC's PostgreSQL dump file and converts it to a directory
# of parquet files.
# See https://cg-519a459a-0ea3-42c2-b7bc-fa1143481f74.s3-us-gov-west-1.amazonaws.com/bulk-downloads/index.html?prefix=bulk-downloads/data-dump/schedules/
# for the PostgreSQL dump file and more info.
#
# This requires you to
# 1. Have Docker installed and running
# 2. Have the `duckdb` command line tool installed
# 3. Several Hundred GB of free disk space
# 4. Either patience, or a decent amount of CPU and RAM
#
# Currently this only deals with schedule A filings. Modify this if you need others.
# Ways to improve this:
# - Containerize the `duckdb` command line tool. Then users only need docker.
# - Parrallelize (everything?): for every table, spin up a new container
# and restore just that table, then write the parquet file.
# Currently it looks like only one CPU is maxed out at a time.
# - Convert to a python script and expose a python API to do conversion/filtering
# on the fly. This would probably speed up the writing of the parquet files.
# - Benchmark writing the tables to CSV directly with postgres, then converting to parquet
# with `duckdb`. This might be faster than streaming the records pg->duckdb,
# which I think happens one-by-one.
# - Include the step for downloading the pgdump file from the FEC's website.
# Check the arguments
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <pgdump_file> <output_directory>"
exit 1
fi
pgdump_file=$1
output_directory=$2
container_name=my_pg_container
username=myuser
ensure_pg_running() {
# Check if the container is already running
if [ "$(docker ps -q -f name=$container_name)" ]; then
echo "PostgreSQL container is already running"
return
fi
# Check if the container is stopped
if [ "$(docker ps -aq -f status=exited -f name=$container_name)" ]; then
echo "Starting the stopped PostgreSQL container"
docker start $container_name
else
echo "PostgreSQL container is not running, starting a new container"
docker run \
--name $container_name \
-e POSTGRES_USER=$username \
-e POSTGRES_PASSWORD=mysecretpassword \
-p 5432:5432 \
-d \
postgres
echo "Creating schema and extensions in the database"
# need to run the following SQL commands to create the schema and extensions:
# Ignore any errors you see here, they are expected.
docker exec $container_name psql -U $username -d postgres -c "CREATE SCHEMA disclosure;"
docker exec $container_name psql -U $username -d postgres -c "grant all on schema disclosure to public;"
docker exec $container_name psql -U $username -d postgres -c "CREATE EXTENSION pg_trgm;"
docker exec $container_name psql -U $username -d postgres -c "CREATE EXTENSION btree_gin;"
fi
}
restore_table() {
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <dump_file> <table_name>"
exit 1
fi
dump_file=$1
table_name=$2
if [ ! -f $dump_file ]; then
echo "File $dump_file does not exist"
exit 1
fi
ensure_pg_running
echo "Restoring table $table_name from $dump_file"
docker exec -i $container_name pg_restore \
--verbose \
--no-acl \
--no-owner \
-U $username \
-d postgres \
--table $table_name \
< $dump_file
}
parquet_path() {
if [ "$#" -ne 1 ]; then
echo "Usage: $0 <table_name>"
exit 1
fi
echo "$output_directory/$1.parquet"
}
write_parquet() {
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <table_name> <output_directory>"
fi
table_name=$1
out_path=$2
directory=$(dirname $out_path)
if [ ! -d $directory ]; then
echo "Directory $directory does not exist, creating"
mkdir -p $directory
fi
# TODO: also containerize this using eg:
# docker run -it --rm \
# -v $output_directory:/output \
# davidgasquez/duckdb \
echo "Writing table $table_name to $out_path"
duckdb -c "
SET temp_directory='.duckdb_tmp/';
--- Without this, duckdb has to load the whole thing to keep the order.
--- This was running me out of memory.
--- https://duckdb.org/docs/guides/performance/how_to_tune_workloads.html#the-preserve_insertion_order-option
SET preserve_insertion_order = false;
INSTALL postgres;
ATTACH 'postgres://myuser:mysecretpassword@localhost:5432/postgres' AS p (TYPE postgres);
COPY (SELECT * FROM p.disclosure.$table_name) TO '$out_path';
"
}
pgdump_to_parquets() {
# $1 : pgdump file
# $2 : output directory
# rest: table names
if [ "$#" -lt 3 ]; then
echo "Usage: $0 <pgdump_file> <output_directory> <table1> <table2> ..."
exit 1
fi
pgdump_file=$1
output_directory=$2
shift 2
tables=("$@")
ensure_pg_running
for table in "${tables[@]}"; do
p=$(parquet_path $table)
if [ -f $p ]; then
echo "Parquet file $p already exists, skipping"
continue
fi
restore_table $pgdump_file $table
write_parquet $table $p
done
}
tables=(
"fec_fitem_sched_a"
"fec_fitem_sched_a_1975_1976"
"fec_fitem_sched_a_1977_1978"
"fec_fitem_sched_a_1979_1980"
"fec_fitem_sched_a_1981_1982"
"fec_fitem_sched_a_1983_1984"
"fec_fitem_sched_a_1985_1986"
"fec_fitem_sched_a_1987_1988"
"fec_fitem_sched_a_1989_1990"
"fec_fitem_sched_a_1991_1992"
"fec_fitem_sched_a_1993_1994"
"fec_fitem_sched_a_1995_1996"
"fec_fitem_sched_a_1997_1998"
"fec_fitem_sched_a_1999_2000"
"fec_fitem_sched_a_2001_2002"
"fec_fitem_sched_a_2003_2004"
"fec_fitem_sched_a_2005_2006"
"fec_fitem_sched_a_2007_2008"
"fec_fitem_sched_a_2009_2010"
"fec_fitem_sched_a_2011_2012"
"fec_fitem_sched_a_2013_2014"
"fec_fitem_sched_a_2015_2016"
"fec_fitem_sched_a_2017_2018"
"fec_fitem_sched_a_2019_2020"
"fec_fitem_sched_a_2021_2022"
"fec_fitem_sched_a_2023_2024"
)
pgdump_to_parquets $pgdump_file $output_directory "${tables[@]}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment