Postgres Dump Sanitizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# We'd like to have a sanitized/elided copy of the database suitable | |
# for letting various people run reporting against. Thus, we need to | |
# strip out sensitive or noisy data. | |
# | |
# One "obvious" way to do that is with a table by table | |
# pg_dump. Unfortunatly, that ends up omiting the non-table data (like | |
# views) which we do need. thus, we have this awkward script. | |
# | |
# It's designed to take a full pg_dump, the same that the backups use, | |
# and transform it to being suitable for the reporting database. | |
# | |
# depending on where the bottle necks are, we can run this in a bigger | |
# pipeline with tee. | |
# | |
# We ended up with perl, because the column order changes, and | |
# accounting for that in awk was just getting too hairy. Of course, | |
# this is pretty ugly perl. | |
use strict; | |
use warnings; | |
use Data::Dumper; | |
my $debug = 0; | |
# state variables. | |
my %CLEAN_ACTIONS = (); | |
# The logic: | |
# The data comes in blocks. The end with \. and start with COPY <table name> | |
# When we get the header for a table we want to elide do 2 things: | |
# a. figure out what column numbers we're eliding | |
# b. set a flag saying to elide stuff | |
# Then when we get to outputting, pay attention to said flag | |
# | |
# We need to get a little clever so the clean action only takes effect *next* loop. | |
while(<>) { | |
# if this matches \., then it's an end of data block. | |
# reset CLEAN_ACTIONS | |
if( m{^\s*\\\.\s$} ) { | |
print STDERR "CLEANER: line matches \\. end of block\n" if($debug); | |
%CLEAN_ACTIONS = (); | |
print $_; | |
next; | |
} | |
# What to do for the ccs table. Elide the full number, and 1234 the last4 column | |
elsif( /^COPY credit_cards \((.*)\)/ ) { | |
print STDERR "CLEANER: line matches ccs table.\n" if($debug); | |
my @columns = split(/,\s*/, $1); | |
for my $i (0..$#columns) { | |
push( @{$CLEAN_ACTIONS{"null"}}, $i) if($columns[$i] =~ /^fulldata$/); | |
push( @{$CLEAN_ACTIONS{"null"}}, $i) if($columns[$i] =~ /^fulldata_hash$/); | |
push( @{$CLEAN_ACTIONS{"null"}}, $i) if($columns[$i] =~ /^fulldata_salt$/); | |
push( @{$CLEAN_ACTIONS{"null"}}, $i) if($columns[$i] =~ /^source_id$/); | |
} | |
print $_; | |
next; | |
} | |
# don't output anything for the log_entries table | |
elsif( /^COPY logs / ) { | |
print STDERR "CLEANER: line matches logs\n" if($debug); | |
$CLEAN_ACTIONS{"skip_lines"} = 1; | |
print $_; | |
next; | |
} | |
# don't output anything for the secrets table | |
elsif( /^COPY secrets / ) { | |
print STDERR "CLEANER: line matches secrets\n" if($debug); | |
$CLEAN_ACTIONS{"skip_lines"} = 1; | |
print $_; | |
next; | |
} | |
# elide a couple columns off payment_transactions | |
elsif( /^COPY txn_logs \((.*)\)/ ) { | |
print STDERR "CLEANER: line matches txn_logs.\n" if($debug); | |
my @columns = split(/,\s*/, $1); | |
for my $i (0..$#columns) { | |
push( @{$CLEAN_ACTIONS{"null"}}, $i) if($columns[$i] =~ /^request_string$/); | |
push( @{$CLEAN_ACTIONS{"null"}}, $i) if($columns[$i] =~ /^response_string$/); | |
} | |
print $_; | |
next; | |
} | |
# Now, take action based on the CLEAN_ACTIONS hash | |
# this is an elsif, since we only want to run when the others do not. | |
elsif($CLEAN_ACTIONS{"skip_lines"}){ | |
# don't ouput | |
next; | |
} | |
elsif(defined $CLEAN_ACTIONS{"null"} || defined $CLEAN_ACTIONS{"1234"}) { | |
# mod fields | |
chomp; | |
my @data = split(/\t/); | |
foreach my $i ( @{$CLEAN_ACTIONS{"null"}} ) { | |
$data[$i] = '\N'; | |
} | |
foreach my $i ( @{$CLEAN_ACTIONS{"1234"}} ) { | |
$data[$i] = '1234'; | |
} | |
print join("\t", @data), "\n"; | |
} | |
else { | |
# normal line, output normally | |
print $_; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment