directionless/pg_dump_cleaner.pl

## pg_dump_cleaner.pl
#!/usr/bin/perl

# We'd like to have a sanitized/elided copy of the database suitable
# for letting various people run reporting against.  Thus, we need to
# strip out sensitive or noisy data.
#
# One "obvious" way to do that is with a table by table
# pg_dump. Unfortunatly, that ends up omiting the non-table data (like
# views) which we do need. thus, we have this awkward script.
#
# It's designed to take a full pg_dump, the same that the backups use,
# and transform it to being suitable for the reporting database.
#
# depending on where the bottle necks are, we can run this in a bigger
# pipeline with tee.
#
# We ended up with perl, because the column order changes, and
# accounting for that in awk was just getting too hairy. Of course,
# this is pretty ugly perl.

use strict;
use warnings;
use Data::Dumper;

my $debug = 0;

# state variables.
my %CLEAN_ACTIONS = ();

# The logic:
# The data comes in blocks. The end with \. and start with COPY <table name>
# When we get the header for a table we want to elide do 2 things:
#  a. figure out what column numbers we're eliding
#  b. set a flag saying to elide stuff
# Then when we get to outputting, pay attention to said flag
#
# We need to get a little clever so the clean action only takes effect *next* loop.

while(<>) {

  # if this matches \., then it's an end of data block.
  # reset CLEAN_ACTIONS
  if( m{^\s*\\\.\s$} ) {
    print STDERR "CLEANER: line matches \\. end of block\n" if($debug);
    %CLEAN_ACTIONS = ();
    print $_;
    next;
  }

  # What to do for the ccs table. Elide the full number, and 1234 the last4 column
  elsif( /^COPY credit_cards \((.*)\)/ ) {
    print STDERR "CLEANER: line matches ccs table.\n" if($debug);
    my @columns = split(/,\s*/, $1);
    for my $i (0..$#columns) {
      push( @{$CLEAN_ACTIONS{"null"}}, $i) if($columns[$i] =~ /^fulldata$/);
      push( @{$CLEAN_ACTIONS{"null"}}, $i) if($columns[$i] =~ /^fulldata_hash$/);
      push( @{$CLEAN_ACTIONS{"null"}}, $i) if($columns[$i] =~ /^fulldata_salt$/);
      push( @{$CLEAN_ACTIONS{"null"}}, $i) if($columns[$i] =~ /^source_id$/);
    }
    print $_;
    next;
  }

  # don't output anything for the log_entries table
  elsif( /^COPY logs / ) {
    print STDERR "CLEANER: line matches logs\n" if($debug);
    $CLEAN_ACTIONS{"skip_lines"} = 1;
    print $_;
    next;
  }

  # don't output anything for the secrets table
  elsif( /^COPY secrets / ) {
    print STDERR "CLEANER: line matches secrets\n" if($debug);
    $CLEAN_ACTIONS{"skip_lines"} = 1;
    print $_;
    next;
  }


  # elide a couple columns off payment_transactions
  elsif( /^COPY txn_logs \((.*)\)/ ) {
    print STDERR "CLEANER: line matches txn_logs.\n" if($debug);
    my @columns = split(/,\s*/, $1);
    for my $i (0..$#columns) {
      push( @{$CLEAN_ACTIONS{"null"}}, $i) if($columns[$i] =~ /^request_string$/);
      push( @{$CLEAN_ACTIONS{"null"}}, $i) if($columns[$i] =~ /^response_string$/);
    }
    print $_;
    next;
  }


  # Now, take action based on the CLEAN_ACTIONS hash
  # this is an elsif, since we only want to run when the others do not.
  elsif($CLEAN_ACTIONS{"skip_lines"}){
    # don't ouput
    next;
  }
  elsif(defined $CLEAN_ACTIONS{"null"} || defined $CLEAN_ACTIONS{"1234"}) {
    # mod fields
    chomp;
    my @data = split(/\t/);
    foreach my $i ( @{$CLEAN_ACTIONS{"null"}} ) {
      $data[$i] = '\N';
    }
    foreach my $i ( @{$CLEAN_ACTIONS{"1234"}} ) {
      $data[$i] = '1234';
    }
    print join("\t", @data), "\n";
  }
  else {
    # normal line, output normally
    print $_;
  }
}
	#!/usr/bin/perl

	# We'd like to have a sanitized/elided copy of the database suitable
	# for letting various people run reporting against. Thus, we need to
	# strip out sensitive or noisy data.
	#
	# One "obvious" way to do that is with a table by table
	# pg_dump. Unfortunatly, that ends up omiting the non-table data (like
	# views) which we do need. thus, we have this awkward script.
	#
	# It's designed to take a full pg_dump, the same that the backups use,
	# and transform it to being suitable for the reporting database.
	#
	# depending on where the bottle necks are, we can run this in a bigger
	# pipeline with tee.
	#
	# We ended up with perl, because the column order changes, and
	# accounting for that in awk was just getting too hairy. Of course,
	# this is pretty ugly perl.

	use strict;
	use warnings;
	use Data::Dumper;

	my $debug = 0;

	# state variables.
	my %CLEAN_ACTIONS = ();

	# The logic:
	# The data comes in blocks. The end with \. and start with COPY <table name>
	# When we get the header for a table we want to elide do 2 things:
	# a. figure out what column numbers we're eliding
	# b. set a flag saying to elide stuff
	# Then when we get to outputting, pay attention to said flag
	#
	# We need to get a little clever so the clean action only takes effect next loop.

	while(<>) {

	# if this matches \., then it's an end of data block.
	# reset CLEAN_ACTIONS
	if( m{^\s*\\\.\s$} ) {
	print STDERR "CLEANER: line matches \\. end of block\n" if($debug);
	%CLEAN_ACTIONS = ();
	print $_;
	next;
	}

	# What to do for the ccs table. Elide the full number, and 1234 the last4 column
	elsif( /^COPY credit_cards \((.*)\)/ ) {
	print STDERR "CLEANER: line matches ccs table.\n" if($debug);
	my @columns = split(/,\s*/, $1);
	for my $i (0..$#columns) {
	push( @{$CLEAN_ACTIONS{"null"}}, $i) if($columns[$i] =~ /^fulldata$/);
	push( @{$CLEAN_ACTIONS{"null"}}, $i) if($columns[$i] =~ /^fulldata_hash$/);
	push( @{$CLEAN_ACTIONS{"null"}}, $i) if($columns[$i] =~ /^fulldata_salt$/);
	push( @{$CLEAN_ACTIONS{"null"}}, $i) if($columns[$i] =~ /^source_id$/);
	}
	print $_;
	next;
	}

	# don't output anything for the log_entries table
	elsif( /^COPY logs / ) {
	print STDERR "CLEANER: line matches logs\n" if($debug);
	$CLEAN_ACTIONS{"skip_lines"} = 1;
	print $_;
	next;
	}

	# don't output anything for the secrets table
	elsif( /^COPY secrets / ) {
	print STDERR "CLEANER: line matches secrets\n" if($debug);
	$CLEAN_ACTIONS{"skip_lines"} = 1;
	print $_;
	next;
	}


	# elide a couple columns off payment_transactions
	elsif( /^COPY txn_logs \((.*)\)/ ) {
	print STDERR "CLEANER: line matches txn_logs.\n" if($debug);
	my @columns = split(/,\s*/, $1);
	for my $i (0..$#columns) {
	push( @{$CLEAN_ACTIONS{"null"}}, $i) if($columns[$i] =~ /^request_string$/);
	push( @{$CLEAN_ACTIONS{"null"}}, $i) if($columns[$i] =~ /^response_string$/);
	}
	print $_;
	next;
	}


	# Now, take action based on the CLEAN_ACTIONS hash
	# this is an elsif, since we only want to run when the others do not.
	elsif($CLEAN_ACTIONS{"skip_lines"}){
	# don't ouput
	next;
	}
	elsif(defined $CLEAN_ACTIONS{"null"} \|\| defined $CLEAN_ACTIONS{"1234"}) {
	# mod fields
	chomp;
	my @data = split(/\t/);
	foreach my $i ( @{$CLEAN_ACTIONS{"null"}} ) {
	$data[$i] = '\N';
	}
	foreach my $i ( @{$CLEAN_ACTIONS{"1234"}} ) {
	$data[$i] = '1234';
	}
	print join("\t", @data), "\n";
	}
	else {
	# normal line, output normally
	print $_;
	}
	}