Skip to content

Instantly share code, notes, and snippets.

@gonter
Last active May 14, 2016 02:37
Show Gist options
  • Save gonter/c386c81fdf2deb3660d2ccc545d374dd to your computer and use it in GitHub Desktop.
Save gonter/c386c81fdf2deb3660d2ccc545d374dd to your computer and use it in GitHub Desktop.
postgres aligned output files

process postgres aligned output files

this files are created by using \a to enable aligned output and \o filename

\a
\o output.csv
select * from my_table;
#!/usr/bin/perl
use strict;
use Data::Dumper;
$Data::Dumper::Indent= 1;
use Util::Matrix;
my $fnm1= shift (@ARGV);
my $fnm2= shift (@ARGV);
my $tbl1= pga::read_table ($fnm1);
$tbl1->mk_idx(0);
my $tbl2= pga::read_table ($fnm2);
$tbl2->mk_idx(0);
my $tbl_x= pga::read_table ('sj_dump.csv');
my $tbl_y= $tbl_x->filter(2, 1251);
my $idx_y= $tbl_y->mk_idx(0);
my $missing= pga::diff ($tbl1, $tbl2);
my @column_names= @{$tbl1->{'columns'}};
# print "columns: ", Dumper (\@column_names);
# print "missing: ", Dumper ($missing);
push (@column_names, 'hit');
my $cn_hit= $#column_names;
my @out_rows;
foreach my $row (@$missing)
{
# print "row: ", Dumper ($row);
$row->[$cn_hit]= '';
if (exists ($idx_y->{$row->[4]}))
{
$row->[$cn_hit]= 'name';
push (@out_rows, $row);
}
}
# Util::Matrix::print(\@column_names, $missing);
Util::Matrix::print(\@column_names, \@out_rows);
exit (0);
sub x1
{
my $fnm1= 'sj_current.csv';
my $fnm2= 'sj_dump.csv';
my $tbl1= pga::read_table ($fnm1);
$tbl1->mk_idx(0);
# print "tbl1: ", Dumper ($tbl1);
my $tbl2= pga::read_table ($fnm2);
$tbl2->mk_idx(0);
my $missing= pga::diff ($tbl1, $tbl2);
# print "columns: ", Dumper ($tbl1->{'columns'});
# print "missing: ", Dumper ($missing);
check_1251 ($missing);
write_inserts ('SchlagwortJelinek', $tbl1->{'columns'}, $missing);
}
sub write_inserts
{
my $tbl_name= shift;
my $col_names= shift;
my $rows= shift;
my $fnm_sql= 'insert_missing.sql';
open (SQL, '>:utf8', $fnm_sql) or die;
my @col_names= @$col_names;
my $col_count= @col_names;
my $cnt= 0;
foreach my $row (@$rows)
{
my @col;
my @val;
for (my $i= 0; $i < $col_count; $i++)
{
if (defined ($row->[$i]) && $row->[$i] ne '')
{
push (@col, $col_names[$i]);
push (@val, $row->[$i]);
}
}
print SQL "INSERT INTO \"$tbl_name\" (",
join (',', map { $_ } @col),
") VALUES (",
join (',', map { "'".$_."'" } @val),");\n";
$cnt++;
}
close (SQL);
print "written $cnt rows to $fnm_sql\n";
$cnt;
}
=head2 check_1251
find out if either column 0 or column 2 have the value 1251;
complain otherwise
=cut
sub check_1251
{
my $rows= shift;
foreach my $row (@$rows)
{
next if ($row->[0] == 1251 || $row->[2] == 1251);
print "check row: ", Dumper($row);
}
}
package pga;
# postgres aligned output
sub mk_idx
{
my $tbl= shift;
my $col= shift;
my $idx;
foreach my $row (@{$tbl->{rows}})
{
# print "row: ", main::Dumper ($row);
my $v= $row->[$col];
push (@{$idx->{$v}}, $row);
}
$tbl->{index}= $idx;
}
=head1 filter($table, $column_number, $value)
return a new table consisting only of those rows from $table that contain the given $value in given $column_number.
=cut
sub filter
{
my $tbl= shift;
my $col_num= shift;
my $val= shift;
my @out_rows;
foreach my $row (@{$tbl->{rows}})
{
push (@out_rows, $row) if ($row->[$col_num] == $val);
}
my $res= { columns => $tbl->{columns}, rows => \@out_rows };
bless ($res);
$res;
}
=head2 diff ($tbl1, $tbl2)
look for items from $tbl2 which are not present in $tbl1
=cut
sub diff
{
my $tbl1= shift;
my $tbl2= shift;
my $idx1= $tbl1->{index};
my $idx2= $tbl2->{index};
my @missing;
ROW2: foreach my $id (sort { $a <=> $b } keys %$idx2)
{
if (exists ($idx1->{$id}))
{
# TODO: check if both rows are identical or report otherwise;
next ROW2;
}
push (@missing, @{$idx2->{$id}});
}
(wantarray) ? @missing : \@missing;
}
sub read_table
{
my $fnm= shift;
unless (open (FI, '<:utf8', $fnm))
{
print "cant read $fnm\n";
return undef;
}
my @lines= <FI>;
close (FI);
# TODO: check for empty files or other stuff...
chop (@lines);
my $columns= shift (@lines);
my $rows= pop (@lines);
my @columns= split(/\|/, $columns);
# print "columns: ", main::Dumper(\@columns);
print "rows=[$rows]\n"; # TODO: check if $rows is
push (@lines, $rows) unless ($rows =~ m#^\(\d+ rows\)$#);
my @rows;
my $res=
{
'columns' => \@columns,
'rows' => \@rows,
};
bless ($res);
foreach my $line (@lines)
{
my @l= split (/\|/, $line);
push (@rows, \@l);
}
$res;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment