Skip to content

Instantly share code, notes, and snippets.

@neevek
Last active November 3, 2020 17:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save neevek/eb221743edf3e93974a85cba089c248e to your computer and use it in GitHub Desktop.
Save neevek/eb221743edf3e93974a85cba089c248e to your computer and use it in GitHub Desktop.
A script for parsing CSV file and output report of the data in the file with support of specifying dimensions and indices.
#!/usr/bin/perl
use strict;
use warnings;
my @args = splice(@ARGV, 0);
my @dimens;
my @dimenNameArray;
my %indexNameHash;
my %indexInfos;
my $uid;
my $uvKey= "__uv__";
my $pvKey= "__pv__";
# operators:
# cnt: = count
# sum: = sum
# unq: = unique count
# pv_cavg = count / PV
# pv_savg = sum / PV
# uv_cavg = count / UV
# uv_savg = sum / UV
# uv_uavg = unique_count / UV
#my $colWidth = 10;
#my $fmtStr = "%-${colWidth}s\t";
my @table; # a 2-dimentional array to represent a table
my @maxColWidths; # max width of each column
my $gCounter;
foreach (@args) {
if (/^d:([^: \n\t]+)(:([^ \n\t]+))?/) {
push (@dimens, [$1, $3]); # the tuple is (dimenName, alias)
} elsif (/uid:([^ \n\t]+)/) {
$uid = $1;
} elsif (/colwidth:([1-9]\d*)/) {
#$colWidth = $1;
#$fmtStr = "%-${colWidth}s\t";
} elsif (/(cnt:|sum:|unq:|[up]v_[ucs]avg:)?([^: \n\t]+)(:([^ \n\t]+))?/) {
# the tuple is (indexName, alias)
# $indexInfos{indexName} = [['sum:', 'alias1'], ['sum:', 'alias2'], ['', 'alias3']]
push (@{$indexInfos{$2} || ($indexInfos{$2} = [])}, [$1, $4]);
}
}
if ((scalar @dimens) == 0 || scalar (keys %indexInfos) == 0) {
print "Usage: analyze_csv.pl <d:dimen1> <d:dimen2> ... <indexRegex1|indexRegex2|...> <operator:index:alias>\n";
exit(1);
}
my $useGcString = 0;
if (tryLoad("Unicode::GCString")) {
$useGcString = 1;
}
my $indexRegex = join("|", keys %indexInfos);
my $root = TreeNode->new();
while (<>) {
my $line = $_;
parse($root, $line);
}
if (scalar (keys %indexNameHash)) {
print("\n");
printHeader();
printIndex($root);
my $rowIndex = 0;
foreach my $indexValues (@table) {
if ($rowIndex == 1) {
my $totalWidth = 0;
map { $totalWidth += $_ } @maxColWidths;
print(('-' x ($totalWidth + 4 * scalar @maxColWidths)) . "\n");
}
my $counter = 0;
foreach my $indexValue (@$indexValues) {
print(padStrWithSpace($indexValue, $maxColWidths[$counter]) . "\t");
++$counter;
}
print("\n");
++$rowIndex;
}
}
sub padStrWithSpace {
my ($str, $maxColWidth) = @_;
return $str . (' ' x ($maxColWidth - getColWidth($str)))
}
sub getColWidth {
my $str = shift;
if ($useGcString) {
return Unicode::GCString->new($str)->columns;
}
return length($str);
}
sub parse {
my ($node, $line) = @_;
foreach my $dimen (@dimens) {
my $nodes = $node->{nodes};
if ($line =~ /($dimen->[0])=([^,"\n]+)/) {
my $dimenName = $1;
my $nodeName = $2;
# use alias if it is not empty
if ($dimen->[1]) {
$dimenName = $dimen->[1];
}
$node = addNewDimenNode($nodes, $dimenName, $nodeName);
} else {
$node = addNewDimenNode($nodes, $dimen->[1] ? $dimen->[1] : $dimen->[0], "-");
}
}
my $indexNode = $node->{nodes}->[0];
if (!$indexNode) {
$node->{nodes}->[0] = $indexNode = TreeNode->new();
$indexNode->{value} = {};
}
parseIndex($indexNode, $line);
}
sub addNewDimenNode {
my ($nodes, $dimenName, $nodeName) = @_;
my $newNode;
foreach my $n (@$nodes) {
if ($n->{value} eq $nodeName) {
$newNode = $n;
last;
}
}
if (!$newNode) {
$newNode = TreeNode->new();
$newNode->{value} = $nodeName;
push (@$nodes, $newNode);
my $found = 0;
for my $dn (@dimenNameArray) {
if ($dimenName eq $dn) {
$found = 1;
last;
}
}
if (!$found) {
push (@dimenNameArray, $dimenName);
}
}
return $newNode;
}
sub parseIndex {
my $node = shift;
my $line = shift;
my $indexMap = $node->{value} || ($node->{value} = {});
my $uv;
if ($uid && $line =~ /$uid=([^,"\n]+)/) {
$uv = $1;
my $uvKeyHash = $indexMap->{$uvKey} || ($indexMap->{$uvKey} = {});
$uvKeyHash->{$uv} = 0;
}
$indexMap->{$pvKey} += 1;
while ($line =~ /($indexRegex)=([^,"\n]+)/g) {
my $indexName = $1;
my $indexValue = $2;
my $index = $indexMap->{$indexName} || ($indexMap->{$indexName} = [0, 0, {}]); # [count, sum, {uid1, uid2, uid3...}]
$index->[0] += 1; # count
$index->[1] += $indexValue; # sum
for my $indexInfoItem (@{$indexInfos{$indexName}}) {
my $operator = $indexInfoItem->[0];
if (!defined $uid) {
if ($operator && $operator eq "unq:") {
print STDERR ("must specify uid: key to use unq:\n");
}
} elsif (defined $uv && $uv ne "") {
$index->[2]->{$uv} = 0;
}
}
$indexNameHash{$indexName} = 0;
}
}
sub pushIntoColsArr {
my $colsArr = shift;
my $value = shift;
push(@$colsArr, $value);
my $maxW = $maxColWidths[$gCounter];
my $curW = $value ? getColWidth($value) : 1;
if ($maxW) {
$maxColWidths[$gCounter] = $curW > $maxW ? $curW : $maxW;
} else {
$maxColWidths[$gCounter] = $curW;
}
++$gCounter;
}
sub printHeader {
my @headerCols;
$gCounter = 0;
foreach my $dimen (@dimenNameArray) {
pushIntoColsArr(\@headerCols, $dimen);
}
if ($uid) {
pushIntoColsArr(\@headerCols, "UV");
}
pushIntoColsArr(\@headerCols, "PV");
my $indexCols = 0;
for my $indexName (sort keys %indexNameHash) {
if (!scalar @{$indexInfos{$indexName}}) {
++$indexCols;
pushIntoColsArr(\@headerCols, $indexName);
} else {
for my $indexInfoItem (@{$indexInfos{$indexName}}) {
my $tmpIndexName = $indexName;
if ($indexInfoItem) {
++$indexCols;
if ($indexInfoItem->[1]) {
$tmpIndexName = $indexInfoItem->[1]; # use alias
} elsif ($indexInfoItem->[0]) {
$tmpIndexName = $indexInfoItem->[0] . $tmpIndexName;
}
}
pushIntoColsArr(\@headerCols, $tmpIndexName);
}
}
}
#my $cols = @dimenNameArray + $indexCols;
#if ($uid) {
#$cols += 1;
#}
#my $sep = "-" x ($cols * ($colWidth + 7));
#print("\n$sep\n");
push(@table, \@headerCols);
}
sub printIndex {
my $node = shift;
printIndex2($node, [])
}
sub printIndex2 {
my $node = shift;
my $dimens = shift;
my $value = $node->{value};
if (defined $value && ref($value) ne "HASH") {
push (@$dimens, $value);
}
my $nodes = $node->{nodes};
for my $n (@$nodes) {
my $value = $n->{value};
if (ref($value) eq "HASH") {
my @indexCols;
$gCounter = 0;
for my $dimen (@$dimens) {
pushIntoColsArr(\@indexCols, $dimen);
}
my $uv;
my $pv = $value->{$pvKey};
if ($uid) {
my $uvKeyHash = $value->{$uvKey};
$uv = scalar (keys %$uvKeyHash);
pushIntoColsArr(\@indexCols, $uv);
}
pushIntoColsArr(\@indexCols, $pv);
for my $indexName (sort keys %indexNameHash) {
my $indexDataTuple = $value->{$indexName} || [0, 0, {}]; # (count, sum, {uid1, uid2...}) tuple
my @indexInfoItems = @{$indexInfos{$indexName}};
if (!scalar @indexInfoItems) {
printIndexItem(\@indexCols, $uv, $pv, "", $indexDataTuple);
} else {
for my $indexInfoItem (@indexInfoItems) {
my $operator = $indexInfoItem->[0];
printIndexItem(\@indexCols, $uv, $pv, $operator, $indexDataTuple);
}
}
}
push(@table, \@indexCols);
} else {
printIndex2($n, $dimens);
}
}
pop @$dimens;
}
sub printIndexItem {
my ($indexCols, $uv, $pv, $operator, $indexDataTuple) = @_;
my $v = "0";
if (!defined $operator || $operator !~ /avg:/) {
if ($operator && $operator eq "cnt:") {
$v = $indexDataTuple->[0]; # use count
} elsif ($operator && $operator eq "unq:") {
$v = scalar (keys %{$indexDataTuple->[2]}); # use unique count
} else {
$v = $indexDataTuple->[1]; # use sum
}
} else {
if ($operator eq "pv_cavg:") {
$v = sprintf("%.4f", $indexDataTuple->[0] / $pv);
} elsif ($operator eq "pv_savg:") {
$v = sprintf("%.4f", $indexDataTuple->[1] / $pv);
} elsif ($operator =~ /uv_[ucs]avg:/) {
if (defined $uv) {
$uv || ($uv = "1");
if ($operator eq "uv_cavg:") {
$v = sprintf("%.4f", $indexDataTuple->[0] / $uv);
} elsif ($operator eq "uv_uavg:") {
$v = sprintf("%.4f", (scalar (keys %{$indexDataTuple->[2]})) / $uv);
} else {
$v = sprintf("%.4f", $indexDataTuple->[1] / $uv);
}
} else {
print STDERR ("must specify uid: key to compute UV for using uv_*avg $uv\n");
}
}
}
pushIntoColsArr($indexCols, $v);
}
sub tryLoad {
my $mod = shift;
eval("use $mod");
if ($@) {
return 0;
} else {
return 1;
}
}
package BaseObject;
sub new {
my ($class, $args) = @_;
return bless $args, $class;
}
package TreeNode;
use parent -norequire, qw(BaseObject);
sub new {
my ($class) = @_;
return $class->SUPER::new({
value => undef,
nodes => []
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment