Last active
November 3, 2020 17:07
-
-
Save neevek/eb221743edf3e93974a85cba089c248e to your computer and use it in GitHub Desktop.
A script for parsing CSV file and output report of the data in the file with support of specifying dimensions and indices.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
my @args = splice(@ARGV, 0); | |
my @dimens; | |
my @dimenNameArray; | |
my %indexNameHash; | |
my %indexInfos; | |
my $uid; | |
my $uvKey= "__uv__"; | |
my $pvKey= "__pv__"; | |
# operators: | |
# cnt: = count | |
# sum: = sum | |
# unq: = unique count | |
# pv_cavg = count / PV | |
# pv_savg = sum / PV | |
# uv_cavg = count / UV | |
# uv_savg = sum / UV | |
# uv_uavg = unique_count / UV | |
#my $colWidth = 10; | |
#my $fmtStr = "%-${colWidth}s\t"; | |
my @table; # a 2-dimentional array to represent a table | |
my @maxColWidths; # max width of each column | |
my $gCounter; | |
foreach (@args) { | |
if (/^d:([^: \n\t]+)(:([^ \n\t]+))?/) { | |
push (@dimens, [$1, $3]); # the tuple is (dimenName, alias) | |
} elsif (/uid:([^ \n\t]+)/) { | |
$uid = $1; | |
} elsif (/colwidth:([1-9]\d*)/) { | |
#$colWidth = $1; | |
#$fmtStr = "%-${colWidth}s\t"; | |
} elsif (/(cnt:|sum:|unq:|[up]v_[ucs]avg:)?([^: \n\t]+)(:([^ \n\t]+))?/) { | |
# the tuple is (indexName, alias) | |
# $indexInfos{indexName} = [['sum:', 'alias1'], ['sum:', 'alias2'], ['', 'alias3']] | |
push (@{$indexInfos{$2} || ($indexInfos{$2} = [])}, [$1, $4]); | |
} | |
} | |
if ((scalar @dimens) == 0 || scalar (keys %indexInfos) == 0) { | |
print "Usage: analyze_csv.pl <d:dimen1> <d:dimen2> ... <indexRegex1|indexRegex2|...> <operator:index:alias>\n"; | |
exit(1); | |
} | |
my $useGcString = 0; | |
if (tryLoad("Unicode::GCString")) { | |
$useGcString = 1; | |
} | |
my $indexRegex = join("|", keys %indexInfos); | |
my $root = TreeNode->new(); | |
while (<>) { | |
my $line = $_; | |
parse($root, $line); | |
} | |
if (scalar (keys %indexNameHash)) { | |
print("\n"); | |
printHeader(); | |
printIndex($root); | |
my $rowIndex = 0; | |
foreach my $indexValues (@table) { | |
if ($rowIndex == 1) { | |
my $totalWidth = 0; | |
map { $totalWidth += $_ } @maxColWidths; | |
print(('-' x ($totalWidth + 4 * scalar @maxColWidths)) . "\n"); | |
} | |
my $counter = 0; | |
foreach my $indexValue (@$indexValues) { | |
print(padStrWithSpace($indexValue, $maxColWidths[$counter]) . "\t"); | |
++$counter; | |
} | |
print("\n"); | |
++$rowIndex; | |
} | |
} | |
sub padStrWithSpace { | |
my ($str, $maxColWidth) = @_; | |
return $str . (' ' x ($maxColWidth - getColWidth($str))) | |
} | |
sub getColWidth { | |
my $str = shift; | |
if ($useGcString) { | |
return Unicode::GCString->new($str)->columns; | |
} | |
return length($str); | |
} | |
sub parse { | |
my ($node, $line) = @_; | |
foreach my $dimen (@dimens) { | |
my $nodes = $node->{nodes}; | |
if ($line =~ /($dimen->[0])=([^,"\n]+)/) { | |
my $dimenName = $1; | |
my $nodeName = $2; | |
# use alias if it is not empty | |
if ($dimen->[1]) { | |
$dimenName = $dimen->[1]; | |
} | |
$node = addNewDimenNode($nodes, $dimenName, $nodeName); | |
} else { | |
$node = addNewDimenNode($nodes, $dimen->[1] ? $dimen->[1] : $dimen->[0], "-"); | |
} | |
} | |
my $indexNode = $node->{nodes}->[0]; | |
if (!$indexNode) { | |
$node->{nodes}->[0] = $indexNode = TreeNode->new(); | |
$indexNode->{value} = {}; | |
} | |
parseIndex($indexNode, $line); | |
} | |
sub addNewDimenNode { | |
my ($nodes, $dimenName, $nodeName) = @_; | |
my $newNode; | |
foreach my $n (@$nodes) { | |
if ($n->{value} eq $nodeName) { | |
$newNode = $n; | |
last; | |
} | |
} | |
if (!$newNode) { | |
$newNode = TreeNode->new(); | |
$newNode->{value} = $nodeName; | |
push (@$nodes, $newNode); | |
my $found = 0; | |
for my $dn (@dimenNameArray) { | |
if ($dimenName eq $dn) { | |
$found = 1; | |
last; | |
} | |
} | |
if (!$found) { | |
push (@dimenNameArray, $dimenName); | |
} | |
} | |
return $newNode; | |
} | |
sub parseIndex { | |
my $node = shift; | |
my $line = shift; | |
my $indexMap = $node->{value} || ($node->{value} = {}); | |
my $uv; | |
if ($uid && $line =~ /$uid=([^,"\n]+)/) { | |
$uv = $1; | |
my $uvKeyHash = $indexMap->{$uvKey} || ($indexMap->{$uvKey} = {}); | |
$uvKeyHash->{$uv} = 0; | |
} | |
$indexMap->{$pvKey} += 1; | |
while ($line =~ /($indexRegex)=([^,"\n]+)/g) { | |
my $indexName = $1; | |
my $indexValue = $2; | |
my $index = $indexMap->{$indexName} || ($indexMap->{$indexName} = [0, 0, {}]); # [count, sum, {uid1, uid2, uid3...}] | |
$index->[0] += 1; # count | |
$index->[1] += $indexValue; # sum | |
for my $indexInfoItem (@{$indexInfos{$indexName}}) { | |
my $operator = $indexInfoItem->[0]; | |
if (!defined $uid) { | |
if ($operator && $operator eq "unq:") { | |
print STDERR ("must specify uid: key to use unq:\n"); | |
} | |
} elsif (defined $uv && $uv ne "") { | |
$index->[2]->{$uv} = 0; | |
} | |
} | |
$indexNameHash{$indexName} = 0; | |
} | |
} | |
sub pushIntoColsArr { | |
my $colsArr = shift; | |
my $value = shift; | |
push(@$colsArr, $value); | |
my $maxW = $maxColWidths[$gCounter]; | |
my $curW = $value ? getColWidth($value) : 1; | |
if ($maxW) { | |
$maxColWidths[$gCounter] = $curW > $maxW ? $curW : $maxW; | |
} else { | |
$maxColWidths[$gCounter] = $curW; | |
} | |
++$gCounter; | |
} | |
sub printHeader { | |
my @headerCols; | |
$gCounter = 0; | |
foreach my $dimen (@dimenNameArray) { | |
pushIntoColsArr(\@headerCols, $dimen); | |
} | |
if ($uid) { | |
pushIntoColsArr(\@headerCols, "UV"); | |
} | |
pushIntoColsArr(\@headerCols, "PV"); | |
my $indexCols = 0; | |
for my $indexName (sort keys %indexNameHash) { | |
if (!scalar @{$indexInfos{$indexName}}) { | |
++$indexCols; | |
pushIntoColsArr(\@headerCols, $indexName); | |
} else { | |
for my $indexInfoItem (@{$indexInfos{$indexName}}) { | |
my $tmpIndexName = $indexName; | |
if ($indexInfoItem) { | |
++$indexCols; | |
if ($indexInfoItem->[1]) { | |
$tmpIndexName = $indexInfoItem->[1]; # use alias | |
} elsif ($indexInfoItem->[0]) { | |
$tmpIndexName = $indexInfoItem->[0] . $tmpIndexName; | |
} | |
} | |
pushIntoColsArr(\@headerCols, $tmpIndexName); | |
} | |
} | |
} | |
#my $cols = @dimenNameArray + $indexCols; | |
#if ($uid) { | |
#$cols += 1; | |
#} | |
#my $sep = "-" x ($cols * ($colWidth + 7)); | |
#print("\n$sep\n"); | |
push(@table, \@headerCols); | |
} | |
sub printIndex { | |
my $node = shift; | |
printIndex2($node, []) | |
} | |
sub printIndex2 { | |
my $node = shift; | |
my $dimens = shift; | |
my $value = $node->{value}; | |
if (defined $value && ref($value) ne "HASH") { | |
push (@$dimens, $value); | |
} | |
my $nodes = $node->{nodes}; | |
for my $n (@$nodes) { | |
my $value = $n->{value}; | |
if (ref($value) eq "HASH") { | |
my @indexCols; | |
$gCounter = 0; | |
for my $dimen (@$dimens) { | |
pushIntoColsArr(\@indexCols, $dimen); | |
} | |
my $uv; | |
my $pv = $value->{$pvKey}; | |
if ($uid) { | |
my $uvKeyHash = $value->{$uvKey}; | |
$uv = scalar (keys %$uvKeyHash); | |
pushIntoColsArr(\@indexCols, $uv); | |
} | |
pushIntoColsArr(\@indexCols, $pv); | |
for my $indexName (sort keys %indexNameHash) { | |
my $indexDataTuple = $value->{$indexName} || [0, 0, {}]; # (count, sum, {uid1, uid2...}) tuple | |
my @indexInfoItems = @{$indexInfos{$indexName}}; | |
if (!scalar @indexInfoItems) { | |
printIndexItem(\@indexCols, $uv, $pv, "", $indexDataTuple); | |
} else { | |
for my $indexInfoItem (@indexInfoItems) { | |
my $operator = $indexInfoItem->[0]; | |
printIndexItem(\@indexCols, $uv, $pv, $operator, $indexDataTuple); | |
} | |
} | |
} | |
push(@table, \@indexCols); | |
} else { | |
printIndex2($n, $dimens); | |
} | |
} | |
pop @$dimens; | |
} | |
sub printIndexItem { | |
my ($indexCols, $uv, $pv, $operator, $indexDataTuple) = @_; | |
my $v = "0"; | |
if (!defined $operator || $operator !~ /avg:/) { | |
if ($operator && $operator eq "cnt:") { | |
$v = $indexDataTuple->[0]; # use count | |
} elsif ($operator && $operator eq "unq:") { | |
$v = scalar (keys %{$indexDataTuple->[2]}); # use unique count | |
} else { | |
$v = $indexDataTuple->[1]; # use sum | |
} | |
} else { | |
if ($operator eq "pv_cavg:") { | |
$v = sprintf("%.4f", $indexDataTuple->[0] / $pv); | |
} elsif ($operator eq "pv_savg:") { | |
$v = sprintf("%.4f", $indexDataTuple->[1] / $pv); | |
} elsif ($operator =~ /uv_[ucs]avg:/) { | |
if (defined $uv) { | |
$uv || ($uv = "1"); | |
if ($operator eq "uv_cavg:") { | |
$v = sprintf("%.4f", $indexDataTuple->[0] / $uv); | |
} elsif ($operator eq "uv_uavg:") { | |
$v = sprintf("%.4f", (scalar (keys %{$indexDataTuple->[2]})) / $uv); | |
} else { | |
$v = sprintf("%.4f", $indexDataTuple->[1] / $uv); | |
} | |
} else { | |
print STDERR ("must specify uid: key to compute UV for using uv_*avg $uv\n"); | |
} | |
} | |
} | |
pushIntoColsArr($indexCols, $v); | |
} | |
sub tryLoad { | |
my $mod = shift; | |
eval("use $mod"); | |
if ($@) { | |
return 0; | |
} else { | |
return 1; | |
} | |
} | |
package BaseObject; | |
sub new { | |
my ($class, $args) = @_; | |
return bless $args, $class; | |
} | |
package TreeNode; | |
use parent -norequire, qw(BaseObject); | |
sub new { | |
my ($class) = @_; | |
return $class->SUPER::new({ | |
value => undef, | |
nodes => [] | |
}); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment