neevek/analyze_csv.pl

## analyze_csv.pl
#!/usr/bin/perl

use strict;
use warnings;

my @args = splice(@ARGV, 0);

my @dimens;
my @dimenNameArray;
my %indexNameHash;
my %indexInfos;
my $uid;

my $uvKey= "__uv__";
my $pvKey= "__pv__";

# operators:
# cnt: = count
# sum: = sum
# unq: = unique count
# pv_cavg = count / PV
# pv_savg = sum / PV
# uv_cavg = count / UV
# uv_savg = sum / UV
# uv_uavg = unique_count / UV

#my $colWidth = 10;
#my $fmtStr = "%-${colWidth}s\t";

my @table; # a 2-dimentional array to represent a table
my @maxColWidths; # max width of each column
my $gCounter;

foreach (@args) {
  if (/^d:([^: \n\t]+)(:([^ \n\t]+))?/) {
    push (@dimens, [$1, $3]);  # the tuple is (dimenName, alias)

  } elsif (/uid:([^ \n\t]+)/) {
    $uid = $1;

  } elsif (/colwidth:([1-9]\d*)/) {
    #$colWidth = $1;
    #$fmtStr = "%-${colWidth}s\t";

  } elsif (/(cnt:|sum:|unq:|[up]v_[ucs]avg:)?([^: \n\t]+)(:([^ \n\t]+))?/) {
    # the tuple is (indexName, alias)
    # $indexInfos{indexName} = [['sum:', 'alias1'], ['sum:', 'alias2'], ['', 'alias3']]
    push (@{$indexInfos{$2} || ($indexInfos{$2} = [])}, [$1, $4]);
  }
}

if ((scalar @dimens) == 0 || scalar (keys %indexInfos) == 0) {
  print "Usage: analyze_csv.pl <d:dimen1> <d:dimen2> ... <indexRegex1|indexRegex2|...> <operator:index:alias>\n";
  exit(1);
}

my $useGcString = 0;
if (tryLoad("Unicode::GCString")) {
  $useGcString = 1;
}

my $indexRegex = join("|", keys %indexInfos);

my $root = TreeNode->new();
while (<>) {
  my $line = $_;
  parse($root, $line);
}

if (scalar (keys %indexNameHash)) {
  print("\n");
  printHeader();
  printIndex($root);

  my $rowIndex = 0;
  foreach my $indexValues (@table) {
    if ($rowIndex == 1) {
      my $totalWidth = 0;
      map { $totalWidth += $_ } @maxColWidths;
      print(('-' x ($totalWidth + 4 * scalar @maxColWidths)) . "\n");
    }

    my $counter = 0;
    foreach my $indexValue (@$indexValues) {
      print(padStrWithSpace($indexValue, $maxColWidths[$counter]) . "\t");
      ++$counter;
    }
    print("\n");
    ++$rowIndex;
  }
}

sub padStrWithSpace {
  my ($str, $maxColWidth) = @_;
  return $str . (' ' x ($maxColWidth - getColWidth($str)))
}

sub getColWidth {
  my $str = shift;
  if ($useGcString) {
    return Unicode::GCString->new($str)->columns;
  }
  return length($str);
}

sub parse {
  my ($node, $line) = @_;

  foreach my $dimen (@dimens) {
    my $nodes = $node->{nodes};
    if ($line =~ /($dimen->[0])=([^,"\n]+)/) {
      my $dimenName = $1;
      my $nodeName = $2;

      # use alias if it is not empty
      if ($dimen->[1]) {
        $dimenName = $dimen->[1];
      }
      $node = addNewDimenNode($nodes, $dimenName, $nodeName);
    } else {
      $node = addNewDimenNode($nodes, $dimen->[1] ? $dimen->[1] : $dimen->[0], "-");
    }
  }

  my $indexNode = $node->{nodes}->[0];
  if (!$indexNode) {
    $node->{nodes}->[0] = $indexNode = TreeNode->new();
    $indexNode->{value} = {};
  }

  parseIndex($indexNode, $line);
}

sub addNewDimenNode {
  my ($nodes, $dimenName, $nodeName) = @_;

  my $newNode;
  foreach my $n (@$nodes) {
    if ($n->{value} eq $nodeName) {
      $newNode = $n;
      last;
    }
  }

  if (!$newNode) {
    $newNode = TreeNode->new();
    $newNode->{value} = $nodeName;
    push (@$nodes, $newNode);

    my $found = 0;
    for my $dn (@dimenNameArray) {
      if ($dimenName eq $dn) {
        $found = 1;
        last;
      }
    }

    if (!$found) {
      push (@dimenNameArray, $dimenName);
    }
  }

  return $newNode;
}

sub parseIndex {
  my $node = shift;
  my $line = shift;

  my $indexMap = $node->{value} || ($node->{value} = {});

  my $uv;
  if ($uid && $line =~ /$uid=([^,"\n]+)/) {
    $uv = $1;
    my $uvKeyHash = $indexMap->{$uvKey} || ($indexMap->{$uvKey} = {});
    $uvKeyHash->{$uv} = 0;
  }
  $indexMap->{$pvKey} += 1;

  while ($line =~ /($indexRegex)=([^,"\n]+)/g) {
    my $indexName = $1;
    my $indexValue = $2;

    my $index = $indexMap->{$indexName} || ($indexMap->{$indexName} = [0, 0, {}]); # [count, sum, {uid1, uid2, uid3...}]
    $index->[0] += 1;               # count
    $index->[1] += $indexValue;     # sum

    for my $indexInfoItem (@{$indexInfos{$indexName}}) {
      my $operator = $indexInfoItem->[0];
        if (!defined $uid) {
          if ($operator && $operator eq "unq:") {
            print STDERR ("must specify uid: key to use unq:\n");
          }

        } elsif (defined $uv && $uv ne "") {
          $index->[2]->{$uv} = 0;
        }
    }

    $indexNameHash{$indexName} = 0;
  }
}

sub pushIntoColsArr {
  my $colsArr = shift;
  my $value = shift;

  push(@$colsArr, $value);

  my $maxW = $maxColWidths[$gCounter];

  my $curW = $value ? getColWidth($value) : 1;
  if ($maxW) {
    $maxColWidths[$gCounter] = $curW > $maxW ? $curW : $maxW;
  } else {
    $maxColWidths[$gCounter] = $curW;
  }
  ++$gCounter;
}

sub printHeader {
  my @headerCols;
  $gCounter = 0;

  foreach my $dimen (@dimenNameArray) {
    pushIntoColsArr(\@headerCols, $dimen);
  }
  if ($uid) {
    pushIntoColsArr(\@headerCols, "UV");
  }
  pushIntoColsArr(\@headerCols, "PV");

  my $indexCols = 0;
  for my $indexName (sort keys %indexNameHash) {
    if (!scalar @{$indexInfos{$indexName}}) {
      ++$indexCols;
      pushIntoColsArr(\@headerCols, $indexName);

    } else {
      for my $indexInfoItem (@{$indexInfos{$indexName}}) {
        my $tmpIndexName = $indexName;
        if ($indexInfoItem) {
          ++$indexCols;
          if ($indexInfoItem->[1]) {
            $tmpIndexName = $indexInfoItem->[1];  # use alias
          } elsif ($indexInfoItem->[0]) {
            $tmpIndexName = $indexInfoItem->[0] . $tmpIndexName;
          }
        }

        pushIntoColsArr(\@headerCols, $tmpIndexName);
      }
    }
  }
  #my $cols = @dimenNameArray + $indexCols;
  #if ($uid) {
    #$cols += 1;
  #}
  #my $sep = "-" x ($cols * ($colWidth + 7));
  #print("\n$sep\n");

  push(@table, \@headerCols);
}

sub printIndex {
  my $node = shift;
  printIndex2($node, [])
}

sub printIndex2 {
  my $node = shift;
  my $dimens = shift;

  my $value = $node->{value};
  if (defined $value && ref($value) ne "HASH") {
    push (@$dimens, $value);
  }

  my $nodes = $node->{nodes};
  for my $n (@$nodes) {
    my $value = $n->{value};
    if (ref($value) eq "HASH") {
      my @indexCols;
      $gCounter = 0;

      for my $dimen (@$dimens) {
        pushIntoColsArr(\@indexCols, $dimen);
      }

      my $uv;
      my $pv = $value->{$pvKey};

      if ($uid) {
        my $uvKeyHash = $value->{$uvKey};
        $uv = scalar (keys %$uvKeyHash);
        pushIntoColsArr(\@indexCols, $uv);
      }
      pushIntoColsArr(\@indexCols, $pv);

      for my $indexName (sort keys %indexNameHash) {
        my $indexDataTuple = $value->{$indexName} || [0, 0, {}]; # (count, sum, {uid1, uid2...}) tuple

        my @indexInfoItems = @{$indexInfos{$indexName}};
        if (!scalar @indexInfoItems) {
          printIndexItem(\@indexCols, $uv, $pv, "", $indexDataTuple);

        } else {
          for my $indexInfoItem (@indexInfoItems) {
            my $operator = $indexInfoItem->[0];
            printIndexItem(\@indexCols, $uv, $pv, $operator, $indexDataTuple);
          }
        }
      }

      push(@table, \@indexCols);

    } else {
      printIndex2($n, $dimens);
    }
  }

  pop @$dimens;
}

sub printIndexItem {
  my ($indexCols, $uv, $pv, $operator, $indexDataTuple) = @_;

  my $v = "0";

  if (!defined $operator || $operator !~ /avg:/) {
    if ($operator && $operator eq "cnt:") {
      $v = $indexDataTuple->[0];                     # use count
    } elsif ($operator && $operator eq "unq:") {
      $v = scalar (keys %{$indexDataTuple->[2]});    # use unique count
    } else {
      $v = $indexDataTuple->[1];                     # use sum
    }

  } else {
    if ($operator eq "pv_cavg:") {
      $v = sprintf("%.4f", $indexDataTuple->[0] / $pv);

    } elsif ($operator eq "pv_savg:") {
      $v = sprintf("%.4f", $indexDataTuple->[1] / $pv);

    } elsif ($operator =~ /uv_[ucs]avg:/) {
      if (defined $uv) {
        $uv || ($uv = "1");
        if ($operator eq "uv_cavg:") {
          $v = sprintf("%.4f", $indexDataTuple->[0] / $uv);
        } elsif ($operator eq "uv_uavg:") {
          $v = sprintf("%.4f", (scalar (keys %{$indexDataTuple->[2]})) / $uv);
        } else {
          $v = sprintf("%.4f", $indexDataTuple->[1] / $uv);
        }
      } else {
        print STDERR ("must specify uid: key to compute UV for using uv_*avg $uv\n");
      }
    }
  }

  pushIntoColsArr($indexCols, $v);
}

sub tryLoad {
  my $mod = shift;

  eval("use $mod");

  if ($@) {
    return 0;
  } else {
    return 1;
  }
}

package BaseObject;
sub new {
  my ($class, $args) = @_;
  return bless $args, $class;
}

package TreeNode;
use parent -norequire, qw(BaseObject);

sub new {
  my ($class) = @_;
  return $class->SUPER::new({
      value => undef,
      nodes => []
    });
}
	#!/usr/bin/perl

	use strict;
	use warnings;

	my @args = splice(@ARGV, 0);

	my @dimens;
	my @dimenNameArray;
	my %indexNameHash;
	my %indexInfos;
	my $uid;

	my $uvKey= "__uv__";
	my $pvKey= "__pv__";

	# operators:
	# cnt: = count
	# sum: = sum
	# unq: = unique count
	# pv_cavg = count / PV
	# pv_savg = sum / PV
	# uv_cavg = count / UV
	# uv_savg = sum / UV
	# uv_uavg = unique_count / UV

	#my $colWidth = 10;
	#my $fmtStr = "%-${colWidth}s\t";

	my @table; # a 2-dimentional array to represent a table
	my @maxColWidths; # max width of each column
	my $gCounter;

	foreach (@args) {
	if (/^d:([^: \n\t]+)(:([^ \n\t]+))?/) {
	push (@dimens, [$1, $3]); # the tuple is (dimenName, alias)

	} elsif (/uid:([^ \n\t]+)/) {
	$uid = $1;

	} elsif (/colwidth:([1-9]\d*)/) {
	#$colWidth = $1;
	#$fmtStr = "%-${colWidth}s\t";

	} elsif (/(cnt:\|sum:\|unq:\|[up]v_[ucs]avg:)?([^: \n\t]+)(:([^ \n\t]+))?/) {
	# the tuple is (indexName, alias)
	# $indexInfos{indexName} = [['sum:', 'alias1'], ['sum:', 'alias2'], ['', 'alias3']]
	push (@{$indexInfos{$2} \|\| ($indexInfos{$2} = [])}, [$1, $4]);
	}
	}

	if ((scalar @dimens) == 0 \|\| scalar (keys %indexInfos) == 0) {
	print "Usage: analyze_csv.pl <d:dimen1> <d:dimen2> ... <indexRegex1\|indexRegex2\|...> <operator:index:alias>\n";
	exit(1);
	}

	my $useGcString = 0;
	if (tryLoad("Unicode::GCString")) {
	$useGcString = 1;
	}

	my $indexRegex = join("\|", keys %indexInfos);

	my $root = TreeNode->new();
	while (<>) {
	my $line = $_;
	parse($root, $line);
	}

	if (scalar (keys %indexNameHash)) {
	print("\n");
	printHeader();
	printIndex($root);

	my $rowIndex = 0;
	foreach my $indexValues (@table) {
	if ($rowIndex == 1) {
	my $totalWidth = 0;
	map { $totalWidth += $_ } @maxColWidths;
	print(('-' x ($totalWidth + 4 * scalar @maxColWidths)) . "\n");
	}

	my $counter = 0;
	foreach my $indexValue (@$indexValues) {
	print(padStrWithSpace($indexValue, $maxColWidths[$counter]) . "\t");
	++$counter;
	}
	print("\n");
	++$rowIndex;
	}
	}

	sub padStrWithSpace {
	my ($str, $maxColWidth) = @_;
	return $str . (' ' x ($maxColWidth - getColWidth($str)))
	}

	sub getColWidth {
	my $str = shift;
	if ($useGcString) {
	return Unicode::GCString->new($str)->columns;
	}
	return length($str);
	}

	sub parse {
	my ($node, $line) = @_;

	foreach my $dimen (@dimens) {
	my $nodes = $node->{nodes};
	if ($line =~ /($dimen->[0])=([^,"\n]+)/) {
	my $dimenName = $1;
	my $nodeName = $2;

	# use alias if it is not empty
	if ($dimen->[1]) {
	$dimenName = $dimen->[1];
	}
	$node = addNewDimenNode($nodes, $dimenName, $nodeName);
	} else {
	$node = addNewDimenNode($nodes, $dimen->[1] ? $dimen->[1] : $dimen->[0], "-");
	}
	}

	my $indexNode = $node->{nodes}->[0];
	if (!$indexNode) {
	$node->{nodes}->[0] = $indexNode = TreeNode->new();
	$indexNode->{value} = {};
	}

	parseIndex($indexNode, $line);
	}

	sub addNewDimenNode {
	my ($nodes, $dimenName, $nodeName) = @_;

	my $newNode;
	foreach my $n (@$nodes) {
	if ($n->{value} eq $nodeName) {
	$newNode = $n;
	last;
	}
	}

	if (!$newNode) {
	$newNode = TreeNode->new();
	$newNode->{value} = $nodeName;
	push (@$nodes, $newNode);

	my $found = 0;
	for my $dn (@dimenNameArray) {
	if ($dimenName eq $dn) {
	$found = 1;
	last;
	}
	}

	if (!$found) {
	push (@dimenNameArray, $dimenName);
	}
	}

	return $newNode;
	}

	sub parseIndex {
	my $node = shift;
	my $line = shift;

	my $indexMap = $node->{value} \|\| ($node->{value} = {});

	my $uv;
	if ($uid && $line =~ /$uid=([^,"\n]+)/) {
	$uv = $1;
	my $uvKeyHash = $indexMap->{$uvKey} \|\| ($indexMap->{$uvKey} = {});
	$uvKeyHash->{$uv} = 0;
	}
	$indexMap->{$pvKey} += 1;

	while ($line =~ /($indexRegex)=([^,"\n]+)/g) {
	my $indexName = $1;
	my $indexValue = $2;

	my $index = $indexMap->{$indexName} \|\| ($indexMap->{$indexName} = [0, 0, {}]); # [count, sum, {uid1, uid2, uid3...}]
	$index->[0] += 1; # count
	$index->[1] += $indexValue; # sum

	for my $indexInfoItem (@{$indexInfos{$indexName}}) {
	my $operator = $indexInfoItem->[0];
	if (!defined $uid) {
	if ($operator && $operator eq "unq:") {
	print STDERR ("must specify uid: key to use unq:\n");
	}

	} elsif (defined $uv && $uv ne "") {
	$index->[2]->{$uv} = 0;
	}
	}

	$indexNameHash{$indexName} = 0;
	}
	}

	sub pushIntoColsArr {
	my $colsArr = shift;
	my $value = shift;

	push(@$colsArr, $value);

	my $maxW = $maxColWidths[$gCounter];

	my $curW = $value ? getColWidth($value) : 1;
	if ($maxW) {
	$maxColWidths[$gCounter] = $curW > $maxW ? $curW : $maxW;
	} else {
	$maxColWidths[$gCounter] = $curW;
	}
	++$gCounter;
	}

	sub printHeader {
	my @headerCols;
	$gCounter = 0;

	foreach my $dimen (@dimenNameArray) {
	pushIntoColsArr(\@headerCols, $dimen);
	}
	if ($uid) {
	pushIntoColsArr(\@headerCols, "UV");
	}
	pushIntoColsArr(\@headerCols, "PV");

	my $indexCols = 0;
	for my $indexName (sort keys %indexNameHash) {
	if (!scalar @{$indexInfos{$indexName}}) {
	++$indexCols;
	pushIntoColsArr(\@headerCols, $indexName);

	} else {
	for my $indexInfoItem (@{$indexInfos{$indexName}}) {
	my $tmpIndexName = $indexName;
	if ($indexInfoItem) {
	++$indexCols;
	if ($indexInfoItem->[1]) {
	$tmpIndexName = $indexInfoItem->[1]; # use alias
	} elsif ($indexInfoItem->[0]) {
	$tmpIndexName = $indexInfoItem->[0] . $tmpIndexName;
	}
	}

	pushIntoColsArr(\@headerCols, $tmpIndexName);
	}
	}
	}
	#my $cols = @dimenNameArray + $indexCols;
	#if ($uid) {
	#$cols += 1;
	#}
	#my $sep = "-" x ($cols * ($colWidth + 7));
	#print("\n$sep\n");

	push(@table, \@headerCols);
	}

	sub printIndex {
	my $node = shift;
	printIndex2($node, [])
	}

	sub printIndex2 {
	my $node = shift;
	my $dimens = shift;

	my $value = $node->{value};
	if (defined $value && ref($value) ne "HASH") {
	push (@$dimens, $value);
	}

	my $nodes = $node->{nodes};
	for my $n (@$nodes) {
	my $value = $n->{value};
	if (ref($value) eq "HASH") {
	my @indexCols;
	$gCounter = 0;

	for my $dimen (@$dimens) {
	pushIntoColsArr(\@indexCols, $dimen);
	}

	my $uv;
	my $pv = $value->{$pvKey};

	if ($uid) {
	my $uvKeyHash = $value->{$uvKey};
	$uv = scalar (keys %$uvKeyHash);
	pushIntoColsArr(\@indexCols, $uv);
	}
	pushIntoColsArr(\@indexCols, $pv);

	for my $indexName (sort keys %indexNameHash) {
	my $indexDataTuple = $value->{$indexName} \|\| [0, 0, {}]; # (count, sum, {uid1, uid2...}) tuple

	my @indexInfoItems = @{$indexInfos{$indexName}};
	if (!scalar @indexInfoItems) {
	printIndexItem(\@indexCols, $uv, $pv, "", $indexDataTuple);

	} else {
	for my $indexInfoItem (@indexInfoItems) {
	my $operator = $indexInfoItem->[0];
	printIndexItem(\@indexCols, $uv, $pv, $operator, $indexDataTuple);
	}
	}
	}

	push(@table, \@indexCols);

	} else {
	printIndex2($n, $dimens);
	}
	}

	pop @$dimens;
	}

	sub printIndexItem {
	my ($indexCols, $uv, $pv, $operator, $indexDataTuple) = @_;

	my $v = "0";

	if (!defined $operator \|\| $operator !~ /avg:/) {
	if ($operator && $operator eq "cnt:") {
	$v = $indexDataTuple->[0]; # use count
	} elsif ($operator && $operator eq "unq:") {
	$v = scalar (keys %{$indexDataTuple->[2]}); # use unique count
	} else {
	$v = $indexDataTuple->[1]; # use sum
	}

	} else {
	if ($operator eq "pv_cavg:") {
	$v = sprintf("%.4f", $indexDataTuple->[0] / $pv);

	} elsif ($operator eq "pv_savg:") {
	$v = sprintf("%.4f", $indexDataTuple->[1] / $pv);

	} elsif ($operator =~ /uv_[ucs]avg:/) {
	if (defined $uv) {
	$uv \|\| ($uv = "1");
	if ($operator eq "uv_cavg:") {
	$v = sprintf("%.4f", $indexDataTuple->[0] / $uv);
	} elsif ($operator eq "uv_uavg:") {
	$v = sprintf("%.4f", (scalar (keys %{$indexDataTuple->[2]})) / $uv);
	} else {
	$v = sprintf("%.4f", $indexDataTuple->[1] / $uv);
	}
	} else {
	print STDERR ("must specify uid: key to compute UV for using uv_*avg $uv\n");
	}
	}
	}

	pushIntoColsArr($indexCols, $v);
	}

	sub tryLoad {
	my $mod = shift;

	eval("use $mod");

	if ($@) {
	return 0;
	} else {
	return 1;
	}
	}

	package BaseObject;
	sub new {
	my ($class, $args) = @_;
	return bless $args, $class;
	}

	package TreeNode;
	use parent -norequire, qw(BaseObject);

	sub new {
	my ($class) = @_;
	return $class->SUPER::new({
	value => undef,
	nodes => []
	});
	}