johan/grep-count.pl

## grep-count.pl
#!/usr/bin/perl

use strict;
use warnings;
use Getopt::Std;
use File::Basename;
use JSON;
use utf8;

my $prog = basename($0);
my %opt;
getopts('he0f:n:i', \%opt);
usage() if $opt{h} || $#ARGV < 0;
sub usage {
  die <<EOU;
usage: $prog [-e0i] [-f col] [-n col] <spec.json> [files]
  given a JSON {"match pattern": "column name", [...]} spec in arg 1 and a list
  of files to match you get a TSV counting matching lines, one row per filename
       -e: egrep style - count by regexp match instead of by substring match
       -0: print "0" for non-matched lines ("" by default)
       -i: interactive (updates counts live every second)
       -f: the name of the filename column ("file", by default)
       -n: count non-matched lines and put them in a column <non-matched-name>
EOU
}

# if "-". read and JSON parse stdin into %colheads, otherwise from arg 1
my $jsonspec = shift;
my $ate_stdin = $jsonspec eq '-';
$jsonspec = do { local $/; <STDIN> } if ($ate_stdin);
my %colheads = %{from_json $jsonspec};
my @matchers = keys %colheads;

# make options more source code readable
my $byregexp = $opt{e} || $prog =~ /^e/;
my $filename = $opt{f} || 'file';
my $nonmatch = $opt{n};
my $notfound = $opt{0} ? '0' : '';
my $live_tsv = $opt{i};
$| = 1 if $live_tsv; # don't line buffer when interactive

# print column headers
print $filename;
print "\t$colheads{$_}" for (@matchers);
print "\t$nonmatch" unless !$nonmatch;
print "\n";

my %count;
my $noncount;
my $fh;
my $last = 0;

# print one row per file on ARGV, one match count per column
@ARGV = ('-') unless @ARGV; # assume STDIN, iff empty
for my $fn (@ARGV) {
  $noncount = 0;

  # read the file
  if (!open($fh, $fn)) {
    warn "Can't open $fn: $!\n";
    next;
  }
  while (<$fh>) {
    my $found;
    for my $find (@matchers) {
      $found = $byregexp ? /$find/ : -1 < index $_, $find;
      if ($found) {
        $count{$colheads{$find}}++;
        last;
      }
    }
    $noncount++ unless $found;
    if ($live_tsv) {
      my $now = time();
      counts($fn, "\r") unless $now == $last;
      $last = $now;
    }
  }
  close($fh);

  # print match counts
  counts($fn, "\n");
}

sub counts {
  my ($fn, $terminator) = @_;
  print $fn;
  print "\t", $count{$colheads{$_}} || $notfound for (@matchers);
  print "\t$noncount" unless !$nonmatch;
  print $terminator;
}
	#!/usr/bin/perl

	use strict;
	use warnings;
	use Getopt::Std;
	use File::Basename;
	use JSON;
	use utf8;

	my $prog = basename($0);
	my %opt;
	getopts('he0f:n:i', \%opt);
	usage() if $opt{h} \|\| $#ARGV < 0;
	sub usage {
	die <<EOU;
	usage: $prog [-e0i] [-f col] [-n col] <spec.json> [files]
	given a JSON {"match pattern": "column name", [...]} spec in arg 1 and a list
	of files to match you get a TSV counting matching lines, one row per filename
	-e: egrep style - count by regexp match instead of by substring match
	-0: print "0" for non-matched lines ("" by default)
	-i: interactive (updates counts live every second)
	-f: the name of the filename column ("file", by default)
	-n: count non-matched lines and put them in a column <non-matched-name>
	EOU
	}

	# if "-". read and JSON parse stdin into %colheads, otherwise from arg 1
	my $jsonspec = shift;
	my $ate_stdin = $jsonspec eq '-';
	$jsonspec = do { local $/; <STDIN> } if ($ate_stdin);
	my %colheads = %{from_json $jsonspec};
	my @matchers = keys %colheads;

	# make options more source code readable
	my $byregexp = $opt{e} \|\| $prog =~ /^e/;
	my $filename = $opt{f} \|\| 'file';
	my $nonmatch = $opt{n};
	my $notfound = $opt{0} ? '0' : '';
	my $live_tsv = $opt{i};
	$\| = 1 if $live_tsv; # don't line buffer when interactive

	# print column headers
	print $filename;
	print "\t$colheads{$_}" for (@matchers);
	print "\t$nonmatch" unless !$nonmatch;
	print "\n";

	my %count;
	my $noncount;
	my $fh;
	my $last = 0;

	# print one row per file on ARGV, one match count per column
	@ARGV = ('-') unless @ARGV; # assume STDIN, iff empty
	for my $fn (@ARGV) {
	$noncount = 0;

	# read the file
	if (!open($fh, $fn)) {
	warn "Can't open $fn: $!\n";
	next;
	}
	while (<$fh>) {
	my $found;
	for my $find (@matchers) {
	$found = $byregexp ? /$find/ : -1 < index $_, $find;
	if ($found) {
	$count{$colheads{$find}}++;
	last;
	}
	}
	$noncount++ unless $found;
	if ($live_tsv) {
	my $now = time();
	counts($fn, "\r") unless $now == $last;
	$last = $now;
	}
	}
	close($fh);

	# print match counts
	counts($fn, "\n");
	}

	sub counts {
	my ($fn, $terminator) = @_;
	print $fn;
	print "\t", $count{$colheads{$_}} \|\| $notfound for (@matchers);
	print "\t$noncount" unless !$nonmatch;
	print $terminator;
	}