Created
March 26, 2011 10:25
-
-
Save johan/888186 to your computer and use it in GitHub Desktop.
Like fgrep / egrep (symlink it as egrep-count to regexp match by default), but for more than one pattern at a time, generating tsv output, and where you list all the patterns and what you want the count columns to be named as a JSON object on stdin.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use Getopt::Std; | |
use File::Basename; | |
use JSON; | |
use utf8; | |
my $prog = basename($0); | |
my %opt; | |
getopts('he0f:n:i', \%opt); | |
usage() if $opt{h} || $#ARGV < 0; | |
sub usage { | |
die <<EOU; | |
usage: $prog [-e0i] [-f col] [-n col] <spec.json> [files] | |
given a JSON {"match pattern": "column name", [...]} spec in arg 1 and a list | |
of files to match you get a TSV counting matching lines, one row per filename | |
-e: egrep style - count by regexp match instead of by substring match | |
-0: print "0" for non-matched lines ("" by default) | |
-i: interactive (updates counts live every second) | |
-f: the name of the filename column ("file", by default) | |
-n: count non-matched lines and put them in a column <non-matched-name> | |
EOU | |
} | |
# if "-". read and JSON parse stdin into %colheads, otherwise from arg 1 | |
my $jsonspec = shift; | |
my $ate_stdin = $jsonspec eq '-'; | |
$jsonspec = do { local $/; <STDIN> } if ($ate_stdin); | |
my %colheads = %{from_json $jsonspec}; | |
my @matchers = keys %colheads; | |
# make options more source code readable | |
my $byregexp = $opt{e} || $prog =~ /^e/; | |
my $filename = $opt{f} || 'file'; | |
my $nonmatch = $opt{n}; | |
my $notfound = $opt{0} ? '0' : ''; | |
my $live_tsv = $opt{i}; | |
$| = 1 if $live_tsv; # don't line buffer when interactive | |
# print column headers | |
print $filename; | |
print "\t$colheads{$_}" for (@matchers); | |
print "\t$nonmatch" unless !$nonmatch; | |
print "\n"; | |
my %count; | |
my $noncount; | |
my $fh; | |
my $last = 0; | |
# print one row per file on ARGV, one match count per column | |
@ARGV = ('-') unless @ARGV; # assume STDIN, iff empty | |
for my $fn (@ARGV) { | |
$noncount = 0; | |
# read the file | |
if (!open($fh, $fn)) { | |
warn "Can't open $fn: $!\n"; | |
next; | |
} | |
while (<$fh>) { | |
my $found; | |
for my $find (@matchers) { | |
$found = $byregexp ? /$find/ : -1 < index $_, $find; | |
if ($found) { | |
$count{$colheads{$find}}++; | |
last; | |
} | |
} | |
$noncount++ unless $found; | |
if ($live_tsv) { | |
my $now = time(); | |
counts($fn, "\r") unless $now == $last; | |
$last = $now; | |
} | |
} | |
close($fh); | |
# print match counts | |
counts($fn, "\n"); | |
} | |
sub counts { | |
my ($fn, $terminator) = @_; | |
print $fn; | |
print "\t", $count{$colheads{$_}} || $notfound for (@matchers); | |
print "\t$noncount" unless !$nonmatch; | |
print $terminator; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example usage: