Skip to content

Instantly share code, notes, and snippets.

@martinvirtel
Last active July 21, 2017 18:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save martinvirtel/94cf47f64bf304e1c66598e93cd565c4 to your computer and use it in GitHub Desktop.
Save martinvirtel/94cf47f64bf304e1c66598e93cd565c4 to your computer and use it in GitHub Desktop.
command line data science: UNIX filters for grouping, counting, CSVing text files

UNIX pipe fittings for statistics

In the quest for command line data science, this kit contains three command line utilities intended to be used in UNIX pipes.

counter.py

Is a regular expression counter filter. Please see docstring for further help.

groupby.py

Concatenates lines from stdin that match a regular expression. Please see docstring.

extract.py

In the spirit of RegExSerDe, this tool uses regular expressions to generate a CSV file from a free-form text file.

Other Useful Tools

If you've got CSV files, you should definitively check out q.

To Do

A cookbook would be nice. Showing how to analyze log files etc.

#! /usr/bin/env python3
"""
counter.py -- command line data science in python
USAGE: counter.py [--sample] [--join=field1,field2] regexp [regexp2] [regexp3] [regexpn] <FILE
counter.py will match the regular expresion against every line in STDIN, and count the lines
each string matching the expression is found. The counts are output as CSV file.
Subexpressions (regular expression groups and named groups) are counted separately.
(See https://docs.python.org/3/howto/regex.html for those).
EXAMPLES
LC_ALL=C ls ~ -l | python3 counter.py 'Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Dec'
will give you a stat of the months the files in your home directory were created.
LC_ALL=C ls ~ -l | python3 counter.py '(?P<month>Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Dec) ?\d\d ?(?P<year>2\d\d\d)'
will give you the stats of months and years in the named regular expression groups.
SORTING BY _SUFFIX
The stats are sorted by value - the string that was found most frequently at the top. If you want to sort
by the string found (=key) instead, append a "_k" to the name of the named group.
Example:
cat /var/log/apache2/access.log | ./counter.py '(?P<time_k>03/Jan/2017:\d\d:)'
will give you a per-hour count of the requests logged in your webserver log on Jan. 3rd 2017, ordered by hour.
(You will need a server log at the specified location containing requests for Jan. 3rd 2017 for this to work).
You can use the suffix "_kn" if you want to sort numerically. This works by converting the strings to the python
type float first. "_kd" would convert them to datetime values using pandas.to_datetime if pandas is installed.
JOINING COUNTERS
Normally, every named group is counted independently. You can count co-ocurrences using the --join parameter:
LC_ALL=C ls ~ -l | python3 counter.py --join=year,month '(?P<month>Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Dec) ?\d\d ?(?P<year>2\d\d\d)'
will separate November 2016 from November 2015 etc. In the first example, all the November files were counted together irrespective of the year. Rows
without matches for one value are counted as "None".
DISPLAYING SAMPLE LINES
LC_ALL=C ls ~ -l | python3 counter.py --sample --join=year,month '(?P<month>Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Dec) ?\d\d ?(?P<year>2\d\d\d)'
will add a field named "sample" to the output csv. It contains the first line that matched this counter as an example.
DESCRIPTIVE STATISTICS
If you have pandas and numpy installed, the matches of named regular expression groups with names ending in "_float" or "_int"
will be converted to the corresponding pandas Series and a series.describe() will be written to STDERR.
printf "10 \n3 \n12 \n16 \n" | counter.py '(?P<n_float>\d+)'
will print mean, max, min, standard deviation and quartiles for the array [10, 3, 12, 16] to STDERR
This was inspired by the great O'Reilly book Data Science at the Command Line, Github Repo is here:
https://github.com/jeroenjanssens/data-science-at-the-command-line
"""
import sys
import os
import re
from collections import Counter,defaultdict,OrderedDict
import csv
try :
import pandas as pd
import numpy as np
has_pandas=True
except ImportError :
has_pandas=False
def float_or_zero(v) :
try :
return float(v)
except ValueError :
return 0.0
def process(*args,join=None,sample=False) :
rex=[]
for rx in args :
rex.append(re.compile(rx))
c=defaultdict(lambda: Counter())
if sample :
samples=defaultdict(lambda : dict())
for line in sys.stdin.readlines() :
processed=set()
valuedict=OrderedDict()
line=line[:-1]
for (rxc,rx) in enumerate(rex) :
for m in [rx.search(line),] :
if len(rex)==0 :
rxc_key=""
else :
rxc_key="%d_" % (rxc+1,)
if m :
for (k,v) in m.groupdict().items() :
c[k].update({ v : 1 })
valuedict[k]=v
processed.add(v)
if sample and v not in samples[k]:
samples[k][v]=line
for g in enumerate(m.groups()) :
if g[1] not in processed :
gkey="%s%s" % (rxc_key,g[0]+1)
c[gkey].update({g[1] : 1 })
processed.add(g[1])
valuedict[gkey]=g[1]
if sample and g[1] not in samples[gkey] :
samples[gkey][g[1]]=line
if len(processed)==0 :
c[0].update({m.group() : 1 })
if sample and m.group() not in samples[0] :
samples[0][m.group()]=line
if join :
jk="join"
jv=",".join([str(valuedict.get(a,None)) for a in join])
c[jk].update({ jv : 1 })
if sample and jv not in samples[jk] :
samples[jk][jv]=line
for k in join :
if not k in valuedict :
c[k].update({ None : 1 })
if sample and None not in samples[k] :
samples[k][None]=line
f=csv.writer(sys.stdout)
lookup=dict()
for (k,v) in c.items() :
nt=str(k).split("_")
sorter=lambda a : a[1]
realkey=k
# group name ends in _k or _kn or _kd - sort by key, sort by key numerically, sort by key as date
if len(nt)==2 :
realkey=nt[0]
if nt[1]=="k" :
sorter=lambda a : a[0]
elif nt[1]=="kn" :
sorter=lambda a: float_or_zero(a[0])
elif nt[1]=="kd" and has_pandas :
sorter=lambda a: pd.to_datetime(a[0])
elif has_pandas and nt[1] in np.sctypeDict.keys() :
pass
else :
sys.stderr.write("key suffix _{} not recognized. Possible values: _k (sort by key), _kn (sort numerically)\n".format(nt[1]))
realkey=k
table=sorted(list(v.items()), key=sorter ,reverse=True)
# group name ends in _int, _float or similar: Descriptive Stats with Pandas etc. to stderr
if has_pandas and len(nt)>1 and nt[1] in np.sctypeDict.keys() :
if has_pandas :
cf=getattr(np,nt[1])
se=pd.Series([a for a in convert_or_na(table,cf)],dtype=cf, name=nt[0])
sys.stderr.write(str(se.describe())+"\n")
# group name is "join" - separate joined key into columns
if k == "join" :
jointable=table
continue
else :
# normal behaviour
total = 0
for r in table :
total += int(r[-1])
headers=['group','match','count','percent']
if sample :
headers.append("sample")
f.writerow(headers)
for r in table :
rr=[realkey]
rr.extend(r)
rr.append("%.2f%%" % ((100.0*r[-1])/total))
if sample :
rr.append(samples.get(realkey,defaultdict(lambda : '-'))[r[0]])
f.writerow(rr)
rr=[realkey,'total',total,'100.00%']
f.writerow(rr)
if join is not None and k in join :
lookup[k]=dict([(r[0],r[1]) for r in table])
if join :
joincolumns=["join"]
joincolumns.extend(join)
joincolumns.append("count")
for k in join :
if k in lookup :
joincolumns.append("%s_sum" % k)
if sample :
joincolumns.append("sample")
f.writerow(joincolumns)
for row in jointable:
trow=["join"]
trow.extend(row[0].split(","))
trow.append(row[1])
for i,k in enumerate(join) :
if k in lookup :
trow.append(lookup[k].get(trow[i+1],None))
if sample :
trow.append(samples["join"][row[0]])
f.writerow(trow)
def convert_or_na(table,conv) :
try :
nanv=conv(np.NaN)
except ValueError :
# no integer NaN :-(
nanv=None
for r in table :
try :
v=conv(r[-2])
except Exception as e:
v=nanv
if v is not None :
for t in range(0,r[-1]) :
yield v
if __name__=='__main__' :
if len(sys.argv)<2 or sys.argv[1].find("-h")>-1 :
print(__doc__)
else :
args=set(sys.argv)
switches=set()
join=None
sample=False
for a in args :
join_test=re.match("--join=(?P<fields>[^ ]+)",a)
if join_test :
start=2
join=join_test.groupdict()["fields"].split(",")
switches.add(a)
sample_test=re.match("--sample",a)
if sample_test :
sample=True
switches.add(a)
for s in switches :
args.remove(s)
process(*args,join=join,sample=sample)
#! /usr/bin/env python3
"""
extract.py -- command line log file processor
USAGE: extract.py regexp1 regexp2 regexp3 <FILE
will generate a CSV with header with the matches (or None if no match) for each regexp as fields. If the regexp contain
a named group, the name of this group will be used as the header for the column.
BUG: The Regexps have to match every line of the file, otherwise extract.py will fail. Use grep if you need to.
Inspired by [RegExSerDe](https://github.com/apache/hive/blob/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/RegexSerDe.java).
"""
import sys
import os
import re
from collections import Counter,defaultdict,OrderedDict
import csv
try :
import pandas as pd
import numpy as np
has_pandas=True
except ImportError :
has_pandas=False
maxbuffer=10000
def process(argl) :
buff=OrderedDict()
rex=[re.compile(a) for a in argl]
w=csv.writer(sys.stdout)
headerrow=[]
headerwritten=False
for line in sys.stdin.readlines() :
row=[]
for i,r in enumerate(rex) :
m=r.search(line)
if m :
if m.groupdict() :
row.append(list(m.groupdict().values())[0])
else :
row.append(m.groups()[0])
else :
row.append(None)
if not headerwritten :
if m.groupdict() :
headerrow.append(list(m.groupdict().keys())[0])
else :
headerrow.append("f%s" % i)
if not headerwritten :
w.writerow(headerrow)
headerwritten=True
w.writerow(row)
if __name__=='__main__' :
if len(sys.argv)<2 :
print(__doc__)
else :
process(sys.argv[1:])
#! /usr/bin/env python3
"""
groupby.py -- command line data science
USAGE: groupy.py regexp <FILE
groupby.py will concatenate lines matching certain criteria from STDIN to a tab-separated single line on stdout
Criteria can be: matching regexp
"""
import sys
import os
import re
from collections import Counter,defaultdict,OrderedDict
import csv
try :
import pandas as pd
import numpy as np
has_pandas=True
except ImportError :
has_pandas=False
maxbuffer=10000
def writeobj(d,k) :
sys.stdout.write(str(k)+"\t"+str(len(d[k][1]))+"\t"+"\t".join(d[k][1])+"\n")
def process(arg) :
buff=OrderedDict()
rex=re.compile(arg)
linecounter=0
seen=set()
for line in sys.stdin.readlines() :
linecounter+=1
line=line[:-1]
m=rex.search(line)
if m :
key=m.group()
else :
key=None
if key in buff :
buff[key][0]=linecounter
buff[key][1].append(line)
else :
buff[key]=[linecounter,[line]]
if key in seen :
sys.stderr.write("Please increase buffer size from current value of {}. Key {} was found after being flushed\n".format(maxbuffer,key))
if len(buff.keys())>maxbuffer :
sk=sorted([(k,v[0]) for (k,v) in buff.items()],key=lambda a: a[1])[0]
writeobj(buff,sk[0])
del buff[sk[0]]
seen.add(sk[0])
for k in buff.keys() :
writeobj(buff,k)
if __name__=='__main__' :
if len(sys.argv)<2 :
print(__doc__)
else :
process(sys.argv[1])
REMOTE := mvirtel@vm-hetzner:/home/mvirtel/projekte/
.PHONY: deploy
deploy :
rsync -v ./*.py $(REMOTE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment