Skip to content

Instantly share code, notes, and snippets.

@esoergel
Last active March 4, 2019 05:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save esoergel/6dc0e8b5b42b7c735a54d9741bcd5b08 to your computer and use it in GitHub Desktop.
Save esoergel/6dc0e8b5b42b7c735a54d9741bcd5b08 to your computer and use it in GitHub Desktop.
Calculate statistics around the line length of a set of matching files
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Usage:
$ find . | grep py$ | python line_counts.py
"""
from __future__ import unicode_literals
import subprocess
import numpy
def parse(line):
if not line:
return
count, filename = line.split()
if filename == 'total':
return
return (int(count), filename)
res = subprocess.check_output(['xargs', 'wc', '-l'])
counts_by_file = filter(None, map(parse, res.split('\n')))
print "Biggest 5 files:"
biggest = sorted(counts_by_file)[-5:]
for count, filename in biggest:
print " ", count, filename
line_counts = [count for count, _ in counts_by_file]
median = int(numpy.median(line_counts))
std = int(numpy.round(numpy.std(line_counts)))
print "matching files:", len(line_counts)
print "total lines:", numpy.sum(line_counts)
print "min:", numpy.min(line_counts)
print "median:", median
print "+1σ:", median + std
print "max:", numpy.max(line_counts)
# I like weighted average best - you're more likely to be working in a file
# with longer lines, so that gives you the best idea how long files will be in your day-to-day
print "weighted avg:", int(numpy.round(numpy.average(line_counts, weights=line_counts)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment