Skip to content

Embed URL

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Count all function lengths under a directory
from pandas import DataFrame
from pandas.util.testing import set_trace
import os
import numpy as np
import matplotlib.pyplot as plt
dirs = []
names = []
lengths = []
walked = os.walk('pandas')
def _should_count_file(path):
return path.endswith('.py') or path.endswith('.pyx')
def _is_def_line(line):
return (line.endswith(':') and
(line.startswith('def ') or
line.startswith('cdef ') or
line.startswith('cpdef ') or
' def ' in line or ' cdef ' in line or ' cpdef ' in line))
class LengthCounter(object):
"""
should add option for subtracting nested function lengths??
"""
def __init__(self, lines):
self.lines = lines
self.pos = 0
self.counts = []
self.n = len(lines)
def get_counts(self):
self.pos = 0
self.counts = []
while self.pos < self.n:
line = self.lines[self.pos]
self.pos += 1
if _is_def_line(line):
level = _get_indent_level(line)
self._count_function(indent_level=level)
return self.counts
def _count_function(self, indent_level=1):
indent = ' ' * indent_level
def _end_of_function(line):
return (line != '' and
not line.startswith(indent) and
not line.startswith('#'))
start_pos = self.pos
while self.pos < self.n:
line = self.lines[self.pos]
if _end_of_function(line):
self._push_count(start_pos)
return
self.pos += 1
if _is_def_line(line):
self._count_function(indent_level=indent_level + 1)
# end of file
self._push_count(start_pos)
def _push_count(self, start_pos):
func_lines = self.lines[start_pos:self.pos]
if len(func_lines) > 300:
set_trace()
# remove blank lines at end
while len(func_lines) > 0 and func_lines[-1] == '':
func_lines = func_lines[:-1]
# remove docstrings and comments
clean_lines = []
in_docstring = False
for line in func_lines:
line = line.strip()
if in_docstring and _is_triplequote(line):
in_docstring = False
continue
if line.startswith('#'):
continue
if _is_triplequote(line):
in_docstring = True
continue
self.counts.append(len(func_lines))
def _get_indent_level(line):
level = 0
while line.startswith(' ' * level):
level += 1
return level
def _is_triplequote(line):
return line.startswith('"""') or line.startswith("'''")
def _get_file_function_lengths(path):
lines = [x.rstrip() for x in open(path).readlines()]
counter = LengthCounter(lines)
return counter.get_counts()
# def test_get_function_lengths():
text = """
class Foo:
def foo():
def bar():
a = 1
b = 2
c = 3
foo = 'bar'
def x():
a = 1
b = 3
c = 7
pass
"""
expected = [5, 8, 7]
lines = [x.rstrip() for x in text.splitlines()]
counter = LengthCounter(lines)
result = counter.get_counts()
assert(result == expected)
def doit():
for directory, _, files in walked:
print directory
for path in files:
if not _should_count_file(path):
continue
full_path = os.path.join(directory, path)
print full_path
lines = len(open(full_path).readlines())
dirs.append(directory)
names.append(path)
lengths.append(lines)
result = DataFrame({'dirs' : dirs, 'names' : names,
'lengths' : lengths})
def doit2():
counts = {}
for directory, _, files in walked:
print directory
for path in files:
if not _should_count_file(path) or path.startswith('test_'):
continue
full_path = os.path.join(directory, path)
counts[full_path] = _get_file_function_lengths(full_path)
return counts
counts = doit2()
# counts = _get_file_function_lengths('pandas/tests/test_series.py')
all_counts = []
for k, v in counts.iteritems():
all_counts.extend(v)
all_counts = np.array(all_counts)
fig = plt.figure(figsize=(10, 5))
ax = fig.add_subplot(111)
ax.hist(all_counts, bins=100)
n = len(all_counts)
nmore = (all_counts > 50).sum()
ax.set_title('pandas function lengths, n=%d' % n)
ax.set_ylabel('N functions')
ax.set_xlabel('Function length')
ax.text(100, 300, '%.3f%% with > 50 lines' % ((n - nmore) / float(n)),
fontsize=18)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.