Skip to content

Instantly share code, notes, and snippets.

@njsmith
Last active December 1, 2016 08:14
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save njsmith/9157645 to your computer and use it in GitHub Desktop.
Save njsmith/9157645 to your computer and use it in GitHub Desktop.
scripts for counting how often different operations are used in Python code, used for discussions around PEP 465
#!/usr/bin/env python
# https://gist.github.com/njsmith/9157645
# usage:
# python3 grep-dot-dot.py path [path path ...]
DOT_NAMES = ["dot",
# scikit-learn:
"fast_dot",
"safe_sparse_dot",
]
import ast
import sys
import os
import os.path
# for "dot" this returns "dot", and for "np.dot" this returns "dot"
def leaf_name(node):
if isinstance(node, ast.Attribute):
return node.attr
elif isinstance(node, ast.Name):
return node.id
else:
return None
def is_dot(node):
return isinstance(node, ast.Call) and leaf_name(node.func) in DOT_NAMES
def dot_args(node):
if len(node.args) == 1:
# method call -- first arg is .func.value
assert isinstance(node.func, ast.Attribute)
args = [node.func.value] + node.args
elif len(node.args) == 2:
# function call
args = node.args
else:
print "weird call: %s" % (ast.dump(node),)
args = None
return args
class DotDotCounter(ast.NodeVisitor):
def __init__(self):
self.left = 0
self.right = 0
self.total = 0
self.method = 0
def visit_Call(self, node):
if is_dot(node):
self.total += 1
if len(node.args) == 1:
self.method += 1
args = dot_args(node)
if args:
if is_dot(args[0]):
self.left += 1
if is_dot(args[1]):
self.right += 1
self.generic_visit(node)
def dot_pattern(node):
if not is_dot(node):
if isinstance(node, ast.BinOp):
#return "(_ %s _)" % (node.op.__class__.__name__)
return "_"
else:
return "_"
node._PROCESSED_ = True
args = dot_args(node)
if not args:
return
sub_patterns = [dot_pattern(arg) for arg in args]
return "(%s @ %s)" % tuple(sub_patterns)
class DotDotPatterns(ast.NodeVisitor):
def __init__(self):
self.patterns = {}
def visit_Call(self, node):
if is_dot(node) and not hasattr(node, "_PROCESSED_"):
pattern = dot_pattern(node)
if pattern:
self.patterns.setdefault(pattern, 0)
self.patterns[pattern] += 1
self.generic_visit(node)
roots = sys.argv[1:]
counter = DotDotCounter()
patterns = DotDotPatterns()
for root in roots:
for dirpath, _, filenames in os.walk(root):
for filename in filenames:
if filename.endswith(".py"):
path = os.path.join(dirpath, filename)
#print path
try:
root_node = ast.parse(open(path).read())
except SyntaxError:
print "Error parsing file: %s" % (path,)
else:
counter.visit(root_node)
patterns.visit(root_node)
print "Patterns:"
for pattern, count in sorted(patterns.patterns.iteritems(),
key=lambda item: item[1],
reverse=True):
print " %8i %s" % (count, pattern)
print
print "Total calls to function/methods named dot:", counter.total
print " (of which this many were methods: %s )" % (counter.method,)
print "Left-associative nestings: ", counter.left
print "Right-associative nestings:", counter.right
#!/usr/bin/env python3
# https://gist.github.com/njsmith/9157645
# usage:
# python3 grep-ops.py token path
import sys
import os
import os.path
import tokenize
from collections import OrderedDict
target, root = sys.argv[1:]
for dirpath, _, filenames in os.walk(root):
for filename in filenames:
if filename.endswith(".py"):
path = os.path.join(dirpath, filename)
try:
for token in tokenize.tokenize(open(path, "rb").readline):
if token.string == target:
sys.stdout.write("%s:%s:%s"
% (path, token.start[0], token.line))
except Exception as e:
sys.stderr.write("Failed to read %s: %s\n" % (path, e))
#!/usr/bin/env python3
# http://legacy.python.org/dev/peps/pep-0465/
# https://gist.github.com/njsmith/9157645
# usage:
# python3 scan-ops.py stdlib_path sklearn_path nipy_path
import sys
import os
import os.path
import tokenize
from collections import OrderedDict
NON_SOURCE_TOKENS = [
tokenize.COMMENT, tokenize.NL, tokenize.ENCODING, tokenize.NEWLINE,
tokenize.INDENT, tokenize.DEDENT,
]
SKIP_OPS = list("(),.:[]{}@;") + ["->", "..."]
class TokenCounts(object):
def __init__(self, dot_names=[]):
self.counts = {}
self.sloc = 0
self.dot_names = dot_names
def count(self, path):
sloc_idxes = set()
for token in tokenize.tokenize(open(path, "rb").readline):
if token.type == tokenize.OP:
self.counts.setdefault(token.string, 0)
self.counts[token.string] += 1
if token.string in self.dot_names:
self.counts.setdefault("dot", 0)
self.counts["dot"] += 1
if token.type not in NON_SOURCE_TOKENS:
sloc_idxes.add(token.start[0])
self.sloc += len(sloc_idxes)
@classmethod
def combine(cls, objs):
combined = cls()
for obj in objs:
for op, count in obj.counts.items():
combined.counts.setdefault(op, 0)
combined.counts[op] += count
combined.sloc += obj.sloc
return combined
def count_tree(root, **kwargs):
c = TokenCounts(**kwargs)
for dirpath, _, filenames in os.walk(root):
for filename in filenames:
if filename.endswith(".py"):
path = os.path.join(dirpath, filename)
try:
c.count(path)
sys.stderr.write(".")
sys.stderr.flush()
except Exception as e:
sys.stderr.write("\nFailed to read %s: %s\n" % (path, e))
return c
# count_objs is OrderedDict (name -> TokenCounts)
def summarize(count_objs, out):
ops = {}
for count_obj in count_objs.values():
for op in count_obj.counts:
ops[op] = []
for count_obj in count_objs.values():
for op, row in ops.items():
count = count_obj.counts.get(op, 0)
row.append(count / count_obj.sloc)
titles = ["Op"] + list(count_objs)
# 4 chars is enough for ops and all numbers.
column_widths = [max(len(title), 4) for title in titles]
rows = []
for op, row in ops.items():
#rows.append(["``" + op + "``"] + row)
rows.append([op] + row)
rows.sort(key=lambda row: row[-1])
rows.reverse()
def write_row(entries):
out.write(" ".join(entries))
out.write("\n")
def lines():
write_row("=" * w for w in column_widths)
lines()
write_row(t.rjust(w) for w, t in zip(column_widths, titles))
lines()
for row in rows:
op = row[0]
if op in SKIP_OPS:
continue
# numbers here are avg number of uses per sloc, which is
# inconveniently small. convert to uses/1e4 sloc
numbers = row[1:]
number_strs = [str(int(round(x * 10000))) for x in numbers]
formatted_row = [op] + number_strs
write_row(str(e).rjust(w)
for w, e in zip(column_widths, formatted_row))
lines()
def run_projects(names, dot_names, dirs, out):
assert len(names) == len(dot_names) == len(dirs)
count_objs = OrderedDict()
for name, dot_name, dir in zip(names, dot_names, dirs):
counts = count_tree(dir, dot_names=dot_name)
count_objs[name] = counts
out.write("%s: %s sloc\n" % (name, counts.sloc))
count_objs["combined"] = TokenCounts.combine(count_objs.values())
summarize(count_objs, out)
if __name__ == "__main__":
run_projects(["stdlib", "scikit-learn", "nipy"],
[[],
# https://github.com/numpy/numpy/pull/4351#discussion_r9977913
# sklearn fast_dot is used to fix up some optimizations that
# are missing from older numpy's, but in modern days is
# exactly the same, so it's fair to count. safe_sparse_dot
# has hacks to workaround some quirks in scipy.sparse
# matrices, but these quirks are also already fixed, so
# counting this calls is also fair.
["dot", "fast_dot", "safe_sparse_dot"],
["dot"]],
sys.argv[1:],
sys.stdout)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment