Last active
December 1, 2016 08:14
-
-
Save njsmith/9157645 to your computer and use it in GitHub Desktop.
scripts for counting how often different operations are used in Python code, used for discussions around PEP 465
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# https://gist.github.com/njsmith/9157645 | |
# usage: | |
# python3 grep-dot-dot.py path [path path ...] | |
DOT_NAMES = ["dot", | |
# scikit-learn: | |
"fast_dot", | |
"safe_sparse_dot", | |
] | |
import ast | |
import sys | |
import os | |
import os.path | |
# for "dot" this returns "dot", and for "np.dot" this returns "dot" | |
def leaf_name(node): | |
if isinstance(node, ast.Attribute): | |
return node.attr | |
elif isinstance(node, ast.Name): | |
return node.id | |
else: | |
return None | |
def is_dot(node): | |
return isinstance(node, ast.Call) and leaf_name(node.func) in DOT_NAMES | |
def dot_args(node): | |
if len(node.args) == 1: | |
# method call -- first arg is .func.value | |
assert isinstance(node.func, ast.Attribute) | |
args = [node.func.value] + node.args | |
elif len(node.args) == 2: | |
# function call | |
args = node.args | |
else: | |
print "weird call: %s" % (ast.dump(node),) | |
args = None | |
return args | |
class DotDotCounter(ast.NodeVisitor): | |
def __init__(self): | |
self.left = 0 | |
self.right = 0 | |
self.total = 0 | |
self.method = 0 | |
def visit_Call(self, node): | |
if is_dot(node): | |
self.total += 1 | |
if len(node.args) == 1: | |
self.method += 1 | |
args = dot_args(node) | |
if args: | |
if is_dot(args[0]): | |
self.left += 1 | |
if is_dot(args[1]): | |
self.right += 1 | |
self.generic_visit(node) | |
def dot_pattern(node): | |
if not is_dot(node): | |
if isinstance(node, ast.BinOp): | |
#return "(_ %s _)" % (node.op.__class__.__name__) | |
return "_" | |
else: | |
return "_" | |
node._PROCESSED_ = True | |
args = dot_args(node) | |
if not args: | |
return | |
sub_patterns = [dot_pattern(arg) for arg in args] | |
return "(%s @ %s)" % tuple(sub_patterns) | |
class DotDotPatterns(ast.NodeVisitor): | |
def __init__(self): | |
self.patterns = {} | |
def visit_Call(self, node): | |
if is_dot(node) and not hasattr(node, "_PROCESSED_"): | |
pattern = dot_pattern(node) | |
if pattern: | |
self.patterns.setdefault(pattern, 0) | |
self.patterns[pattern] += 1 | |
self.generic_visit(node) | |
roots = sys.argv[1:] | |
counter = DotDotCounter() | |
patterns = DotDotPatterns() | |
for root in roots: | |
for dirpath, _, filenames in os.walk(root): | |
for filename in filenames: | |
if filename.endswith(".py"): | |
path = os.path.join(dirpath, filename) | |
#print path | |
try: | |
root_node = ast.parse(open(path).read()) | |
except SyntaxError: | |
print "Error parsing file: %s" % (path,) | |
else: | |
counter.visit(root_node) | |
patterns.visit(root_node) | |
print "Patterns:" | |
for pattern, count in sorted(patterns.patterns.iteritems(), | |
key=lambda item: item[1], | |
reverse=True): | |
print " %8i %s" % (count, pattern) | |
print "Total calls to function/methods named dot:", counter.total | |
print " (of which this many were methods: %s )" % (counter.method,) | |
print "Left-associative nestings: ", counter.left | |
print "Right-associative nestings:", counter.right |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# https://gist.github.com/njsmith/9157645 | |
# usage: | |
# python3 grep-ops.py token path | |
import sys | |
import os | |
import os.path | |
import tokenize | |
from collections import OrderedDict | |
target, root = sys.argv[1:] | |
for dirpath, _, filenames in os.walk(root): | |
for filename in filenames: | |
if filename.endswith(".py"): | |
path = os.path.join(dirpath, filename) | |
try: | |
for token in tokenize.tokenize(open(path, "rb").readline): | |
if token.string == target: | |
sys.stdout.write("%s:%s:%s" | |
% (path, token.start[0], token.line)) | |
except Exception as e: | |
sys.stderr.write("Failed to read %s: %s\n" % (path, e)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# http://legacy.python.org/dev/peps/pep-0465/ | |
# https://gist.github.com/njsmith/9157645 | |
# usage: | |
# python3 scan-ops.py stdlib_path sklearn_path nipy_path | |
import sys | |
import os | |
import os.path | |
import tokenize | |
from collections import OrderedDict | |
NON_SOURCE_TOKENS = [ | |
tokenize.COMMENT, tokenize.NL, tokenize.ENCODING, tokenize.NEWLINE, | |
tokenize.INDENT, tokenize.DEDENT, | |
] | |
SKIP_OPS = list("(),.:[]{}@;") + ["->", "..."] | |
class TokenCounts(object): | |
def __init__(self, dot_names=[]): | |
self.counts = {} | |
self.sloc = 0 | |
self.dot_names = dot_names | |
def count(self, path): | |
sloc_idxes = set() | |
for token in tokenize.tokenize(open(path, "rb").readline): | |
if token.type == tokenize.OP: | |
self.counts.setdefault(token.string, 0) | |
self.counts[token.string] += 1 | |
if token.string in self.dot_names: | |
self.counts.setdefault("dot", 0) | |
self.counts["dot"] += 1 | |
if token.type not in NON_SOURCE_TOKENS: | |
sloc_idxes.add(token.start[0]) | |
self.sloc += len(sloc_idxes) | |
@classmethod | |
def combine(cls, objs): | |
combined = cls() | |
for obj in objs: | |
for op, count in obj.counts.items(): | |
combined.counts.setdefault(op, 0) | |
combined.counts[op] += count | |
combined.sloc += obj.sloc | |
return combined | |
def count_tree(root, **kwargs): | |
c = TokenCounts(**kwargs) | |
for dirpath, _, filenames in os.walk(root): | |
for filename in filenames: | |
if filename.endswith(".py"): | |
path = os.path.join(dirpath, filename) | |
try: | |
c.count(path) | |
sys.stderr.write(".") | |
sys.stderr.flush() | |
except Exception as e: | |
sys.stderr.write("\nFailed to read %s: %s\n" % (path, e)) | |
return c | |
# count_objs is OrderedDict (name -> TokenCounts) | |
def summarize(count_objs, out): | |
ops = {} | |
for count_obj in count_objs.values(): | |
for op in count_obj.counts: | |
ops[op] = [] | |
for count_obj in count_objs.values(): | |
for op, row in ops.items(): | |
count = count_obj.counts.get(op, 0) | |
row.append(count / count_obj.sloc) | |
titles = ["Op"] + list(count_objs) | |
# 4 chars is enough for ops and all numbers. | |
column_widths = [max(len(title), 4) for title in titles] | |
rows = [] | |
for op, row in ops.items(): | |
#rows.append(["``" + op + "``"] + row) | |
rows.append([op] + row) | |
rows.sort(key=lambda row: row[-1]) | |
rows.reverse() | |
def write_row(entries): | |
out.write(" ".join(entries)) | |
out.write("\n") | |
def lines(): | |
write_row("=" * w for w in column_widths) | |
lines() | |
write_row(t.rjust(w) for w, t in zip(column_widths, titles)) | |
lines() | |
for row in rows: | |
op = row[0] | |
if op in SKIP_OPS: | |
continue | |
# numbers here are avg number of uses per sloc, which is | |
# inconveniently small. convert to uses/1e4 sloc | |
numbers = row[1:] | |
number_strs = [str(int(round(x * 10000))) for x in numbers] | |
formatted_row = [op] + number_strs | |
write_row(str(e).rjust(w) | |
for w, e in zip(column_widths, formatted_row)) | |
lines() | |
def run_projects(names, dot_names, dirs, out): | |
assert len(names) == len(dot_names) == len(dirs) | |
count_objs = OrderedDict() | |
for name, dot_name, dir in zip(names, dot_names, dirs): | |
counts = count_tree(dir, dot_names=dot_name) | |
count_objs[name] = counts | |
out.write("%s: %s sloc\n" % (name, counts.sloc)) | |
count_objs["combined"] = TokenCounts.combine(count_objs.values()) | |
summarize(count_objs, out) | |
if __name__ == "__main__": | |
run_projects(["stdlib", "scikit-learn", "nipy"], | |
[[], | |
# https://github.com/numpy/numpy/pull/4351#discussion_r9977913 | |
# sklearn fast_dot is used to fix up some optimizations that | |
# are missing from older numpy's, but in modern days is | |
# exactly the same, so it's fair to count. safe_sparse_dot | |
# has hacks to workaround some quirks in scipy.sparse | |
# matrices, but these quirks are also already fixed, so | |
# counting this calls is also fair. | |
["dot", "fast_dot", "safe_sparse_dot"], | |
["dot"]], | |
sys.argv[1:], | |
sys.stdout) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment