Last active
May 30, 2016 22:16
-
-
Save kitsu/1184b3caf4c209d67ab3f481de104dc3 to your computer and use it in GitHub Desktop.
Flaky Autolisp parser with stats collection/reporting. See blog series starting at http://kitsu.github.io/2016/05/27/autolisp_parse_01/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Process a lisp file to extract useful data.""" | |
import sys, os | |
from collections import namedtuple | |
substitutions = ( | |
# Expand quote literal to quote fn | |
("'(", "(quote "), | |
# Expand parens with white-space for splitting | |
(')', ' ) '), | |
('(', ' ( '), | |
) | |
Stats = namedtuple('Stats', 'funcs, deps, strs, files') | |
def modify(line): | |
"""Perform replacements listed in global dictionary.""" | |
for k, v in substitutions: | |
if k in line: | |
line = line.replace(k, v) | |
return line | |
def scrub_comment(line): | |
"""Carefully clean comments from line.""" | |
if not line or line[0] == ';': | |
# Just skip blank lines and comments | |
return | |
semi = line.find(';') | |
if semi > 0: | |
if '"' not in line or line.rfind('"') < semi: | |
line = line[:semi] | |
else: | |
# Find left most semi not in a string literal... | |
parts = line.split(';') | |
first, rest = parts[0], parts[1:] | |
accepted = [first] | |
quotes = 0 | |
#import pdb; pdb.set_trace() | |
while rest: | |
quotes += first.count('"') | |
# If we have collected an even number of quotes on left | |
if quotes%2 == 0: | |
break | |
first, rest = rest[0], rest[1:] | |
accepted.append(first) | |
line = ';'.join(accepted) | |
return line | |
def tokenize(infile): | |
"""Cheaty str.split tokenization.""" | |
tokens = list() | |
# Clean and append all lines into one big string | |
for line in infile: | |
line = scrub_comment(line.strip()) | |
if not line: | |
continue | |
# Tokenize the line | |
tokens.extend([token for token in | |
# AutoLisp is case-insensitive | |
modify(line).lower().split() | |
if token]) | |
return tokens | |
# Model states as classes that know what to consume | |
Number = namedtuple('Number', 'value') | |
String = namedtuple('String', 'value') | |
Symbol = namedtuple('Symbol', 'value') | |
def consume_other(tokens): | |
"""Handle Symbol, String, and Number tokens.""" | |
first = tokens[0] | |
try: | |
num = float(first) | |
return Number(num), tokens[1:] | |
except ValueError: | |
pass | |
if '"' == first[0]: | |
return consume_string(tokens) | |
return Symbol(first), tokens[1:] | |
def consume_string(tokens): | |
"""Consume tokens between double-quotes.""" | |
first, rest = tokens[0], tokens[1:] | |
if first.count('"')%2 == 0 and '\\"' not in first: | |
return String(first.strip('"')), tokens[1:] | |
parts = [first] | |
while rest: | |
first, rest = rest[0], rest[1:] | |
parts.append(first) | |
quotes = first.count('"') - first.count('\\"') + first.count('\\\\"') | |
if quotes%2 == 1: | |
break | |
else: | |
print tokens | |
raise SyntaxError("Reached end of input without matching quote.") | |
whole = String(" ".join(parts).strip('"')) | |
return whole, rest | |
def consume_body(tokens): | |
"""Consume all expressions until unpaired closing bracket ')' found.""" | |
children = list() | |
while tokens and not tokens[0] == ')': | |
# These are in order of specificity | |
if Defun.matches(tokens): | |
fn = Defun() | |
children.append(fn) | |
tokens = fn.consume(tokens) | |
elif Apply.matches(tokens): | |
ap = Apply() | |
children.append(ap) | |
tokens = ap.consume(tokens) | |
elif List.matches(tokens): | |
li = List() | |
children.append(li) | |
tokens = li.consume(tokens) | |
else: | |
# First token must be one of Symbol, String, Number | |
other, tokens = consume_other(tokens) | |
children.append(other) | |
# Omit last head if it exists (i.e. ')') | |
return children, tokens[1:] | |
class Program(object): | |
"""Base matcher.""" | |
def __init__(self, tokens): | |
self.children = list() | |
self.consume(tokens) | |
def consume(self, tokens): | |
"""Consume tokens to build children""" | |
#import pdb; pdb.set_trace() | |
body, tokens = consume_body(tokens) | |
self.children = body | |
if tokens: | |
raise SyntaxError("Unmatched ')' in input.") | |
def __repr__(self): | |
return "Program({})".format(self.children) | |
class Defun(object): | |
"""Match function definition.""" | |
no_params_err = "Defun '{}': expected parameter list, not {}" | |
nested_params_err = "Defun '{}': parameters must be simple list, found \n{}" | |
def __init__(self): | |
self.name = None | |
self.params = None | |
self.locals = None | |
self.body = None | |
self.deps = None | |
def __repr__(self): | |
return "Defun({0.name}, {0.params}, {0.locals}):\n{0.body}".format(self) | |
@classmethod | |
def matches(cls, tokens): | |
"""Match '(defun name (/) body).""" | |
if tokens[0] == '(' and tokens[1] == 'defun': | |
return True | |
return False | |
def consume(self, tokens): | |
"""Pull out key elements of fn, then process body.""" | |
self.name = tokens[2] | |
if not tokens[3] == '(': | |
raise SyntaxError(self.no_params_err.format(self.name, tokens[3])) | |
params_close = 3 + tokens[3:].index(')') | |
next_list = 3 + tokens[4:].index('(') | |
if params_close > next_list: | |
raise SyntaxError(self.nested_params_err.format(self.name, | |
tokens[3:params_close + 1])) | |
self.parse_params(tokens[4:params_close]) | |
self.body, tokens = consume_body(tokens[params_close + 1:]) | |
self.discover_deps() | |
return tokens | |
def parse_params(self, params): | |
"""Separate params and locals.""" | |
split = params.index('/') if '/' in params else None | |
if split: | |
self.params = params[:split] | |
self.locals = params[split + 1:] | |
else: | |
self.params = params | |
self.locals = list() | |
def discover_deps(self): | |
"""Walk through body looking for functions not in locals.""" | |
fn_locals = self.locals | |
deps = set() | |
for sub_dep in (item for item in self.body if isinstance(item, Defun)): | |
deps.update(sub_dep.deps) | |
contents = [item for item in self.body if isinstance(item, Apply)] | |
while contents: | |
ap = contents.pop() | |
if ap.func not in fn_locals: | |
deps.add(ap.func) | |
contents.extend(item for item in ap.body | |
if isinstance(item, Apply)) | |
self.deps = deps | |
class Apply(object): | |
"""Match function application.""" | |
def __init__(self): | |
func = None | |
body = None | |
def __repr__(self): | |
return "Apply({0.func}):\n{0.body}".format(self) | |
@classmethod | |
def matches(cls, tokens): | |
"""Match '(symbol body).""" | |
return tokens[0] == '(' and not tokens[1] == '(' | |
def consume(self, tokens): | |
self.func = tokens[1] | |
body, tokens = consume_body(tokens[2:]) | |
self.body = body | |
return tokens | |
class List(object): | |
"""Match list literal (in cond).""" | |
def __init__(self): | |
body = None | |
def __repr__(self): | |
return "List():\n{0.body}".format(self) | |
@classmethod | |
def matches(cls, tokens): | |
"""Match '(body).""" | |
return tokens[0] == '(' | |
def consume(self, tokens): | |
body, tokens = consume_body(tokens[1:]) | |
self.body = body | |
return tokens | |
def parse(tokens): | |
"""Do basic tagging and build syntax tree from tokens.""" | |
return Program(tokens) | |
def analyze(tree): | |
"""Collect info about program tree.""" | |
extensions = set(('.dcl', '.lsp', '.mnr', '.dvb', '.mnl', '.vlx')) | |
stats = Stats(set(), set(), set(), set()) | |
exprs = tree.children[:] | |
while exprs: | |
expr = exprs.pop() | |
body = None | |
if isinstance(expr, Defun): | |
stats.funcs.add(expr.name) | |
stats.deps.update(expr.deps) | |
elif isinstance(expr, String): | |
val = expr.value | |
stats.strs.add(val) | |
if val[-4:] in extensions: | |
stats.files.add(val) | |
if hasattr(expr, 'body'): | |
exprs.extend(item for item in expr.body) | |
return stats | |
def process_file(filename): | |
"""Walk through file maintaining stack of state.""" | |
with open(filename, 'r') as infile: | |
tokens = tokenize(infile) | |
tree = parse(tokens) | |
return analyze(tree) | |
def output_stats(stats, target=None, prefix=''): | |
"""Pretty-print stats, optionally to a filelike.""" | |
if not target: | |
target = sys.stdout | |
target.write("{}Functions:\n".format(prefix)) | |
if stats.funcs: | |
for name in sorted(stats.funcs): | |
target.write("{}\t{}\n".format(prefix, name)) | |
else: | |
target.write("{}None\n".format(prefix)) | |
target.write("{}Dependencies:\n".format(prefix)) | |
if stats.deps: | |
for name in sorted(stats.deps): | |
target.write("{}\t{}\n".format(prefix, name)) | |
else: | |
target.write("{}None\n".format(prefix)) | |
target.write("{}Strings:\n".format(prefix)) | |
if stats.strs: | |
for name in sorted(stats.strs, key=len): | |
target.write("{}\t{}\n".format(prefix, name)) | |
else: | |
target.write("{}None\n".format(prefix)) | |
target.write("{}Possible file dependencies:\n".format(prefix)) | |
if stats.files: | |
for name in stats.files: | |
target.write("{}\t{}\n".format(prefix, name)) | |
else: | |
target.write("{}None\n".format(prefix)) | |
if __name__ == "__main__": | |
filename = sys.argv[-1] | |
stats = process_file(filename) | |
output_stats(stats) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment