kitsu/parse_lisp.py

## parse_lisp.py
"""Process a lisp file to extract useful data."""
import sys, os
from collections import namedtuple

substitutions = (
        # Expand quote literal to quote fn
        ("'(", "(quote "),
        # Expand parens with white-space for splitting
        (')', ' ) '),
        ('(', ' ( '),
        )

Stats = namedtuple('Stats', 'funcs, deps, strs, files')

def modify(line):
    """Perform replacements listed in global dictionary."""
    for k, v in substitutions:
        if k in line:
            line = line.replace(k, v)
    return line

def scrub_comment(line):
    """Carefully clean comments from line."""
    if not line or line[0] == ';':
        # Just skip blank lines and comments
        return
    semi = line.find(';')
    if semi > 0:
        if '"' not in line or line.rfind('"') < semi:
            line = line[:semi]
        else:
            # Find left most semi not in a string literal...
            parts = line.split(';')
            first, rest = parts[0], parts[1:]
            accepted = [first]
            quotes = 0
            #import pdb; pdb.set_trace()
            while rest:
                quotes += first.count('"')
                # If we have collected an even number of quotes on left
                if quotes%2 == 0:
                    break
                first, rest = rest[0], rest[1:]
                accepted.append(first)
            line = ';'.join(accepted)
    return line

def tokenize(infile):
    """Cheaty str.split tokenization."""
    tokens = list()
    # Clean and append all lines into one big string
    for line in infile:
        line = scrub_comment(line.strip())
        if not line:
            continue
        # Tokenize the line
        tokens.extend([token for token in
                       # AutoLisp is case-insensitive
                       modify(line).lower().split()
                       if token])
    return tokens

# Model states as classes that know what to consume
Number = namedtuple('Number', 'value')
String = namedtuple('String', 'value')
Symbol = namedtuple('Symbol', 'value')

def consume_other(tokens):
    """Handle Symbol, String, and Number tokens."""
    first = tokens[0]
    try:
        num = float(first)
        return Number(num), tokens[1:]
    except ValueError:
        pass
    if '"' == first[0]:
        return consume_string(tokens)
    return Symbol(first), tokens[1:]

def consume_string(tokens):
    """Consume tokens between double-quotes."""
    first, rest = tokens[0], tokens[1:]
    if first.count('"')%2 == 0 and '\\"' not in first:
        return String(first.strip('"')), tokens[1:]
    parts = [first]
    while rest:
        first, rest = rest[0], rest[1:]
        parts.append(first)
        quotes = first.count('"') - first.count('\\"') + first.count('\\\\"')
        if quotes%2 == 1:
            break
    else:
        print tokens
        raise SyntaxError("Reached end of input without matching quote.")
    whole = String(" ".join(parts).strip('"'))
    return whole, rest

def consume_body(tokens):
    """Consume all expressions until unpaired closing bracket ')' found."""
    children = list()
    while tokens and not tokens[0] == ')':
        # These are in order of specificity
        if Defun.matches(tokens):
            fn = Defun()
            children.append(fn)
            tokens = fn.consume(tokens)
        elif Apply.matches(tokens):
            ap = Apply()
            children.append(ap)
            tokens = ap.consume(tokens)
        elif List.matches(tokens):
            li = List()
            children.append(li)
            tokens = li.consume(tokens)
        else:
            # First token must be one of Symbol, String, Number
            other, tokens = consume_other(tokens)
            children.append(other)
    # Omit last head if it exists (i.e. ')')
    return children, tokens[1:]

class Program(object):
    """Base matcher."""
    def __init__(self, tokens):
        self.children = list()
        self.consume(tokens)

    def consume(self, tokens):
        """Consume tokens to build children"""
        #import pdb; pdb.set_trace()
        body, tokens = consume_body(tokens)
        self.children = body
        if tokens:
            raise SyntaxError("Unmatched ')' in input.")

    def __repr__(self):
        return "Program({})".format(self.children)

class Defun(object):
    """Match function definition."""
    no_params_err = "Defun '{}': expected parameter list, not {}"
    nested_params_err = "Defun '{}': parameters must be simple list, found \n{}"
    def __init__(self):
        self.name = None
        self.params = None
        self.locals = None
        self.body = None
        self.deps = None

    def __repr__(self):
        return "Defun({0.name}, {0.params}, {0.locals}):\n{0.body}".format(self)

    @classmethod
    def matches(cls, tokens):
        """Match '(defun name (/) body)."""
        if tokens[0] == '(' and tokens[1] == 'defun':
            return True
        return False

    def consume(self, tokens):
        """Pull out key elements of fn, then process body."""
        self.name = tokens[2]
        if not tokens[3] == '(':
            raise SyntaxError(self.no_params_err.format(self.name, tokens[3]))
        params_close = 3 + tokens[3:].index(')')
        next_list = 3 + tokens[4:].index('(')
        if params_close > next_list:
            raise SyntaxError(self.nested_params_err.format(self.name,
                                                            tokens[3:params_close + 1]))
        self.parse_params(tokens[4:params_close])
        self.body, tokens = consume_body(tokens[params_close + 1:])
        self.discover_deps()
        return tokens

    def parse_params(self, params):
        """Separate params and locals."""
        split = params.index('/') if '/' in params else None
        if split:
            self.params = params[:split]
            self.locals = params[split + 1:]
        else:
            self.params = params
            self.locals = list()

    def discover_deps(self):
        """Walk through body looking for functions not in locals."""
        fn_locals = self.locals
        deps = set()
        for sub_dep in (item for item in self.body if isinstance(item, Defun)):
            deps.update(sub_dep.deps)
        contents = [item for item in self.body if isinstance(item, Apply)]
        while contents:
            ap = contents.pop()
            if ap.func not in fn_locals:
                deps.add(ap.func)
                contents.extend(item for item in ap.body
                                if isinstance(item, Apply))
        self.deps = deps

class Apply(object):
    """Match function application."""
    def __init__(self):
        func = None
        body = None

    def __repr__(self):
        return "Apply({0.func}):\n{0.body}".format(self)

    @classmethod
    def matches(cls, tokens):
        """Match '(symbol body)."""
        return tokens[0] == '(' and not tokens[1] == '('

    def consume(self, tokens):
        self.func = tokens[1]
        body, tokens = consume_body(tokens[2:])
        self.body = body
        return tokens

class List(object):
    """Match list literal (in cond)."""
    def __init__(self):
        body = None

    def __repr__(self):
        return "List():\n{0.body}".format(self)

    @classmethod
    def matches(cls, tokens):
        """Match '(body)."""
        return tokens[0] == '('

    def consume(self, tokens):
        body, tokens = consume_body(tokens[1:])
        self.body = body
        return tokens

def parse(tokens):
    """Do basic tagging and build syntax tree from tokens."""
    return Program(tokens)

def analyze(tree):
    """Collect info about program tree."""
    extensions = set(('.dcl', '.lsp', '.mnr', '.dvb', '.mnl', '.vlx'))
    stats = Stats(set(), set(), set(), set())
    exprs = tree.children[:]
    while exprs:
        expr = exprs.pop()
        body = None
        if isinstance(expr, Defun):
            stats.funcs.add(expr.name)
            stats.deps.update(expr.deps)
        elif isinstance(expr, String):
            val = expr.value
            stats.strs.add(val)
            if val[-4:] in extensions:
                stats.files.add(val)
        if hasattr(expr, 'body'):
            exprs.extend(item for item in expr.body)
    return stats

def process_file(filename):
    """Walk through file maintaining stack of state."""
    with open(filename, 'r') as infile:
        tokens = tokenize(infile)
    tree = parse(tokens)
    return analyze(tree)

def output_stats(stats, target=None, prefix=''):
    """Pretty-print stats, optionally to a filelike."""
    if not target:
        target = sys.stdout
    target.write("{}Functions:\n".format(prefix))
    if stats.funcs:
        for name in sorted(stats.funcs):
            target.write("{}\t{}\n".format(prefix, name))
    else:
        target.write("{}None\n".format(prefix))
    target.write("{}Dependencies:\n".format(prefix))
    if stats.deps:
        for name in sorted(stats.deps):
            target.write("{}\t{}\n".format(prefix, name))
    else:
        target.write("{}None\n".format(prefix))
    target.write("{}Strings:\n".format(prefix))
    if stats.strs:
        for name in sorted(stats.strs, key=len):
            target.write("{}\t{}\n".format(prefix, name))
    else:
        target.write("{}None\n".format(prefix))
    target.write("{}Possible file dependencies:\n".format(prefix))
    if stats.files:
        for name in stats.files:
            target.write("{}\t{}\n".format(prefix, name))
    else:
        target.write("{}None\n".format(prefix))

if __name__ == "__main__":
    filename = sys.argv[-1]
    stats = process_file(filename)
    output_stats(stats)
	"""Process a lisp file to extract useful data."""
	import sys, os
	from collections import namedtuple

	substitutions = (
	# Expand quote literal to quote fn
	("'(", "(quote "),
	# Expand parens with white-space for splitting
	(')', ' ) '),
	('(', ' ( '),
	)

	Stats = namedtuple('Stats', 'funcs, deps, strs, files')

	def modify(line):
	"""Perform replacements listed in global dictionary."""
	for k, v in substitutions:
	if k in line:
	line = line.replace(k, v)
	return line

	def scrub_comment(line):
	"""Carefully clean comments from line."""
	if not line or line[0] == ';':
	# Just skip blank lines and comments
	return
	semi = line.find(';')
	if semi > 0:
	if '"' not in line or line.rfind('"') < semi:
	line = line[:semi]
	else:
	# Find left most semi not in a string literal...
	parts = line.split(';')
	first, rest = parts[0], parts[1:]
	accepted = [first]
	quotes = 0
	#import pdb; pdb.set_trace()
	while rest:
	quotes += first.count('"')
	# If we have collected an even number of quotes on left
	if quotes%2 == 0:
	break
	first, rest = rest[0], rest[1:]
	accepted.append(first)
	line = ';'.join(accepted)
	return line

	def tokenize(infile):
	"""Cheaty str.split tokenization."""
	tokens = list()
	# Clean and append all lines into one big string
	for line in infile:
	line = scrub_comment(line.strip())
	if not line:
	continue
	# Tokenize the line
	tokens.extend([token for token in
	# AutoLisp is case-insensitive
	modify(line).lower().split()
	if token])
	return tokens

	# Model states as classes that know what to consume
	Number = namedtuple('Number', 'value')
	String = namedtuple('String', 'value')
	Symbol = namedtuple('Symbol', 'value')

	def consume_other(tokens):
	"""Handle Symbol, String, and Number tokens."""
	first = tokens[0]
	try:
	num = float(first)
	return Number(num), tokens[1:]
	except ValueError:
	pass
	if '"' == first[0]:
	return consume_string(tokens)
	return Symbol(first), tokens[1:]

	def consume_string(tokens):
	"""Consume tokens between double-quotes."""
	first, rest = tokens[0], tokens[1:]
	if first.count('"')%2 == 0 and '\\"' not in first:
	return String(first.strip('"')), tokens[1:]
	parts = [first]
	while rest:
	first, rest = rest[0], rest[1:]
	parts.append(first)
	quotes = first.count('"') - first.count('\\"') + first.count('\\\\"')
	if quotes%2 == 1:
	break
	else:
	print tokens
	raise SyntaxError("Reached end of input without matching quote.")
	whole = String(" ".join(parts).strip('"'))
	return whole, rest

	def consume_body(tokens):
	"""Consume all expressions until unpaired closing bracket ')' found."""
	children = list()
	while tokens and not tokens[0] == ')':
	# These are in order of specificity
	if Defun.matches(tokens):
	fn = Defun()
	children.append(fn)
	tokens = fn.consume(tokens)
	elif Apply.matches(tokens):
	ap = Apply()
	children.append(ap)
	tokens = ap.consume(tokens)
	elif List.matches(tokens):
	li = List()
	children.append(li)
	tokens = li.consume(tokens)
	else:
	# First token must be one of Symbol, String, Number
	other, tokens = consume_other(tokens)
	children.append(other)
	# Omit last head if it exists (i.e. ')')
	return children, tokens[1:]

	class Program(object):
	"""Base matcher."""
	def __init__(self, tokens):
	self.children = list()
	self.consume(tokens)

	def consume(self, tokens):
	"""Consume tokens to build children"""
	#import pdb; pdb.set_trace()
	body, tokens = consume_body(tokens)
	self.children = body
	if tokens:
	raise SyntaxError("Unmatched ')' in input.")

	def __repr__(self):
	return "Program({})".format(self.children)

	class Defun(object):
	"""Match function definition."""
	no_params_err = "Defun '{}': expected parameter list, not {}"
	nested_params_err = "Defun '{}': parameters must be simple list, found \n{}"
	def __init__(self):
	self.name = None
	self.params = None
	self.locals = None
	self.body = None
	self.deps = None

	def __repr__(self):
	return "Defun({0.name}, {0.params}, {0.locals}):\n{0.body}".format(self)

	@classmethod
	def matches(cls, tokens):
	"""Match '(defun name (/) body)."""
	if tokens[0] == '(' and tokens[1] == 'defun':
	return True
	return False

	def consume(self, tokens):
	"""Pull out key elements of fn, then process body."""
	self.name = tokens[2]
	if not tokens[3] == '(':
	raise SyntaxError(self.no_params_err.format(self.name, tokens[3]))
	params_close = 3 + tokens[3:].index(')')
	next_list = 3 + tokens[4:].index('(')
	if params_close > next_list:
	raise SyntaxError(self.nested_params_err.format(self.name,
	tokens[3:params_close + 1]))
	self.parse_params(tokens[4:params_close])
	self.body, tokens = consume_body(tokens[params_close + 1:])
	self.discover_deps()
	return tokens

	def parse_params(self, params):
	"""Separate params and locals."""
	split = params.index('/') if '/' in params else None
	if split:
	self.params = params[:split]
	self.locals = params[split + 1:]
	else:
	self.params = params
	self.locals = list()

	def discover_deps(self):
	"""Walk through body looking for functions not in locals."""
	fn_locals = self.locals
	deps = set()
	for sub_dep in (item for item in self.body if isinstance(item, Defun)):
	deps.update(sub_dep.deps)
	contents = [item for item in self.body if isinstance(item, Apply)]
	while contents:
	ap = contents.pop()
	if ap.func not in fn_locals:
	deps.add(ap.func)
	contents.extend(item for item in ap.body
	if isinstance(item, Apply))
	self.deps = deps

	class Apply(object):
	"""Match function application."""
	def __init__(self):
	func = None
	body = None

	def __repr__(self):
	return "Apply({0.func}):\n{0.body}".format(self)

	@classmethod
	def matches(cls, tokens):
	"""Match '(symbol body)."""
	return tokens[0] == '(' and not tokens[1] == '('

	def consume(self, tokens):
	self.func = tokens[1]
	body, tokens = consume_body(tokens[2:])
	self.body = body
	return tokens

	class List(object):
	"""Match list literal (in cond)."""
	def __init__(self):
	body = None

	def __repr__(self):
	return "List():\n{0.body}".format(self)

	@classmethod
	def matches(cls, tokens):
	"""Match '(body)."""
	return tokens[0] == '('

	def consume(self, tokens):
	body, tokens = consume_body(tokens[1:])
	self.body = body
	return tokens

	def parse(tokens):
	"""Do basic tagging and build syntax tree from tokens."""
	return Program(tokens)

	def analyze(tree):
	"""Collect info about program tree."""
	extensions = set(('.dcl', '.lsp', '.mnr', '.dvb', '.mnl', '.vlx'))
	stats = Stats(set(), set(), set(), set())
	exprs = tree.children[:]
	while exprs:
	expr = exprs.pop()
	body = None
	if isinstance(expr, Defun):
	stats.funcs.add(expr.name)
	stats.deps.update(expr.deps)
	elif isinstance(expr, String):
	val = expr.value
	stats.strs.add(val)
	if val[-4:] in extensions:
	stats.files.add(val)
	if hasattr(expr, 'body'):
	exprs.extend(item for item in expr.body)
	return stats

	def process_file(filename):
	"""Walk through file maintaining stack of state."""
	with open(filename, 'r') as infile:
	tokens = tokenize(infile)
	tree = parse(tokens)
	return analyze(tree)

	def output_stats(stats, target=None, prefix=''):
	"""Pretty-print stats, optionally to a filelike."""
	if not target:
	target = sys.stdout
	target.write("{}Functions:\n".format(prefix))
	if stats.funcs:
	for name in sorted(stats.funcs):
	target.write("{}\t{}\n".format(prefix, name))
	else:
	target.write("{}None\n".format(prefix))
	target.write("{}Dependencies:\n".format(prefix))
	if stats.deps:
	for name in sorted(stats.deps):
	target.write("{}\t{}\n".format(prefix, name))
	else:
	target.write("{}None\n".format(prefix))
	target.write("{}Strings:\n".format(prefix))
	if stats.strs:
	for name in sorted(stats.strs, key=len):
	target.write("{}\t{}\n".format(prefix, name))
	else:
	target.write("{}None\n".format(prefix))
	target.write("{}Possible file dependencies:\n".format(prefix))
	if stats.files:
	for name in stats.files:
	target.write("{}\t{}\n".format(prefix, name))
	else:
	target.write("{}None\n".format(prefix))

	if __name__ == "__main__":
	filename = sys.argv[-1]
	stats = process_file(filename)
	output_stats(stats)