mkowoods/json_parser.py

## json_parser.py
# ---------------
# User Instructions
#
# In this problem, you will be using many of the tools and techniques
# that you developed in unit 3 to write a grammar that will allow
# us to write a parser for the JSON language.
#
# You will have to visit json.org to see the JSON grammar. It is not
# presented in the correct format for our grammar function, so you
# will need to translate it.

# ---------------
# Provided functions
#
# These are all functions that were built in unit 3. They will help
# you as you write the grammar.  Add your code at line 102.

from functools import update_wrapper
from string import split
import re

def grammar(description, whitespace=r'\s*'):
    """Convert a description to a grammar.  Each line is a rule for a
    non-terminal symbol; it looks like this:
        Symbol =>  A1 A2 ... | B1 B2 ... | C1 C2 ...
    where the right-hand side is one or more alternatives, separated by
    the '|' sign.  Each alternative is a sequence of atoms, separated by
    spaces.  An atom is either a symbol on some left-hand side, or it is
    a regular expression that will be passed to re.match to match a token.

    Notation for *, +, or ? not allowed in a rule alternative (but ok
    within a token). Use '\' to continue long lines.  You must include spaces
    or tabs around '=>' and '|'. That's within the grammar description itself.
    The grammar that gets defined allows whitespace between tokens by default;
    specify '' as the second argument to grammar() to disallow this (or supply
    any regular expression to describe allowable whitespace between tokens)."""
    G = {' ': whitespace}
    description = description.replace('\t', ' ') # no tabs!
    for line in split(description, '\n'):
        #print 'line', line
        lhs, rhs = split(line, ' => ', 1)
        alternatives = split(rhs, ' | ')
        G[lhs] = tuple(map(split, alternatives))
    return G

def decorator(d):
    "Make function d a decorator: d wraps a function fn."
    def _d(fn):
        return update_wrapper(d(fn), fn)
    update_wrapper(_d, d)
    return _d

@decorator
def memo(f):
    """Decorator that caches the return value for each call to f(args).
    Then when called again with same args, we can just look it up."""
    cache = {}
    def _f(*args):
        try:
            return cache[args]
        except KeyError:
            cache[args] = result = f(*args)
            return result
        except TypeError:
            # some element of args can't be a dict key
            return f(args)
    return _f

@decorator
def trace(f):
    """produces a human readable version of a recursive funciton"""
    indent = '   '
    def _f(*args):
        signature = '%s(%s)' % (f.__name__, ', '.join(map(repr, args)))
        print '%s--> %s' % (trace.level*indent, signature)
        trace.level += 1
        try:
            result = f(*args)
            print '%s<-- %s == %s' % ((trace.level-1)*indent,
                                      signature, result)
        finally:
            trace.level -= 1
        return result
    trace.level = 0
    return _f

def parse(start_symbol, text, grammar):
    """Example call: parse('Exp', '3*x + b', G).
    Returns a (tree, remainder) pair. If remainder is '', it parsed the whole
    string. Failure iff remainder is None. This is a deterministic PEG parser,
    so rule order (left-to-right) matters. Do 'E => T op E | T', putting the
    longest parse first; don't do 'E => T | T op E'
    Also, no left recursion allowed: don't do 'E => E op T'"""

    tokenizer = grammar[' '] + '(%s)'

    @trace
    def parse_sequence(sequence, text):
        #print 'sequence: ', sequence, 'text: ', text
        result = []
        for atom in sequence:
            tree, text = parse_atom(atom, text)
            if text is None: return Fail
            result.append(tree)
        return result, text

    @memo
    def parse_atom(atom, text):
        #print 'atom:', atom, 'text:', text
        if atom in grammar:  # Non-Terminal: tuple of alternatives
            for alternative in grammar[atom]:
                tree, rem = parse_sequence(alternative, text)
                if rem is not None: return [atom]+tree, rem
            return Fail
        else:  # Terminal: match characters against start of text
            m = re.match(tokenizer % atom, text)
            return Fail if (not m) else (m.group(1), text[m.end():])

    # Body of parse:
    return parse_atom(start_symbol, text)

Fail = (None, None)

JSON = grammar(r"""object => [{] members [}] | [{] [}]
members => pair [,] members | pair
pair => string [:] value
array =>  [[] elements []] | [[] []]
elements => value [,] elements | value
value => string | number | object | array | true | false | null
string => \"[\w\s\b]+\"
number => int frac exp | int exp | int frac | int
int => [-]?[0-9]+
frac => [.][0-9]+
exp => [eE][+-]?[0-9]+""", whitespace='\s*')

def json_parse(text):
    return parse('value', text, JSON)

def test():
    assert json_parse('["testing", 1, 2, 3]') == (
                       ['value', ['array', '[', ['elements', ['value',
                       ['string', '"testing"']], ',', ['elements', ['value', ['number',
                       ['int', '1']]], ',', ['elements', ['value', ['number',
                       ['int', '2']]], ',', ['elements', ['value', ['number',
                       ['int', '3']]]]]]], ']']], '')

    assert json_parse('-123.456e+789') == (
                       ['value', ['number', ['int', '-123'], ['frac', '.456'], ['exp', 'e+789']]], '')

    assert json_parse('{"age": 21, "state":"CO","occupation":"rides the rodeo"}') == (
                      ['value', ['object', '{', ['members', ['pair', ['string', '"age"'],
                       ':', ['value', ['number', ['int', '21']]]], ',', ['members',
                      ['pair', ['string', '"state"'], ':', ['value', ['string', '"CO"']]],
                      ',', ['members', ['pair', ['string', '"occupation"'], ':',
                      ['value', ['string', '"rides the rodeo"']]]]]], '}']], '')
    return 'tests pass'

#print json_parse('"rides the rodeo"')
#print json_parse('{"age": 21, "state":"CO","occupation":"rides the rodeo"}')

print test()
	# ---------------
	# User Instructions
	#
	# In this problem, you will be using many of the tools and techniques
	# that you developed in unit 3 to write a grammar that will allow
	# us to write a parser for the JSON language.
	#
	# You will have to visit json.org to see the JSON grammar. It is not
	# presented in the correct format for our grammar function, so you
	# will need to translate it.

	# ---------------
	# Provided functions
	#
	# These are all functions that were built in unit 3. They will help
	# you as you write the grammar. Add your code at line 102.

	from functools import update_wrapper
	from string import split
	import re

	def grammar(description, whitespace=r'\s*'):
	"""Convert a description to a grammar. Each line is a rule for a
	non-terminal symbol; it looks like this:
	Symbol => A1 A2 ... \| B1 B2 ... \| C1 C2 ...
	where the right-hand side is one or more alternatives, separated by
	the '\|' sign. Each alternative is a sequence of atoms, separated by
	spaces. An atom is either a symbol on some left-hand side, or it is
	a regular expression that will be passed to re.match to match a token.

	Notation for *, +, or ? not allowed in a rule alternative (but ok
	within a token). Use '\' to continue long lines. You must include spaces
	or tabs around '=>' and '\|'. That's within the grammar description itself.
	The grammar that gets defined allows whitespace between tokens by default;
	specify '' as the second argument to grammar() to disallow this (or supply
	any regular expression to describe allowable whitespace between tokens)."""
	G = {' ': whitespace}
	description = description.replace('\t', ' ') # no tabs!
	for line in split(description, '\n'):
	#print 'line', line
	lhs, rhs = split(line, ' => ', 1)
	alternatives = split(rhs, ' \| ')
	G[lhs] = tuple(map(split, alternatives))
	return G

	def decorator(d):
	"Make function d a decorator: d wraps a function fn."
	def _d(fn):
	return update_wrapper(d(fn), fn)
	update_wrapper(_d, d)
	return _d

	@decorator
	def memo(f):
	"""Decorator that caches the return value for each call to f(args).
	Then when called again with same args, we can just look it up."""
	cache = {}
	def _f(*args):
	try:
	return cache[args]
	except KeyError:
	cache[args] = result = f(*args)
	return result
	except TypeError:
	# some element of args can't be a dict key
	return f(args)
	return _f

	@decorator
	def trace(f):
	"""produces a human readable version of a recursive funciton"""
	indent = ' '
	def _f(*args):
	signature = '%s(%s)' % (f.__name__, ', '.join(map(repr, args)))
	print '%s--> %s' % (trace.level*indent, signature)
	trace.level += 1
	try:
	result = f(*args)
	print '%s<-- %s == %s' % ((trace.level-1)*indent,
	signature, result)
	finally:
	trace.level -= 1
	return result
	trace.level = 0
	return _f

	def parse(start_symbol, text, grammar):
	"""Example call: parse('Exp', '3*x + b', G).
	Returns a (tree, remainder) pair. If remainder is '', it parsed the whole
	string. Failure iff remainder is None. This is a deterministic PEG parser,
	so rule order (left-to-right) matters. Do 'E => T op E \| T', putting the
	longest parse first; don't do 'E => T \| T op E'
	Also, no left recursion allowed: don't do 'E => E op T'"""

	tokenizer = grammar[' '] + '(%s)'

	@trace
	def parse_sequence(sequence, text):
	#print 'sequence: ', sequence, 'text: ', text
	result = []
	for atom in sequence:
	tree, text = parse_atom(atom, text)
	if text is None: return Fail
	result.append(tree)
	return result, text

	@memo
	def parse_atom(atom, text):
	#print 'atom:', atom, 'text:', text
	if atom in grammar: # Non-Terminal: tuple of alternatives
	for alternative in grammar[atom]:
	tree, rem = parse_sequence(alternative, text)
	if rem is not None: return [atom]+tree, rem
	return Fail
	else: # Terminal: match characters against start of text
	m = re.match(tokenizer % atom, text)
	return Fail if (not m) else (m.group(1), text[m.end():])

	# Body of parse:
	return parse_atom(start_symbol, text)

	Fail = (None, None)

	JSON = grammar(r"""object => [{] members [}] \| [{] [}]
	members => pair [,] members \| pair
	pair => string [:] value
	array => [[] elements []] \| [[] []]
	elements => value [,] elements \| value
	value => string \| number \| object \| array \| true \| false \| null
	string => \"[\w\s\b]+\"
	number => int frac exp \| int exp \| int frac \| int
	int => [-]?[0-9]+
	frac => [.][0-9]+
	exp => [eE][+-]?[0-9]+""", whitespace='\s*')

	def json_parse(text):
	return parse('value', text, JSON)

	def test():
	assert json_parse('["testing", 1, 2, 3]') == (
	['value', ['array', '[', ['elements', ['value',
	['string', '"testing"']], ',', ['elements', ['value', ['number',
	['int', '1']]], ',', ['elements', ['value', ['number',
	['int', '2']]], ',', ['elements', ['value', ['number',
	['int', '3']]]]]]], ']']], '')

	assert json_parse('-123.456e+789') == (
	['value', ['number', ['int', '-123'], ['frac', '.456'], ['exp', 'e+789']]], '')

	assert json_parse('{"age": 21, "state":"CO","occupation":"rides the rodeo"}') == (
	['value', ['object', '{', ['members', ['pair', ['string', '"age"'],
	':', ['value', ['number', ['int', '21']]]], ',', ['members',
	['pair', ['string', '"state"'], ':', ['value', ['string', '"CO"']]],
	',', ['members', ['pair', ['string', '"occupation"'], ':',
	['value', ['string', '"rides the rodeo"']]]]]], '}']], '')
	return 'tests pass'

	#print json_parse('"rides the rodeo"')
	#print json_parse('{"age": 21, "state":"CO","occupation":"rides the rodeo"}')

	print test()