Skip to content

Instantly share code, notes, and snippets.

@atg
Created March 14, 2016 02:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save atg/7f00ea563b56789724ce to your computer and use it in GitHub Desktop.
Save atg/7f00ea563b56789724ce to your computer and use it in GitHub Desktop.
_ = require 'lodash'
{Tokenizer} = require './tokenize'
last = _.last
# There's two varieties of data structures: 'vals' and 'val'
# 'vals' is a branch node that contains many direct children
# 'val' is a leaf node that has no direct children
makeVals = (kind, vals) -> { kind: kind, vals: vals }
makeLeftsRights = (kind, lefts, rights) -> { kind: kind, lefts: lefts, rights: rights }
makeVal = (kind, val) -> { kind: kind, val: val }
isVal = (obj, kind, val) -> obj.kind == kind and _.isEqual(obj.val, val)
isOfKind = (obj, kind) -> obj.kind == kind
# ------------------------
# --- Utility functions --
makeRoot = (vals=[]) -> { kind: '_root', vals: vals }
isPunct = (tok) -> tok.kind == 'punct'
isIdent = (tok) -> tok.kind == 'ident'
isString = (tok) -> tok.kind == 'string'
isNumber = (tok) -> tok.kind == 'number'
isEOL = (tok) -> tok.kind == 'eol'
replaceArrayInto = (xs, ys) ->
if xs == ys
return
n = ys.length
xs.length = n
i = 0
while i < n
xs[i] = ys[i]
i += 1
printBare = process.stdout.write
printJSON = (j) ->
console.log(JSON.stringify(j, null, 2))
arrayKeys = ['lefts', 'rights', 'vals', 'table']
printTree = (j, indent=0, shouldIndentFirstLine=true) ->
sp = _.repeat(' ', indent * 2)
sp0 = if shouldIndentFirstLine then sp else ''
isObject = _.isObject(j)
if isObject
if 'vals' of j
process.stdout.write(sp0+"#{j.kind}")
printTree(j.vals, indent, false)
return
if 'table' of j
process.stdout.write(sp0+"#{j.kind}")
printTree(j.table, indent, false)
return
if 'lefts' of j and 'rights' of j
console.log(sp0+"#{j.kind}(")
process.stdout.write(sp+" lefts: ")
printTree(j.lefts, indent + 1, false)
process.stdout.write(sp+" rights: ")
printTree(j.rights, indent + 1, false)
console.log(sp+")")
return
if 'left' of j and 'right' of j
console.log(sp0+"#{j.kind}(")
printTree(j.left, indent + 1)
printTree(j.right, indent + 1)
console.log(sp+")")
return
if _.isArray(j)
if j.length
console.log(sp0+"[")
for val in j
printTree(val, indent + 1)
console.log(sp+"]")
else
console.log(sp0+"[]")
return
# console.log(JSON.stringify(j))
console.log(sp+"#{j.kind}(#{JSON.stringify(j.val)})")
# -------------------------
# --- Parsing functions ---
Parsing = {}
# Splits a token stream into a tree of brackets
Parsing._brackets = (toks) ->
stack = [makeRoot()]
output = []
bracketKindsMap = {
'(': 'group', ')': 'group'
'[': 'square', ']': 'square'
'{': 'block', '}': 'block'
}
for tok in toks
if isPunct(tok) and tok.val in ['(', '[', '{']
bracketKind = bracketKindsMap[tok.val]
branch = { 'kind': bracketKind, 'vals': [] }
last(stack).vals.push(branch)
stack.push(branch)
else if isPunct(tok) and tok.val in [')', ']', '}'] and last(stack).kind == bracketKindsMap[tok.val]
stack.pop()
else
last(stack).vals.push(tok)
return stack[0].vals
# Splits a token stream by a given token
Parsing._splitBy = (toks, outerKind, innerKind, splitter) ->
group = makeVals(innerKind, [])
groups = []
hasFound = false
for x in toks
if _.isEqual(x, splitter)
hasFound = true
groups.push(group) if group.vals.length > 0
group = makeVals(innerKind, [])
else
group.vals.push(x)
groups.push(group) if group.vals.length > 0
if hasFound
if outerKind?
return [makeVals(outerKind, groups)]
else
return groups
else
return toks
# Splits a token stream by a given token
Parsing._splitByIntoTable = (toks, kind, splitter) ->
group = []
groups = []
hasFound = false
for x in toks
if _.isEqual(x, splitter)
hasFound = true
groups.push(group) if group.length > 0
group = []
else
group.push(x)
groups.push(group) if group.length > 0
if hasFound
return [{ kind: kind, table: groups }]
else
return toks
# Splits a token stream into lines
Parsing._splitLines = (toks) ->
group = makeVals('line', [])
groups = []
foundNewline = false
for x in toks
if x.kind == 'newline'
foundNewline = true
hasFound = true
groups.push(group) if group.vals.length > 0
group = makeVals('line', [])
else
group.vals.push(x)
groups.push(group) if group.vals.length > 0
if foundNewline
return groups
# If there are any non-lines then wrap in a line
for tok in toks
if _.isObject(tok) and tok.kind != 'line'
return [makeVals('line', toks)]
# Otherwise return the lines
return toks
# Partitions an array of tokens by one token into left and right sides
lpartition = (toks, kind, splitter) ->
for tok, i in toks
if _.isEqual(tok, splitter)
lefts = _.slice(toks, 0, i)
rights = _.slice(toks, i + 1)
return makeLeftsRights(kind, lefts, rights)
return null
Parsing._lpartitionBy = (toks, kind, splitter) ->
for tok, i in toks
if _.isEqual(tok, splitter)
lefts = _.slice(toks, 0, i)
rights = _.slice(toks, i + 1)
return [ makeLeftsRights(kind, lefts, rights) ]
return toks
Parsing._rpartitionBy = (toks, kind, splitter) ->
for tok, i in toks by -1
if _.isEqual(tok, splitter)
lefts = _.slice(toks, 0, i)
rights = _.slice(toks, i + 1)
return [ makeLeftsRights(kind, lefts, rights) ]
return toks
# -------------------------
# --- Walking functions ---
## TODO: I think walk should not only walk through vals, but also 'left' and 'right'. Then we don't have to have trees that are full of ...
## TODO: also I think there should be some support for a .table property that is a 2D array
# { kind, val: Any } -- leaf node
# { kind, vals: [Any] } -- branch node
# { kind, left: Any, right: Any } -- either node
# { kind: lefts: [Any], rights: [Any] }
# { kind, table: [[Any]] } -- either node
# TBH if we're going to do this then _walk should be able to handle arrays
# Walk through the nodes of a tree.
# before(x, isLeaf) walks outside-inwards
# after(x, isLeaf walks inside-outwards
# ... think the order of <begin> tags in XML, vs the order of </end> tags
# this function is a total mess!
# need to simply and generalise! perhaps keep a stack of parents
# and turn before/after into functions that RETURN the new array instead of trying to set it
runNotifs = (obj, stack, callback) ->
if not callback?
return
for ak in arrayKeys
vals = obj[ak]
if vals?
result = callback(stack, vals)
if result?
obj[ak] = result
Parsing._walk = (obj, stack, before, after, leaf=null) ->
if _.isArray(obj)
arr = obj
if before
result = before(stack, arr)
if result?
replaceArrayInto(arr, before(stack, arr))
for subobj in arr
Parsing._walk(subobj, stack, before, after, leaf)
if after
result = after(stack, arr)
if result?
replaceArrayInto(arr, result)
return
# Is this a leaf node (it contains no arrayKeys)
isLeaf = true
for ak in arrayKeys
if obj[ak]?
isLeaf = false
if isLeaf
if before or after
stack.push(obj)
leaf(stack, obj) if leaf
stack.pop(obj)
return
stack.push(obj)
runNotifs(obj, stack, before)
for ak in arrayKeys
vals = obj[ak]
continue if not vals?
for x in vals
Parsing._walk(x, stack, before, after, leaf)
runNotifs(obj, stack, after)
stack.pop()
Parsing._mapTreeInPlace = (tree, mapping) ->
Parsing._walk tree, [], null, (stack, vals) ->
return mapping(vals, last(stack))
###
Parsing._walkOld = (tree, before, after) ->
walkInner = (x) ->
# if x is an array, handle it specifically
if _.isArray(x)
arr = x
before(null, arr, 'table', true) if before
for cell in arr
Parsing._walk(cell, before, after)
after(null, arr, 'table', true) if after
# if x has a table property, it will contain an array of arrays
else if x.table?
if before
for row in x.table
before(x, row, 'table', true)
for row in x.table
for cell in row
Parsing._walk(cell, before, after)
if after
for row in x.table
after(x, row, 'table', true)
# if x is a branch node, then it may contain .vals, .lefts, .rights, or a mixture of the three
else if x.vals? or x.lefts? or x.rights?
if before
before(x, x.vals, 'vals') if x.vals?
before(x, x.lefts, 'lefts') if x.lefts?
before(x, x.rights, 'rights') if x.rights?
Parsing._walk(x, before, after)
if after
after(x, x.vals, 'vals') if x.vals?
after(x, x.lefts, 'lefts') if x.lefts?
after(x, x.rights, 'rights') if x.rights?
# otherwise x is a leaf node
else
before(x, null, null) if before
after(x, null, null) if after
before(tree, tree.vals, 'vals') if before and tree.vals?
before(tree, tree.lefts, 'lefts') if before and tree.lefts?
before(tree, tree.rights, 'rights') if before and tree.rights?
if tree.vals?
for x in tree.vals
walkInner(x)
if tree.lefts?
for x in tree.lefts
walkInner(x)
if tree.rights?
for x in tree.rights
walkInner(x)
after(tree, tree.vals, 'vals') if after and tree.vals?
after(tree, tree.lefts, 'lefts') if after and tree.lefts?
after(tree, tree.rights, 'rights') if after and tree.rights?
###
# --------------------
# --- Tokenization ---
Parsing.tokenize = (txt) ->
return new Tokenizer(txt).tokenize()
# --------------------
# --- Test it out! ---
# txt = 'foo = bar = baz = 1, 2; 3, 4'
txt = '''foo {
(
abc; abc
)
}
'''
# txt = #'a * b + c * d + e'
txt = '''
for x, y in y {
print(foo)
}
'''
#outer( inner(a, b) (a, b) ) (a, b)
toks = Parsing.tokenize(txt)
toks = Parsing._brackets(toks)
root = makeVals('block', toks)
# (lowest precedence)
# - newline stage: parse out newlines from inside {blocks}
Parsing._mapTreeInPlace root, (toks, element) ->
if element.kind == 'block'
# If we are in a {...} block, then each val must be a line
Parsing._splitLines(toks)
else
# Remove extraneous newlines from the output
_.filter(toks, (x) -> (x.kind != 'newline'))
# - predictive stage: prefixes on the starts of lines
predictiveKeywords = ["let", "var", "for", "while"]
predictiveParslets = {}
predictiveParslets.for = (toks) ->
# e.g. for _ in _ {block}
# splits on the in
[firsts..., end] = toks
firsts = toks # todo: we shove the block at the end of 'rights' for now
console.log(firsts, end)
leftsrights = lpartition(firsts, 'for', makeVal('ident', 'in'))
# leftsrights.block = end --- todo: implement the .block property
Parsing._mapTreeInPlace root, (toks, element) ->
if element.kind == 'line' and toks.length
tok = toks[0]
for kw in predictiveKeywords
if isVal(tok, 'ident', kw)
rest = _.tail(toks)
obj = makeVals(kw, rest)
if kw of predictiveParslets
obj = predictiveParslets[kw](rest)
return [obj]
return toks
# - semicolons and commas
Parsing._mapTreeInPlace root, (toks) -> Parsing._splitByIntoTable(toks, 'bindings', makeVal('punct', '='))
Parsing._mapTreeInPlace root, (toks) -> Parsing._splitByIntoTable(toks, 'semicolons', makeVal('punct', ';'))
Parsing._mapTreeInPlace root, (toks) -> Parsing._splitByIntoTable(toks, 'commas', makeVal('punct', ','))
# while true
# before = _.cloneDeep(root)
#
# # - colons make left associative 'pairs'
# Parsing._mapTreeInPlace root, (toks) -> Parsing._lpartitionBy(toks, 'pair', makeVal('punct', ':'))
#
# # now would be good to do operator parsing
# # and actually the below does not work.
# # Parsing._mapTreeInPlace root, (toks) -> Parsing._lpartitionBy(toks, 'add', makeVal('punct', '+'))
# # Parsing._mapTreeInPlace root, (toks) -> Parsing._lpartitionBy(toks, 'sub', makeVal('punct', '-'))
#
# # Parsing._mapTreeInPlace root, (toks) -> Parsing._lpartitionBy(toks, 'mul', makeVal('punct', '*'))
# # Parsing._mapTreeInPlace root, (toks) -> Parsing._lpartitionBy(toks, 'div', makeVal('punct', '/'))
#
# # TODO: need to recursively apply stuff like pairs. best to use a while loop
# break if _.isEqual(root, before)
Specific = {}
Specific.call = (toks, parent) ->
output = []
for tok in toks
if tok.kind == 'group' and output.length
# whatever is immediately before gets slurped up into a call
prev = output.pop()
output.push {
kind: 'call'
# vals: [prev, makeVal('spacer', ' '), tok]
left: prev
right: tok
}
else
output.push(tok)
return output
Parsing._mapTreeInPlace root, (toks, parent) -> Specific.call(toks, parent)
# Parsing._mapTreeInPlace root, (toks, parent) -> Specific.call(toks, parent)
# f = (stack, vals) ->
# # console.log("@INKI", JSON.stringify(stack, null, 2))
# return vals
# g = (stack, vals) ->
# # console.log("@OUTKI", JSON.stringify(stack, null, 2))
# return vals
# Parsing._walk root, [], f, g, null
###
# call (left high) -- <any> <group>
# subscript (left high) -- <any> <square>
# for (left predictive) -- line^ for <^in> in ... <block> $line
# while (left predictive) -- line^ while ... <block> $line
# return
# ~ Literals ~
# array: square of exprs
# object: block
###
printTree(root)
printJSON(root)
splitByAs = (root, outerKind, innerKind, splitter) ->
Parsing._mapTreeInPlace root, (toks) ->
Parsing._splitBy(toks, outerKind, innerKind, splitter)
return
txt = '''def foo(x, y) {
1, 2, 3
}
'''
# printJSON(root)
## todo: partitioning and infix operators, and context sensitive parsing
# e.g. A -> B (right assoc)
# need infix and suffix operator parsing
# need to parse stuff like `* group` (function call) and `* square` (subscripting)
# splitByAs(root, 'bindings', 'binding', makeVal('punct', '='))
# splitByAs(root, 'semicolons', 'semicolon', makeVal('punct', ';'))
# splitByAs(root, 'commas', 'comma', makeVal('punct', ','))
# (high precedence)
# splitByAs(root, 'lines', makeVal('newline', '\n'))
# High precedence ^^^
# printJSON(root)
# XS = [1, 2, 3]
# YS = [10, 20, 30, 40, 50]
#
# replaceArrayInto(XS, YS)
# console.log XS
@reistemhoff
Copy link

masterful, but i invade invitation. well done

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment