Created
March 4, 2023 12:02
-
-
Save alllex/afcaf4dd1d1c4b1a5f2fa825f471e9d3 to your computer and use it in GitHub Desktop.
Kotlin coroutine-based parser combinators implemented on top of DeepRecursiveFunction
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data class Token(val value: String) | |
data class TokenMatch(val token: Token, val offset: Int) | |
sealed class SyntaxTree | |
data class Node(val children: List<SyntaxTree>) : SyntaxTree() { | |
constructor(vararg children: SyntaxTree) : this(children.toList()) | |
override fun toString(): String = "Node($children)" | |
} | |
data class Lexeme(val match: TokenMatch, val text: String) : SyntaxTree() { | |
override fun toString(): String = "Lexeme('$text')" | |
} | |
sealed class ParseResult<out T> | |
data class ParsedValue<T>(val value: T) : ParseResult<T>() | |
abstract class ParseError : ParseResult<Nothing>() | |
inline infix fun <T, R> ParseResult<T>.map(f: (T) -> R): ParseResult<R> = when (this) { | |
is ParsedValue -> ParsedValue(f(this.value)) | |
is ParseError -> this | |
} | |
data class MismatchedToken(val expected: Token, val found: TokenMatch) : ParseError() | |
data class NoMatchingToken(val fromIndex: Int) : ParseError() | |
class ParserState(private val inputTokens: List<String>, private val tokens: List<Token>) { | |
private var position: Int = 0 | |
private fun findMatch(fromIndex: Int): TokenMatch? { | |
val inputTokenValue = inputTokens[fromIndex] | |
val foundToken = tokens.firstOrNull { it.value == inputTokenValue } | |
return foundToken?.let { TokenMatch(it, fromIndex) } | |
} | |
fun parseToken(token: Token): ParseResult<TokenMatch> { | |
val fromIndex = this.position | |
val match = findMatch(fromIndex) | |
?: return NoMatchingToken(fromIndex) | |
if (match.token != token) return MismatchedToken(token, match) | |
this.position = match.offset + 1 // because lexer has already tokenized input | |
return ParsedValue(match) | |
} | |
fun getText(tokenMatch: TokenMatch): String { | |
return inputTokens[tokenMatch.offset] | |
} | |
} | |
typealias Parser<T> = DeepRecursiveFunction<ParserState, ParseResult<T>> | |
typealias ParserScope<T> = DeepRecursiveScope<ParserState, ParseResult<T>> | |
fun tokenParser(token: Token): Parser<TokenMatch> = Parser { state -> | |
state.parseToken(token) | |
} | |
fun lexemeParser(tokenParser: Parser<TokenMatch>): Parser<SyntaxTree> = Parser { state -> | |
val t = tokenParser.callRecursive(state) | |
t.map { Lexeme(it, state.getText(it)) } | |
} | |
suspend inline fun <T, R> ParserScope<T>.combine(state: ParserState, parser1: Parser<T>, next: (T) -> ParseResult<R>): ParseResult<R> { | |
val v = parser1.callRecursive(state) | |
if (v is ParseError) return v | |
v as ParsedValue | |
return next(v.value) | |
} | |
fun <T> runParser(input: List<String>, tokens: List<Token>, parser: Parser<T>): ParseResult<T> { | |
val state = ParserState(input, tokens) | |
return parser(state) | |
} | |
fun main() { | |
val token1 = Token("a") | |
val token2 = Token("b") | |
val tokens = listOf(token1, token2) | |
val parser = object { | |
val parser1 = lexemeParser(tokenParser(token1)) | |
val parser2 = lexemeParser(tokenParser(token2)) | |
val ab = Parser<SyntaxTree> { state -> | |
combine(state, parser1) { t1 -> | |
combine(state, parser2) { t2 -> | |
ParsedValue(Node(t1, t2)) | |
} | |
} | |
} | |
val root = ab | |
} | |
val goodInput = listOf("a", "b") | |
val goodResult = runParser(goodInput, tokens, parser.root) | |
println(goodResult) | |
// ParsedValue(value=Node([Lexeme('a'), Lexeme('b')])) | |
val badInput = listOf("a", "a") | |
val badResult = runParser(badInput, tokens, parser.root) | |
println(badResult) | |
// MismatchedToken(expected=Token(value=b), found=TokenMatch(token=Token(value=a), offset=1)) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment