Last active
August 17, 2018 16:36
-
-
Save IARI/91011233658d386f1f1aefd2450537f2 to your computer and use it in GitHub Desktop.
Custom Tokenizer for https://github.com/h0tk3y/better-parse
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package de.imc.placeholder.app | |
import com.github.h0tk3y.betterParse.lexer.* | |
import com.github.h0tk3y.betterParse.utils.cached | |
import java.io.InputStream | |
import java.util.* | |
import java.util.logging.Logger | |
import java.util.regex.MatchResult | |
import kotlin.coroutines.experimental.buildSequence | |
/** Tokenizes input character sequences using the [tokens], prioritized by their order in the list, | |
* first matched first. */ | |
class MyTokenizer(override val tokens: List<Token>, vararg options: RegexOption) : Tokenizer { | |
init { | |
require(tokens.isNotEmpty()) { "The tokens list should not be empty" } | |
} | |
val logger = Logger.getLogger(javaClass.simpleName) | |
val patterns = tokens.map { it to (it.regex?.toPattern() ?: it.pattern.toRegex(setOf(*options)).toPattern()) } | |
private val allInOnePattern = patterns | |
.joinToString("|", prefix = "\\G(?:", postfix = ")") { "(${it.second.pattern()})" }.toPattern() | |
private val patternGroupIndices = | |
buildSequence { | |
var groupId = 1 // the zero group is the whole match | |
for (p in patterns) { | |
yield(groupId) // the group for the current pattern | |
groupId += p.second.matcher("").groupCount() + 1 // skip all the nested groups in p | |
} | |
}.toList() | |
/** Tokenizes the [input] from a [String] into a [TokenizerMatchesSequence]. */ | |
override fun tokenize(input: String) = input.scanTokenize(::Scanner) | |
/** Tokenizes the [input] from an [InputStream] into a [TokenizerMatchesSequence]. */ | |
override fun tokenize(input: InputStream) = input.scanTokenize(::Scanner) | |
/** Tokenizes the [input] from a [Readable] into a [TokenizerMatchesSequence]. */ | |
override fun tokenize(input: Readable) = input.scanTokenize(::Scanner) | |
fun <T> T.scanTokenize(const: (T) -> Scanner) = const(this).apply { | |
//useDelimiter("""(?=\n)""") | |
logger.info("Delimiter:") | |
logger.info(delimiter().pattern()) | |
}.run { tokenize(this) } | |
/** Tokenizes the [input] from a [Scanner] into a [TokenizerMatchesSequence]. */ | |
override fun tokenize(input: Scanner): Sequence<TokenMatch> = buildSequence<TokenMatch> { | |
var pos = 0 | |
var row = 1 | |
var col = 1 | |
while (input.hasNext()) { | |
val matchResult: MatchResult | |
val matchedToken: Token = | |
if (input.findWithinHorizon(allInOnePattern, 0) != null) { | |
matchResult = input.match() | |
tokens[patternGroupIndices.indexOfFirst { matchResult.group(it) != null }] | |
} else { | |
logger.warning(allInOnePattern.pattern()) | |
logger.warning("could not match") | |
logger.warning(input.toString()) | |
val test = input.findWithinHorizon(".".toRegex(setOf(RegexOption.MULTILINE, RegexOption.DOT_MATCHES_ALL)).toPattern() | |
, 0) | |
if (test == null) { | |
logger.warning("DID NOT FIND More;") | |
} else { | |
logger.warning("DID FIND;") | |
logger.warning("'$test'") | |
} | |
yield(TokenMatch(noneMatched, input.next(), pos, row, col)) | |
break | |
} | |
logger.info("matches: '${matchResult.group()}'") | |
val match = matchResult.group() | |
val result = TokenMatch(matchedToken, match, pos, row, col) | |
pos += match.length | |
col += match.length | |
val addRows = match.count { it == '\n' } | |
row += addRows | |
if (addRows > 0) { | |
col = match.length - match.lastIndexOf('\n') | |
} | |
yield(result) | |
} | |
}.cached() | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package de.imc.placeholder.app.parsing | |
import com.github.h0tk3y.betterParse.combinators.map | |
import com.github.h0tk3y.betterParse.combinators.zeroOrMore | |
import com.github.h0tk3y.betterParse.grammar.Grammar | |
import com.github.h0tk3y.betterParse.parser.Parser | |
import de.imc.placeholder.app.MyTokenizer | |
object testgrammar : Grammar<String>() { | |
val singleToken by token(""".+""".toRegex(RegexOption.DOT_MATCHES_ALL)) | |
override val rootParser: Parser<String> by zeroOrMore(singleToken) map { it.joinToString("#") } | |
override val tokenizer by lazy { | |
MyTokenizer(tokens, RegexOption.MULTILINE, RegexOption.DOT_MATCHES_ALL) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment