IARI/MyTokenizer.kt

## MyTokenizer.kt
package de.imc.placeholder.app

import com.github.h0tk3y.betterParse.lexer.*
import com.github.h0tk3y.betterParse.utils.cached
import java.io.InputStream
import java.util.*
import java.util.logging.Logger
import java.util.regex.MatchResult
import kotlin.coroutines.experimental.buildSequence


/** Tokenizes input character sequences using the [tokens], prioritized by their order in the list,
 * first matched first. */
class MyTokenizer(override val tokens: List<Token>, vararg options: RegexOption) : Tokenizer {
    init {
        require(tokens.isNotEmpty()) { "The tokens list should not be empty" }
    }

    val logger = Logger.getLogger(javaClass.simpleName)

    val patterns = tokens.map { it to (it.regex?.toPattern() ?: it.pattern.toRegex(setOf(*options)).toPattern()) }
    private val allInOnePattern = patterns
            .joinToString("|", prefix = "\\G(?:", postfix = ")") { "(${it.second.pattern()})" }.toPattern()
    private val patternGroupIndices =
            buildSequence {
                var groupId = 1 // the zero group is the whole match
                for (p in patterns) {
                    yield(groupId) // the group for the current pattern
                    groupId += p.second.matcher("").groupCount() + 1 // skip all the nested groups in p
                }
            }.toList()

    /** Tokenizes the [input] from a [String] into a [TokenizerMatchesSequence]. */
    override fun tokenize(input: String) = input.scanTokenize(::Scanner)

    /** Tokenizes the [input] from an [InputStream] into a [TokenizerMatchesSequence]. */
    override fun tokenize(input: InputStream) = input.scanTokenize(::Scanner)

    /** Tokenizes the [input] from a [Readable] into a [TokenizerMatchesSequence]. */
    override fun tokenize(input: Readable) = input.scanTokenize(::Scanner)

    fun <T> T.scanTokenize(const: (T) -> Scanner) = const(this).apply {
        //useDelimiter("""(?=\n)""")
        logger.info("Delimiter:")
        logger.info(delimiter().pattern())
    }.run { tokenize(this) }

    /** Tokenizes the [input] from a [Scanner] into a [TokenizerMatchesSequence]. */
    override fun tokenize(input: Scanner): Sequence<TokenMatch> = buildSequence<TokenMatch> {
        var pos = 0
        var row = 1
        var col = 1

        while (input.hasNext()) {
            val matchResult: MatchResult
            val matchedToken: Token =
                    if (input.findWithinHorizon(allInOnePattern, 0) != null) {
                        matchResult = input.match()
                        tokens[patternGroupIndices.indexOfFirst { matchResult.group(it) != null }]
                    } else {
                        logger.warning(allInOnePattern.pattern())
                        logger.warning("could not match")
                        logger.warning(input.toString())
                        val test = input.findWithinHorizon(".".toRegex(setOf(RegexOption.MULTILINE, RegexOption.DOT_MATCHES_ALL)).toPattern()
                                , 0)
                        if (test == null) {
                            logger.warning("DID NOT FIND More;")
                        } else {
                            logger.warning("DID FIND;")
                            logger.warning("'$test'")
                        }
                        yield(TokenMatch(noneMatched, input.next(), pos, row, col))
                        break
                    }

            logger.info("matches: '${matchResult.group()}'")

            val match = matchResult.group()
            val result = TokenMatch(matchedToken, match, pos, row, col)

            pos += match.length
            col += match.length

            val addRows = match.count { it == '\n' }
            row += addRows
            if (addRows > 0) {
                col = match.length - match.lastIndexOf('\n')
            }

            yield(result)
        }
    }.cached()
}

## testgrammar.kt
package de.imc.placeholder.app.parsing

import com.github.h0tk3y.betterParse.combinators.map
import com.github.h0tk3y.betterParse.combinators.zeroOrMore
import com.github.h0tk3y.betterParse.grammar.Grammar
import com.github.h0tk3y.betterParse.parser.Parser
import de.imc.placeholder.app.MyTokenizer

object testgrammar : Grammar<String>() {
    val singleToken by token(""".+""".toRegex(RegexOption.DOT_MATCHES_ALL))
    override val rootParser: Parser<String> by zeroOrMore(singleToken) map { it.joinToString("#") }
    override val tokenizer by lazy {
        MyTokenizer(tokens, RegexOption.MULTILINE, RegexOption.DOT_MATCHES_ALL)
    }
}
	package de.imc.placeholder.app

	import com.github.h0tk3y.betterParse.lexer.*
	import com.github.h0tk3y.betterParse.utils.cached
	import java.io.InputStream
	import java.util.*
	import java.util.logging.Logger
	import java.util.regex.MatchResult
	import kotlin.coroutines.experimental.buildSequence


	/** Tokenizes input character sequences using the [tokens], prioritized by their order in the list,
	* first matched first. */
	class MyTokenizer(override val tokens: List<Token>, vararg options: RegexOption) : Tokenizer {
	init {
	require(tokens.isNotEmpty()) { "The tokens list should not be empty" }
	}

	val logger = Logger.getLogger(javaClass.simpleName)

	val patterns = tokens.map { it to (it.regex?.toPattern() ?: it.pattern.toRegex(setOf(*options)).toPattern()) }
	private val allInOnePattern = patterns
	.joinToString("\|", prefix = "\\G(?:", postfix = ")") { "(${it.second.pattern()})" }.toPattern()
	private val patternGroupIndices =
	buildSequence {
	var groupId = 1 // the zero group is the whole match
	for (p in patterns) {
	yield(groupId) // the group for the current pattern
	groupId += p.second.matcher("").groupCount() + 1 // skip all the nested groups in p
	}
	}.toList()

	/** Tokenizes the [input] from a [String] into a [TokenizerMatchesSequence]. */
	override fun tokenize(input: String) = input.scanTokenize(::Scanner)

	/** Tokenizes the [input] from an [InputStream] into a [TokenizerMatchesSequence]. */
	override fun tokenize(input: InputStream) = input.scanTokenize(::Scanner)

	/** Tokenizes the [input] from a [Readable] into a [TokenizerMatchesSequence]. */
	override fun tokenize(input: Readable) = input.scanTokenize(::Scanner)

	fun <T> T.scanTokenize(const: (T) -> Scanner) = const(this).apply {
	//useDelimiter("""(?=\n)""")
	logger.info("Delimiter:")
	logger.info(delimiter().pattern())
	}.run { tokenize(this) }

	/** Tokenizes the [input] from a [Scanner] into a [TokenizerMatchesSequence]. */
	override fun tokenize(input: Scanner): Sequence<TokenMatch> = buildSequence<TokenMatch> {
	var pos = 0
	var row = 1
	var col = 1

	while (input.hasNext()) {
	val matchResult: MatchResult
	val matchedToken: Token =
	if (input.findWithinHorizon(allInOnePattern, 0) != null) {
	matchResult = input.match()
	tokens[patternGroupIndices.indexOfFirst { matchResult.group(it) != null }]
	} else {
	logger.warning(allInOnePattern.pattern())
	logger.warning("could not match")
	logger.warning(input.toString())
	val test = input.findWithinHorizon(".".toRegex(setOf(RegexOption.MULTILINE, RegexOption.DOT_MATCHES_ALL)).toPattern()
	, 0)
	if (test == null) {
	logger.warning("DID NOT FIND More;")
	} else {
	logger.warning("DID FIND;")
	logger.warning("'$test'")
	}
	yield(TokenMatch(noneMatched, input.next(), pos, row, col))
	break
	}

	logger.info("matches: '${matchResult.group()}'")

	val match = matchResult.group()
	val result = TokenMatch(matchedToken, match, pos, row, col)

	pos += match.length
	col += match.length

	val addRows = match.count { it == '\n' }
	row += addRows
	if (addRows > 0) {
	col = match.length - match.lastIndexOf('\n')
	}

	yield(result)
	}
	}.cached()
	}
	package de.imc.placeholder.app.parsing

	import com.github.h0tk3y.betterParse.combinators.map
	import com.github.h0tk3y.betterParse.combinators.zeroOrMore
	import com.github.h0tk3y.betterParse.grammar.Grammar
	import com.github.h0tk3y.betterParse.parser.Parser
	import de.imc.placeholder.app.MyTokenizer

	object testgrammar : Grammar<String>() {
	val singleToken by token(""".+""".toRegex(RegexOption.DOT_MATCHES_ALL))
	override val rootParser: Parser<String> by zeroOrMore(singleToken) map { it.joinToString("#") }
	override val tokenizer by lazy {
	MyTokenizer(tokens, RegexOption.MULTILINE, RegexOption.DOT_MATCHES_ALL)
	}
	}