okuoku/YuniSexpTokenize-beforeregex.cmake Secret

## YuniSexpTokenize-beforeregex.cmake
#
# YuniSexpTokenize:
#
#  yuni_sexp_tokenize_ctx_start(<CTX> str)
#  yuni_sexp_tokenize_ctx_next(<CTX> out_result out_start out_end)
#    result: Type of the token
#     === Specials ===
#     ( )   -- paren
#     [ ]   -- paren
#     #     -- is for vector      (*)
#     #vu8  -- is for bytevector  (*)
#     #u8   -- is for bytevector  (*)
#     '     -- is for quote
#     `     -- is for quasiquote
#     ,     -- is for unquote
#     ,@    -- is for unquote-splicing
#     #f    -- is for false       (*)
#     #t    -- is for true        (*)
#     #;    -- next-datum-comment
#
#       (*) -- Returned as DATUM
#
#
#     === Datum types ===
#     STRING
#     DATUM
#
#     === Command ===
#     AGAIN
#     EOF
#
#     (We don't have COMMENT type for now)


# internal:
#
#   <ctx>_cur   -- Current position in buffer
#   <ctx>_buf   -- stream buffer
#   <ctx>_tkns  -- Token start location
#   <ctx>_bcdep -- Depth for block-comment
#   <ctx>_acc   -- Accumulator for the state
#   <ctx>_st    -- current state
#   <ctx>_lsc   -- Line Segment Cache
#   <ctx>_dsc   -- Delimited Segment Cache
#   <ctx>_brc   -- Block Comment Region Cache
#     NUL -- Space or BoS
#     NRM -- Normal object state
#     STR -- String state
#     BCM -- Block comment state
#     LCM -- Line comment state
#

macro(yuni_sexp_tokenize_ctx_start ctx str)
    string(LENGTH "${${str}}" __end)
    set(${ctx}_cur 0)
    set(${ctx}_buf "${${str}}")
    set(${ctx}_tkns -1)
    set(${ctx}_bcdep)
    set(${ctx}_acc)
    set(${ctx}_st NUL)
    set(${ctx}_end ${__end})
    # Line Segment Cache
    string(REGEX REPLACE "[^\r\n]" " " __lscbuf "${${str}}")
    string(LENGTH "${__lscbuf}" __lsclen)
    message(STATUS "LEN: ${__end} == ${__lsclen}")
endmacro()

macro(yuni_sexp_tokenize_ctx_next ctx out_result out_start out_end)
    while(1)
        yuni_sexp_tokenize_ctx__itr(${ctx}
            ${out_result} ${out_start} ${out_end})
        if(NOT ${out_result} STREQUAL AGAIN)
            break()
        endif()
    endwhile()
endmacro()

macro(yuni_sexp_tokenize_ctx_token ctx out_result start end)
    math(EXPR __len "${end}-${start}")
    string(SUBSTRING "${${ctx}_buf}" ${start} ${__len} ${out_result})
endmacro()

macro(yuni_sexp_tokenize_ctx__itr ctx out_result out_start out_end)
    # __input: Calc an input char
    math(EXPR __next ${${ctx}_cur}+1)
    set(__input)
    set(__has_input)
    if(NOT ${${ctx}_cur} EQUAL ${${ctx}_end})
        string(SUBSTRING "${${ctx}_buf}" ${${ctx}_cur} 1 __input)
        set(__has_input ON)
    endif()

    set(__st ${${ctx}_st})
    set(__result UNKNOWN)
    set(__start -9999)
    set(__end -9999)

    message(STATUS "${__st} ${${ctx}_cur} ${__next} ${${ctx}_end} INPUT[${__input}]")

    if(NOT __has_input)
        if(${${ctx}_st} STREQUAL NRM)
            # Terminate current datum
            set(${ctx}_st NUL)
            set(__result DATUM)
            set(__start ${${ctx}_tkns})
            set(__end ${${ctx}_cur})
        else()
            set(__result EOF)
        endif()
    elseif(${__st} STREQUAL NUL)
        # NUL:
        if(("${__input}" STREQUAL " ")
           OR ("${__input}" STREQUAL "\r")
           OR ("${__input}" STREQUAL "\n")
           OR ("${__input}" STREQUAL "\t"))
           # Whitespace. Again.
           set(${ctx}_cur ${__next})
           set(__result AGAIN)
        elseif("${__input}" STREQUAL "\"")
            # DQUOTE. Begin string
            set(${ctx}_cur ${__next})
            set(${ctx}_st STR)
            set(${ctx}_tkns ${__next})
            set(${ctx}_acc) # Clear ACC
            set(__result AGAIN)
        elseif("${__input}" STREQUAL ";")
            # Semicolon. Begin line-comment
            set(${ctx}_st LCM)
            set(${ctx}_acc)
            set(${ctx}_cur ${__next})
            set(__result AGAIN)
        elseif(("${__input}" STREQUAL "(")
            OR ("${__input}" STREQUAL ")")
            OR ("${__input}" STREQUAL "[")
            OR ("${__input}" STREQUAL "]")
            OR ("${__input}" STREQUAL "'")
            OR ("${__input}" STREQUAL "`"))
            # Single char datum. Return it immediately.
            set(${ctx}_cur ${__next})
            set(${ctx}_tkns ${${ctx}_cur})
            set(__result ${__input})
            set(__start ${${ctx}_cur})
            set(__end ${__next})
        else()
            # Otherwise. Enter NRM.
            set(${ctx}_tkns ${${ctx}_cur})
            set(${ctx}_cur ${__next})
            set(${ctx}_st NRM)
            set(${ctx}_acc ${__input})
            set(__result AGAIN)
        endif()
    elseif(${__st} STREQUAL NRM)
        set(__term)
        if("${${ctx}_acc}" STREQUAL ",")
            if("${__input}" STREQUAL "@")
                # Emit unquote-splicing
                set(__result ",@")
                set(${ctx}_st NUL)
                set(__start ${${ctx}_tkns})
                set(__end ${__next})
                set(${${ctx}_cur} ${__next})
            else()
                # Emit unquote
                set(__result ",")
                set(${ctx}_st NUL)
                set(__start ${${ctx}_tkns})
                set(__end ${${ctx}_cur})
            endif()
        elseif(("${__input}" STREQUAL " ")
           OR ("${__input}" STREQUAL "\r")
           OR ("${__input}" STREQUAL "\n")
           OR ("${__input}" STREQUAL "\t")
           OR ("${__input}" STREQUAL "\"")
           OR ("${__input}" STREQUAL ";")
           OR ("${__input}" STREQUAL "(")
           OR ("${__input}" STREQUAL ")")
           OR ("${__input}" STREQUAL "[")
           OR ("${__input}" STREQUAL "]")
           OR ("${__input}" STREQUAL "'")
           OR ("${__input}" STREQUAL "`"))
           # Unconsume the char and return to NUL
           set(${ctx}_st NUL)
           set(__result DATUM)
           set(__start ${${ctx}_tkns})
           set(__end ${${ctx}_cur})
        else()
            if("${${ctx}_acc}" STREQUAL "#")
                if("${__input}" STREQUAL "|")
                    # Enter bcm
                    set(${ctx}_bcdep x)
                    set(${ctx}_st BCM)
                endif()
            endif()
            set(${ctx}_acc)
            set(__result AGAIN)
            set(${ctx}_cur ${__next})
        endif()
    elseif(${__st} STREQUAL STR)
        if(${ctx}_acc)
            # Ignore a char
            set(${ctx}_acc)
            set(__result AGAIN)
            set(${ctx}_cur ${__next})
        elseif("${__input}" STREQUAL "\"")
            set(__start ${${ctx}_tkns})
            set(__end ${${ctx}_cur})
            set(${ctx}_cur ${__next})
            set(__result STRING)
            set(${ctx}_st NUL)
        elseif("${__input}" STREQUAL "\\")
            set(${ctx}_cur ${__next})
            set(${ctx}_acc ON)
            set(__result AGAIN)
        else()
            set(${ctx}_cur ${__next})
            set(__result AGAIN)
        endif()
    elseif(${__st} STREQUAL LCM)
        if(${ctx}_acc)
            # CRLF
            set(${ctx}_acc)
            set(${ctx}_st NUL)
        elseif("${__input}" STREQUAL "\r")
            set(${ctx}_acc x)
        elseif("${__input}" STREQUAL "\n")
            set(${ctx}_st NUL)
        endif()
        set(${ctx}_cur ${__next})
        set(__result AGAIN)
    elseif(${__st} STREQUAL BCM)
        if("${${ctx}_acc}" STREQUAL "#")
            set(${ctx}_acc)
            if("${__input}" STREQUAL "|")
                list(APPEND ${ctx}_bcdep x)
            endif()
        elseif("${${ctx}_acc}" STREQUAL "|")
            set(${ctx}_acc)
            if("${__input}" STREQUAL "#")
                list(REMOVE_AT ${ctx}_bcdep 0)
            endif()
        else()
            set(${ctx}_acc "${__input}")
        endif()
        if(NOT ${ctx}_bcdep)
            set(${ctx}_st NUL)
        endif()
        set(${ctx}_cur ${__next})
        set(__result AGAIN)
    else()
        message(FATAL_ERROR "Invalid state: ${__st}")
    endif()
    set(${out_result} ${__result})
    set(${out_start} ${__start})
    set(${out_end} ${__end})
endmacro()
	#
	# YuniSexpTokenize:
	#
	# yuni_sexp_tokenize_ctx_start(<CTX> str)
	# yuni_sexp_tokenize_ctx_next(<CTX> out_result out_start out_end)
	# result: Type of the token
	# === Specials ===
	# ( ) -- paren
	# [ ] -- paren
	# # -- is for vector (*)
	# #vu8 -- is for bytevector (*)
	# #u8 -- is for bytevector (*)
	# ' -- is for quote
	# ` -- is for quasiquote
	# , -- is for unquote
	# ,@ -- is for unquote-splicing
	# #f -- is for false (*)
	# #t -- is for true (*)
	# #; -- next-datum-comment
	#
	# (*) -- Returned as DATUM
	#
	#
	# === Datum types ===
	# STRING
	# DATUM
	#
	# === Command ===
	# AGAIN
	# EOF
	#
	# (We don't have COMMENT type for now)


	# internal:
	#
	# <ctx>_cur -- Current position in buffer
	# <ctx>_buf -- stream buffer
	# <ctx>_tkns -- Token start location
	# <ctx>_bcdep -- Depth for block-comment
	# <ctx>_acc -- Accumulator for the state
	# <ctx>_st -- current state
	# <ctx>_lsc -- Line Segment Cache
	# <ctx>_dsc -- Delimited Segment Cache
	# <ctx>_brc -- Block Comment Region Cache
	# NUL -- Space or BoS
	# NRM -- Normal object state
	# STR -- String state
	# BCM -- Block comment state
	# LCM -- Line comment state
	#

	macro(yuni_sexp_tokenize_ctx_start ctx str)
	string(LENGTH "${${str}}" __end)
	set(${ctx}_cur 0)
	set(${ctx}_buf "${${str}}")
	set(${ctx}_tkns -1)
	set(${ctx}_bcdep)
	set(${ctx}_acc)
	set(${ctx}_st NUL)
	set(${ctx}_end ${__end})
	# Line Segment Cache
	string(REGEX REPLACE "[^\r\n]" " " __lscbuf "${${str}}")
	string(LENGTH "${__lscbuf}" __lsclen)
	message(STATUS "LEN: ${__end} == ${__lsclen}")
	endmacro()

	macro(yuni_sexp_tokenize_ctx_next ctx out_result out_start out_end)
	while(1)
	yuni_sexp_tokenize_ctx__itr(${ctx}
	${out_result} ${out_start} ${out_end})
	if(NOT ${out_result} STREQUAL AGAIN)
	break()
	endif()
	endwhile()
	endmacro()

	macro(yuni_sexp_tokenize_ctx_token ctx out_result start end)
	math(EXPR __len "${end}-${start}")
	string(SUBSTRING "${${ctx}_buf}" ${start} ${__len} ${out_result})
	endmacro()

	macro(yuni_sexp_tokenize_ctx__itr ctx out_result out_start out_end)
	# __input: Calc an input char
	math(EXPR __next ${${ctx}_cur}+1)
	set(__input)
	set(__has_input)
	if(NOT ${${ctx}_cur} EQUAL ${${ctx}_end})
	string(SUBSTRING "${${ctx}_buf}" ${${ctx}_cur} 1 __input)
	set(__has_input ON)
	endif()

	set(__st ${${ctx}_st})
	set(__result UNKNOWN)
	set(__start -9999)
	set(__end -9999)

	message(STATUS "${__st} ${${ctx}_cur} ${__next} ${${ctx}_end} INPUT[${__input}]")

	if(NOT __has_input)
	if(${${ctx}_st} STREQUAL NRM)
	# Terminate current datum
	set(${ctx}_st NUL)
	set(__result DATUM)
	set(__start ${${ctx}_tkns})
	set(__end ${${ctx}_cur})
	else()
	set(__result EOF)
	endif()
	elseif(${__st} STREQUAL NUL)
	# NUL:
	if(("${__input}" STREQUAL " ")
	OR ("${__input}" STREQUAL "\r")
	OR ("${__input}" STREQUAL "\n")
	OR ("${__input}" STREQUAL "\t"))
	# Whitespace. Again.
	set(${ctx}_cur ${__next})
	set(__result AGAIN)
	elseif("${__input}" STREQUAL "\"")
	# DQUOTE. Begin string
	set(${ctx}_cur ${__next})
	set(${ctx}_st STR)
	set(${ctx}_tkns ${__next})
	set(${ctx}_acc) # Clear ACC
	set(__result AGAIN)
	elseif("${__input}" STREQUAL ";")
	# Semicolon. Begin line-comment
	set(${ctx}_st LCM)
	set(${ctx}_acc)
	set(${ctx}_cur ${__next})
	set(__result AGAIN)
	elseif(("${__input}" STREQUAL "(")
	OR ("${__input}" STREQUAL ")")
	OR ("${__input}" STREQUAL "[")
	OR ("${__input}" STREQUAL "]")
	OR ("${__input}" STREQUAL "'")
	OR ("${__input}" STREQUAL "`"))
	# Single char datum. Return it immediately.
	set(${ctx}_cur ${__next})
	set(${ctx}_tkns ${${ctx}_cur})
	set(__result ${__input})
	set(__start ${${ctx}_cur})
	set(__end ${__next})
	else()
	# Otherwise. Enter NRM.
	set(${ctx}_tkns ${${ctx}_cur})
	set(${ctx}_cur ${__next})
	set(${ctx}_st NRM)
	set(${ctx}_acc ${__input})
	set(__result AGAIN)
	endif()
	elseif(${__st} STREQUAL NRM)
	set(__term)
	if("${${ctx}_acc}" STREQUAL ",")
	if("${__input}" STREQUAL "@")
	# Emit unquote-splicing
	set(__result ",@")
	set(${ctx}_st NUL)
	set(__start ${${ctx}_tkns})
	set(__end ${__next})
	set(${${ctx}_cur} ${__next})
	else()
	# Emit unquote
	set(__result ",")
	set(${ctx}_st NUL)
	set(__start ${${ctx}_tkns})
	set(__end ${${ctx}_cur})
	endif()
	elseif(("${__input}" STREQUAL " ")
	OR ("${__input}" STREQUAL "\r")
	OR ("${__input}" STREQUAL "\n")
	OR ("${__input}" STREQUAL "\t")
	OR ("${__input}" STREQUAL "\"")
	OR ("${__input}" STREQUAL ";")
	OR ("${__input}" STREQUAL "(")
	OR ("${__input}" STREQUAL ")")
	OR ("${__input}" STREQUAL "[")
	OR ("${__input}" STREQUAL "]")
	OR ("${__input}" STREQUAL "'")
	OR ("${__input}" STREQUAL "`"))
	# Unconsume the char and return to NUL
	set(${ctx}_st NUL)
	set(__result DATUM)
	set(__start ${${ctx}_tkns})
	set(__end ${${ctx}_cur})
	else()
	if("${${ctx}_acc}" STREQUAL "#")
	if("${__input}" STREQUAL "\|")
	# Enter bcm
	set(${ctx}_bcdep x)
	set(${ctx}_st BCM)
	endif()
	endif()
	set(${ctx}_acc)
	set(__result AGAIN)
	set(${ctx}_cur ${__next})
	endif()
	elseif(${__st} STREQUAL STR)
	if(${ctx}_acc)
	# Ignore a char
	set(${ctx}_acc)
	set(__result AGAIN)
	set(${ctx}_cur ${__next})
	elseif("${__input}" STREQUAL "\"")
	set(__start ${${ctx}_tkns})
	set(__end ${${ctx}_cur})
	set(${ctx}_cur ${__next})
	set(__result STRING)
	set(${ctx}_st NUL)
	elseif("${__input}" STREQUAL "\\")
	set(${ctx}_cur ${__next})
	set(${ctx}_acc ON)
	set(__result AGAIN)
	else()
	set(${ctx}_cur ${__next})
	set(__result AGAIN)
	endif()
	elseif(${__st} STREQUAL LCM)
	if(${ctx}_acc)
	# CRLF
	set(${ctx}_acc)
	set(${ctx}_st NUL)
	elseif("${__input}" STREQUAL "\r")
	set(${ctx}_acc x)
	elseif("${__input}" STREQUAL "\n")
	set(${ctx}_st NUL)
	endif()
	set(${ctx}_cur ${__next})
	set(__result AGAIN)
	elseif(${__st} STREQUAL BCM)
	if("${${ctx}_acc}" STREQUAL "#")
	set(${ctx}_acc)
	if("${__input}" STREQUAL "\|")
	list(APPEND ${ctx}_bcdep x)
	endif()
	elseif("${${ctx}_acc}" STREQUAL "\|")
	set(${ctx}_acc)
	if("${__input}" STREQUAL "#")
	list(REMOVE_AT ${ctx}_bcdep 0)
	endif()
	else()
	set(${ctx}_acc "${__input}")
	endif()
	if(NOT ${ctx}_bcdep)
	set(${ctx}_st NUL)
	endif()
	set(${ctx}_cur ${__next})
	set(__result AGAIN)
	else()
	message(FATAL_ERROR "Invalid state: ${__st}")
	endif()
	set(${out_result} ${__result})
	set(${out_start} ${__start})
	set(${out_end} ${__end})
	endmacro()