okuoku/YuniSexpTokenize.cmake Secret

## YuniSexpTokenize.cmake
#
# YuniSexpTokenize:
#
#  yuni_sexp_tokenize_ctx_start(<CTX> str)
#  yuni_sexp_tokenize_ctx_next(<CTX> out_result out_start out_end)
#    result: Type of the token
#     === Specials ===
#     ( )   -- paren
#     [ ]   -- paren
#     #     -- is for vector      (*)
#     #vu8  -- is for bytevector  (*)
#     #u8   -- is for bytevector  (*)
#     '     -- is for quote
#     `     -- is for quasiquote
#     ,     -- is for unquote
#     ,@    -- is for unquote-splicing
#     #f    -- is for false       (*)
#     #t    -- is for true        (*)
#     #;    -- next-datum-comment
#
#       (*) -- Returned as DATUM
#
#
#     === Datum types ===
#     STRING
#     DATUM
#
#     === Command ===
#     AGAIN
#     EOF
#
#     (We don't have COMMENT type for now)


# internal:
#
#   <ctx>_cur   -- Current position in buffer
#   <ctx>_buf   -- stream buffer
#   <ctx>_tkns  -- Token start location
#   <ctx>_bcdep -- Depth for block-comment
#   <ctx>_acc   -- Accumulator for the state
#   <ctx>_st    -- current state
#     NUL -- Space or BoS
#     NRM -- Normal object state
#     STR -- String state
#     BCM -- Block comment state
#     LCM -- Line comment state
#

string(ASCII 1 _yuni_sexp_esc_dq)
#set(_yuni_sexp_esc_dq "\"")
string(ASCII 2 _yuni_sexp_esc_sem)

function(yuni_sexp_tokenize out str)
    yuni_sexp_tokenize_preprocess(prep fil)
    string(REGEX MATCHALL
        "\"[^\"]*\"|#vu8|#u8|#t|#f|#\\[a-z]*|#\\.|#|,@|,|[()`']|[^ \r\n\t()`']+|[ \r\n\t]+"
        lis
        "${prep}")
    set(${out} "${lis}" PARENT_SCOPE)
endfunction()

function(yuni_sexp_token_unescape out str)
    string(REGEX REPLACE "${_yuni_sexp_esc_sem}" ";" tmp
        "${str}")
    string(REGEX REPLACE "${_yuni_sexp_esc_dq}" "\"" tmp
        "${tmp}")
    set(${out} "${tmp}" PARENT_SCOPE)
endfunction()

function(yuni_sexp_tokenize_preprocess out str)
    # Strip LCM BCM STR from input string so we can match
    # entire token at once.
    set(st NUL)
    set(acc)
    set(q "${${str}}")
    set(bcdep)
    while(NOT ${st} STREQUAL TERM)
        #message(STATUS "Loop: ${st}")
        #message(STATUS "Q: ${q}")
        if(${st} STREQUAL NUL)
            if("${q}" MATCHES "^([^;#\"]*)#(\\\\|\\|)(.*)")
                string(CONCAT acc "${acc}" "${CMAKE_MATCH_1}")
                if(${CMAKE_MATCH_2} STREQUAL "\\")
                    # Character constant
                    set(next "${CMAKE_MATCH_3}")
                    if("${next}" MATCHES "^;(.*)")
                        # SEM
                        set(acc "${acc}#\\${_yuni_sexp_esc_sem}")
                        set(q "${CMAKE_MATCH_1}")
                    elseif("${next}" MATCHES "^\"(.*)")
                        # DQUOTE
                        set(acc "${acc}#\\${_yuni_sexp_esc_dq}")
                        set(q "${CMAKE_MATCH_1}")
                    else()
                        set(acc "${acc}#\\")
                        set(q "${next}")
                    endif()
                elseif(${CMAKE_MATCH_2} STREQUAL "|")
                    # BCM: Block comment
                    set(q "${CMAKE_MATCH_3}")
                    set(bcdep x)
                    set(st BCM)
                else()
                    message(FATAL_ERROR "Should not happen: ${CMAKE_MATCH_1}")
                endif()
            elseif("${q}" MATCHES "^([^;\"]+)(.*)")
                string(CONCAT acc "${acc}" "${CMAKE_MATCH_1}")
                set(q "${CMAKE_MATCH_2}")
            elseif("${q}" MATCHES "^;[^\r\n]*\r?\n(.*)")
                # LCM: Line comment
                set(q "${CMAKE_MATCH_1}")
            elseif("${q}" MATCHES "^\"(.*)")
                # STR: String
                # NB: It seems CONCAT cannot concat single DQUOTE here..
                set(acc "${acc}\"")
                set(q "${CMAKE_MATCH_1}")
                set(st STR)
            else()
                set(st TERM)
                string(CONCAT acc "${acc}" "${q}")
            endif()
        elseif(${st} STREQUAL BCM)
            if("${q}" MATCHES "^[^#|]*([#|]*)(.*)")
                #message(STATUS "BCC: ${CMAKE_MATCH_1}")
                if(${CMAKE_MATCH_1} STREQUAL "#|")
                    set(q "${CMAKE_MATCH_2}")
                    list(APPEND bcdep x)
                elseif(${CMAKE_MATCH_1} STREQUAL "|#")
                    set(q "${CMAKE_MATCH_2}")
                    list(REMOVE_AT bcdep 0)
                    if(NOT bcdep)
                        #message(STATUS "BCCC: ${CMAKE_MATCH_2}")
                        set(st NUL)
                    endif()
                elseif(${CMAKE_MATCH_1} STREQUAL "|")
                    set(q "${CMAKE_MATCH_2}")
                elseif(${CMAKE_MATCH_1} STREQUAL "#")
                    set(q "${CMAKE_MATCH_2}")
                else()
                    message(FATAL_ERROR "Huh? ${CMAKE_MATCH_2}")
                endif()
            else()
                message(FATAL_ERROR "Open BCM: ${q}")
            endif()
        elseif(${st} STREQUAL STR)
            if("${q}" MATCHES "^([^\"\\\\]*)\\\\\"(.*)")
                #message(STATUS "STRx: ${CMAKE_MATCH_1}")
                # NB: Same here. Avoid CONCAT for escape char.
                set(acc "${acc}${CMAKE_MATCH_1}${_yuni_sexp_esc_dq}")
                set(q "${CMAKE_MATCH_2}")
            elseif("${q}" MATCHES "^([^\"]*\")(.*)")
                string(CONCAT acc "${acc}" "${CMAKE_MATCH_1}")
                #message(STATUS "STR: ${CMAKE_MATCH_1}")
                set(q "${CMAKE_MATCH_2}")
                set(st NUL)
            else()
                message(FATAL_ERROR "Open STR: ${q}")
            endif()
        endif()
    endwhile()
    string(REGEX REPLACE ";" "${_yuni_sexp_esc_sem}" escstr
        "${acc}")
    set(${out} "${escstr}" PARENT_SCOPE)
endfunction()
	#
	# YuniSexpTokenize:
	#
	# yuni_sexp_tokenize_ctx_start(<CTX> str)
	# yuni_sexp_tokenize_ctx_next(<CTX> out_result out_start out_end)
	# result: Type of the token
	# === Specials ===
	# ( ) -- paren
	# [ ] -- paren
	# # -- is for vector (*)
	# #vu8 -- is for bytevector (*)
	# #u8 -- is for bytevector (*)
	# ' -- is for quote
	# ` -- is for quasiquote
	# , -- is for unquote
	# ,@ -- is for unquote-splicing
	# #f -- is for false (*)
	# #t -- is for true (*)
	# #; -- next-datum-comment
	#
	# (*) -- Returned as DATUM
	#
	#
	# === Datum types ===
	# STRING
	# DATUM
	#
	# === Command ===
	# AGAIN
	# EOF
	#
	# (We don't have COMMENT type for now)


	# internal:
	#
	# <ctx>_cur -- Current position in buffer
	# <ctx>_buf -- stream buffer
	# <ctx>_tkns -- Token start location
	# <ctx>_bcdep -- Depth for block-comment
	# <ctx>_acc -- Accumulator for the state
	# <ctx>_st -- current state
	# NUL -- Space or BoS
	# NRM -- Normal object state
	# STR -- String state
	# BCM -- Block comment state
	# LCM -- Line comment state
	#

	string(ASCII 1 _yuni_sexp_esc_dq)
	#set(_yuni_sexp_esc_dq "\"")
	string(ASCII 2 _yuni_sexp_esc_sem)

	function(yuni_sexp_tokenize out str)
	yuni_sexp_tokenize_preprocess(prep fil)
	string(REGEX MATCHALL
	"\"[^\"]\"\|#vu8\|#u8\|#t\|#f\|#\\[a-z]\|#\\.\|#\|,@\|,\|[()`']\|[^ \r\n\t()`']+\|[ \r\n\t]+"
	lis
	"${prep}")
	set(${out} "${lis}" PARENT_SCOPE)
	endfunction()

	function(yuni_sexp_token_unescape out str)
	string(REGEX REPLACE "${_yuni_sexp_esc_sem}" ";" tmp
	"${str}")
	string(REGEX REPLACE "${_yuni_sexp_esc_dq}" "\"" tmp
	"${tmp}")
	set(${out} "${tmp}" PARENT_SCOPE)
	endfunction()

	function(yuni_sexp_tokenize_preprocess out str)
	# Strip LCM BCM STR from input string so we can match
	# entire token at once.
	set(st NUL)
	set(acc)
	set(q "${${str}}")
	set(bcdep)
	while(NOT ${st} STREQUAL TERM)
	#message(STATUS "Loop: ${st}")
	#message(STATUS "Q: ${q}")
	if(${st} STREQUAL NUL)
	if("${q}" MATCHES "^([^;#\"])#(\\\\\|\\\|)(.)")
	string(CONCAT acc "${acc}" "${CMAKE_MATCH_1}")
	if(${CMAKE_MATCH_2} STREQUAL "\\")
	# Character constant
	set(next "${CMAKE_MATCH_3}")
	if("${next}" MATCHES "^;(.*)")
	# SEM
	set(acc "${acc}#\\${_yuni_sexp_esc_sem}")
	set(q "${CMAKE_MATCH_1}")
	elseif("${next}" MATCHES "^\"(.*)")
	# DQUOTE
	set(acc "${acc}#\\${_yuni_sexp_esc_dq}")
	set(q "${CMAKE_MATCH_1}")
	else()
	set(acc "${acc}#\\")
	set(q "${next}")
	endif()
	elseif(${CMAKE_MATCH_2} STREQUAL "\|")
	# BCM: Block comment
	set(q "${CMAKE_MATCH_3}")
	set(bcdep x)
	set(st BCM)
	else()
	message(FATAL_ERROR "Should not happen: ${CMAKE_MATCH_1}")
	endif()
	elseif("${q}" MATCHES "^([^;\"]+)(.*)")
	string(CONCAT acc "${acc}" "${CMAKE_MATCH_1}")
	set(q "${CMAKE_MATCH_2}")
	elseif("${q}" MATCHES "^;[^\r\n]\r?\n(.)")
	# LCM: Line comment
	set(q "${CMAKE_MATCH_1}")
	elseif("${q}" MATCHES "^\"(.*)")
	# STR: String
	# NB: It seems CONCAT cannot concat single DQUOTE here..
	set(acc "${acc}\"")
	set(q "${CMAKE_MATCH_1}")
	set(st STR)
	else()
	set(st TERM)
	string(CONCAT acc "${acc}" "${q}")
	endif()
	elseif(${st} STREQUAL BCM)
	if("${q}" MATCHES "^[^#\|]([#\|])(.*)")
	#message(STATUS "BCC: ${CMAKE_MATCH_1}")
	if(${CMAKE_MATCH_1} STREQUAL "#\|")
	set(q "${CMAKE_MATCH_2}")
	list(APPEND bcdep x)
	elseif(${CMAKE_MATCH_1} STREQUAL "\|#")
	set(q "${CMAKE_MATCH_2}")
	list(REMOVE_AT bcdep 0)
	if(NOT bcdep)
	#message(STATUS "BCCC: ${CMAKE_MATCH_2}")
	set(st NUL)
	endif()
	elseif(${CMAKE_MATCH_1} STREQUAL "\|")
	set(q "${CMAKE_MATCH_2}")
	elseif(${CMAKE_MATCH_1} STREQUAL "#")
	set(q "${CMAKE_MATCH_2}")
	else()
	message(FATAL_ERROR "Huh? ${CMAKE_MATCH_2}")
	endif()
	else()
	message(FATAL_ERROR "Open BCM: ${q}")
	endif()
	elseif(${st} STREQUAL STR)
	if("${q}" MATCHES "^([^\"\\\\])\\\\\"(.)")
	#message(STATUS "STRx: ${CMAKE_MATCH_1}")
	# NB: Same here. Avoid CONCAT for escape char.
	set(acc "${acc}${CMAKE_MATCH_1}${_yuni_sexp_esc_dq}")
	set(q "${CMAKE_MATCH_2}")
	elseif("${q}" MATCHES "^([^\"]\")(.)")
	string(CONCAT acc "${acc}" "${CMAKE_MATCH_1}")
	#message(STATUS "STR: ${CMAKE_MATCH_1}")
	set(q "${CMAKE_MATCH_2}")
	set(st NUL)
	else()
	message(FATAL_ERROR "Open STR: ${q}")
	endif()
	endif()
	endwhile()
	string(REGEX REPLACE ";" "${_yuni_sexp_esc_sem}" escstr
	"${acc}")
	set(${out} "${escstr}" PARENT_SCOPE)
	endfunction()