Skip to content

Instantly share code, notes, and snippets.

@okuoku
Created June 3, 2018 19:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save okuoku/6a1a4ea859735cdc19779db3314bb63d to your computer and use it in GitHub Desktop.
Save okuoku/6a1a4ea859735cdc19779db3314bb63d to your computer and use it in GitHub Desktop.
#
# YuniSexpTokenize:
#
# yuni_sexp_tokenize_ctx_start(<CTX> str)
# yuni_sexp_tokenize_ctx_next(<CTX> out_result out_start out_end)
# result: Type of the token
# === Specials ===
# ( ) -- paren
# [ ] -- paren
# # -- is for vector (*)
# #vu8 -- is for bytevector (*)
# #u8 -- is for bytevector (*)
# ' -- is for quote
# ` -- is for quasiquote
# , -- is for unquote
# ,@ -- is for unquote-splicing
# #f -- is for false (*)
# #t -- is for true (*)
# #; -- next-datum-comment
#
# (*) -- Returned as DATUM
#
#
# === Datum types ===
# STRING
# DATUM
#
# === Command ===
# AGAIN
# EOF
#
# (We don't have COMMENT type for now)
# internal:
#
# <ctx>_cur -- Current position in buffer
# <ctx>_buf -- stream buffer
# <ctx>_tkns -- Token start location
# <ctx>_bcdep -- Depth for block-comment
# <ctx>_acc -- Accumulator for the state
# <ctx>_st -- current state
# <ctx>_lsc -- Line Segment Cache
# <ctx>_dsc -- Delimited Segment Cache
# <ctx>_brc -- Block Comment Region Cache
# NUL -- Space or BoS
# NRM -- Normal object state
# STR -- String state
# BCM -- Block comment state
# LCM -- Line comment state
#
macro(yuni_sexp_tokenize_ctx_start ctx str)
string(LENGTH "${${str}}" __end)
set(${ctx}_cur 0)
set(${ctx}_buf "${${str}}")
set(${ctx}_tkns -1)
set(${ctx}_bcdep)
set(${ctx}_acc)
set(${ctx}_st NUL)
set(${ctx}_end ${__end})
# Line Segment Cache
string(REGEX REPLACE "[^\r\n]" " " __lscbuf "${${str}}")
string(LENGTH "${__lscbuf}" __lsclen)
message(STATUS "LEN: ${__end} == ${__lsclen}")
endmacro()
macro(yuni_sexp_tokenize_ctx_next ctx out_result out_start out_end)
while(1)
yuni_sexp_tokenize_ctx__itr(${ctx}
${out_result} ${out_start} ${out_end})
if(NOT ${out_result} STREQUAL AGAIN)
break()
endif()
endwhile()
endmacro()
macro(yuni_sexp_tokenize_ctx_token ctx out_result start end)
math(EXPR __len "${end}-${start}")
string(SUBSTRING "${${ctx}_buf}" ${start} ${__len} ${out_result})
endmacro()
macro(yuni_sexp_tokenize_ctx__itr ctx out_result out_start out_end)
# __input: Calc an input char
math(EXPR __next ${${ctx}_cur}+1)
set(__input)
set(__has_input)
if(NOT ${${ctx}_cur} EQUAL ${${ctx}_end})
string(SUBSTRING "${${ctx}_buf}" ${${ctx}_cur} 1 __input)
set(__has_input ON)
endif()
set(__st ${${ctx}_st})
set(__result UNKNOWN)
set(__start -9999)
set(__end -9999)
message(STATUS "${__st} ${${ctx}_cur} ${__next} ${${ctx}_end} INPUT[${__input}]")
if(NOT __has_input)
if(${${ctx}_st} STREQUAL NRM)
# Terminate current datum
set(${ctx}_st NUL)
set(__result DATUM)
set(__start ${${ctx}_tkns})
set(__end ${${ctx}_cur})
else()
set(__result EOF)
endif()
elseif(${__st} STREQUAL NUL)
# NUL:
if(("${__input}" STREQUAL " ")
OR ("${__input}" STREQUAL "\r")
OR ("${__input}" STREQUAL "\n")
OR ("${__input}" STREQUAL "\t"))
# Whitespace. Again.
set(${ctx}_cur ${__next})
set(__result AGAIN)
elseif("${__input}" STREQUAL "\"")
# DQUOTE. Begin string
set(${ctx}_cur ${__next})
set(${ctx}_st STR)
set(${ctx}_tkns ${__next})
set(${ctx}_acc) # Clear ACC
set(__result AGAIN)
elseif("${__input}" STREQUAL ";")
# Semicolon. Begin line-comment
set(${ctx}_st LCM)
set(${ctx}_acc)
set(${ctx}_cur ${__next})
set(__result AGAIN)
elseif(("${__input}" STREQUAL "(")
OR ("${__input}" STREQUAL ")")
OR ("${__input}" STREQUAL "[")
OR ("${__input}" STREQUAL "]")
OR ("${__input}" STREQUAL "'")
OR ("${__input}" STREQUAL "`"))
# Single char datum. Return it immediately.
set(${ctx}_cur ${__next})
set(${ctx}_tkns ${${ctx}_cur})
set(__result ${__input})
set(__start ${${ctx}_cur})
set(__end ${__next})
else()
# Otherwise. Enter NRM.
set(${ctx}_tkns ${${ctx}_cur})
set(${ctx}_cur ${__next})
set(${ctx}_st NRM)
set(${ctx}_acc ${__input})
set(__result AGAIN)
endif()
elseif(${__st} STREQUAL NRM)
set(__term)
if("${${ctx}_acc}" STREQUAL ",")
if("${__input}" STREQUAL "@")
# Emit unquote-splicing
set(__result ",@")
set(${ctx}_st NUL)
set(__start ${${ctx}_tkns})
set(__end ${__next})
set(${${ctx}_cur} ${__next})
else()
# Emit unquote
set(__result ",")
set(${ctx}_st NUL)
set(__start ${${ctx}_tkns})
set(__end ${${ctx}_cur})
endif()
elseif(("${__input}" STREQUAL " ")
OR ("${__input}" STREQUAL "\r")
OR ("${__input}" STREQUAL "\n")
OR ("${__input}" STREQUAL "\t")
OR ("${__input}" STREQUAL "\"")
OR ("${__input}" STREQUAL ";")
OR ("${__input}" STREQUAL "(")
OR ("${__input}" STREQUAL ")")
OR ("${__input}" STREQUAL "[")
OR ("${__input}" STREQUAL "]")
OR ("${__input}" STREQUAL "'")
OR ("${__input}" STREQUAL "`"))
# Unconsume the char and return to NUL
set(${ctx}_st NUL)
set(__result DATUM)
set(__start ${${ctx}_tkns})
set(__end ${${ctx}_cur})
else()
if("${${ctx}_acc}" STREQUAL "#")
if("${__input}" STREQUAL "|")
# Enter bcm
set(${ctx}_bcdep x)
set(${ctx}_st BCM)
endif()
endif()
set(${ctx}_acc)
set(__result AGAIN)
set(${ctx}_cur ${__next})
endif()
elseif(${__st} STREQUAL STR)
if(${ctx}_acc)
# Ignore a char
set(${ctx}_acc)
set(__result AGAIN)
set(${ctx}_cur ${__next})
elseif("${__input}" STREQUAL "\"")
set(__start ${${ctx}_tkns})
set(__end ${${ctx}_cur})
set(${ctx}_cur ${__next})
set(__result STRING)
set(${ctx}_st NUL)
elseif("${__input}" STREQUAL "\\")
set(${ctx}_cur ${__next})
set(${ctx}_acc ON)
set(__result AGAIN)
else()
set(${ctx}_cur ${__next})
set(__result AGAIN)
endif()
elseif(${__st} STREQUAL LCM)
if(${ctx}_acc)
# CRLF
set(${ctx}_acc)
set(${ctx}_st NUL)
elseif("${__input}" STREQUAL "\r")
set(${ctx}_acc x)
elseif("${__input}" STREQUAL "\n")
set(${ctx}_st NUL)
endif()
set(${ctx}_cur ${__next})
set(__result AGAIN)
elseif(${__st} STREQUAL BCM)
if("${${ctx}_acc}" STREQUAL "#")
set(${ctx}_acc)
if("${__input}" STREQUAL "|")
list(APPEND ${ctx}_bcdep x)
endif()
elseif("${${ctx}_acc}" STREQUAL "|")
set(${ctx}_acc)
if("${__input}" STREQUAL "#")
list(REMOVE_AT ${ctx}_bcdep 0)
endif()
else()
set(${ctx}_acc "${__input}")
endif()
if(NOT ${ctx}_bcdep)
set(${ctx}_st NUL)
endif()
set(${ctx}_cur ${__next})
set(__result AGAIN)
else()
message(FATAL_ERROR "Invalid state: ${__st}")
endif()
set(${out_result} ${__result})
set(${out_start} ${__start})
set(${out_end} ${__end})
endmacro()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment