Skip to content

Instantly share code, notes, and snippets.

@okuoku
Created June 6, 2018 16:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save okuoku/7fd831c53b9bf7e970fad5bbb4301985 to your computer and use it in GitHub Desktop.
Save okuoku/7fd831c53b9bf7e970fad5bbb4301985 to your computer and use it in GitHub Desktop.
#
# YuniSexpTokenize:
#
# yuni_sexp_tokenize_ctx_start(<CTX> str)
# yuni_sexp_tokenize_ctx_next(<CTX> out_result out_start out_end)
# result: Type of the token
# === Specials ===
# ( ) -- paren
# [ ] -- paren
# # -- is for vector (*)
# #vu8 -- is for bytevector (*)
# #u8 -- is for bytevector (*)
# ' -- is for quote
# ` -- is for quasiquote
# , -- is for unquote
# ,@ -- is for unquote-splicing
# #f -- is for false (*)
# #t -- is for true (*)
# #; -- next-datum-comment
#
# (*) -- Returned as DATUM
#
#
# === Datum types ===
# STRING
# DATUM
#
# === Command ===
# AGAIN
# EOF
#
# (We don't have COMMENT type for now)
# internal:
#
# <ctx>_cur -- Current position in buffer
# <ctx>_buf -- stream buffer
# <ctx>_tkns -- Token start location
# <ctx>_bcdep -- Depth for block-comment
# <ctx>_acc -- Accumulator for the state
# <ctx>_st -- current state
# NUL -- Space or BoS
# NRM -- Normal object state
# STR -- String state
# BCM -- Block comment state
# LCM -- Line comment state
#
string(ASCII 1 _yuni_sexp_esc_dq)
#set(_yuni_sexp_esc_dq "\"")
string(ASCII 2 _yuni_sexp_esc_sem)
function(yuni_sexp_tokenize out str)
yuni_sexp_tokenize_preprocess(prep fil)
string(REGEX MATCHALL
"\"[^\"]*\"|#vu8|#u8|#t|#f|#\\[a-z]*|#\\.|#|,@|,|[()`']|[^ \r\n\t()`']+|[ \r\n\t]+"
lis
"${prep}")
set(${out} "${lis}" PARENT_SCOPE)
endfunction()
function(yuni_sexp_token_unescape out str)
string(REGEX REPLACE "${_yuni_sexp_esc_sem}" ";" tmp
"${str}")
string(REGEX REPLACE "${_yuni_sexp_esc_dq}" "\"" tmp
"${tmp}")
set(${out} "${tmp}" PARENT_SCOPE)
endfunction()
function(yuni_sexp_tokenize_preprocess out str)
# Strip LCM BCM STR from input string so we can match
# entire token at once.
set(st NUL)
set(acc)
set(q "${${str}}")
set(bcdep)
while(NOT ${st} STREQUAL TERM)
#message(STATUS "Loop: ${st}")
#message(STATUS "Q: ${q}")
if(${st} STREQUAL NUL)
if("${q}" MATCHES "^([^;#\"]*)#(\\\\|\\|)(.*)")
string(CONCAT acc "${acc}" "${CMAKE_MATCH_1}")
if(${CMAKE_MATCH_2} STREQUAL "\\")
# Character constant
set(next "${CMAKE_MATCH_3}")
if("${next}" MATCHES "^;(.*)")
# SEM
set(acc "${acc}#\\${_yuni_sexp_esc_sem}")
set(q "${CMAKE_MATCH_1}")
elseif("${next}" MATCHES "^\"(.*)")
# DQUOTE
set(acc "${acc}#\\${_yuni_sexp_esc_dq}")
set(q "${CMAKE_MATCH_1}")
else()
set(acc "${acc}#\\")
set(q "${next}")
endif()
elseif(${CMAKE_MATCH_2} STREQUAL "|")
# BCM: Block comment
set(q "${CMAKE_MATCH_3}")
set(bcdep x)
set(st BCM)
else()
message(FATAL_ERROR "Should not happen: ${CMAKE_MATCH_1}")
endif()
elseif("${q}" MATCHES "^([^;\"]+)(.*)")
string(CONCAT acc "${acc}" "${CMAKE_MATCH_1}")
set(q "${CMAKE_MATCH_2}")
elseif("${q}" MATCHES "^;[^\r\n]*\r?\n(.*)")
# LCM: Line comment
set(q "${CMAKE_MATCH_1}")
elseif("${q}" MATCHES "^\"(.*)")
# STR: String
# NB: It seems CONCAT cannot concat single DQUOTE here..
set(acc "${acc}\"")
set(q "${CMAKE_MATCH_1}")
set(st STR)
else()
set(st TERM)
string(CONCAT acc "${acc}" "${q}")
endif()
elseif(${st} STREQUAL BCM)
if("${q}" MATCHES "^[^#|]*([#|]*)(.*)")
#message(STATUS "BCC: ${CMAKE_MATCH_1}")
if(${CMAKE_MATCH_1} STREQUAL "#|")
set(q "${CMAKE_MATCH_2}")
list(APPEND bcdep x)
elseif(${CMAKE_MATCH_1} STREQUAL "|#")
set(q "${CMAKE_MATCH_2}")
list(REMOVE_AT bcdep 0)
if(NOT bcdep)
#message(STATUS "BCCC: ${CMAKE_MATCH_2}")
set(st NUL)
endif()
elseif(${CMAKE_MATCH_1} STREQUAL "|")
set(q "${CMAKE_MATCH_2}")
elseif(${CMAKE_MATCH_1} STREQUAL "#")
set(q "${CMAKE_MATCH_2}")
else()
message(FATAL_ERROR "Huh? ${CMAKE_MATCH_2}")
endif()
else()
message(FATAL_ERROR "Open BCM: ${q}")
endif()
elseif(${st} STREQUAL STR)
if("${q}" MATCHES "^([^\"\\\\]*)\\\\\"(.*)")
#message(STATUS "STRx: ${CMAKE_MATCH_1}")
# NB: Same here. Avoid CONCAT for escape char.
set(acc "${acc}${CMAKE_MATCH_1}${_yuni_sexp_esc_dq}")
set(q "${CMAKE_MATCH_2}")
elseif("${q}" MATCHES "^([^\"]*\")(.*)")
string(CONCAT acc "${acc}" "${CMAKE_MATCH_1}")
#message(STATUS "STR: ${CMAKE_MATCH_1}")
set(q "${CMAKE_MATCH_2}")
set(st NUL)
else()
message(FATAL_ERROR "Open STR: ${q}")
endif()
endif()
endwhile()
string(REGEX REPLACE ";" "${_yuni_sexp_esc_sem}" escstr
"${acc}")
set(${out} "${escstr}" PARENT_SCOPE)
endfunction()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment