-
-
Save okuoku/7fd831c53b9bf7e970fad5bbb4301985 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# YuniSexpTokenize: | |
# | |
# yuni_sexp_tokenize_ctx_start(<CTX> str) | |
# yuni_sexp_tokenize_ctx_next(<CTX> out_result out_start out_end) | |
# result: Type of the token | |
# === Specials === | |
# ( ) -- paren | |
# [ ] -- paren | |
# # -- is for vector (*) | |
# #vu8 -- is for bytevector (*) | |
# #u8 -- is for bytevector (*) | |
# ' -- is for quote | |
# ` -- is for quasiquote | |
# , -- is for unquote | |
# ,@ -- is for unquote-splicing | |
# #f -- is for false (*) | |
# #t -- is for true (*) | |
# #; -- next-datum-comment | |
# | |
# (*) -- Returned as DATUM | |
# | |
# | |
# === Datum types === | |
# STRING | |
# DATUM | |
# | |
# === Command === | |
# AGAIN | |
# EOF | |
# | |
# (We don't have COMMENT type for now) | |
# internal: | |
# | |
# <ctx>_cur -- Current position in buffer | |
# <ctx>_buf -- stream buffer | |
# <ctx>_tkns -- Token start location | |
# <ctx>_bcdep -- Depth for block-comment | |
# <ctx>_acc -- Accumulator for the state | |
# <ctx>_st -- current state | |
# NUL -- Space or BoS | |
# NRM -- Normal object state | |
# STR -- String state | |
# BCM -- Block comment state | |
# LCM -- Line comment state | |
# | |
string(ASCII 1 _yuni_sexp_esc_dq) | |
#set(_yuni_sexp_esc_dq "\"") | |
string(ASCII 2 _yuni_sexp_esc_sem) | |
function(yuni_sexp_tokenize out str) | |
yuni_sexp_tokenize_preprocess(prep fil) | |
string(REGEX MATCHALL | |
"\"[^\"]*\"|#vu8|#u8|#t|#f|#\\[a-z]*|#\\.|#|,@|,|[()`']|[^ \r\n\t()`']+|[ \r\n\t]+" | |
lis | |
"${prep}") | |
set(${out} "${lis}" PARENT_SCOPE) | |
endfunction() | |
function(yuni_sexp_token_unescape out str) | |
string(REGEX REPLACE "${_yuni_sexp_esc_sem}" ";" tmp | |
"${str}") | |
string(REGEX REPLACE "${_yuni_sexp_esc_dq}" "\"" tmp | |
"${tmp}") | |
set(${out} "${tmp}" PARENT_SCOPE) | |
endfunction() | |
function(yuni_sexp_tokenize_preprocess out str) | |
# Strip LCM BCM STR from input string so we can match | |
# entire token at once. | |
set(st NUL) | |
set(acc) | |
set(q "${${str}}") | |
set(bcdep) | |
while(NOT ${st} STREQUAL TERM) | |
#message(STATUS "Loop: ${st}") | |
#message(STATUS "Q: ${q}") | |
if(${st} STREQUAL NUL) | |
if("${q}" MATCHES "^([^;#\"]*)#(\\\\|\\|)(.*)") | |
string(CONCAT acc "${acc}" "${CMAKE_MATCH_1}") | |
if(${CMAKE_MATCH_2} STREQUAL "\\") | |
# Character constant | |
set(next "${CMAKE_MATCH_3}") | |
if("${next}" MATCHES "^;(.*)") | |
# SEM | |
set(acc "${acc}#\\${_yuni_sexp_esc_sem}") | |
set(q "${CMAKE_MATCH_1}") | |
elseif("${next}" MATCHES "^\"(.*)") | |
# DQUOTE | |
set(acc "${acc}#\\${_yuni_sexp_esc_dq}") | |
set(q "${CMAKE_MATCH_1}") | |
else() | |
set(acc "${acc}#\\") | |
set(q "${next}") | |
endif() | |
elseif(${CMAKE_MATCH_2} STREQUAL "|") | |
# BCM: Block comment | |
set(q "${CMAKE_MATCH_3}") | |
set(bcdep x) | |
set(st BCM) | |
else() | |
message(FATAL_ERROR "Should not happen: ${CMAKE_MATCH_1}") | |
endif() | |
elseif("${q}" MATCHES "^([^;\"]+)(.*)") | |
string(CONCAT acc "${acc}" "${CMAKE_MATCH_1}") | |
set(q "${CMAKE_MATCH_2}") | |
elseif("${q}" MATCHES "^;[^\r\n]*\r?\n(.*)") | |
# LCM: Line comment | |
set(q "${CMAKE_MATCH_1}") | |
elseif("${q}" MATCHES "^\"(.*)") | |
# STR: String | |
# NB: It seems CONCAT cannot concat single DQUOTE here.. | |
set(acc "${acc}\"") | |
set(q "${CMAKE_MATCH_1}") | |
set(st STR) | |
else() | |
set(st TERM) | |
string(CONCAT acc "${acc}" "${q}") | |
endif() | |
elseif(${st} STREQUAL BCM) | |
if("${q}" MATCHES "^[^#|]*([#|]*)(.*)") | |
#message(STATUS "BCC: ${CMAKE_MATCH_1}") | |
if(${CMAKE_MATCH_1} STREQUAL "#|") | |
set(q "${CMAKE_MATCH_2}") | |
list(APPEND bcdep x) | |
elseif(${CMAKE_MATCH_1} STREQUAL "|#") | |
set(q "${CMAKE_MATCH_2}") | |
list(REMOVE_AT bcdep 0) | |
if(NOT bcdep) | |
#message(STATUS "BCCC: ${CMAKE_MATCH_2}") | |
set(st NUL) | |
endif() | |
elseif(${CMAKE_MATCH_1} STREQUAL "|") | |
set(q "${CMAKE_MATCH_2}") | |
elseif(${CMAKE_MATCH_1} STREQUAL "#") | |
set(q "${CMAKE_MATCH_2}") | |
else() | |
message(FATAL_ERROR "Huh? ${CMAKE_MATCH_2}") | |
endif() | |
else() | |
message(FATAL_ERROR "Open BCM: ${q}") | |
endif() | |
elseif(${st} STREQUAL STR) | |
if("${q}" MATCHES "^([^\"\\\\]*)\\\\\"(.*)") | |
#message(STATUS "STRx: ${CMAKE_MATCH_1}") | |
# NB: Same here. Avoid CONCAT for escape char. | |
set(acc "${acc}${CMAKE_MATCH_1}${_yuni_sexp_esc_dq}") | |
set(q "${CMAKE_MATCH_2}") | |
elseif("${q}" MATCHES "^([^\"]*\")(.*)") | |
string(CONCAT acc "${acc}" "${CMAKE_MATCH_1}") | |
#message(STATUS "STR: ${CMAKE_MATCH_1}") | |
set(q "${CMAKE_MATCH_2}") | |
set(st NUL) | |
else() | |
message(FATAL_ERROR "Open STR: ${q}") | |
endif() | |
endif() | |
endwhile() | |
string(REGEX REPLACE ";" "${_yuni_sexp_esc_sem}" escstr | |
"${acc}") | |
set(${out} "${escstr}" PARENT_SCOPE) | |
endfunction() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment