Created
June 3, 2018 19:12
-
-
Save okuoku/6a1a4ea859735cdc19779db3314bb63d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# YuniSexpTokenize: | |
# | |
# yuni_sexp_tokenize_ctx_start(<CTX> str) | |
# yuni_sexp_tokenize_ctx_next(<CTX> out_result out_start out_end) | |
# result: Type of the token | |
# === Specials === | |
# ( ) -- paren | |
# [ ] -- paren | |
# # -- is for vector (*) | |
# #vu8 -- is for bytevector (*) | |
# #u8 -- is for bytevector (*) | |
# ' -- is for quote | |
# ` -- is for quasiquote | |
# , -- is for unquote | |
# ,@ -- is for unquote-splicing | |
# #f -- is for false (*) | |
# #t -- is for true (*) | |
# #; -- next-datum-comment | |
# | |
# (*) -- Returned as DATUM | |
# | |
# | |
# === Datum types === | |
# STRING | |
# DATUM | |
# | |
# === Command === | |
# AGAIN | |
# EOF | |
# | |
# (We don't have COMMENT type for now) | |
# internal: | |
# | |
# <ctx>_cur -- Current position in buffer | |
# <ctx>_buf -- stream buffer | |
# <ctx>_tkns -- Token start location | |
# <ctx>_bcdep -- Depth for block-comment | |
# <ctx>_acc -- Accumulator for the state | |
# <ctx>_st -- current state | |
# <ctx>_lsc -- Line Segment Cache | |
# <ctx>_dsc -- Delimited Segment Cache | |
# <ctx>_brc -- Block Comment Region Cache | |
# NUL -- Space or BoS | |
# NRM -- Normal object state | |
# STR -- String state | |
# BCM -- Block comment state | |
# LCM -- Line comment state | |
# | |
macro(yuni_sexp_tokenize_ctx_start ctx str) | |
string(LENGTH "${${str}}" __end) | |
set(${ctx}_cur 0) | |
set(${ctx}_buf "${${str}}") | |
set(${ctx}_tkns -1) | |
set(${ctx}_bcdep) | |
set(${ctx}_acc) | |
set(${ctx}_st NUL) | |
set(${ctx}_end ${__end}) | |
# Line Segment Cache | |
string(REGEX REPLACE "[^\r\n]" " " __lscbuf "${${str}}") | |
string(LENGTH "${__lscbuf}" __lsclen) | |
message(STATUS "LEN: ${__end} == ${__lsclen}") | |
endmacro() | |
macro(yuni_sexp_tokenize_ctx_next ctx out_result out_start out_end) | |
while(1) | |
yuni_sexp_tokenize_ctx__itr(${ctx} | |
${out_result} ${out_start} ${out_end}) | |
if(NOT ${out_result} STREQUAL AGAIN) | |
break() | |
endif() | |
endwhile() | |
endmacro() | |
macro(yuni_sexp_tokenize_ctx_token ctx out_result start end) | |
math(EXPR __len "${end}-${start}") | |
string(SUBSTRING "${${ctx}_buf}" ${start} ${__len} ${out_result}) | |
endmacro() | |
macro(yuni_sexp_tokenize_ctx__itr ctx out_result out_start out_end) | |
# __input: Calc an input char | |
math(EXPR __next ${${ctx}_cur}+1) | |
set(__input) | |
set(__has_input) | |
if(NOT ${${ctx}_cur} EQUAL ${${ctx}_end}) | |
string(SUBSTRING "${${ctx}_buf}" ${${ctx}_cur} 1 __input) | |
set(__has_input ON) | |
endif() | |
set(__st ${${ctx}_st}) | |
set(__result UNKNOWN) | |
set(__start -9999) | |
set(__end -9999) | |
message(STATUS "${__st} ${${ctx}_cur} ${__next} ${${ctx}_end} INPUT[${__input}]") | |
if(NOT __has_input) | |
if(${${ctx}_st} STREQUAL NRM) | |
# Terminate current datum | |
set(${ctx}_st NUL) | |
set(__result DATUM) | |
set(__start ${${ctx}_tkns}) | |
set(__end ${${ctx}_cur}) | |
else() | |
set(__result EOF) | |
endif() | |
elseif(${__st} STREQUAL NUL) | |
# NUL: | |
if(("${__input}" STREQUAL " ") | |
OR ("${__input}" STREQUAL "\r") | |
OR ("${__input}" STREQUAL "\n") | |
OR ("${__input}" STREQUAL "\t")) | |
# Whitespace. Again. | |
set(${ctx}_cur ${__next}) | |
set(__result AGAIN) | |
elseif("${__input}" STREQUAL "\"") | |
# DQUOTE. Begin string | |
set(${ctx}_cur ${__next}) | |
set(${ctx}_st STR) | |
set(${ctx}_tkns ${__next}) | |
set(${ctx}_acc) # Clear ACC | |
set(__result AGAIN) | |
elseif("${__input}" STREQUAL ";") | |
# Semicolon. Begin line-comment | |
set(${ctx}_st LCM) | |
set(${ctx}_acc) | |
set(${ctx}_cur ${__next}) | |
set(__result AGAIN) | |
elseif(("${__input}" STREQUAL "(") | |
OR ("${__input}" STREQUAL ")") | |
OR ("${__input}" STREQUAL "[") | |
OR ("${__input}" STREQUAL "]") | |
OR ("${__input}" STREQUAL "'") | |
OR ("${__input}" STREQUAL "`")) | |
# Single char datum. Return it immediately. | |
set(${ctx}_cur ${__next}) | |
set(${ctx}_tkns ${${ctx}_cur}) | |
set(__result ${__input}) | |
set(__start ${${ctx}_cur}) | |
set(__end ${__next}) | |
else() | |
# Otherwise. Enter NRM. | |
set(${ctx}_tkns ${${ctx}_cur}) | |
set(${ctx}_cur ${__next}) | |
set(${ctx}_st NRM) | |
set(${ctx}_acc ${__input}) | |
set(__result AGAIN) | |
endif() | |
elseif(${__st} STREQUAL NRM) | |
set(__term) | |
if("${${ctx}_acc}" STREQUAL ",") | |
if("${__input}" STREQUAL "@") | |
# Emit unquote-splicing | |
set(__result ",@") | |
set(${ctx}_st NUL) | |
set(__start ${${ctx}_tkns}) | |
set(__end ${__next}) | |
set(${${ctx}_cur} ${__next}) | |
else() | |
# Emit unquote | |
set(__result ",") | |
set(${ctx}_st NUL) | |
set(__start ${${ctx}_tkns}) | |
set(__end ${${ctx}_cur}) | |
endif() | |
elseif(("${__input}" STREQUAL " ") | |
OR ("${__input}" STREQUAL "\r") | |
OR ("${__input}" STREQUAL "\n") | |
OR ("${__input}" STREQUAL "\t") | |
OR ("${__input}" STREQUAL "\"") | |
OR ("${__input}" STREQUAL ";") | |
OR ("${__input}" STREQUAL "(") | |
OR ("${__input}" STREQUAL ")") | |
OR ("${__input}" STREQUAL "[") | |
OR ("${__input}" STREQUAL "]") | |
OR ("${__input}" STREQUAL "'") | |
OR ("${__input}" STREQUAL "`")) | |
# Unconsume the char and return to NUL | |
set(${ctx}_st NUL) | |
set(__result DATUM) | |
set(__start ${${ctx}_tkns}) | |
set(__end ${${ctx}_cur}) | |
else() | |
if("${${ctx}_acc}" STREQUAL "#") | |
if("${__input}" STREQUAL "|") | |
# Enter bcm | |
set(${ctx}_bcdep x) | |
set(${ctx}_st BCM) | |
endif() | |
endif() | |
set(${ctx}_acc) | |
set(__result AGAIN) | |
set(${ctx}_cur ${__next}) | |
endif() | |
elseif(${__st} STREQUAL STR) | |
if(${ctx}_acc) | |
# Ignore a char | |
set(${ctx}_acc) | |
set(__result AGAIN) | |
set(${ctx}_cur ${__next}) | |
elseif("${__input}" STREQUAL "\"") | |
set(__start ${${ctx}_tkns}) | |
set(__end ${${ctx}_cur}) | |
set(${ctx}_cur ${__next}) | |
set(__result STRING) | |
set(${ctx}_st NUL) | |
elseif("${__input}" STREQUAL "\\") | |
set(${ctx}_cur ${__next}) | |
set(${ctx}_acc ON) | |
set(__result AGAIN) | |
else() | |
set(${ctx}_cur ${__next}) | |
set(__result AGAIN) | |
endif() | |
elseif(${__st} STREQUAL LCM) | |
if(${ctx}_acc) | |
# CRLF | |
set(${ctx}_acc) | |
set(${ctx}_st NUL) | |
elseif("${__input}" STREQUAL "\r") | |
set(${ctx}_acc x) | |
elseif("${__input}" STREQUAL "\n") | |
set(${ctx}_st NUL) | |
endif() | |
set(${ctx}_cur ${__next}) | |
set(__result AGAIN) | |
elseif(${__st} STREQUAL BCM) | |
if("${${ctx}_acc}" STREQUAL "#") | |
set(${ctx}_acc) | |
if("${__input}" STREQUAL "|") | |
list(APPEND ${ctx}_bcdep x) | |
endif() | |
elseif("${${ctx}_acc}" STREQUAL "|") | |
set(${ctx}_acc) | |
if("${__input}" STREQUAL "#") | |
list(REMOVE_AT ${ctx}_bcdep 0) | |
endif() | |
else() | |
set(${ctx}_acc "${__input}") | |
endif() | |
if(NOT ${ctx}_bcdep) | |
set(${ctx}_st NUL) | |
endif() | |
set(${ctx}_cur ${__next}) | |
set(__result AGAIN) | |
else() | |
message(FATAL_ERROR "Invalid state: ${__st}") | |
endif() | |
set(${out_result} ${__result}) | |
set(${out_start} ${__start}) | |
set(${out_end} ${__end}) | |
endmacro() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment