Skip to content

Instantly share code, notes, and snippets.

@bakpakin
Last active January 19, 2022 11:34
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bakpakin/750b4d782eb0818b7ea0699c13c791e7 to your computer and use it in GitHub Desktop.
Save bakpakin/750b4d782eb0818b7ea0699c13c791e7 to your computer and use it in GitHub Desktop.
Use pegs to parse utf8
###
### utf8.janet
###
### Pure janet utf8 utils. You should probably just use C.
###
(defn utf8-encode
"Convert a sequence of codepoints to a string."
[x]
(def buf @"")
(each b x
(cond
(< b 0x80)
(buffer/format buf "%c" b)
(< b 0x800)
(buffer/format buf "%c%c"
(bor 0xC0 (brshift b 6))
(bor 0x80 (band b 0x3F)))
(< b 0x10000)
(buffer/format buf "%c%c%c"
(bor 0xE0 (brshift b 12))
(bor 0x80 (band (brshift b 6) 0x3F))
(bor 0x80 (band b 0x3F)))
(< b 0x110000)
(buffer/format buf "%c%c%c%c"
(bor 0xF0 (brshift b 18))
(bor 0x80 (band (brshift b 12) 0x3F))
(bor 0x80 (band (brshift b 6) 0x3F))
(bor 0x80 (band b 0x3F)))
(error (string "codepoint " b " is invalid"))))
(string buf))
(def utf8-peg
(peg/compile
~{:extra '(range "\x80\xBF")
# 1 byte variant (0xxxxxxx)
:1byte (/ '(range "\x00\x7F") ,first)
# 2 byte variant (110xxxxx 10xxxxxx)
:2byte (/ (* '(range "\xC0\xDF") :extra)
,(fn [[x] [y]]
(+ (* 0x40 (band x 0x1F))
(band y 0x3F))))
# 3 byte variant (1110xxxx 10xxxxxx 10xxxxxx)
:3byte (/ (* '(range "\xE0\xEF") :extra :extra)
,(fn [[x] [y] [z]]
(+ (* 0x1000 (band x 0x0F))
(* 0x40 (band y 0x3F))
(band z 0x3F))))
# 4 byte variant (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
:4byte (/ (* '(range "\xF0\xF7") :extra :extra :extra)
,(fn [[x] [y] [z] [w]]
(+ (* 0x40000 (band x 0x07))
(* 0x1000 (band y 0x3F))
(* 0x40 (band z 0x3F))
(band w 0x3F))))
# Any sequence of variants
:main (any (+ :1byte :2byte :3byte :4byte -1 (error "")))}))
(defn utf8-decode
"Parse a utf-8 string into a sequence of codepoints."
[x]
(peg/match utf8-peg x))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment