Last active
January 19, 2022 11:34
-
-
Save bakpakin/750b4d782eb0818b7ea0699c13c791e7 to your computer and use it in GitHub Desktop.
Use pegs to parse utf8
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### | |
### utf8.janet | |
### | |
### Pure janet utf8 utils. You should probably just use C. | |
### | |
(defn utf8-encode | |
"Convert a sequence of codepoints to a string." | |
[x] | |
(def buf @"") | |
(each b x | |
(cond | |
(< b 0x80) | |
(buffer/format buf "%c" b) | |
(< b 0x800) | |
(buffer/format buf "%c%c" | |
(bor 0xC0 (brshift b 6)) | |
(bor 0x80 (band b 0x3F))) | |
(< b 0x10000) | |
(buffer/format buf "%c%c%c" | |
(bor 0xE0 (brshift b 12)) | |
(bor 0x80 (band (brshift b 6) 0x3F)) | |
(bor 0x80 (band b 0x3F))) | |
(< b 0x110000) | |
(buffer/format buf "%c%c%c%c" | |
(bor 0xF0 (brshift b 18)) | |
(bor 0x80 (band (brshift b 12) 0x3F)) | |
(bor 0x80 (band (brshift b 6) 0x3F)) | |
(bor 0x80 (band b 0x3F))) | |
(error (string "codepoint " b " is invalid")))) | |
(string buf)) | |
(def utf8-peg | |
(peg/compile | |
~{:extra '(range "\x80\xBF") | |
# 1 byte variant (0xxxxxxx) | |
:1byte (/ '(range "\x00\x7F") ,first) | |
# 2 byte variant (110xxxxx 10xxxxxx) | |
:2byte (/ (* '(range "\xC0\xDF") :extra) | |
,(fn [[x] [y]] | |
(+ (* 0x40 (band x 0x1F)) | |
(band y 0x3F)))) | |
# 3 byte variant (1110xxxx 10xxxxxx 10xxxxxx) | |
:3byte (/ (* '(range "\xE0\xEF") :extra :extra) | |
,(fn [[x] [y] [z]] | |
(+ (* 0x1000 (band x 0x0F)) | |
(* 0x40 (band y 0x3F)) | |
(band z 0x3F)))) | |
# 4 byte variant (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) | |
:4byte (/ (* '(range "\xF0\xF7") :extra :extra :extra) | |
,(fn [[x] [y] [z] [w]] | |
(+ (* 0x40000 (band x 0x07)) | |
(* 0x1000 (band y 0x3F)) | |
(* 0x40 (band z 0x3F)) | |
(band w 0x3F)))) | |
# Any sequence of variants | |
:main (any (+ :1byte :2byte :3byte :4byte -1 (error "")))})) | |
(defn utf8-decode | |
"Parse a utf-8 string into a sequence of codepoints." | |
[x] | |
(peg/match utf8-peg x)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment