Skip to content

Instantly share code, notes, and snippets.

@martialboniou
Created February 11, 2012 18:46
Show Gist options
  • Save martialboniou/1803509 to your computer and use it in GitHub Desktop.
Save martialboniou/1803509 to your computer and use it in GitHub Desktop.
utf8 encoding read-file functions for shen
\* functions to read UTF-8 files & sequences *\
(synonyms bytelist (list number))
(define nlist->utf8
{ (list number) --> bytelist }
[] -> []
X -> (nlist->utf8- X []))
(define nlist->utf8-
{ (list number) --> bytelist --> bytelist }
[] Acc -> (reverse Acc)
[ X | XS ] Acc -> (let Byte1 (+ 192 (div X 64))
Byte2 (+ 128 (mod X 64))
(nlist->utf8- XS [ Byte2 Byte1 | Acc])) where (and (> X 127)
(< X 2048))
[ X | XS ] Acc -> (let Byte1 (+ 224 (div X 4096))
Byte2 (+ 128 (mod (div X 64) 64))
Byte3 (+ 128 (mod X 64))
(nlist->utf8- XS [ Byte3 Byte2 Byte1 | Acc])) where (and (> X 2047)
(< X 65536))
[ X | XS ] Acc -> (let Byte1 (+ 240 (div X 262144))
Byte2 (+ 128 (mod (div X 4096) 64))
Byte3 (+ 128 (mod (div X 64) 64))
Byte4 (+ 128 (mod X 64))
(nlist->utf8- XS [ Byte4 Byte3 Byte2 Byte1 | Acc])) where (and (> X 65535)
(< X 2097152))
[ X | XS ] Acc -> (let Byte1 (+ 248 (div X 16777216))
Byte2 (+ 128 (mod (div X 262144) 64))
Byte3 (+ 128 (mod (div X 4096) 64))
Byte4 (+ 128 (mod (div X 64) 64))
Byte5 (+ 128 (mod X 64))
(nlist->utf8- XS [ Byte5 Byte4 Byte3 Byte2 Byte1 | Acc])) where (and (> X 2097151)
(< X 67108864))
[ X | XS ] Acc -> (let Byte1 (+ 252 (div X 1073741824))
Byte2 (+ 128 (mod (div X 16777216) 64))
Byte3 (+ 128 (mod (div X 262144) 64))
Byte4 (+ 128 (mod (div X 4096) 64))
Byte5 (+ 128 (mod (div X 64) 64))
Byte6 (+ 128 (mod X 64))
(nlist->utf8- XS [ Byte6 Byte5 Byte4 Byte3 Byte2 Byte1 | Acc])) where (and (> X 67108863)
(< X 2147483648))
[ X | XS ] Acc -> (nlist->utf8- XS [ X | Acc ]))
(define utf8->nlist
{ bytelist --> (list number) }
[] -> []
X -> (utf8->nlist- X []))
(define utf8->nlist-
\* convert to UCS-4 *\
{ bytelist --> (list number) --> (list number) }
[] Acc -> (reverse Acc)
[ X | XS ] Acc -> (utf8->nlist- XS Acc) where (or (= X 254)
(= X 255)) \* wrong case *\
[ X Y Z W V U | XS] Acc -> (let UD (+ (* (- X 252) 1073741824)
(* (- Y 128) 16777216)
(* (- Z 128) 262144)
(* (- W 128) 4096)
(* (- V 128) 64) (- U 128))
(utf8->nlist- XS [ UD | Acc])) where (or (= X 252) (= X 253))
[ X Y Z W V | XS ] Acc -> (let UD (+ (* (- X 248) 16777216)
(* (- Y 128) 262144)
(* (- Z 128) 4096)
(* (- W 128) 64) (- V 128))
(utf8->nlist- XS [ UD | Acc])) where (and (> X 247) (< X 252))
[ X Y Z W | XS ] Acc -> (let UD (+ (* (- X 240) 262144)
(* (- Y 128) 4096)
(* (- Z 128) 64) (- W 128))
(utf8->nlist- XS [ UD | Acc])) where (and (> X 239) (< X 248))
[ X Y Z | XS ] Acc -> (let UD (+ (* (- X 224) 4096)
(* (- Y 128) 64) (- Z 128))
(utf8->nlist- XS [ UD | Acc ])) where (and (> X 223) (< X 240))
[ X Y | XS ] Acc -> (let UD (+ (* (- X 192) 64) (- Y 128))
(utf8->nlist- XS [ UD | Acc ])) where (and (> X 191) (< X 224))
[ X | XS ] Acc -> (utf8->nlist- XS [ X | Acc ]))
(define read-file-as-utf8-bytelist
{ string --> (list number) }
File -> (let Contents (read-file-as-bytelist File)
(if (empty? Contents)
(error "empty content")
(utf8->nlist Contents))))
(define read-file-as-utf8-string
{ string --> string }
File -> (let Contents (read-file-as-utf8-bytelist File)
(nlist->string Contents)))
\* utilities for bytelist conversion *\
(define nlist->string
{ (list number) --> string }
N -> (nlist->string- N []))
(define nlist->string-
{ (list number) --> (list string) --> string }
[] [] -> ""
[] StringList -> (if (empty? (tail StringList))
(head StringList)
(let Revert (reverse StringList)
(foldl cn (head Revert) (tail Revert))))
[ N | NS ] StringList -> (nlist->string- NS [(n->string N) | StringList]))
\* utilities for sequence manipulation *\
(define foldl
{(A --> B --> A) --> A --> (list B) --> A}
Fn A [B] -> (Fn A B)
Fn A [B|BS] -> (foldl Fn (Fn A B) BS))
\* floor / div / mod *\
\* REMARK: floor is not defined in my version of Shen 3.0 *\
(define floor
{ number --> number }
0 -> 0
N -> (floor-string- (explode (str N)) []))
(define floor-string-
{ (list string) --> (list string) --> number }
[] [] -> (error "not a number")
[] Str -> (let Result (reverse Str)
TResult (tail Result)
ResultString (if (empty? TResult)
(head Result)
(foldl cn (head Result) TResult))
(integer-string->num ResultString))
["." | Ss] Str -> (floor-string- [] Str)
[S | Ss] Str -> (floor-string- Ss [S | Str]))
(define integer-string->num
{ string --> number }
(@s "-" Ss) -> (- 0 (integer-string->num Ss))
Str -> (integer-string->num- (explode Str) []))
(define integer-string->num-
{ (list string) --> (list number) --> number }
[] Acc -> (list-number->num- Acc 1 0)
["1" | Y] Acc -> (integer-string->num- Y [1 | Acc])
["2" | Y] Acc -> (integer-string->num- Y [2 | Acc])
["3" | Y] Acc -> (integer-string->num- Y [3 | Acc])
["4" | Y] Acc -> (integer-string->num- Y [4 | Acc])
["5" | Y] Acc -> (integer-string->num- Y [5 | Acc])
["6" | Y] Acc -> (integer-string->num- Y [6 | Acc])
["7" | Y] Acc -> (integer-string->num- Y [7 | Acc])
["8" | Y] Acc -> (integer-string->num- Y [8 | Acc])
["9" | Y] Acc -> (integer-string->num- Y [9 | Acc])
["0" | Y] Acc -> (integer-string->num- Y [0 | Acc])
_ Acc -> (error "unknown symbol in an integer-string"))
(define list-number->num-
{ (list number) --> number --> number --> number }
\* The array of number must be reversed *\
[] _ Acc -> Acc
[X | XS ] Range Acc -> (list-number->num- XS (* Range 10) (+ (* X Range) Acc)))
(define mod
{ number --> number --> number }
X 0 -> (error "division by zero")
X Y -> (- X (* Y (div X Y))))
(define div
{ number --> number --> number }
X 0 -> (error "division by zero")
X Y -> (floor (/ X Y)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment