Created
February 11, 2012 18:46
-
-
Save martialboniou/1803509 to your computer and use it in GitHub Desktop.
utf8 encoding read-file functions for shen
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
\* functions to read UTF-8 files & sequences *\ | |
(synonyms bytelist (list number)) | |
(define nlist->utf8 | |
{ (list number) --> bytelist } | |
[] -> [] | |
X -> (nlist->utf8- X [])) | |
(define nlist->utf8- | |
{ (list number) --> bytelist --> bytelist } | |
[] Acc -> (reverse Acc) | |
[ X | XS ] Acc -> (let Byte1 (+ 192 (div X 64)) | |
Byte2 (+ 128 (mod X 64)) | |
(nlist->utf8- XS [ Byte2 Byte1 | Acc])) where (and (> X 127) | |
(< X 2048)) | |
[ X | XS ] Acc -> (let Byte1 (+ 224 (div X 4096)) | |
Byte2 (+ 128 (mod (div X 64) 64)) | |
Byte3 (+ 128 (mod X 64)) | |
(nlist->utf8- XS [ Byte3 Byte2 Byte1 | Acc])) where (and (> X 2047) | |
(< X 65536)) | |
[ X | XS ] Acc -> (let Byte1 (+ 240 (div X 262144)) | |
Byte2 (+ 128 (mod (div X 4096) 64)) | |
Byte3 (+ 128 (mod (div X 64) 64)) | |
Byte4 (+ 128 (mod X 64)) | |
(nlist->utf8- XS [ Byte4 Byte3 Byte2 Byte1 | Acc])) where (and (> X 65535) | |
(< X 2097152)) | |
[ X | XS ] Acc -> (let Byte1 (+ 248 (div X 16777216)) | |
Byte2 (+ 128 (mod (div X 262144) 64)) | |
Byte3 (+ 128 (mod (div X 4096) 64)) | |
Byte4 (+ 128 (mod (div X 64) 64)) | |
Byte5 (+ 128 (mod X 64)) | |
(nlist->utf8- XS [ Byte5 Byte4 Byte3 Byte2 Byte1 | Acc])) where (and (> X 2097151) | |
(< X 67108864)) | |
[ X | XS ] Acc -> (let Byte1 (+ 252 (div X 1073741824)) | |
Byte2 (+ 128 (mod (div X 16777216) 64)) | |
Byte3 (+ 128 (mod (div X 262144) 64)) | |
Byte4 (+ 128 (mod (div X 4096) 64)) | |
Byte5 (+ 128 (mod (div X 64) 64)) | |
Byte6 (+ 128 (mod X 64)) | |
(nlist->utf8- XS [ Byte6 Byte5 Byte4 Byte3 Byte2 Byte1 | Acc])) where (and (> X 67108863) | |
(< X 2147483648)) | |
[ X | XS ] Acc -> (nlist->utf8- XS [ X | Acc ])) | |
(define utf8->nlist | |
{ bytelist --> (list number) } | |
[] -> [] | |
X -> (utf8->nlist- X [])) | |
(define utf8->nlist- | |
\* convert to UCS-4 *\ | |
{ bytelist --> (list number) --> (list number) } | |
[] Acc -> (reverse Acc) | |
[ X | XS ] Acc -> (utf8->nlist- XS Acc) where (or (= X 254) | |
(= X 255)) \* wrong case *\ | |
[ X Y Z W V U | XS] Acc -> (let UD (+ (* (- X 252) 1073741824) | |
(* (- Y 128) 16777216) | |
(* (- Z 128) 262144) | |
(* (- W 128) 4096) | |
(* (- V 128) 64) (- U 128)) | |
(utf8->nlist- XS [ UD | Acc])) where (or (= X 252) (= X 253)) | |
[ X Y Z W V | XS ] Acc -> (let UD (+ (* (- X 248) 16777216) | |
(* (- Y 128) 262144) | |
(* (- Z 128) 4096) | |
(* (- W 128) 64) (- V 128)) | |
(utf8->nlist- XS [ UD | Acc])) where (and (> X 247) (< X 252)) | |
[ X Y Z W | XS ] Acc -> (let UD (+ (* (- X 240) 262144) | |
(* (- Y 128) 4096) | |
(* (- Z 128) 64) (- W 128)) | |
(utf8->nlist- XS [ UD | Acc])) where (and (> X 239) (< X 248)) | |
[ X Y Z | XS ] Acc -> (let UD (+ (* (- X 224) 4096) | |
(* (- Y 128) 64) (- Z 128)) | |
(utf8->nlist- XS [ UD | Acc ])) where (and (> X 223) (< X 240)) | |
[ X Y | XS ] Acc -> (let UD (+ (* (- X 192) 64) (- Y 128)) | |
(utf8->nlist- XS [ UD | Acc ])) where (and (> X 191) (< X 224)) | |
[ X | XS ] Acc -> (utf8->nlist- XS [ X | Acc ])) | |
(define read-file-as-utf8-bytelist | |
{ string --> (list number) } | |
File -> (let Contents (read-file-as-bytelist File) | |
(if (empty? Contents) | |
(error "empty content") | |
(utf8->nlist Contents)))) | |
(define read-file-as-utf8-string | |
{ string --> string } | |
File -> (let Contents (read-file-as-utf8-bytelist File) | |
(nlist->string Contents))) | |
\* utilities for bytelist conversion *\ | |
(define nlist->string | |
{ (list number) --> string } | |
N -> (nlist->string- N [])) | |
(define nlist->string- | |
{ (list number) --> (list string) --> string } | |
[] [] -> "" | |
[] StringList -> (if (empty? (tail StringList)) | |
(head StringList) | |
(let Revert (reverse StringList) | |
(foldl cn (head Revert) (tail Revert)))) | |
[ N | NS ] StringList -> (nlist->string- NS [(n->string N) | StringList])) | |
\* utilities for sequence manipulation *\ | |
(define foldl | |
{(A --> B --> A) --> A --> (list B) --> A} | |
Fn A [B] -> (Fn A B) | |
Fn A [B|BS] -> (foldl Fn (Fn A B) BS)) | |
\* floor / div / mod *\ | |
\* REMARK: floor is not defined in my version of Shen 3.0 *\ | |
(define floor | |
{ number --> number } | |
0 -> 0 | |
N -> (floor-string- (explode (str N)) [])) | |
(define floor-string- | |
{ (list string) --> (list string) --> number } | |
[] [] -> (error "not a number") | |
[] Str -> (let Result (reverse Str) | |
TResult (tail Result) | |
ResultString (if (empty? TResult) | |
(head Result) | |
(foldl cn (head Result) TResult)) | |
(integer-string->num ResultString)) | |
["." | Ss] Str -> (floor-string- [] Str) | |
[S | Ss] Str -> (floor-string- Ss [S | Str])) | |
(define integer-string->num | |
{ string --> number } | |
(@s "-" Ss) -> (- 0 (integer-string->num Ss)) | |
Str -> (integer-string->num- (explode Str) [])) | |
(define integer-string->num- | |
{ (list string) --> (list number) --> number } | |
[] Acc -> (list-number->num- Acc 1 0) | |
["1" | Y] Acc -> (integer-string->num- Y [1 | Acc]) | |
["2" | Y] Acc -> (integer-string->num- Y [2 | Acc]) | |
["3" | Y] Acc -> (integer-string->num- Y [3 | Acc]) | |
["4" | Y] Acc -> (integer-string->num- Y [4 | Acc]) | |
["5" | Y] Acc -> (integer-string->num- Y [5 | Acc]) | |
["6" | Y] Acc -> (integer-string->num- Y [6 | Acc]) | |
["7" | Y] Acc -> (integer-string->num- Y [7 | Acc]) | |
["8" | Y] Acc -> (integer-string->num- Y [8 | Acc]) | |
["9" | Y] Acc -> (integer-string->num- Y [9 | Acc]) | |
["0" | Y] Acc -> (integer-string->num- Y [0 | Acc]) | |
_ Acc -> (error "unknown symbol in an integer-string")) | |
(define list-number->num- | |
{ (list number) --> number --> number --> number } | |
\* The array of number must be reversed *\ | |
[] _ Acc -> Acc | |
[X | XS ] Range Acc -> (list-number->num- XS (* Range 10) (+ (* X Range) Acc))) | |
(define mod | |
{ number --> number --> number } | |
X 0 -> (error "division by zero") | |
X Y -> (- X (* Y (div X Y)))) | |
(define div | |
{ number --> number --> number } | |
X 0 -> (error "division by zero") | |
X Y -> (floor (/ X Y))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment