meijeru/unicode-multiple.reds

## unicode-multiple.reds
Red/System []

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; function utf8-to-cps (cps = codepoints)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; This function decodes UTF-8 information supplied as bytes in argument u
; and uses the argument res to store the array of codepoints
; (integers >= 0  and < 10FFFFh); the space for this array should have been
; allocated by the caller; an upper limit for the size of the array in bytes
; is 4 times the length of the UTF-8 string. The actual size used is returned.
; For the algorithm, see http://en.wikipedia.org/wiki/UTF-8.

; Remarks about the contents:
; (1) Coding errors are skipped until the first correct UTF-8 combination
;     is encountered. The first byte of the offending combination is
;     replaced by a standard byte according to one of the several options
;     described in the article cited above and/or taken from elsewhere.
; (2) Codepoint U+0000 cannot be represented as UTF-8 single byte 00h
;     since that signifies end-of-c-string for Red/System!
;     However, this code point can be represented by C0h 80h.
; (3) As it stands, the function accepts "overlong" combinations, such as
;     C0h 80h or E0h 80h 80h or even F0h 80h 80h 80h for U+0000.
;     Uncomment the appropriate lines to test for this.
; (4) In any case, it rejects 4-byte sequences coding for 110000h and higher
;     (in fact, up to 1FFFFFh), because codepoints beyond U+10FFFF
;     are not defined in the Unicode standard.
; (5) Also, it rejects sequences resulting in the invalid Unicode codepoints
;     U+DC00..U+DFFF, which are used in UTF-16 for high and low surrogate halves.
;     It may, however, use some of these as replacement in the output.

; Remarks about the coding:
; (1) Another choice of type for the input parameter could have been
;     pointer! [byte!], but this would have meant that the length of the
;     input must be paassed as an extra parameter. It would have done
;     away with the problem of the null byte, though.
; (2) The choice of type for the output parameter is almost forced upon us,
;     since Red/System does not have native arrays.
; (3) The shift and bitwise or operators are assumed to be more efficient
;     than division and addition. They are also more closely tied to the
;     UTF-8 specs, reducing the possibility of errors.
; (4) The cases of invalid byte sequences include:
;     - an invalid starting byte
;     - an unexpected continuation byte
;     - a start byte not followed by enough continuation bytes (incomplete sequence)
;     - a sequence that decodes to a value that should use a shorter sequence
;      (an "overlong form").
;     The first case is caught by the test: unless b1 < C0h
;     The second case is caught by the tests: b2 >= 80h b2 < C0h etc.
;     The third case is effectively caught by the same tests, since
;     b2 >= 80h etc. will fail if b2 = 0 etc.
;     Thus there is no need for the equivalent of the protective REBOL clause
;                       if not tail? next u
;     etc.
;     The last case is also explicitly caught if the appropriate lines are
;     uncommented.
; (5) With a restriction to the BMP (codepoints up to U+FFFF) the resulting
;     array of codepoints could have elements that are uint16!, which would
;     save half of the space for this array.

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;


#define code-array! [pointer! [integer!]]

#define replacement
;	choose one of the following options
	FFFDh			; U+FFFD = replacement character
;	1Ah				; U+001A = control SUB (substitute)
;	241Ah			; U+241A = symbol for substitute
;	2426h			; U+2426 = symbol for substitute form two
;	3Fh				; U+003F = question mark
;	BFh				; U+00BF = inverted question mark
;	DC00h + b1		; U+DCxx where xx = b1 (never a Unicode codepoint)

utf8-to-cps: func [
	u [c-string!]
	res [code-array!]
	return: [integer!]
	/local
		b1 b2 b3 b4		; up to four bytes in a UTF-8 sequence
						; for computing purposes they are of integer! type
		cp				; computed codepoint
		res0			; start of result
][
	res0: res
	while [b1: as-integer u/1 b1 <> 0][

		; cycling through res is done at the end; likewise for u
		; to account for this, as soon as a multiple byte sequence is consumed
		; the pointer in u is moved one less than the number of bytes consumed

		either b1 < 80h					; single byte (ASCII)
		[
			res/value: b1				; and we are done
		][
			res/value: replacement
			; assume error by default - this simplifies code greatly
			; res/value is now only set if a correct sequence has been decoded

			unless b1 < C0h [			; 80h - BFh may not start a sequence
				case  [
					b1 < E0h [			; start of two-byte sequence
						b2: as-integer u/2
						if all [
							b2 >= 80h b2 < C0h
						][
							cp:	(b1 - C0h << 6) or
								(b2 - 80h)
;							if any [
;								cp > 7Fh	; optional test for overlong
;								cp = 0		; even so, must allow U+0000
;							][
								res/value: cp
								u: u + 1
;							]
						]
					]
					b1 < F0h [		; start of three-byte sequence
						b2: as-integer u/2
						b3: as-integer u/3
						if all [
							b2 >= 80h b2 < C0h
							b3 >= 80h b3 < C0h
						][
							cp:	(b1 - E0h << 12) or
								(b2 - 80h <<  6) or
								(b3 - 80h)
							if all [
								any [cp < DC00h cp > DCFFh]
;								cp > 7FFh	; optional test for overlong
							][
								res/value: cp
								u: u + 2
							]
						]
					]
					b1 < F8h [	; start of four-byte sequence
						b2: as-integer u/2
						b3: as-integer u/3
						b4: as-integer u/4
						if all [
							b2 >= 80h b2 < C0h
							b3 >= 80h b3 < C0h
							b4 >= 80h b4 < C0h
						][
							cp:	(b1 - F0h << 18) or
								(b2 - 80h << 12) or
								(b3 - 80h <<  6) or
								(b4 - 80h)
							if all [
								cp <= 10FFFFh
;								cp > FFFFh	; optional test for overlong
							][
								res/value: cp
								u: u + 3
							]
						]
					]
;					true [
;						error case
;					]
				]
			]
		]
		res: res + 1
		u: u + 1
	]
	res - res0
]
	Red/System []

	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	; function utf8-to-cps (cps = codepoints)
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	; This function decodes UTF-8 information supplied as bytes in argument u
	; and uses the argument res to store the array of codepoints
	; (integers >= 0 and < 10FFFFh); the space for this array should have been
	; allocated by the caller; an upper limit for the size of the array in bytes
	; is 4 times the length of the UTF-8 string. The actual size used is returned.
	; For the algorithm, see http://en.wikipedia.org/wiki/UTF-8.

	; Remarks about the contents:
	; (1) Coding errors are skipped until the first correct UTF-8 combination
	; is encountered. The first byte of the offending combination is
	; replaced by a standard byte according to one of the several options
	; described in the article cited above and/or taken from elsewhere.
	; (2) Codepoint U+0000 cannot be represented as UTF-8 single byte 00h
	; since that signifies end-of-c-string for Red/System!
	; However, this code point can be represented by C0h 80h.
	; (3) As it stands, the function accepts "overlong" combinations, such as
	; C0h 80h or E0h 80h 80h or even F0h 80h 80h 80h for U+0000.
	; Uncomment the appropriate lines to test for this.
	; (4) In any case, it rejects 4-byte sequences coding for 110000h and higher
	; (in fact, up to 1FFFFFh), because codepoints beyond U+10FFFF
	; are not defined in the Unicode standard.
	; (5) Also, it rejects sequences resulting in the invalid Unicode codepoints
	; U+DC00..U+DFFF, which are used in UTF-16 for high and low surrogate halves.
	; It may, however, use some of these as replacement in the output.

	; Remarks about the coding:
	; (1) Another choice of type for the input parameter could have been
	; pointer! [byte!], but this would have meant that the length of the
	; input must be paassed as an extra parameter. It would have done
	; away with the problem of the null byte, though.
	; (2) The choice of type for the output parameter is almost forced upon us,
	; since Red/System does not have native arrays.
	; (3) The shift and bitwise or operators are assumed to be more efficient
	; than division and addition. They are also more closely tied to the
	; UTF-8 specs, reducing the possibility of errors.
	; (4) The cases of invalid byte sequences include:
	; - an invalid starting byte
	; - an unexpected continuation byte
	; - a start byte not followed by enough continuation bytes (incomplete sequence)
	; - a sequence that decodes to a value that should use a shorter sequence
	; (an "overlong form").
	; The first case is caught by the test: unless b1 < C0h
	; The second case is caught by the tests: b2 >= 80h b2 < C0h etc.
	; The third case is effectively caught by the same tests, since
	; b2 >= 80h etc. will fail if b2 = 0 etc.
	; Thus there is no need for the equivalent of the protective REBOL clause
	; if not tail? next u
	; etc.
	; The last case is also explicitly caught if the appropriate lines are
	; uncommented.
	; (5) With a restriction to the BMP (codepoints up to U+FFFF) the resulting
	; array of codepoints could have elements that are uint16!, which would
	; save half of the space for this array.

	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;




	#define code-array! [pointer! [integer!]]

	#define replacement
	; choose one of the following options
	FFFDh ; U+FFFD = replacement character
	; 1Ah ; U+001A = control SUB (substitute)
	; 241Ah ; U+241A = symbol for substitute
	; 2426h ; U+2426 = symbol for substitute form two
	; 3Fh ; U+003F = question mark
	; BFh ; U+00BF = inverted question mark
	; DC00h + b1 ; U+DCxx where xx = b1 (never a Unicode codepoint)

	utf8-to-cps: func [
	u [c-string!]
	res [code-array!]
	return: [integer!]
	/local
	b1 b2 b3 b4 ; up to four bytes in a UTF-8 sequence
	; for computing purposes they are of integer! type
	cp ; computed codepoint
	res0 ; start of result
	][
	res0: res
	while [b1: as-integer u/1 b1 <> 0][

	; cycling through res is done at the end; likewise for u
	; to account for this, as soon as a multiple byte sequence is consumed
	; the pointer in u is moved one less than the number of bytes consumed

	either b1 < 80h ; single byte (ASCII)
	[
	res/value: b1 ; and we are done
	][
	res/value: replacement
	; assume error by default - this simplifies code greatly
	; res/value is now only set if a correct sequence has been decoded

	unless b1 < C0h [ ; 80h - BFh may not start a sequence
	case [
	b1 < E0h [ ; start of two-byte sequence
	b2: as-integer u/2
	if all [
	b2 >= 80h b2 < C0h
	][
	cp: (b1 - C0h << 6) or
	(b2 - 80h)
	; if any [
	; cp > 7Fh ; optional test for overlong
	; cp = 0 ; even so, must allow U+0000
	; ][
	res/value: cp
	u: u + 1
	; ]
	]
	]
	b1 < F0h [ ; start of three-byte sequence
	b2: as-integer u/2
	b3: as-integer u/3
	if all [
	b2 >= 80h b2 < C0h
	b3 >= 80h b3 < C0h
	][
	cp: (b1 - E0h << 12) or
	(b2 - 80h << 6) or
	(b3 - 80h)
	if all [
	any [cp < DC00h cp > DCFFh]
	; cp > 7FFh ; optional test for overlong
	][
	res/value: cp
	u: u + 2
	]
	]
	]
	b1 < F8h [ ; start of four-byte sequence
	b2: as-integer u/2
	b3: as-integer u/3
	b4: as-integer u/4
	if all [
	b2 >= 80h b2 < C0h
	b3 >= 80h b3 < C0h
	b4 >= 80h b4 < C0h
	][
	cp: (b1 - F0h << 18) or
	(b2 - 80h << 12) or
	(b3 - 80h << 6) or
	(b4 - 80h)
	if all [
	cp <= 10FFFFh
	; cp > FFFFh ; optional test for overlong
	][
	res/value: cp
	u: u + 3
	]
	]
	]
	; true [
	; error case
	; ]
	]
	]
	]
	res: res + 1
	u: u + 1
	]
	res - res0
	]