Last active
June 1, 2016 19:58
Add support for unicode and ASCII escape code in triple string
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
proc parse_hex*(num: var int; ch: char) = | |
case ch | |
of '0'..'9': num = (num shl 4) or (ch.int - '0'.int) | |
of 'a'..'f': num = (num shl 4) or (ch.int - 'a'.int) + 10 | |
of 'A'..'F': num = (num shl 4) or (ch.int - 'A'.int) + 10 | |
else: raise newException(ValueError, "invalid unicode hexcode") | |
template ones(n: expr): expr = ((1 shl n)-1) | |
proc runeLenAt(s: string, i: Natural): int = | |
## Returns the number of bytes the rune starting at ``s[i]`` takes | |
if ord(s[i]) <=% 127: result = 1 | |
elif ord(s[i]) shr 5 == 0b110: result = 2 | |
elif ord(s[i]) shr 4 == 0b1110: result = 3 | |
elif ord(s[i]) shr 3 == 0b11110: result = 4 | |
elif ord(s[i]) shr 2 == 0b111110: result = 5 | |
elif ord(s[i]) shr 1 == 0b1111110: result = 6 | |
else: result = 1 | |
proc usc*(s: string): string {.compiletime.} = | |
result = "" | |
var idx: int | |
while idx < len(s): | |
var ch = s[idx] | |
if ch == '\\': | |
# is escape code. | |
inc(idx) | |
ch = s[idx] | |
case ch | |
of '\\': # escaped backslash | |
result.add('\\') | |
of 'n', 'l', 'L': # newline literal | |
result.add('\L') | |
of '0': # octal escape | |
var num = 0 | |
while s[idx + 1] in '0'..'7': | |
num = (num shl 3) or (s[idx + 1].int - '0'.int) | |
inc(idx) | |
result.add(num.char); | |
of 'u', 'x': # unicode hex escape | |
let n_chars = if ch == 'u': 4 else: 2 | |
var num = 0 | |
for i in idx+1..idx+n_chars: | |
parse_hex(num, s[i]) | |
#~ result.add(num.Rune.toUTF8) | |
#~ # inlined toUTF-8 to avoid unicode and strutils dependencies. | |
if num <=% 127: | |
result.add( num.char ) | |
elif num <=% 0x07FF: | |
result.add( ((num shr 6) or 0b110_00000).char ) | |
result.add( ((num and ones(6)) or 0b10_0000_00).char ) | |
elif num <=% 0xFFFF: | |
result.add( (num shr 12 or 0b1110_0000).char ) | |
result.add( (num shr 6 and ones(6) or 0b10_0000_00).char ) | |
result.add( (num and ones(6) or 0b10_0000_00).char ) | |
else: # value is 0xFFFF | |
result.add( "\xef\xbf\xbf" ) | |
inc(idx, n_chars) | |
else: # skip unknow escape code. | |
discard | |
inc(idx) | |
continue | |
else: | |
# get UTF-8 byte sequence len and add to output. | |
let last_char = idx + runeLenAt(s, idx) | |
while idx < last_char: | |
result.add(s[idx]) | |
inc(idx) | |
when isMainModule: | |
echo usc"""mn\lue \075 \x17 \u25cf\uFFFF""" | |
echo """mn\lue \075 \x17 \u25cf\uFFFF""" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment