Skip to content

Instantly share code, notes, and snippets.

@ssledz
Last active August 4, 2020 00:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ssledz/f26e9892d5153436bacc38239770b9a3 to your computer and use it in GitHub Desktop.
Save ssledz/f26e9892d5153436bacc38239770b9a3 to your computer and use it in GitHub Desktop.
Flexible and Economical UTF-8 Decoder
/**
* https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
*/
object Utf8Decoder {
private val Utf8Accept: Int = 0
private val Utf8Reject: Int = 1
//@formatter:off
private val utf8d : Array[Int] = Array(
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
)
//@formatter:on
private def decode(state: Int, codep: Array[Int], byte: Int): Int = {
val t = utf8d(byte)
codep(0) = if (state != Utf8Accept) (byte & 0x3f) | (codep(0) << 6) else (0xff >> t) & byte
utf8d(256 + state * 16 + t)
}
private def toUInt(b: Byte): Int = 0xff & b.asInstanceOf[Int]
def decode(bytes: Array[Byte]): String = {
val codes = Array.ofDim[Int](bytes.length)
var i, j = 0
val buff = Array(0)
var state = 0
while (i < bytes.length) {
state = decode(state, buff, toUInt(bytes(i)))
if (state == Utf8Accept) {
codes(j) = buff(0)
j = j + 1
}
i = i + 1
}
new String(codes, 0, j)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment