Skip to content

Instantly share code, notes, and snippets.

@oov
Last active October 14, 2018 00:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save oov/beb156800c6aae4ca07acad83b9d000d to your computer and use it in GitHub Desktop.
Save oov/beb156800c6aae4ca07acad83b9d000d to your computer and use it in GitHub Desktop.
UTF-8文字列を1文字ずつで分解するLuaスクリプト
-- The MIT License (MIT)
--
-- Copyright (c) 2018 oov
--
-- Permission is hereby granted, free of charge, to any person obtaining a copy of
-- this software and associated documentation files (the "Software"), to deal in
-- the Software without restriction, including without limitation the rights to
-- use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
-- the Software, and to permit persons to whom the Software is furnished to do so,
-- subject to the following conditions:
--
-- The above copyright notice and this permission notice shall be included in all
-- copies or substantial portions of the Software.
--
-- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-- FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-- COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-- IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-- Reference: https://tools.ietf.org/html/rfc3629#section-4
local function split_by_utf8_char(s)
local r, cp, p, ln = {}, {}, 1, s:len()
local c1, c2, c3, c4
while p <= ln do
c1 = s:byte(p)
if 0x00 <= c1 and c1 <= 0x7f then
table.insert(r, string.char(c1))
table.insert(cp, c1)
elseif p + 1 <= ln then
c2 = s:byte(p + 1)
if 0xc2 <= c1 and c1 <= 0xdf and 0x80 <= c2 and c2 <= 0xbf then
table.insert(r, string.char(c1, c2))
table.insert(cp, (c1 - 0xc0) * 0x40 + (c2 - 0x80))
p = p + 1
elseif p + 2 <= ln then
c3 = s:byte(p + 2)
if (c1 == 0xe0 and 0xa0 <= c2 and c2 <= 0xbf and 0x80 <= c3 and c3 <= 0xbf)or
(0xe1 <= c1 and c1 <= 0xec and 0x80 <= c2 and c2 <= 0xbf and 0x80 <= c3 and c3 <= 0xbf)or
(c1 == 0xed and 0x80 <= c2 and c2 <= 0x9f and 0x80 <= c3 and c3 <= 0xbf)or
(0xee <= c1 and c1 <= 0xef and 0x80 <= c2 and c2 <= 0xbf and 0x80 <= c3 and c3 <= 0xbf) then
table.insert(r, string.char(c1, c2, c3))
table.insert(cp, (c1 - 0xe0) * 0x1000 + (c2 - 0x80) * 0x40 + (c3 - 0x80))
p = p + 2
elseif p + 3 <= ln then
c4 = s:byte(p + 3)
if (c1 == 0xf0 and 0x90 <= c2 and c2 <= 0xbf and 0x80 <= c3 and c3 <= 0xbf and 0x80 <= c4 and c4 <= 0xbf)or
(0xf1 <= c1 and c1 <= 0xf3 and 0x80 <= c2 and c2 <= 0xbf and 0x80 <= c3 and c3 <= 0xbf and 0x80 <= c4 and c4 <= 0xbf)or
(c1 == 0xf4 and 0x80 <= c2 and c2 <= 0x8f and 0x80 <= c3 and c3 <= 0xbf and 0x80 <= c4 and c4 <= 0xbf) then
table.insert(r, string.char(c1, c2, c3, c4))
table.insert(cp, (c1 - 0xf0) * 0x40000 + (c2 - 0x80) * 0x1000 + (c3 - 0x80) * 0x40 + (c3 - 0x80))
p = p + 3
end
end
end
end
p = p + 1
end
return r, cp
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment