Skip to content

Instantly share code, notes, and snippets.

@masakielastic
Last active October 2, 2017 06:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save masakielastic/68e3b1ab1e35759c2656b3a7aacef7a5 to your computer and use it in GitHub Desktop.
Save masakielastic/68e3b1ab1e35759c2656b3a7aacef7a5 to your computer and use it in GitHub Desktop.
ラウンドトリップが保障されない文字を調べます。
def chr_unsafe(cp, enc)
[cp.to_s(16)].pack("H*").force_encoding(enc)
end
enc = 'cp932'
count = 0
count2 = 0
(0x8000..0xffff).each do |cp|
begin
c = cp.chr(enc)
rescue RangeError => ex
count += 1
unsafe = chr_unsafe(cp, enc)
if unsafe.valid_encoding?
print cp.to_s(16)
print ' '
count2 += 1
end
end
end
puts
puts count.to_s(10) + ' RangeError'
puts count2.to_s(10) + ' RangeError + valid_encoding?'
Encoding.list.each do |enc|
begin
0x61.chr(enc).encode(enc)
check(enc)
rescue Encoding::ConverterNotFoundError => ex
next
end
end
Big5
{:unsafe=>10, :error=>896}
Big5-HKSCS
{:unsafe=>10, :error=>165}
CP932
{:unsafe=>424, :error=>1777}
CP949
{:unsafe=>362, :error=>5641}
def check(enc, max=0xFFFF)
replace = "?"
opt = {
:invalid => :replace,
:undef => :replace,
:replace => replace
}
unsafe = 0
error = 0
(0..max).each do |cp|
c = [cp.to_s(16)].pack("H*").force_encoding(enc)
unless c.valid_encoding?
next
end
ret = c.encode("utf-8", opt).encode(enc, opt)
if ret != c then
if ret == replace then
error += 1
else
unsafe += 1
end
end
end
puts enc
puts "unsafe: #{unsafe}, error: #{error}"
end
list = [
# Traditional Chinese
"Big5",
"Big5-HKSCS",
# Japanese
"CP932",
# Korean
"CP949"
]
list.each do |enc|
check(enc)
end
def check(enc)
replace = "?"
opt = {
:invalid => :replace,
:undef => :replace,
:replace => replace
}
count = 0
(0x8000..0xffff).each do |cp|
begin
c = cp.chr(enc)
rescue RangeError => ex
next
end
ret = c.encode('utf-8', opt).encode(enc, opt)
if ret == c || ret == '?'
next
end
#print "[#{cp.to_s(16)}: #{ret.unpack('H*').first}] "
count += 1
end
puts enc
puts "round-trip unsafe: #{count.to_s(10)}"
end
list = [
# Traditional Chinese
"Big5",
# Japanese
"CP932"
]
list.each do |enc|
check(enc)
end
def chr_unsafe(cp, enc)
[cp.to_s(16)].pack("H*").force_encoding(enc)
end
def valid_codepoint?(cp, enc)
[cp.to_s(16)].pack("H*").force_encoding(enc).valid_encoding?
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment