Skip to content

Instantly share code, notes, and snippets.

@yujinakayama
Last active August 29, 2015 13:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yujinakayama/9511027 to your computer and use it in GitHub Desktop.
Save yujinakayama/9511027 to your computer and use it in GitHub Desktop.
# coding: utf-8
require 'tmpdir'
require 'parser/current'
def in_tmpdir
Dir.mktmpdir do |tmpdir|
Dir.chdir(tmpdir) do
yield
end
end
end
def each_sample
encodings = [
'utf-8',
'us-ascii',
'ascii-8bit',
'koi8-r'
]
regexps = [
'abc',
'À',
'д',
'\xff'
]
options = [
'',
'u',
'n'
]
encodings.each do |encoding|
regexps.each do |regexp|
options.each do |option|
source = <<-END.gsub(/^\s+\|/, '').chomp
|# coding: #{encoding}
|puts /#{regexp}/#{option}.encoding
END
yield encoding, regexp, option, source
end
end
end
end
# Derived from https://github.com/whitequark/parser/blob/v2.1.7/lib/parser/builders/default.rb#L707-L726
def regexp_from_node(regexp_node)
str_node, opt_node = *regexp_node
regexp_body, = *str_node
*regexp_opt = *opt_node
regexp_body = case
when regexp_opt.include?(:u)
regexp_body.encode(Encoding::UTF_8)
when regexp_opt.include?(:e)
regexp_body.encode(Encoding::EUC_JP)
when regexp_opt.include?(:s)
regexp_body.encode(Encoding::WINDOWS_31J)
when regexp_opt.include?(:n)
regexp_body.encode(Encoding::BINARY)
else
regexp_body
end
Regexp.new(regexp_body)
end
markdown = ''
markdown << "* Ruby: #{RUBY_ENGINE} #{RUBY_VERSION}\n"
markdown << "* Encoding of the source Parser was run on: #{''.encoding}\n"
markdown << "\n"
markdown << "Magic comment | Regexp | Ruby (Regexp encoding or error) | `Regexp.new().encoding` with Parser node or error | Parser regexp node \n"
markdown << "--------------|--------|---------------------------------|---------------------------------------------------|--------------------\n"
in_tmpdir do
path = 'sample.rb'
each_sample do |encoding, regexp, option, source|
begin
File.open(path, "w:#{encoding}") do |file|
file.write(source)
end
rescue
next # Skip if the file encoding does not support the character
end
ruby_result = `ruby #{path} 2>&1`
ast = Parser::CurrentRuby.parse_file(path)
regexp_node = ast.children[2].children[0]
begin
parser_result = regexp_from_node(regexp_node).encoding
rescue => error
parser_result = error.message
end
markdown << "#{encoding} | `/#{regexp}/#{option}` | #{ruby_result.chomp} | #{parser_result} | `#{regexp_node.inspect.gsub("\n", '')}`\n"
end
end
File.write('result.md', markdown)
  • Ruby: ruby 2.1.1
  • Encoding of the source Parser was run on: UTF-8
Magic comment Regexp Ruby (Regexp encoding or error) Regexp.new().encoding with Parser node or error Parser regexp node
utf-8 /abc/ US-ASCII US-ASCII (regexp (str "abc") (regopt))
utf-8 /abc/u UTF-8 US-ASCII (regexp (str "abc") (regopt :u))
utf-8 /abc/n US-ASCII US-ASCII (regexp (str "abc") (regopt :n))
utf-8 /À/ UTF-8 UTF-8 (regexp (str "À") (regopt))
utf-8 /À/u UTF-8 UTF-8 (regexp (str "À") (regopt :u))
utf-8 /À/n sample.rb:2: regexp encoding option 'n' differs from source encoding 'UTF-8' U+00C0 from UTF-8 to ASCII-8BIT (regexp (str "À") (regopt :n))
utf-8 /д/ UTF-8 UTF-8 (regexp (str "д") (regopt))
utf-8 /д/u UTF-8 UTF-8 (regexp (str "д") (regopt :u))
utf-8 /д/n sample.rb:2: regexp encoding option 'n' differs from source encoding 'UTF-8' U+0434 from UTF-8 to ASCII-8BIT (regexp (str "д") (regopt :n))
utf-8 /\xff/ sample.rb:2: invalid multibyte escape: /\xff/ invalid multibyte escape: /\xff/ (regexp (str "\\xff") (regopt))
utf-8 /\xff/u sample.rb:2: invalid multibyte escape: /\xff/ invalid multibyte escape: /\xff/ (regexp (str "\\xff") (regopt :u))
utf-8 /\xff/n ASCII-8BIT ASCII-8BIT (regexp (str "\\xff") (regopt :n))
us-ascii /abc/ US-ASCII US-ASCII (regexp (str "abc") (regopt))
us-ascii /abc/u UTF-8 US-ASCII (regexp (str "abc") (regopt :u))
us-ascii /abc/n US-ASCII US-ASCII (regexp (str "abc") (regopt :n))
us-ascii /\xff/ ASCII-8BIT invalid multibyte escape: /\xff/ (regexp (str "\\xff") (regopt))
us-ascii /\xff/u sample.rb:2: invalid multibyte escape: /\xff/ invalid multibyte escape: /\xff/ (regexp (str "\\xff") (regopt :u))
us-ascii /\xff/n ASCII-8BIT ASCII-8BIT (regexp (str "\\xff") (regopt :n))
ascii-8bit /abc/ US-ASCII US-ASCII (regexp (str "abc") (regopt))
ascii-8bit /abc/u UTF-8 US-ASCII (regexp (str "abc") (regopt :u))
ascii-8bit /abc/n US-ASCII US-ASCII (regexp (str "abc") (regopt :n))
ascii-8bit /À/ ASCII-8BIT ASCII-8BIT (regexp (str "\xC3\x80") (regopt))
ascii-8bit /À/u sample.rb:2: regexp encoding option 'u' differs from source encoding 'ASCII-8BIT' "\xC3" from ASCII-8BIT to UTF-8 (regexp (str "\xC3\x80") (regopt :u))
ascii-8bit /À/n ASCII-8BIT ASCII-8BIT (regexp (str "\xC3\x80") (regopt :n))
ascii-8bit /д/ ASCII-8BIT ASCII-8BIT (regexp (str "\xD0\xB4") (regopt))
ascii-8bit /д/u sample.rb:2: regexp encoding option 'u' differs from source encoding 'ASCII-8BIT' "\xD0" from ASCII-8BIT to UTF-8 (regexp (str "\xD0\xB4") (regopt :u))
ascii-8bit /д/n ASCII-8BIT ASCII-8BIT (regexp (str "\xD0\xB4") (regopt :n))
ascii-8bit /\xff/ ASCII-8BIT ASCII-8BIT (regexp (str "\\xff") (regopt))
ascii-8bit /\xff/u sample.rb:2: invalid multibyte escape: /\xff/ invalid multibyte escape: /\xff/ (regexp (str "\\xff") (regopt :u))
ascii-8bit /\xff/n ASCII-8BIT ASCII-8BIT (regexp (str "\\xff") (regopt :n))
koi8-r /abc/ US-ASCII US-ASCII (regexp (str "abc") (regopt))
koi8-r /abc/u UTF-8 US-ASCII (regexp (str "abc") (regopt :u))
koi8-r /abc/n US-ASCII US-ASCII (regexp (str "abc") (regopt :n))
koi8-r /д/ KOI8-R UTF-8 (regexp (str "д") (regopt))
koi8-r /д/u sample.rb:2: regexp encoding option 'u' differs from source encoding 'KOI8-R' UTF-8 (regexp (str "д") (regopt :u))
koi8-r /д/n sample.rb:2: regexp encoding option 'n' differs from source encoding 'KOI8-R' U+0434 from UTF-8 to ASCII-8BIT (regexp (str "д") (regopt :n))
koi8-r /\xff/ KOI8-R invalid multibyte escape: /\xff/ (regexp (str "\\xff") (regopt))
koi8-r /\xff/u sample.rb:2: invalid multibyte escape: /\xff/ invalid multibyte escape: /\xff/ (regexp (str "\\xff") (regopt :u))
koi8-r /\xff/n ASCII-8BIT ASCII-8BIT (regexp (str "\\xff") (regopt :n))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment