yujinakayama/generate_table.rb

## generate_table.rb
# coding: utf-8

require 'tmpdir'
require 'parser/current'

def in_tmpdir
  Dir.mktmpdir do |tmpdir|
    Dir.chdir(tmpdir) do
      yield
    end
  end
end

def each_sample
  encodings = [
    'utf-8',
    'us-ascii',
    'ascii-8bit',
    'koi8-r'
  ]

  regexps = [
    'abc',
    'À',
    'д',
    '\xff'
  ]

  options = [
    '',
    'u',
    'n'
  ]

  encodings.each do |encoding|
    regexps.each do |regexp|
      options.each do |option|
        source = <<-END.gsub(/^\s+\|/, '').chomp
          |# coding: #{encoding}
          |puts /#{regexp}/#{option}.encoding
        END
        yield encoding, regexp, option, source
      end
    end
  end
end

# Derived from https://github.com/whitequark/parser/blob/v2.1.7/lib/parser/builders/default.rb#L707-L726
def regexp_from_node(regexp_node)
  str_node, opt_node = *regexp_node
  regexp_body, = *str_node
  *regexp_opt  = *opt_node

  regexp_body = case
  when regexp_opt.include?(:u)
    regexp_body.encode(Encoding::UTF_8)
  when regexp_opt.include?(:e)
    regexp_body.encode(Encoding::EUC_JP)
  when regexp_opt.include?(:s)
    regexp_body.encode(Encoding::WINDOWS_31J)
  when regexp_opt.include?(:n)
    regexp_body.encode(Encoding::BINARY)
  else
    regexp_body
  end

  Regexp.new(regexp_body)
end

markdown = ''
markdown << "* Ruby: #{RUBY_ENGINE} #{RUBY_VERSION}\n"
markdown << "* Encoding of the source Parser was run on: #{''.encoding}\n"
markdown << "\n"
markdown << "Magic comment | Regexp | Ruby (Regexp encoding or error) | `Regexp.new().encoding` with Parser node or error | Parser regexp node \n"
markdown << "--------------|--------|---------------------------------|---------------------------------------------------|--------------------\n"

in_tmpdir do
  path = 'sample.rb'

  each_sample do |encoding, regexp, option, source|
    begin
      File.open(path, "w:#{encoding}") do |file|
        file.write(source)
      end
    rescue
      next # Skip if the file encoding does not support the character
    end

    ruby_result = `ruby #{path} 2>&1`

    ast = Parser::CurrentRuby.parse_file(path)
    regexp_node = ast.children[2].children[0]

    begin
      parser_result = regexp_from_node(regexp_node).encoding
    rescue => error
      parser_result = error.message
    end

    markdown << "#{encoding} | `/#{regexp}/#{option}` | #{ruby_result.chomp} | #{parser_result} | `#{regexp_node.inspect.gsub("\n", '')}`\n"
  end
end

File.write('result.md', markdown)

## result.md

      
    Raw
  

              result.md
            
          
Ruby: ruby 2.1.1
Encoding of the source Parser was run on: UTF-8


Magic comment
Regexp
Ruby (Regexp encoding or error)
Regexp.new().encoding with Parser node or error
Parser regexp node


utf-8
/abc/
US-ASCII
US-ASCII
(regexp  (str "abc")  (regopt))


utf-8
/abc/u
UTF-8
US-ASCII
(regexp  (str "abc")  (regopt :u))


utf-8
/abc/n
US-ASCII
US-ASCII
(regexp  (str "abc")  (regopt :n))


utf-8
/À/
UTF-8
UTF-8
(regexp  (str "À")  (regopt))


utf-8
/À/u
UTF-8
UTF-8
(regexp  (str "À")  (regopt :u))


utf-8
/À/n
sample.rb:2: regexp encoding option 'n' differs from source encoding 'UTF-8'
U+00C0 from UTF-8 to ASCII-8BIT
(regexp  (str "À")  (regopt :n))


utf-8
/д/
UTF-8
UTF-8
(regexp  (str "д")  (regopt))


utf-8
/д/u
UTF-8
UTF-8
(regexp  (str "д")  (regopt :u))


utf-8
/д/n
sample.rb:2: regexp encoding option 'n' differs from source encoding 'UTF-8'
U+0434 from UTF-8 to ASCII-8BIT
(regexp  (str "д")  (regopt :n))


utf-8
/\xff/
sample.rb:2: invalid multibyte escape: /\xff/
invalid multibyte escape: /\xff/
(regexp  (str "\\xff")  (regopt))


utf-8
/\xff/u
sample.rb:2: invalid multibyte escape: /\xff/
invalid multibyte escape: /\xff/
(regexp  (str "\\xff")  (regopt :u))


utf-8
/\xff/n
ASCII-8BIT
ASCII-8BIT
(regexp  (str "\\xff")  (regopt :n))


us-ascii
/abc/
US-ASCII
US-ASCII
(regexp  (str "abc")  (regopt))


us-ascii
/abc/u
UTF-8
US-ASCII
(regexp  (str "abc")  (regopt :u))


us-ascii
/abc/n
US-ASCII
US-ASCII
(regexp  (str "abc")  (regopt :n))


us-ascii
/\xff/
ASCII-8BIT
invalid multibyte escape: /\xff/
(regexp  (str "\\xff")  (regopt))


us-ascii
/\xff/u
sample.rb:2: invalid multibyte escape: /\xff/
invalid multibyte escape: /\xff/
(regexp  (str "\\xff")  (regopt :u))


us-ascii
/\xff/n
ASCII-8BIT
ASCII-8BIT
(regexp  (str "\\xff")  (regopt :n))


ascii-8bit
/abc/
US-ASCII
US-ASCII
(regexp  (str "abc")  (regopt))


ascii-8bit
/abc/u
UTF-8
US-ASCII
(regexp  (str "abc")  (regopt :u))


ascii-8bit
/abc/n
US-ASCII
US-ASCII
(regexp  (str "abc")  (regopt :n))


ascii-8bit
/À/
ASCII-8BIT
ASCII-8BIT
(regexp  (str "\xC3\x80")  (regopt))


ascii-8bit
/À/u
sample.rb:2: regexp encoding option 'u' differs from source encoding 'ASCII-8BIT'
"\xC3" from ASCII-8BIT to UTF-8
(regexp  (str "\xC3\x80")  (regopt :u))


ascii-8bit
/À/n
ASCII-8BIT
ASCII-8BIT
(regexp  (str "\xC3\x80")  (regopt :n))


ascii-8bit
/д/
ASCII-8BIT
ASCII-8BIT
(regexp  (str "\xD0\xB4")  (regopt))


ascii-8bit
/д/u
sample.rb:2: regexp encoding option 'u' differs from source encoding 'ASCII-8BIT'
"\xD0" from ASCII-8BIT to UTF-8
(regexp  (str "\xD0\xB4")  (regopt :u))


ascii-8bit
/д/n
ASCII-8BIT
ASCII-8BIT
(regexp  (str "\xD0\xB4")  (regopt :n))


ascii-8bit
/\xff/
ASCII-8BIT
ASCII-8BIT
(regexp  (str "\\xff")  (regopt))


ascii-8bit
/\xff/u
sample.rb:2: invalid multibyte escape: /\xff/
invalid multibyte escape: /\xff/
(regexp  (str "\\xff")  (regopt :u))


ascii-8bit
/\xff/n
ASCII-8BIT
ASCII-8BIT
(regexp  (str "\\xff")  (regopt :n))


koi8-r
/abc/
US-ASCII
US-ASCII
(regexp  (str "abc")  (regopt))


koi8-r
/abc/u
UTF-8
US-ASCII
(regexp  (str "abc")  (regopt :u))


koi8-r
/abc/n
US-ASCII
US-ASCII
(regexp  (str "abc")  (regopt :n))


koi8-r
/д/
KOI8-R
UTF-8
(regexp  (str "д")  (regopt))


koi8-r
/д/u
sample.rb:2: regexp encoding option 'u' differs from source encoding 'KOI8-R'
UTF-8
(regexp  (str "д")  (regopt :u))


koi8-r
/д/n
sample.rb:2: regexp encoding option 'n' differs from source encoding 'KOI8-R'
U+0434 from UTF-8 to ASCII-8BIT
(regexp  (str "д")  (regopt :n))


koi8-r
/\xff/
KOI8-R
invalid multibyte escape: /\xff/
(regexp  (str "\\xff")  (regopt))


koi8-r
/\xff/u
sample.rb:2: invalid multibyte escape: /\xff/
invalid multibyte escape: /\xff/
(regexp  (str "\\xff")  (regopt :u))


koi8-r
/\xff/n
ASCII-8BIT
ASCII-8BIT
(regexp  (str "\\xff")  (regopt :n))
	# coding: utf-8

	require 'tmpdir'
	require 'parser/current'

	def in_tmpdir
	Dir.mktmpdir do \|tmpdir\|
	Dir.chdir(tmpdir) do
	yield
	end
	end
	end

	def each_sample
	encodings = [
	'utf-8',
	'us-ascii',
	'ascii-8bit',
	'koi8-r'
	]

	regexps = [
	'abc',
	'À',
	'д',
	'\xff'
	]

	options = [
	'',
	'u',
	'n'
	]

	encodings.each do \|encoding\|
	regexps.each do \|regexp\|
	options.each do \|option\|
	source = <<-END.gsub(/^\s+\\|/, '').chomp
	\|# coding: #{encoding}
	\|puts /#{regexp}/#{option}.encoding
	END
	yield encoding, regexp, option, source
	end
	end
	end
	end

	# Derived from https://github.com/whitequark/parser/blob/v2.1.7/lib/parser/builders/default.rb#L707-L726
	def regexp_from_node(regexp_node)
	str_node, opt_node = *regexp_node
	regexp_body, = *str_node
	regexp_opt = opt_node

	regexp_body = case
	when regexp_opt.include?(:u)
	regexp_body.encode(Encoding::UTF_8)
	when regexp_opt.include?(:e)
	regexp_body.encode(Encoding::EUC_JP)
	when regexp_opt.include?(:s)
	regexp_body.encode(Encoding::WINDOWS_31J)
	when regexp_opt.include?(:n)
	regexp_body.encode(Encoding::BINARY)
	else
	regexp_body
	end

	Regexp.new(regexp_body)
	end

	markdown = ''
	markdown << "* Ruby: #{RUBY_ENGINE} #{RUBY_VERSION}\n"
	markdown << "* Encoding of the source Parser was run on: #{''.encoding}\n"
	markdown << "\n"
	markdown << "Magic comment \| Regexp \| Ruby (Regexp encoding or error) \| `Regexp.new().encoding` with Parser node or error \| Parser regexp node \n"
	markdown << "--------------\|--------\|---------------------------------\|---------------------------------------------------\|--------------------\n"

	in_tmpdir do
	path = 'sample.rb'

	each_sample do \|encoding, regexp, option, source\|
	begin
	File.open(path, "w:#{encoding}") do \|file\|
	file.write(source)
	end
	rescue
	next # Skip if the file encoding does not support the character
	end

	ruby_result = `ruby #{path} 2>&1`

	ast = Parser::CurrentRuby.parse_file(path)
	regexp_node = ast.children[2].children[0]

	begin
	parser_result = regexp_from_node(regexp_node).encoding
	rescue => error
	parser_result = error.message
	end

	markdown << "#{encoding} \| `/#{regexp}/#{option}` \| #{ruby_result.chomp} \| #{parser_result} \| `#{regexp_node.inspect.gsub("\n", '')}`\n"
	end
	end

	File.write('result.md', markdown)
Magic comment	Regexp	Ruby (Regexp encoding or error)	`Regexp.new().encoding` with Parser node or error	Parser regexp node
utf-8	`/abc/`	US-ASCII	US-ASCII	`(regexp (str "abc") (regopt))`
utf-8	`/abc/u`	UTF-8	US-ASCII	`(regexp (str "abc") (regopt :u))`
utf-8	`/abc/n`	US-ASCII	US-ASCII	`(regexp (str "abc") (regopt :n))`
utf-8	`/À/`	UTF-8	UTF-8	`(regexp (str "À") (regopt))`
utf-8	`/À/u`	UTF-8	UTF-8	`(regexp (str "À") (regopt :u))`
utf-8	`/À/n`	sample.rb:2: regexp encoding option 'n' differs from source encoding 'UTF-8'	U+00C0 from UTF-8 to ASCII-8BIT	`(regexp (str "À") (regopt :n))`
utf-8	`/д/`	UTF-8	UTF-8	`(regexp (str "д") (regopt))`
utf-8	`/д/u`	UTF-8	UTF-8	`(regexp (str "д") (regopt :u))`
utf-8	`/д/n`	sample.rb:2: regexp encoding option 'n' differs from source encoding 'UTF-8'	U+0434 from UTF-8 to ASCII-8BIT	`(regexp (str "д") (regopt :n))`
utf-8	`/\xff/`	sample.rb:2: invalid multibyte escape: /\xff/	invalid multibyte escape: /\xff/	`(regexp (str "\\xff") (regopt))`
utf-8	`/\xff/u`	sample.rb:2: invalid multibyte escape: /\xff/	invalid multibyte escape: /\xff/	`(regexp (str "\\xff") (regopt :u))`
utf-8	`/\xff/n`	ASCII-8BIT	ASCII-8BIT	`(regexp (str "\\xff") (regopt :n))`
us-ascii	`/abc/`	US-ASCII	US-ASCII	`(regexp (str "abc") (regopt))`
us-ascii	`/abc/u`	UTF-8	US-ASCII	`(regexp (str "abc") (regopt :u))`
us-ascii	`/abc/n`	US-ASCII	US-ASCII	`(regexp (str "abc") (regopt :n))`
us-ascii	`/\xff/`	ASCII-8BIT	invalid multibyte escape: /\xff/	`(regexp (str "\\xff") (regopt))`
us-ascii	`/\xff/u`	sample.rb:2: invalid multibyte escape: /\xff/	invalid multibyte escape: /\xff/	`(regexp (str "\\xff") (regopt :u))`
us-ascii	`/\xff/n`	ASCII-8BIT	ASCII-8BIT	`(regexp (str "\\xff") (regopt :n))`
ascii-8bit	`/abc/`	US-ASCII	US-ASCII	`(regexp (str "abc") (regopt))`
ascii-8bit	`/abc/u`	UTF-8	US-ASCII	`(regexp (str "abc") (regopt :u))`
ascii-8bit	`/abc/n`	US-ASCII	US-ASCII	`(regexp (str "abc") (regopt :n))`
ascii-8bit	`/À/`	ASCII-8BIT	ASCII-8BIT	`(regexp (str "\xC3\x80") (regopt))`
ascii-8bit	`/À/u`	sample.rb:2: regexp encoding option 'u' differs from source encoding 'ASCII-8BIT'	"\xC3" from ASCII-8BIT to UTF-8	`(regexp (str "\xC3\x80") (regopt :u))`
ascii-8bit	`/À/n`	ASCII-8BIT	ASCII-8BIT	`(regexp (str "\xC3\x80") (regopt :n))`
ascii-8bit	`/д/`	ASCII-8BIT	ASCII-8BIT	`(regexp (str "\xD0\xB4") (regopt))`
ascii-8bit	`/д/u`	sample.rb:2: regexp encoding option 'u' differs from source encoding 'ASCII-8BIT'	"\xD0" from ASCII-8BIT to UTF-8	`(regexp (str "\xD0\xB4") (regopt :u))`
ascii-8bit	`/д/n`	ASCII-8BIT	ASCII-8BIT	`(regexp (str "\xD0\xB4") (regopt :n))`
ascii-8bit	`/\xff/`	ASCII-8BIT	ASCII-8BIT	`(regexp (str "\\xff") (regopt))`
ascii-8bit	`/\xff/u`	sample.rb:2: invalid multibyte escape: /\xff/	invalid multibyte escape: /\xff/	`(regexp (str "\\xff") (regopt :u))`
ascii-8bit	`/\xff/n`	ASCII-8BIT	ASCII-8BIT	`(regexp (str "\\xff") (regopt :n))`
koi8-r	`/abc/`	US-ASCII	US-ASCII	`(regexp (str "abc") (regopt))`
koi8-r	`/abc/u`	UTF-8	US-ASCII	`(regexp (str "abc") (regopt :u))`
koi8-r	`/abc/n`	US-ASCII	US-ASCII	`(regexp (str "abc") (regopt :n))`
koi8-r	`/д/`	KOI8-R	UTF-8	`(regexp (str "д") (regopt))`
koi8-r	`/д/u`	sample.rb:2: regexp encoding option 'u' differs from source encoding 'KOI8-R'	UTF-8	`(regexp (str "д") (regopt :u))`
koi8-r	`/д/n`	sample.rb:2: regexp encoding option 'n' differs from source encoding 'KOI8-R'	U+0434 from UTF-8 to ASCII-8BIT	`(regexp (str "д") (regopt :n))`
koi8-r	`/\xff/`	KOI8-R	invalid multibyte escape: /\xff/	`(regexp (str "\\xff") (regopt))`
koi8-r	`/\xff/u`	sample.rb:2: invalid multibyte escape: /\xff/	invalid multibyte escape: /\xff/	`(regexp (str "\\xff") (regopt :u))`
koi8-r	`/\xff/n`	ASCII-8BIT	ASCII-8BIT	`(regexp (str "\\xff") (regopt :n))`