nurse/abnf2onig.rb

## abnf2onig.rb
#!/usr/local/bin/ruby
# If you want an ABNF to Regexp, see below.
# http://www.a-k-r.org/abnf/
# https://github.com/martinthomson/abnf2regex
require 'strscan'

RULENAME_ = /[A-Za-z][A-Za-z0-9\-]*/
C_WSPP_ = /[ \t]+(?:(?:;.*)?\n[ \t]+)*|(?:(?:;.*)?\n[ \t]+)+/
C_WSPS_ = /[ \t]*(?:(?:;.*)?\n[ \t]+)*/
C_NL_ = /(?:;.*)?\n/
DIGITS_ = /[0-9]+/

class CC
  def initialize(encoding=nil)
    @encoding = encoding
    @cc = []
  end

  def <<(v)
    case v
    when Fixnum
      _bsearch_add2(v, v)
    when Range
      _bsearch_add2(v.first, v.last)
    when CC
      v.to_a.each_slice(2) do |from, to|
        _bsearch_add2(from, to)
      end
    else
      raise TypeError, "invalid value to add a charclass #{v.inspect}"
    end
    self
  end

  def to_s
    return to_char(@cc.first) if @encoding && @cc.size == 2 && @cc.first == @cc.last

    case @cc
    when [0x30, 0x39] # [0-9]
      return '\d'
    when [0x30, 0x39, 0x41, 0x46] # [0-9A-F]
      return '\h'
    end

    buf = '['
    n = @encoding ? nil : 0
    @cc.each_slice(2) do |from, to|
      buf << ',' if n && 1 < n += 1
      buf << to_char(from)
      if from != to
        if @encoding.nil?
          buf << (to - from > 1 ? '-' : ',')
        else
          buf << '-' if to - from > 1
        end
        buf << to_char(to)
      end
    end
    buf << ']'
  end

  def inspect
    to_s.inspect[1..-1]
  end

  def to_a
    @cc
  end

  def empty?
    @cc.empty?
  end

  private
  def to_char(v)
    #Regexp.quote(@encoding ? v.chr(@encoding) : v.to_s)
    if @encoding
      case v
      when 0x2D, 0x5B, 0x5D
        '\\' << v.chr
      when 0x09
        '\t'
      when 0x0a
        '\n'
      when 0x0d
        '\r'
      when 0x20..0x7E
        v.chr
      when 0..0x7F
        '\x%02x' % v
      when 0..0xFFFF
        '\u%04x' % v
      else
        '\u{%x}' % v
      end
    else
      v.to_s
    end
  end

  # Returns high of low.v.high
  def _bsearch(v)
    if @cc.empty? || v < @cc.first
      return 0
    elsif @cc.last < v
      return @cc.length
    end
    low = 0
    high = @cc.length - 1
    while low < high
      mid = (low + high) / 2
      case v <=> @cc[mid]
      when 1 # >
        low = mid + 1
        return low if v < @cc[low]
      when -1 # <
        high = mid - 1
        return mid if @cc[high] < v
      else # =
        return mid # it already has the val
      end
    end
    return low if @cc[low] == v
    raise [low, high, mid].inspect
    high < mid ? mid : low
  end

  def _bsearch_add2(from, to)
    pos1 = _bsearch(from)
    pos2 = from == to ? pos1 : _bsearch(to)
    if pos1.odd?
      pos1 -= 1
      from = @cc[pos1]
    elsif pos1 > 0 && @cc[pos1-1] == from - 1
      pos1 -= 2
      from = @cc[pos1]
    end
    if pos2.odd?
      to = @cc[pos2]
    elsif to + 1 == @cc[pos2] || to == @cc[pos2]
      pos2 += 1
      to = @cc[pos2]
    else
      pos2 -= 1
    end
    @cc[pos1, pos2-pos1+1] = from, to
  end
end
#__END__

#require 'test/unit'
class TestCC#< Test::Unit::TestCase
  def test_add
    cc = CC.new
    assert_equal('[]', cc.to_s)
    cc << 1
    assert_equal('[1]', cc.to_s)
    cc << 2
    assert_equal('[1,2]', cc.to_s)
    cc << 2
    assert_equal('[1,2]', cc.to_s)
    cc << 0
    assert_equal('[0-2]', cc.to_s)
    cc << 9
    assert_equal('[0-2,9]', cc.to_s)
    cc << 6
    assert_equal('[0-2,6,9]', cc.to_s)
    cc << 3
    assert_equal('[0-3,6,9]', cc.to_s)
    cc << 0
    assert_equal('[0-3,6,9]', cc.to_s)
    cc << 5
    assert_equal('[0-3,5,6,9]', cc.to_s)
    cc << 7
    assert_equal('[0-3,5-7,9]', cc.to_s)
    cc << 4
    assert_equal('[0-7,9]', cc.to_s)
  end

  def test_add2
    cc= CC.new
    assert_equal('[]', cc.to_s)
    cc << (3..4)
    assert_equal('[3,4]', cc.to_s)
    cc << (1..2)
    assert_equal('[1-4]', cc.to_s)
    cc << (5..6)
    assert_equal('[1-6]', cc.to_s)
    cc << (8..9)
    assert_equal('[1-6,8,9]', cc.to_s)
    cc << (7..7)
    assert_equal('[1-9]', cc.to_s)
    cc << (2..4)
    assert_equal('[1-9]', cc.to_s)
    cc << (1..4)
    assert_equal('[1-9]', cc.to_s)
    cc << (7..9)
    assert_equal('[1-9]', cc.to_s)
    cc << (1..9)
    assert_equal('[1-9]', cc.to_s)

    cc= CC.new
    assert_equal('[]', cc.to_s)
    cc << (5..5)
    assert_equal('[5]', cc.to_s)
    cc << (3..3)
    assert_equal('[3,5]', cc.to_s)
    cc << (1..4)
    assert_equal('[1-5]', cc.to_s)
  end
end

class Alternation
  attr_accessor :repeat

  def initialize(parser, encoding)
    @parser = parser
    @encoding = encoding
    @chars = CC.new(@encoding)
    @ary = []
    @repeat = nil
  end

  def <<(concatenation)
    case concatenation
    when nil
      raise "nil"
    end
    @ary << concatenation
  end

  def atomic?
    return true if @ary.empty?

    if @ary.size == 1
      return \
        case e = @ary.first
        when Integer, Range
          true
        when String
          e.size <= 1
        when Symbol
          @parser.rules[e.to_s].atomic?
        when Alternation, Concatenation
          e.atomic?
        else
          raise
        end
    end

    cc?
  end

  def empty?
    @ary.empty?
  end

  def cc?
    @ary.all? do |e|
      case e
      when Integer, Range
        true
      when String
        e.size == 1
      when Symbol
        @parser.rules[e.to_s].cc?
      when Alternation, Concatenation
        e.cc?
      else
        raise
      end
    end
  end

  def single?
    @ary.size <= 1
  end

  def to_a
    @ary
  end

  def to_s
    strs, cc = to_s0

    if !strs.empty?
      str = strs.map{|e|e.to_s}.join('|')
      if cc
        str << '|'
        str << cc.to_s
      else
      end
    elsif cc
      str = cc.to_s
    else
      return ''
    end

    case @repeat
    when nil
      str
    when [0, 0]
      ""
    when [0, 1], [nil, 1]
      if atomic?
        str <<= '?'
      else
        "(?:#{str})?"
      end
    else
      "(?:#{str}){#{@repeat.join(",")}}"
    end
  end

  private
  def to_s0
    strs = []
    cc = nil

    ary = @ary
    while ary
      alts = []
      ary.each do |e|
        case e
        when Range, Integer
          cc = CC.new(@encoding) unless cc
          cc << e
        when String
          strs << e
=begin
        when Symbol
          p e
          if CORE_RULE_NAMES.include?(e)
            alts << @parser.rules[e.to_s]
          elsif r = @parser.rules[e.to_s] and r.atomic?
            alts << r
          else
            unless r
              p [__LINE__, e, r]
              puts caller
            end
            strs << "\\g<#{e}>"
          end
=end
        when Alternation
          alts.concat e.to_a
        when Concatenation
          alts << e.expand
        when Repetition
          case e.element
          when Alternation, Concatenation
            if e.cc?
              alts << e.element
            else
              strs << e.to_s
            end
          else
            if e.cc?
              cc = CC.new(@encoding) unless cc
              cc << e.element
            else
              strs << e.to_s
            end
          end
        else
          raise e.inspect
        end
      end
      break if alts.empty?
      ary = alts
    end

    return strs, cc
  end
end

class Concatenation
  def initialize(parser, encoding)
    @parser = parser
    @encoding = encoding
    @ary = []
  end

  def <<(item)
    case item
    when nil
      raise "nil"
    when Array, String, Alternation, Concatenation
      return @ary if item.empty?
    end
    @ary << item
  end

  def e2s(e)
    e.to_s
  end

  def expand
    buf = nil
    @ary.each do |e|
      case buf
      when nil
        buf = e
      when String
        buf << e2s(e)
      else
        buf = e2s(buf).dup
        buf << e2s(e)
      end
    end
    buf
  end

  def atomic?
    return true if @ary.empty?
    return false if @ary.size > 1
    return @ary.first.atomic?

    case item = @ary.first
    when Integer, Range
      true
    when String
      item.size == 1
    when Symbol
      @parser.rules[item.to_s].atomic?
    when Alternation, Concatenation, Repetition
      item.atomic?
    else
      raise item.inspect
    end
  end

  def cc?
    return false if @ary.size != 1

    case item = @ary.first
    when Integer, Range
      true
    when String
      item.size == 1
    when Symbol
      @parser.rules[item.to_s].cc?
    when Alternation, Concatenation, Repetition
      item.cc?
    else
      raise item.inspect
    end
  end

  def empty?
    @ary.empty?
  end
end

class Repetition
  def initialize(parser, element, repeat,rulename=nil)
    @parser = parser
    @element = element
    @repeat = repeat
    @rulename = rulename
  end

  def atomic?
    case element = @element
    when Integer, Range
      true
    when String
      element.size == 1
    when Symbol
      @parser.rules[element.to_s].atomic?
    when Alternation, Concatenation, Repetition
      element.atomic?
    else
      raise element.inspect
    end
  end

  def cc?
    return false unless @repeat.nil?
    case element = @element
    when Integer, Range
      true
    when String
      element.size == 1
    when Symbol
      @parser.rules[element.to_s].cc?
    when Alternation, Concatenation
      element.cc?
    else
      raise element.inspect
    end
  end

  def single?
    case @element
    when Integer, Range, String, Symbol
      true
    when Alternation, Concatenation, Repetition
      @element.single?
    else
      raise @element.inspect
    end
  end

  def element
    case element = @element
    when Integer, Range, String
      element
    when Symbol
      @parser.rules[element.to_s]
    when Alternation, Concatenation
      element
    else
      raise element.inspect
    end
  end

  def to_s
    element = @element
    case element
    when Integer
      enc = @encoding || Encoding::UTF_8
      str = Regexp.quote(element.chr(enc))
    when Range
      enc = @encoding || Encoding::UTF_8
      str = "[#{element.first.chr(enc)}-#{element.last.chr(enc)}]"
    when String
      str = element
    when Alternation
      str = element.to_s
    when Symbol
      r = @parser.rules[element.to_s]
      if r
        if Grammer::CORE_RULE_NAMES.include?(element) ||
          @parser.expand_rules.include?(element)
          str = r.to_s
        elsif @parser.autoexpand_rules[element]
          str = "(?<#{element}>#{r})"
          @parser.autoexpand_rules[element] = false
        else
          str = "\\g<#{element}>"
        end
      else
        unless r
          p [__LINE__, element, r]
          puts caller
        end
        str = "\\g<#{element}>"
      end
    else
      raise "unknown type of element #{element.inspect}"
    end

    case @repeat
    when nil
      str
    when [0, 1], [nil, 1]
      if atomic?
        str <<= '?'
      else
        "(?:#{str})?"
      end
    else
      atomic? ? "#{str}#@repeat" : "(?:#{str})#@repeat"
    end
  end
end

class Grammer
  attr_reader :encoding, :expand_rules, :autoexpand_rules, :rules
  attr_accessor :rulename

  CORE_RULE_NAMES = %i[ALPHA BIT CHAR CR CRLF CTL DIGIT DQUOTE HEXDIG HTAB LF LWSP OCTET SP VCHAR WSP OWS RWS BWS word token tchar special]

  def initialize(src=nil, encoding=nil)
    @rules = {}
    @refs = {}
    @rulename = nil
    @parser = self
    @encoding = encoding
    @expand_rules = nil
    @autoexpand_rules = nil
    parse(src, encoding) if src
  end

  # return value is Integer or Range or String
  def scan_digits(ss, reg, base)
    c = ss.scan(reg) or raise 'missing digits in num-val'
    case ss.scan(/[.\-]/)
    when '.'
      encoding = ss.string.encoding
      chrs = c.to_i(base).chr(encoding)
      while ss.skip('.')
        c = ss.scan(reg) or raise 'missing concatenated digits in num-val'
        chrs << c.to_i(base).chr(encoding)
      end
      chrs
    when '-'
      d = ss.scan(reg) or raise 'missing the end of range num-val'
      c.to_i(base)..d.to_i(base)
    else
      c.to_i(base)
    end
  end

  def scan_repeat(ss)
    # repeat         =  1*DIGIT / (*DIGIT "*" *DIGIT)
    min = ss.scan(DIGITS_)
    star = ss.skip(/\*/)
    max = ss.scan(DIGITS_)
    if star
      if max
        min ? "{#{min},#{max}}" :
        max == 1 ? '?' : "{,#{max}}"
      elsif min
        min == '1' ? '+' : "{#{min},}"
      else
        '*'
      end
    else
      if max
        raise if min
        "{#{max}}"
      elsif min
        "{#{min}}"
      else
        nil
      end
    end
  end

  def scan_alternation(state)
    ss = state[:ss]
    encoding = ss.string.encoding
    alternation = Alternation.new(self, encoding)
    # alternation    =  concatenation *(*c-wsp "/" *c-wsp concatenation)
    while true
      concatenation = Concatenation.new(self, encoding)
      $stderr.puts 'alternation: ' + ss.rest[/.*/][0, 100] if $DEBUG
      # concatenation  =  repetition *(1*c-wsp repetition)
      while true
        $stderr.puts 'concatenation: ' + ss.rest[/.*/][0, 100] if $DEBUG
        # repetition     =  [repeat] element
        repeat = scan_repeat(ss)

        # element        =  rulename / group / option / char-val / num-val / prose-val
        if name = ss.scan(RULENAME_) or (ss.scan(/<([^>]+)>/) && name = ss[1])
          # prose-val      =  "<" *(%x20-3D / %x3F-7E) ">"
          element = name.intern
          n = @refs[state[:rulename]][name]
          @refs[state[:rulename]][name] = n ? n+1 : 1
        elsif ss.skip(/\(/)
                      ss.skip(C_WSPS_)
                      element = scan_alternation(state)
                      ss.skip(C_WSPS_)
                      ss.skip(/\)/) or raise "missing ')'"
        elsif ss.skip(/\[/)
          ss.skip(C_WSPS_)
          element = scan_alternation(state)
          element.repeat = [0, 1]
          ss.skip(C_WSPS_)
          ss.skip(/\]/) or raise "missing ']'"
        elsif ss.scan(/"([^"]*)"/)
          # char-val       =  DQUOTE *(%x20-21 / %x23-7E) DQUOTE
          if ss[1].length == 1
            element = ss[1].ord
          else
            element = Regexp.quote(ss[1])
          end
        elsif ss.skip(/%/)
          # num-val        =  "%" (bin-val / dec-val / hex-val)
          case v = ss.getch
          when 'b'
            element = scan_digits(ss, /[01]+/, 2)
          when 'd'
            element = scan_digits(ss, /[0-9]+/, 10)
          when 'x'
            element = scan_digits(ss, /[0-9a-fA-F]+/, 16)
          else
            raise "unknown num-val type '#{v}'"
          end
        else
          raise 'missing element (even if it has repeat)' if repeat
          break
        end
        concatenation << Repetition.new(self, element, repeat, state[:rulename])
        break unless ss.skip(C_WSPP_)
      end
      alternation << concatenation
      ss.skip(C_WSPS_)
      break unless ss.skip(/\//)
      ss.skip(C_WSPS_)
    end
    alternation
  end

  # http://tools.ietf.org/html/rfc2234
  def parse(src, encoding=nil)
    src = src.force_encoding(encoding) if encoding
    indent = 999
    src.scan(/^ *(?=\S)/){|s| indent = s.bytesize if indent > s.bytesize }
    src.gsub!(/^ {#{indent}}/, '') if indent > 0 && indent != 999
    ss = StringScanner.new(src)
    state = {ss: ss, rulename: nil}

    #  rulelist       =  1*( rule / (*c-wsp c-nl) )
    until ss.eos?
      # rule           =  rulename defined-as elements c-nl
      if rulename = ss.scan(RULENAME_)
        state[:rulename] = rulename
        $stderr.puts 'rulename: ' + rulename if $DEBUG
        # defined-as     =  *c-wsp ("=" / "=/") *c-wsp
        ss.skip(C_WSPS_)
        case ss.skip(/=\/?/)
        when 2
          raise 'no previous definition even if this is OR-def' unless rules.key?(rulename)
        when 1
          warn "duplicated definition #{rulename.inspect}" if @rules.key?(rulename)
        else
          raise
        end
        ss.skip(C_WSPS_)

        # elements       =  alternation *c-wsp
        @refs[rulename] = {}
        @rules[rulename] = scan_alternation(state)
        ss.skip(C_WSPS_)

        ss.skip(C_NL_) or raise 'unexpected: ' + (ss.eos? ? 'EOS' : ss.rest[/.+|\n.*/].dump)
        state[:rulename] = nil
      else
        ss.skip(C_WSPS_)
        ss.skip(C_NL_) or raise 'unexpected: ' + (ss.eos? ? 'EOS' : ss.rest[/.+|\n.*/].dump)
      end
    end
    nil
  end

  def build(name, expand_rules=[].freeze)
    @expand_rules = expand_rules
    @autoexpand_rules = {}
    _include_defs(name, defs={})

    buf = ""
    defs.each_pair do |key, value|
      autoexpand_rules[key.intern] = true if value <= 1
    end
    defs.each_key do |key|
      next if CORE_RULE_NAMES.include?(key.intern)
      next if @expand_rules.include?(key.intern)
      next if @autoexpand_rules.key?(key.intern)
      #next if @rules[key].cc?
      buf << "(?<#{key}>#{@rules[key]}){0}"
    end
    buf << "(?<#{name}>#{@rules[name]})"
    Regexp.new(buf)
  end

  def inspect
    "#<%s:%#016x>" % [self.class.name, self.__id__<<1]
  end

  private
  def _include_defs(name, defs)
    unless @refs[name]
      raise "missing definition of '#{name}'"
    end
    @refs[name].each_pair do |key, value|
      n = defs[key]
      if n
        defs[key] = n+value
      else
        defs[key] = value
        _include_defs(key, defs)
      end
    end
  end
end

# http://tools.ietf.org/html/rfc2234
def abnf2onig_lite(src)
  table = {}
  src.scan(/([\w\-]+)\s*=\s*(.*(?:\n\s*[^\w\s].*)*)/) do |name, expr|
    current = []
    stack = [current]
    expr.gsub!(/ ; .*/, '') # remove comments
    expr.scan(/\s*(\/|[0-9*]*(?:"[^"\s]+"|<?[%\w\-]+>?|[^%\w\-\s]+))\s*/) do |atom,|
      /\A(?:([0-9]+)(\*([0-9]+)?)?|(\*([0-9]+)?))?(.*)\z/ =~ atom
      quantifier =
        $3 ? "{#$1,#$3}" :
        $2 ? "{#$1,}" :
        $1 ? "{#$1}" :
        $5 ? "{,#$5}" :
        $4 ? '*' :
        ''
      next '' if $2 == '0'

      case $6
      when /\A<?([\w\-]+)>?\z/
        current << "\\g<#$1>#{quantifier}"
      when /\A"([^"]+)"\z/
        current << "#{Regexp.quote($1)}#{quantifier}"
      when /\A%x([0-9A-F]+)(?:-([0-9A-F]+))?\z/
        current.push($2 ? "[\\u{#$1}-\\u{#$2}]" : "\\u{#$1}")
      when '['
        current = ['(?:']
        stack << current
      when ']'
        current << ')?'
        current = stack[-2]
        current << stack.pop.join
      when '/'
        current << '|'
      when '('
        current = [")#{quantifier}", '(?:']
        stack << current
      when ')'
        current << current.shift
        current = stack[-2]
        current << stack.pop.join
      else
        $stderr.puts atom
        current << "{::#{x}::}"
      end
    end
    table[name] = stack.join
  end
  table
end

def main
  require 'pp'
  #g = Grammer.new
  #g.parse(CORE_RULES_)
  #g.parse($stdin.read, Encoding::US_ASCII)
  #p g.build(ARGV[0])
  #return

  g = Grammer.new
  g.parse(CORE_RULES_)
  g.parse(ADDR_SPEC_RULES_)
  g.parse(MAILTO_RULES_)
  p g.build('local-part', %i[atext dot-atom-text qcontent quoted-pair quoted-string])
  p g.build('domain', %i[atext dot-atom-text dtext-no-obs])
  p g.build('addr-spec', %i[atext dot-atom-text dtext-no-obs qcontent quoted-pair quoted-string])
  p g.build('mailtoURI', %i[atext dot-atom-text dtext-no-obs qchar pct-encoded hfname hfvalue qcontent quoted-pair quoted-string])
  return

  g = Grammer.new
  g.parse(CORE_RULES_)
  g.parse(URI_RULES_)

  unless (uri = g.build('IPv4address')) =~"127.0.0.1"
    puts "#{__LINE__} failed to match"
    p uri
    return
  end
  unless (uri = g.build('IPv6address')) =~ "::1"
    puts "#{__LINE__} failed to match"
    p uri
    return
  end
  unless (uri = g.build('IP-literal')) =~"[::1]"
    puts "#{__LINE__} failed to match"
    p uri
    return
  end
  p uri = g.build('URI', %i[h16 pchar pct-encoded])
  p uri.match("http://exmaple.org:80/foo")
  p uri.match("http://127.0.0.1/foo?hoge#fuga")
  p uri.match("https://[::1]:8080/%e3%81%82?hoge#fuga")
  p uri = g.build('relative-ref', %i[h16 pchar pct-encoded])
  p uri.match("/foo")
  p uri.match("/foo?hoge#fuga")
  p uri.match("/%e3%81%82?hoge#fuga")
  return

  g.parse(IRI_RULES_, Encoding::UTF_8)
  p uri = g.build('IRI')
  p uri=~"http://\u{65e5}\u{672c}\u{8a9e}.jp/\u3042\u3044\u3046"

  g = Grammer.new
  g.parse(CORE_RULES_)
  g.parse(ADDR_SPEC_RULES_)
  p ras = g.build('addr-spec')
  p ras =~ 'foo@example.com'
end

# http://tools.ietf.org/html/rfc5234
CORE_RULES_ = <<'_text'
ALPHA          =  %x41-5A / %x61-7A   ; A-Z / a-z

BIT            =  "0" / "1"

CHAR           =  %x01-7F
                       ; any 7-bit US-ASCII character, excluding NUL

CR             =  %x0D
                       ; carriage return

CRLF           =  CR LF
                       ; Internet standard newline

CTL            =  %x00-1F / %x7F
                       ; controls

DIGIT          =  %x30-39
                       ; 0-9

DQUOTE         =  %x22
                       ; " (Double Quote)

HEXDIG         =  DIGIT / "A" / "B" / "C" / "D" / "E" / "F"

HTAB           =  %x09
                       ; horizontal tab

LF             =  %x0A
                       ; linefeed

LWSP           =  *(WSP / CRLF WSP)
                       ; linear white space (past newline)

OCTET          =  %x00-FF
                       ; 8 bits of data

SP             =  %x20
                       ; space

VCHAR          =  %x21-7E
                       ; visible (printing) characters

WSP            =  SP / HTAB
                       ; white space

; http-bis-p1
OWS            = *( SP / HTAB )
                 ; "optional" whitespace
RWS            = 1*( SP / HTAB )
                 ; "required" whitespace
BWS            = OWS
                 ; "bad" whitespace

word           = token / quoted-string

token          = 1*tchar

tchar          = "!" / "#" / "$" / "%" / "&" / "'" / "*"
                / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~"
                / DIGIT / ALPHA
                ; any VCHAR, except special

special        = "(" / ")" / "<" / ">" / "@" / ","
                / ";" / ":" / "\" / DQUOTE / "/" / "["
                / "]" / "?" / "=" / "{" / "}"
_text

URI_RULES_ = <<'_text'
URI           = scheme ":" hier-part [ "?" query ] [ "#" fragment ]

hier-part     = "//" authority path-abempty
              / path-absolute
              / path-rootless
              / path-empty

URI-reference = URI / relative-ref

absolute-URI  = scheme ":" hier-part [ "?" query ]

relative-ref  = relative-part [ "?" query ] [ "#" fragment ]

relative-part = "//" authority path-abempty
              / path-absolute
              / path-noscheme
              / path-empty

scheme        = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )

authority     = [ userinfo "@" ] host [ ":" port ]
userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
host          = IP-literal / IPv4address / reg-name
port          = *DIGIT

IP-literal    = "[" ( IPv6address / IPvFuture  ) "]"

IPvFuture     = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )

IPv6address   =                            6( h16 ":" ) ls32
              /                       "::" 5( h16 ":" ) ls32
              / [               h16 ] "::" 4( h16 ":" ) ls32
              / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
              / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
              / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
              / [ *4( h16 ":" ) h16 ] "::"              ls32
              / [ *5( h16 ":" ) h16 ] "::"              h16
              / [ *6( h16 ":" ) h16 ] "::"

h16           = 1*4HEXDIG
ls32          = ( h16 ":" h16 ) / IPv4address
IPv4address   = dec-octet "." dec-octet "." dec-octet "." dec-octet

dec-octet     = DIGIT                 ; 0-9
              / %x31-39 DIGIT         ; 10-99
              / "1" 2DIGIT            ; 100-199
              / "2" %x30-34 DIGIT     ; 200-249
              / "25" %x30-35          ; 250-255

reg-name      = *( unreserved / pct-encoded / sub-delims )

path          = path-abempty    ; begins with "/" or is empty
              / path-absolute   ; begins with "/" but not "//"
              / path-noscheme   ; begins with a non-colon segment
              / path-rootless   ; begins with a segment
              / path-empty      ; zero characters

path-abempty  = *( "/" segment )
path-absolute = "/" [ segment-nz *( "/" segment ) ]
path-noscheme = segment-nz-nc *( "/" segment )
path-rootless = segment-nz *( "/" segment )
path-empty    = 0<pchar>

segment       = *pchar
segment-nz    = 1*pchar
segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
              ; non-zero-length segment without any colon ":"

pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"

query         = *( pchar / "/" / "?" )

fragment      = *( pchar / "/" / "?" )

pct-encoded   = "%" HEXDIG HEXDIG

unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
reserved      = gen-delims / sub-delims
gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
              / "*" / "+" / "," / ";" / "="
_text

IRI_RULES_ = <<'_text'
IRI            = scheme ":" ihier-part [ "?" iquery ]
                 [ "#" ifragment ]

ihier-part     = "//" iauthority ipath-abempty
               / ipath-absolute
               / ipath-rootless
               / ipath-empty

IRI-reference  = IRI / irelative-ref

absolute-IRI   = scheme ":" ihier-part [ "?" iquery ]

irelative-ref  = irelative-part [ "?" iquery ] [ "#" ifragment ]

irelative-part = "//" iauthority ipath-abempty
               / ipath-absolute
               / ipath-noscheme
               / ipath-empty

iauthority     = [ iuserinfo "@" ] ihost [ ":" port ]
iuserinfo      = *( iunreserved / pct-form / sub-delims / ":" )
ihost          = IP-literal / IPv4address / ireg-name

pct-form       = pct-encoded

ireg-name      = *( iunreserved / sub-delims )

ipath          = ipath-abempty   ; begins with "/" or is empty
               / ipath-absolute  ; begins with "/" but not "//"
               / ipath-noscheme  ; begins with a non-colon segment
               / ipath-rootless  ; begins with a segment
               / ipath-empty     ; zero characters

ipath-abempty  = *( path-sep isegment )
ipath-absolute = path-sep [ isegment-nz *( path-sep isegment ) ]
ipath-noscheme = isegment-nz-nc *( path-sep isegment )
ipath-rootless = isegment-nz *( path-sep isegment )
ipath-empty    = 0<ipchar>
path-sep       = "/"

isegment       = *ipchar
isegment-nz    = 1*ipchar
isegment-nz-nc = 1*( iunreserved / pct-form / sub-delims
                     / "@" )

               ; non-zero-length segment without any colon ":"

ipchar         = iunreserved / pct-form / sub-delims / ":"
               / "@"

iquery         = *( ipchar / iprivate / "/" / "?" )

ifragment      = *( ipchar / "/" / "?" )

iunreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar

ucschar        = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
               / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
               / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
               / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
               / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
               / %xD0000-DFFFD / %xE1000-EFFFD

iprivate       = %xE000-F8FF / %xE0000-E0FFF / %xF0000-FFFFD
               / %x100000-10FFFD
_text

ADDR_SPEC_RULES_ = <<'_text'

addr-spec       =   local-part "@" domain

local-part      =   dot-atom / quoted-string / obs-local-part

domain          =   dot-atom / domain-literal / obs-domain

domain-literal  =   [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS]

dtext           =   %d33-90 /          ; Printable US-ASCII
                    %d94-126 /         ;  characters not including
                    obs-dtext          ;  "[", "]", or "\"

atext           =   ALPHA / DIGIT /    ; Printable US-ASCII
                    "!" / "#" /        ;  characters not including
                    "$" / "%" /        ;  specials.  Used for atoms.
                    "&" / "'" /
                    "*" / "+" /
                    "-" / "/" /
                    "=" / "?" /
                    "^" / "_" /
                    "`" / "{" /
                    "|" / "}" /
                    "~"

atom            =   [CFWS] 1*atext [CFWS]

dot-atom-text   =   1*atext *("." 1*atext)

dot-atom        =   [CFWS] dot-atom-text [CFWS]

specials        =   "(" / ")" /        ; Special characters that do
                    "<" / ">" /        ;  not appear in atext
                    "[" / "]" /
                    ":" / ";" /
                    "@" / "\" /
                    "," / "." /
                    DQUOTE

qtext           =   %d33 /             ; Printable US-ASCII
                    %d35-91 /          ;  characters not including
                    %d93-126 /         ;  "\" or the quote character
                    obs-qtext

quoted-pair     =   ("\" (VCHAR / WSP)) / obs-qp

FWS             =   ([*WSP CRLF] 1*WSP) /  obs-FWS
                                       ; Folding white space

ctext           =   %d33-39 /          ; Printable US-ASCII
                    %d42-91 /          ;  characters not including
                    %d93-126 /         ;  "(", ")", or "\"
                    obs-ctext

ccontent        =   ctext / quoted-pair / comment

comment         =   "(" *([FWS] ccontent) [FWS] ")"

CFWS            =   (1*([FWS] comment) [FWS]) / FWS

obs-FWS         =   1*WSP *(CRLF 1*WSP)

qcontent        =   qtext / quoted-pair

quoted-string   =   [CFWS]
                    DQUOTE *([FWS] qcontent) [FWS] DQUOTE
                    [CFWS]

word            =   atom / quoted-string

obs-local-part  =   word *("." word)

obs-domain      =   atom *("." atom)

obs-dtext       =   obs-NO-WS-CTL / quoted-pair

obs-NO-WS-CTL   =   %d1-8 /            ; US-ASCII control
                    %d11 /             ;  characters that do not
                    %d12 /             ;  include the carriage
                    %d14-31 /          ;  return, line feed, and
                    %d127              ;  white space characters

obs-ctext       =   obs-NO-WS-CTL

obs-qtext       =   obs-NO-WS-CTL

obs-qp          =   "\" (%d0 / obs-NO-WS-CTL / LF / CR)
_text

MAILTO_RULES_ = <<'_text'
; RFC6068
mailtoURI    = "mailto:" [ to ] [ hfields ]
to           = addr-spec *("," addr-spec )
hfields      = "?" hfield *( "&" hfield )
hfield       = hfname "=" hfvalue
hfname       = *qchar
hfvalue      = *qchar
addr-spec    = local-part "@" domain
local-part   = dot-atom-text / quoted-string
domain       = dot-atom-text / "[" *dtext-no-obs "]"
dtext-no-obs = %d33-90 / ; Printable US-ASCII
               %d94-126  ; characters not including
                         ; "[", "]", or "\"
qchar        = unreserved / pct-encoded / some-delims
some-delims  = "!" / "$" / "'" / "(" / ")" / "*"
             / "+" / "," / ";" / ":" / "@"

; RFC3986
unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
pct-encoded = "%" HEXDIG HEXDIG

; RFC5322
; override by RFC6068
;    3.  Whitespace and comments within <local-part> and <domain> MUST NOT
;        be used.  They would not have any operational semantics.
;    quoted-string   =   [CFWS]
;                DQUOTE *([FWS] qcontent) [FWS] DQUOTE
;                [CFWS]
quoted-string   =   DQUOTE *(qcontent) DQUOTE
; obs-qp          =   "\" (%d0 / obs-NO-WS-CTL / LF / CR)
; quoted-pair     =   ("\" (VCHAR / WSP)) / obs-qp
quoted-pair     =   "\" VCHAR
qtext           =   %d33 /             ; Printable US-ASCII
                    %d35-91 /          ;  characters not including
                    %d93-126           ;  "\" or the quote character
;                   / obs-qtext
atext           =   ALPHA / DIGIT /    ; Printable US-ASCII
                    "!" / ; "#" /        ;  characters not including
                    "$" / "%" /        ;  specials.  Used for atoms.
                    "&" / "'" /
                    "*" / "+" /
                    "-" / ; "/" /
                    "=" / "?" /
                    "^" / "_" /
                    "`" / "{" /
                    "|" / "}" /
                    "~"
_text

if __FILE__ == $0
  main
end