Skip to content

Instantly share code, notes, and snippets.

@nurse
Created March 30, 2011 05:56
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nurse/893932 to your computer and use it in GitHub Desktop.
Save nurse/893932 to your computer and use it in GitHub Desktop.
ABNF to Oniguruma Regexp Converter
#!/usr/local/bin/ruby
# If you want an ABNF to Regexp, see below.
# http://www.a-k-r.org/abnf/
# https://github.com/martinthomson/abnf2regex
require 'strscan'
RULENAME_ = /[A-Za-z][A-Za-z0-9\-]*/
C_WSPP_ = /[ \t]+(?:(?:;.*)?\n[ \t]+)*|(?:(?:;.*)?\n[ \t]+)+/
C_WSPS_ = /[ \t]*(?:(?:;.*)?\n[ \t]+)*/
C_NL_ = /(?:;.*)?\n/
DIGITS_ = /[0-9]+/
class CC
def initialize(encoding=nil)
@encoding = encoding
@cc = []
end
def <<(v)
case v
when Fixnum
_bsearch_add2(v, v)
when Range
_bsearch_add2(v.first, v.last)
when CC
v.to_a.each_slice(2) do |from, to|
_bsearch_add2(from, to)
end
else
raise TypeError, "invalid value to add a charclass #{v.inspect}"
end
self
end
def to_s
return to_char(@cc.first) if @encoding && @cc.size == 2 && @cc.first == @cc.last
case @cc
when [0x30, 0x39] # [0-9]
return '\d'
when [0x30, 0x39, 0x41, 0x46] # [0-9A-F]
return '\h'
end
buf = '['
n = @encoding ? nil : 0
@cc.each_slice(2) do |from, to|
buf << ',' if n && 1 < n += 1
buf << to_char(from)
if from != to
if @encoding.nil?
buf << (to - from > 1 ? '-' : ',')
else
buf << '-' if to - from > 1
end
buf << to_char(to)
end
end
buf << ']'
end
def inspect
to_s.inspect[1..-1]
end
def to_a
@cc
end
def empty?
@cc.empty?
end
private
def to_char(v)
#Regexp.quote(@encoding ? v.chr(@encoding) : v.to_s)
if @encoding
case v
when 0x2D, 0x5B, 0x5D
'\\' << v.chr
when 0x09
'\t'
when 0x0a
'\n'
when 0x0d
'\r'
when 0x20..0x7E
v.chr
when 0..0x7F
'\x%02x' % v
when 0..0xFFFF
'\u%04x' % v
else
'\u{%x}' % v
end
else
v.to_s
end
end
# Returns high of low.v.high
def _bsearch(v)
if @cc.empty? || v < @cc.first
return 0
elsif @cc.last < v
return @cc.length
end
low = 0
high = @cc.length - 1
while low < high
mid = (low + high) / 2
case v <=> @cc[mid]
when 1 # >
low = mid + 1
return low if v < @cc[low]
when -1 # <
high = mid - 1
return mid if @cc[high] < v
else # =
return mid # it already has the val
end
end
return low if @cc[low] == v
raise [low, high, mid].inspect
high < mid ? mid : low
end
def _bsearch_add2(from, to)
pos1 = _bsearch(from)
pos2 = from == to ? pos1 : _bsearch(to)
if pos1.odd?
pos1 -= 1
from = @cc[pos1]
elsif pos1 > 0 && @cc[pos1-1] == from - 1
pos1 -= 2
from = @cc[pos1]
end
if pos2.odd?
to = @cc[pos2]
elsif to + 1 == @cc[pos2] || to == @cc[pos2]
pos2 += 1
to = @cc[pos2]
else
pos2 -= 1
end
@cc[pos1, pos2-pos1+1] = from, to
end
end
#__END__
#require 'test/unit'
class TestCC#< Test::Unit::TestCase
def test_add
cc = CC.new
assert_equal('[]', cc.to_s)
cc << 1
assert_equal('[1]', cc.to_s)
cc << 2
assert_equal('[1,2]', cc.to_s)
cc << 2
assert_equal('[1,2]', cc.to_s)
cc << 0
assert_equal('[0-2]', cc.to_s)
cc << 9
assert_equal('[0-2,9]', cc.to_s)
cc << 6
assert_equal('[0-2,6,9]', cc.to_s)
cc << 3
assert_equal('[0-3,6,9]', cc.to_s)
cc << 0
assert_equal('[0-3,6,9]', cc.to_s)
cc << 5
assert_equal('[0-3,5,6,9]', cc.to_s)
cc << 7
assert_equal('[0-3,5-7,9]', cc.to_s)
cc << 4
assert_equal('[0-7,9]', cc.to_s)
end
def test_add2
cc= CC.new
assert_equal('[]', cc.to_s)
cc << (3..4)
assert_equal('[3,4]', cc.to_s)
cc << (1..2)
assert_equal('[1-4]', cc.to_s)
cc << (5..6)
assert_equal('[1-6]', cc.to_s)
cc << (8..9)
assert_equal('[1-6,8,9]', cc.to_s)
cc << (7..7)
assert_equal('[1-9]', cc.to_s)
cc << (2..4)
assert_equal('[1-9]', cc.to_s)
cc << (1..4)
assert_equal('[1-9]', cc.to_s)
cc << (7..9)
assert_equal('[1-9]', cc.to_s)
cc << (1..9)
assert_equal('[1-9]', cc.to_s)
cc= CC.new
assert_equal('[]', cc.to_s)
cc << (5..5)
assert_equal('[5]', cc.to_s)
cc << (3..3)
assert_equal('[3,5]', cc.to_s)
cc << (1..4)
assert_equal('[1-5]', cc.to_s)
end
end
class Alternation
attr_accessor :repeat
def initialize(parser, encoding)
@parser = parser
@encoding = encoding
@chars = CC.new(@encoding)
@ary = []
@repeat = nil
end
def <<(concatenation)
case concatenation
when nil
raise "nil"
end
@ary << concatenation
end
def atomic?
return true if @ary.empty?
if @ary.size == 1
return \
case e = @ary.first
when Integer, Range
true
when String
e.size <= 1
when Symbol
@parser.rules[e.to_s].atomic?
when Alternation, Concatenation
e.atomic?
else
raise
end
end
cc?
end
def empty?
@ary.empty?
end
def cc?
@ary.all? do |e|
case e
when Integer, Range
true
when String
e.size == 1
when Symbol
@parser.rules[e.to_s].cc?
when Alternation, Concatenation
e.cc?
else
raise
end
end
end
def single?
@ary.size <= 1
end
def to_a
@ary
end
def to_s
strs, cc = to_s0
if !strs.empty?
str = strs.map{|e|e.to_s}.join('|')
if cc
str << '|'
str << cc.to_s
else
end
elsif cc
str = cc.to_s
else
return ''
end
case @repeat
when nil
str
when [0, 0]
""
when [0, 1], [nil, 1]
if atomic?
str <<= '?'
else
"(?:#{str})?"
end
else
"(?:#{str}){#{@repeat.join(",")}}"
end
end
private
def to_s0
strs = []
cc = nil
ary = @ary
while ary
alts = []
ary.each do |e|
case e
when Range, Integer
cc = CC.new(@encoding) unless cc
cc << e
when String
strs << e
=begin
when Symbol
p e
if CORE_RULE_NAMES.include?(e)
alts << @parser.rules[e.to_s]
elsif r = @parser.rules[e.to_s] and r.atomic?
alts << r
else
unless r
p [__LINE__, e, r]
puts caller
end
strs << "\\g<#{e}>"
end
=end
when Alternation
alts.concat e.to_a
when Concatenation
alts << e.expand
when Repetition
case e.element
when Alternation, Concatenation
if e.cc?
alts << e.element
else
strs << e.to_s
end
else
if e.cc?
cc = CC.new(@encoding) unless cc
cc << e.element
else
strs << e.to_s
end
end
else
raise e.inspect
end
end
break if alts.empty?
ary = alts
end
return strs, cc
end
end
class Concatenation
def initialize(parser, encoding)
@parser = parser
@encoding = encoding
@ary = []
end
def <<(item)
case item
when nil
raise "nil"
when Array, String, Alternation, Concatenation
return @ary if item.empty?
end
@ary << item
end
def e2s(e)
e.to_s
end
def expand
buf = nil
@ary.each do |e|
case buf
when nil
buf = e
when String
buf << e2s(e)
else
buf = e2s(buf).dup
buf << e2s(e)
end
end
buf
end
def atomic?
return true if @ary.empty?
return false if @ary.size > 1
return @ary.first.atomic?
case item = @ary.first
when Integer, Range
true
when String
item.size == 1
when Symbol
@parser.rules[item.to_s].atomic?
when Alternation, Concatenation, Repetition
item.atomic?
else
raise item.inspect
end
end
def cc?
return false if @ary.size != 1
case item = @ary.first
when Integer, Range
true
when String
item.size == 1
when Symbol
@parser.rules[item.to_s].cc?
when Alternation, Concatenation, Repetition
item.cc?
else
raise item.inspect
end
end
def empty?
@ary.empty?
end
end
class Repetition
def initialize(parser, element, repeat,rulename=nil)
@parser = parser
@element = element
@repeat = repeat
@rulename = rulename
end
def atomic?
case element = @element
when Integer, Range
true
when String
element.size == 1
when Symbol
@parser.rules[element.to_s].atomic?
when Alternation, Concatenation, Repetition
element.atomic?
else
raise element.inspect
end
end
def cc?
return false unless @repeat.nil?
case element = @element
when Integer, Range
true
when String
element.size == 1
when Symbol
@parser.rules[element.to_s].cc?
when Alternation, Concatenation
element.cc?
else
raise element.inspect
end
end
def single?
case @element
when Integer, Range, String, Symbol
true
when Alternation, Concatenation, Repetition
@element.single?
else
raise @element.inspect
end
end
def element
case element = @element
when Integer, Range, String
element
when Symbol
@parser.rules[element.to_s]
when Alternation, Concatenation
element
else
raise element.inspect
end
end
def to_s
element = @element
case element
when Integer
enc = @encoding || Encoding::UTF_8
str = Regexp.quote(element.chr(enc))
when Range
enc = @encoding || Encoding::UTF_8
str = "[#{element.first.chr(enc)}-#{element.last.chr(enc)}]"
when String
str = element
when Alternation
str = element.to_s
when Symbol
r = @parser.rules[element.to_s]
if r
if Grammer::CORE_RULE_NAMES.include?(element) ||
@parser.expand_rules.include?(element)
str = r.to_s
elsif @parser.autoexpand_rules[element]
str = "(?<#{element}>#{r})"
@parser.autoexpand_rules[element] = false
else
str = "\\g<#{element}>"
end
else
unless r
p [__LINE__, element, r]
puts caller
end
str = "\\g<#{element}>"
end
else
raise "unknown type of element #{element.inspect}"
end
case @repeat
when nil
str
when [0, 1], [nil, 1]
if atomic?
str <<= '?'
else
"(?:#{str})?"
end
else
atomic? ? "#{str}#@repeat" : "(?:#{str})#@repeat"
end
end
end
class Grammer
attr_reader :encoding, :expand_rules, :autoexpand_rules, :rules
attr_accessor :rulename
CORE_RULE_NAMES = %i[ALPHA BIT CHAR CR CRLF CTL DIGIT DQUOTE HEXDIG HTAB LF LWSP OCTET SP VCHAR WSP OWS RWS BWS word token tchar special]
def initialize(src=nil, encoding=nil)
@rules = {}
@refs = {}
@rulename = nil
@parser = self
@encoding = encoding
@expand_rules = nil
@autoexpand_rules = nil
parse(src, encoding) if src
end
# return value is Integer or Range or String
def scan_digits(ss, reg, base)
c = ss.scan(reg) or raise 'missing digits in num-val'
case ss.scan(/[.\-]/)
when '.'
encoding = ss.string.encoding
chrs = c.to_i(base).chr(encoding)
while ss.skip('.')
c = ss.scan(reg) or raise 'missing concatenated digits in num-val'
chrs << c.to_i(base).chr(encoding)
end
chrs
when '-'
d = ss.scan(reg) or raise 'missing the end of range num-val'
c.to_i(base)..d.to_i(base)
else
c.to_i(base)
end
end
def scan_repeat(ss)
# repeat = 1*DIGIT / (*DIGIT "*" *DIGIT)
min = ss.scan(DIGITS_)
star = ss.skip(/\*/)
max = ss.scan(DIGITS_)
if star
if max
min ? "{#{min},#{max}}" :
max == 1 ? '?' : "{,#{max}}"
elsif min
min == '1' ? '+' : "{#{min},}"
else
'*'
end
else
if max
raise if min
"{#{max}}"
elsif min
"{#{min}}"
else
nil
end
end
end
def scan_alternation(state)
ss = state[:ss]
encoding = ss.string.encoding
alternation = Alternation.new(self, encoding)
# alternation = concatenation *(*c-wsp "/" *c-wsp concatenation)
while true
concatenation = Concatenation.new(self, encoding)
$stderr.puts 'alternation: ' + ss.rest[/.*/][0, 100] if $DEBUG
# concatenation = repetition *(1*c-wsp repetition)
while true
$stderr.puts 'concatenation: ' + ss.rest[/.*/][0, 100] if $DEBUG
# repetition = [repeat] element
repeat = scan_repeat(ss)
# element = rulename / group / option / char-val / num-val / prose-val
if name = ss.scan(RULENAME_) or (ss.scan(/<([^>]+)>/) && name = ss[1])
# prose-val = "<" *(%x20-3D / %x3F-7E) ">"
element = name.intern
n = @refs[state[:rulename]][name]
@refs[state[:rulename]][name] = n ? n+1 : 1
elsif ss.skip(/\(/)
ss.skip(C_WSPS_)
element = scan_alternation(state)
ss.skip(C_WSPS_)
ss.skip(/\)/) or raise "missing ')'"
elsif ss.skip(/\[/)
ss.skip(C_WSPS_)
element = scan_alternation(state)
element.repeat = [0, 1]
ss.skip(C_WSPS_)
ss.skip(/\]/) or raise "missing ']'"
elsif ss.scan(/"([^"]*)"/)
# char-val = DQUOTE *(%x20-21 / %x23-7E) DQUOTE
if ss[1].length == 1
element = ss[1].ord
else
element = Regexp.quote(ss[1])
end
elsif ss.skip(/%/)
# num-val = "%" (bin-val / dec-val / hex-val)
case v = ss.getch
when 'b'
element = scan_digits(ss, /[01]+/, 2)
when 'd'
element = scan_digits(ss, /[0-9]+/, 10)
when 'x'
element = scan_digits(ss, /[0-9a-fA-F]+/, 16)
else
raise "unknown num-val type '#{v}'"
end
else
raise 'missing element (even if it has repeat)' if repeat
break
end
concatenation << Repetition.new(self, element, repeat, state[:rulename])
break unless ss.skip(C_WSPP_)
end
alternation << concatenation
ss.skip(C_WSPS_)
break unless ss.skip(/\//)
ss.skip(C_WSPS_)
end
alternation
end
# http://tools.ietf.org/html/rfc2234
def parse(src, encoding=nil)
src = src.force_encoding(encoding) if encoding
indent = 999
src.scan(/^ *(?=\S)/){|s| indent = s.bytesize if indent > s.bytesize }
src.gsub!(/^ {#{indent}}/, '') if indent > 0 && indent != 999
ss = StringScanner.new(src)
state = {ss: ss, rulename: nil}
# rulelist = 1*( rule / (*c-wsp c-nl) )
until ss.eos?
# rule = rulename defined-as elements c-nl
if rulename = ss.scan(RULENAME_)
state[:rulename] = rulename
$stderr.puts 'rulename: ' + rulename if $DEBUG
# defined-as = *c-wsp ("=" / "=/") *c-wsp
ss.skip(C_WSPS_)
case ss.skip(/=\/?/)
when 2
raise 'no previous definition even if this is OR-def' unless rules.key?(rulename)
when 1
warn "duplicated definition #{rulename.inspect}" if @rules.key?(rulename)
else
raise
end
ss.skip(C_WSPS_)
# elements = alternation *c-wsp
@refs[rulename] = {}
@rules[rulename] = scan_alternation(state)
ss.skip(C_WSPS_)
ss.skip(C_NL_) or raise 'unexpected: ' + (ss.eos? ? 'EOS' : ss.rest[/.+|\n.*/].dump)
state[:rulename] = nil
else
ss.skip(C_WSPS_)
ss.skip(C_NL_) or raise 'unexpected: ' + (ss.eos? ? 'EOS' : ss.rest[/.+|\n.*/].dump)
end
end
nil
end
def build(name, expand_rules=[].freeze)
@expand_rules = expand_rules
@autoexpand_rules = {}
_include_defs(name, defs={})
buf = ""
defs.each_pair do |key, value|
autoexpand_rules[key.intern] = true if value <= 1
end
defs.each_key do |key|
next if CORE_RULE_NAMES.include?(key.intern)
next if @expand_rules.include?(key.intern)
next if @autoexpand_rules.key?(key.intern)
#next if @rules[key].cc?
buf << "(?<#{key}>#{@rules[key]}){0}"
end
buf << "(?<#{name}>#{@rules[name]})"
Regexp.new(buf)
end
def inspect
"#<%s:%#016x>" % [self.class.name, self.__id__<<1]
end
private
def _include_defs(name, defs)
unless @refs[name]
raise "missing definition of '#{name}'"
end
@refs[name].each_pair do |key, value|
n = defs[key]
if n
defs[key] = n+value
else
defs[key] = value
_include_defs(key, defs)
end
end
end
end
# http://tools.ietf.org/html/rfc2234
def abnf2onig_lite(src)
table = {}
src.scan(/([\w\-]+)\s*=\s*(.*(?:\n\s*[^\w\s].*)*)/) do |name, expr|
current = []
stack = [current]
expr.gsub!(/ ; .*/, '') # remove comments
expr.scan(/\s*(\/|[0-9*]*(?:"[^"\s]+"|<?[%\w\-]+>?|[^%\w\-\s]+))\s*/) do |atom,|
/\A(?:([0-9]+)(\*([0-9]+)?)?|(\*([0-9]+)?))?(.*)\z/ =~ atom
quantifier =
$3 ? "{#$1,#$3}" :
$2 ? "{#$1,}" :
$1 ? "{#$1}" :
$5 ? "{,#$5}" :
$4 ? '*' :
''
next '' if $2 == '0'
case $6
when /\A<?([\w\-]+)>?\z/
current << "\\g<#$1>#{quantifier}"
when /\A"([^"]+)"\z/
current << "#{Regexp.quote($1)}#{quantifier}"
when /\A%x([0-9A-F]+)(?:-([0-9A-F]+))?\z/
current.push($2 ? "[\\u{#$1}-\\u{#$2}]" : "\\u{#$1}")
when '['
current = ['(?:']
stack << current
when ']'
current << ')?'
current = stack[-2]
current << stack.pop.join
when '/'
current << '|'
when '('
current = [")#{quantifier}", '(?:']
stack << current
when ')'
current << current.shift
current = stack[-2]
current << stack.pop.join
else
$stderr.puts atom
current << "{::#{x}::}"
end
end
table[name] = stack.join
end
table
end
def main
require 'pp'
#g = Grammer.new
#g.parse(CORE_RULES_)
#g.parse($stdin.read, Encoding::US_ASCII)
#p g.build(ARGV[0])
#return
g = Grammer.new
g.parse(CORE_RULES_)
g.parse(ADDR_SPEC_RULES_)
g.parse(MAILTO_RULES_)
p g.build('local-part', %i[atext dot-atom-text qcontent quoted-pair quoted-string])
p g.build('domain', %i[atext dot-atom-text dtext-no-obs])
p g.build('addr-spec', %i[atext dot-atom-text dtext-no-obs qcontent quoted-pair quoted-string])
p g.build('mailtoURI', %i[atext dot-atom-text dtext-no-obs qchar pct-encoded hfname hfvalue qcontent quoted-pair quoted-string])
return
g = Grammer.new
g.parse(CORE_RULES_)
g.parse(URI_RULES_)
unless (uri = g.build('IPv4address')) =~"127.0.0.1"
puts "#{__LINE__} failed to match"
p uri
return
end
unless (uri = g.build('IPv6address')) =~ "::1"
puts "#{__LINE__} failed to match"
p uri
return
end
unless (uri = g.build('IP-literal')) =~"[::1]"
puts "#{__LINE__} failed to match"
p uri
return
end
p uri = g.build('URI', %i[h16 pchar pct-encoded])
p uri.match("http://exmaple.org:80/foo")
p uri.match("http://127.0.0.1/foo?hoge#fuga")
p uri.match("https://[::1]:8080/%e3%81%82?hoge#fuga")
p uri = g.build('relative-ref', %i[h16 pchar pct-encoded])
p uri.match("/foo")
p uri.match("/foo?hoge#fuga")
p uri.match("/%e3%81%82?hoge#fuga")
return
g.parse(IRI_RULES_, Encoding::UTF_8)
p uri = g.build('IRI')
p uri=~"http://\u{65e5}\u{672c}\u{8a9e}.jp/\u3042\u3044\u3046"
g = Grammer.new
g.parse(CORE_RULES_)
g.parse(ADDR_SPEC_RULES_)
p ras = g.build('addr-spec')
p ras =~ 'foo@example.com'
end
# http://tools.ietf.org/html/rfc5234
CORE_RULES_ = <<'_text'
ALPHA = %x41-5A / %x61-7A ; A-Z / a-z
BIT = "0" / "1"
CHAR = %x01-7F
; any 7-bit US-ASCII character, excluding NUL
CR = %x0D
; carriage return
CRLF = CR LF
; Internet standard newline
CTL = %x00-1F / %x7F
; controls
DIGIT = %x30-39
; 0-9
DQUOTE = %x22
; " (Double Quote)
HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F"
HTAB = %x09
; horizontal tab
LF = %x0A
; linefeed
LWSP = *(WSP / CRLF WSP)
; linear white space (past newline)
OCTET = %x00-FF
; 8 bits of data
SP = %x20
; space
VCHAR = %x21-7E
; visible (printing) characters
WSP = SP / HTAB
; white space
; http-bis-p1
OWS = *( SP / HTAB )
; "optional" whitespace
RWS = 1*( SP / HTAB )
; "required" whitespace
BWS = OWS
; "bad" whitespace
word = token / quoted-string
token = 1*tchar
tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*"
/ "+" / "-" / "." / "^" / "_" / "`" / "|" / "~"
/ DIGIT / ALPHA
; any VCHAR, except special
special = "(" / ")" / "<" / ">" / "@" / ","
/ ";" / ":" / "\" / DQUOTE / "/" / "["
/ "]" / "?" / "=" / "{" / "}"
_text
URI_RULES_ = <<'_text'
URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
hier-part = "//" authority path-abempty
/ path-absolute
/ path-rootless
/ path-empty
URI-reference = URI / relative-ref
absolute-URI = scheme ":" hier-part [ "?" query ]
relative-ref = relative-part [ "?" query ] [ "#" fragment ]
relative-part = "//" authority path-abempty
/ path-absolute
/ path-noscheme
/ path-empty
scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
authority = [ userinfo "@" ] host [ ":" port ]
userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
host = IP-literal / IPv4address / reg-name
port = *DIGIT
IP-literal = "[" ( IPv6address / IPvFuture ) "]"
IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
IPv6address = 6( h16 ":" ) ls32
/ "::" 5( h16 ":" ) ls32
/ [ h16 ] "::" 4( h16 ":" ) ls32
/ [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
/ [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
/ [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
/ [ *4( h16 ":" ) h16 ] "::" ls32
/ [ *5( h16 ":" ) h16 ] "::" h16
/ [ *6( h16 ":" ) h16 ] "::"
h16 = 1*4HEXDIG
ls32 = ( h16 ":" h16 ) / IPv4address
IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
dec-octet = DIGIT ; 0-9
/ %x31-39 DIGIT ; 10-99
/ "1" 2DIGIT ; 100-199
/ "2" %x30-34 DIGIT ; 200-249
/ "25" %x30-35 ; 250-255
reg-name = *( unreserved / pct-encoded / sub-delims )
path = path-abempty ; begins with "/" or is empty
/ path-absolute ; begins with "/" but not "//"
/ path-noscheme ; begins with a non-colon segment
/ path-rootless ; begins with a segment
/ path-empty ; zero characters
path-abempty = *( "/" segment )
path-absolute = "/" [ segment-nz *( "/" segment ) ]
path-noscheme = segment-nz-nc *( "/" segment )
path-rootless = segment-nz *( "/" segment )
path-empty = 0<pchar>
segment = *pchar
segment-nz = 1*pchar
segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
; non-zero-length segment without any colon ":"
pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
query = *( pchar / "/" / "?" )
fragment = *( pchar / "/" / "?" )
pct-encoded = "%" HEXDIG HEXDIG
unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
reserved = gen-delims / sub-delims
gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
/ "*" / "+" / "," / ";" / "="
_text
IRI_RULES_ = <<'_text'
IRI = scheme ":" ihier-part [ "?" iquery ]
[ "#" ifragment ]
ihier-part = "//" iauthority ipath-abempty
/ ipath-absolute
/ ipath-rootless
/ ipath-empty
IRI-reference = IRI / irelative-ref
absolute-IRI = scheme ":" ihier-part [ "?" iquery ]
irelative-ref = irelative-part [ "?" iquery ] [ "#" ifragment ]
irelative-part = "//" iauthority ipath-abempty
/ ipath-absolute
/ ipath-noscheme
/ ipath-empty
iauthority = [ iuserinfo "@" ] ihost [ ":" port ]
iuserinfo = *( iunreserved / pct-form / sub-delims / ":" )
ihost = IP-literal / IPv4address / ireg-name
pct-form = pct-encoded
ireg-name = *( iunreserved / sub-delims )
ipath = ipath-abempty ; begins with "/" or is empty
/ ipath-absolute ; begins with "/" but not "//"
/ ipath-noscheme ; begins with a non-colon segment
/ ipath-rootless ; begins with a segment
/ ipath-empty ; zero characters
ipath-abempty = *( path-sep isegment )
ipath-absolute = path-sep [ isegment-nz *( path-sep isegment ) ]
ipath-noscheme = isegment-nz-nc *( path-sep isegment )
ipath-rootless = isegment-nz *( path-sep isegment )
ipath-empty = 0<ipchar>
path-sep = "/"
isegment = *ipchar
isegment-nz = 1*ipchar
isegment-nz-nc = 1*( iunreserved / pct-form / sub-delims
/ "@" )
; non-zero-length segment without any colon ":"
ipchar = iunreserved / pct-form / sub-delims / ":"
/ "@"
iquery = *( ipchar / iprivate / "/" / "?" )
ifragment = *( ipchar / "/" / "?" )
iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar
ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
/ %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
/ %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
/ %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
/ %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
/ %xD0000-DFFFD / %xE1000-EFFFD
iprivate = %xE000-F8FF / %xE0000-E0FFF / %xF0000-FFFFD
/ %x100000-10FFFD
_text
ADDR_SPEC_RULES_ = <<'_text'
addr-spec = local-part "@" domain
local-part = dot-atom / quoted-string / obs-local-part
domain = dot-atom / domain-literal / obs-domain
domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS]
dtext = %d33-90 / ; Printable US-ASCII
%d94-126 / ; characters not including
obs-dtext ; "[", "]", or "\"
atext = ALPHA / DIGIT / ; Printable US-ASCII
"!" / "#" / ; characters not including
"$" / "%" / ; specials. Used for atoms.
"&" / "'" /
"*" / "+" /
"-" / "/" /
"=" / "?" /
"^" / "_" /
"`" / "{" /
"|" / "}" /
"~"
atom = [CFWS] 1*atext [CFWS]
dot-atom-text = 1*atext *("." 1*atext)
dot-atom = [CFWS] dot-atom-text [CFWS]
specials = "(" / ")" / ; Special characters that do
"<" / ">" / ; not appear in atext
"[" / "]" /
":" / ";" /
"@" / "\" /
"," / "." /
DQUOTE
qtext = %d33 / ; Printable US-ASCII
%d35-91 / ; characters not including
%d93-126 / ; "\" or the quote character
obs-qtext
quoted-pair = ("\" (VCHAR / WSP)) / obs-qp
FWS = ([*WSP CRLF] 1*WSP) / obs-FWS
; Folding white space
ctext = %d33-39 / ; Printable US-ASCII
%d42-91 / ; characters not including
%d93-126 / ; "(", ")", or "\"
obs-ctext
ccontent = ctext / quoted-pair / comment
comment = "(" *([FWS] ccontent) [FWS] ")"
CFWS = (1*([FWS] comment) [FWS]) / FWS
obs-FWS = 1*WSP *(CRLF 1*WSP)
qcontent = qtext / quoted-pair
quoted-string = [CFWS]
DQUOTE *([FWS] qcontent) [FWS] DQUOTE
[CFWS]
word = atom / quoted-string
obs-local-part = word *("." word)
obs-domain = atom *("." atom)
obs-dtext = obs-NO-WS-CTL / quoted-pair
obs-NO-WS-CTL = %d1-8 / ; US-ASCII control
%d11 / ; characters that do not
%d12 / ; include the carriage
%d14-31 / ; return, line feed, and
%d127 ; white space characters
obs-ctext = obs-NO-WS-CTL
obs-qtext = obs-NO-WS-CTL
obs-qp = "\" (%d0 / obs-NO-WS-CTL / LF / CR)
_text
MAILTO_RULES_ = <<'_text'
; RFC6068
mailtoURI = "mailto:" [ to ] [ hfields ]
to = addr-spec *("," addr-spec )
hfields = "?" hfield *( "&" hfield )
hfield = hfname "=" hfvalue
hfname = *qchar
hfvalue = *qchar
addr-spec = local-part "@" domain
local-part = dot-atom-text / quoted-string
domain = dot-atom-text / "[" *dtext-no-obs "]"
dtext-no-obs = %d33-90 / ; Printable US-ASCII
%d94-126 ; characters not including
; "[", "]", or "\"
qchar = unreserved / pct-encoded / some-delims
some-delims = "!" / "$" / "'" / "(" / ")" / "*"
/ "+" / "," / ";" / ":" / "@"
; RFC3986
unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
pct-encoded = "%" HEXDIG HEXDIG
; RFC5322
; override by RFC6068
; 3. Whitespace and comments within <local-part> and <domain> MUST NOT
; be used. They would not have any operational semantics.
; quoted-string = [CFWS]
; DQUOTE *([FWS] qcontent) [FWS] DQUOTE
; [CFWS]
quoted-string = DQUOTE *(qcontent) DQUOTE
; obs-qp = "\" (%d0 / obs-NO-WS-CTL / LF / CR)
; quoted-pair = ("\" (VCHAR / WSP)) / obs-qp
quoted-pair = "\" VCHAR
qtext = %d33 / ; Printable US-ASCII
%d35-91 / ; characters not including
%d93-126 ; "\" or the quote character
; / obs-qtext
atext = ALPHA / DIGIT / ; Printable US-ASCII
"!" / ; "#" / ; characters not including
"$" / "%" / ; specials. Used for atoms.
"&" / "'" /
"*" / "+" /
"-" / ; "/" /
"=" / "?" /
"^" / "_" /
"`" / "{" /
"|" / "}" /
"~"
_text
if __FILE__ == $0
main
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment