Created
March 30, 2011 05:56
-
-
Save nurse/893932 to your computer and use it in GitHub Desktop.
ABNF to Oniguruma Regexp Converter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/ruby | |
# If you want an ABNF to Regexp, see below. | |
# http://www.a-k-r.org/abnf/ | |
# https://github.com/martinthomson/abnf2regex | |
require 'strscan' | |
RULENAME_ = /[A-Za-z][A-Za-z0-9\-]*/ | |
C_WSPP_ = /[ \t]+(?:(?:;.*)?\n[ \t]+)*|(?:(?:;.*)?\n[ \t]+)+/ | |
C_WSPS_ = /[ \t]*(?:(?:;.*)?\n[ \t]+)*/ | |
C_NL_ = /(?:;.*)?\n/ | |
DIGITS_ = /[0-9]+/ | |
class CC | |
def initialize(encoding=nil) | |
@encoding = encoding | |
@cc = [] | |
end | |
def <<(v) | |
case v | |
when Fixnum | |
_bsearch_add2(v, v) | |
when Range | |
_bsearch_add2(v.first, v.last) | |
when CC | |
v.to_a.each_slice(2) do |from, to| | |
_bsearch_add2(from, to) | |
end | |
else | |
raise TypeError, "invalid value to add a charclass #{v.inspect}" | |
end | |
self | |
end | |
def to_s | |
return to_char(@cc.first) if @encoding && @cc.size == 2 && @cc.first == @cc.last | |
case @cc | |
when [0x30, 0x39] # [0-9] | |
return '\d' | |
when [0x30, 0x39, 0x41, 0x46] # [0-9A-F] | |
return '\h' | |
end | |
buf = '[' | |
n = @encoding ? nil : 0 | |
@cc.each_slice(2) do |from, to| | |
buf << ',' if n && 1 < n += 1 | |
buf << to_char(from) | |
if from != to | |
if @encoding.nil? | |
buf << (to - from > 1 ? '-' : ',') | |
else | |
buf << '-' if to - from > 1 | |
end | |
buf << to_char(to) | |
end | |
end | |
buf << ']' | |
end | |
def inspect | |
to_s.inspect[1..-1] | |
end | |
def to_a | |
@cc | |
end | |
def empty? | |
@cc.empty? | |
end | |
private | |
def to_char(v) | |
#Regexp.quote(@encoding ? v.chr(@encoding) : v.to_s) | |
if @encoding | |
case v | |
when 0x2D, 0x5B, 0x5D | |
'\\' << v.chr | |
when 0x09 | |
'\t' | |
when 0x0a | |
'\n' | |
when 0x0d | |
'\r' | |
when 0x20..0x7E | |
v.chr | |
when 0..0x7F | |
'\x%02x' % v | |
when 0..0xFFFF | |
'\u%04x' % v | |
else | |
'\u{%x}' % v | |
end | |
else | |
v.to_s | |
end | |
end | |
# Returns high of low.v.high | |
def _bsearch(v) | |
if @cc.empty? || v < @cc.first | |
return 0 | |
elsif @cc.last < v | |
return @cc.length | |
end | |
low = 0 | |
high = @cc.length - 1 | |
while low < high | |
mid = (low + high) / 2 | |
case v <=> @cc[mid] | |
when 1 # > | |
low = mid + 1 | |
return low if v < @cc[low] | |
when -1 # < | |
high = mid - 1 | |
return mid if @cc[high] < v | |
else # = | |
return mid # it already has the val | |
end | |
end | |
return low if @cc[low] == v | |
raise [low, high, mid].inspect | |
high < mid ? mid : low | |
end | |
def _bsearch_add2(from, to) | |
pos1 = _bsearch(from) | |
pos2 = from == to ? pos1 : _bsearch(to) | |
if pos1.odd? | |
pos1 -= 1 | |
from = @cc[pos1] | |
elsif pos1 > 0 && @cc[pos1-1] == from - 1 | |
pos1 -= 2 | |
from = @cc[pos1] | |
end | |
if pos2.odd? | |
to = @cc[pos2] | |
elsif to + 1 == @cc[pos2] || to == @cc[pos2] | |
pos2 += 1 | |
to = @cc[pos2] | |
else | |
pos2 -= 1 | |
end | |
@cc[pos1, pos2-pos1+1] = from, to | |
end | |
end | |
#__END__ | |
#require 'test/unit' | |
class TestCC#< Test::Unit::TestCase | |
def test_add | |
cc = CC.new | |
assert_equal('[]', cc.to_s) | |
cc << 1 | |
assert_equal('[1]', cc.to_s) | |
cc << 2 | |
assert_equal('[1,2]', cc.to_s) | |
cc << 2 | |
assert_equal('[1,2]', cc.to_s) | |
cc << 0 | |
assert_equal('[0-2]', cc.to_s) | |
cc << 9 | |
assert_equal('[0-2,9]', cc.to_s) | |
cc << 6 | |
assert_equal('[0-2,6,9]', cc.to_s) | |
cc << 3 | |
assert_equal('[0-3,6,9]', cc.to_s) | |
cc << 0 | |
assert_equal('[0-3,6,9]', cc.to_s) | |
cc << 5 | |
assert_equal('[0-3,5,6,9]', cc.to_s) | |
cc << 7 | |
assert_equal('[0-3,5-7,9]', cc.to_s) | |
cc << 4 | |
assert_equal('[0-7,9]', cc.to_s) | |
end | |
def test_add2 | |
cc= CC.new | |
assert_equal('[]', cc.to_s) | |
cc << (3..4) | |
assert_equal('[3,4]', cc.to_s) | |
cc << (1..2) | |
assert_equal('[1-4]', cc.to_s) | |
cc << (5..6) | |
assert_equal('[1-6]', cc.to_s) | |
cc << (8..9) | |
assert_equal('[1-6,8,9]', cc.to_s) | |
cc << (7..7) | |
assert_equal('[1-9]', cc.to_s) | |
cc << (2..4) | |
assert_equal('[1-9]', cc.to_s) | |
cc << (1..4) | |
assert_equal('[1-9]', cc.to_s) | |
cc << (7..9) | |
assert_equal('[1-9]', cc.to_s) | |
cc << (1..9) | |
assert_equal('[1-9]', cc.to_s) | |
cc= CC.new | |
assert_equal('[]', cc.to_s) | |
cc << (5..5) | |
assert_equal('[5]', cc.to_s) | |
cc << (3..3) | |
assert_equal('[3,5]', cc.to_s) | |
cc << (1..4) | |
assert_equal('[1-5]', cc.to_s) | |
end | |
end | |
class Alternation | |
attr_accessor :repeat | |
def initialize(parser, encoding) | |
@parser = parser | |
@encoding = encoding | |
@chars = CC.new(@encoding) | |
@ary = [] | |
@repeat = nil | |
end | |
def <<(concatenation) | |
case concatenation | |
when nil | |
raise "nil" | |
end | |
@ary << concatenation | |
end | |
def atomic? | |
return true if @ary.empty? | |
if @ary.size == 1 | |
return \ | |
case e = @ary.first | |
when Integer, Range | |
true | |
when String | |
e.size <= 1 | |
when Symbol | |
@parser.rules[e.to_s].atomic? | |
when Alternation, Concatenation | |
e.atomic? | |
else | |
raise | |
end | |
end | |
cc? | |
end | |
def empty? | |
@ary.empty? | |
end | |
def cc? | |
@ary.all? do |e| | |
case e | |
when Integer, Range | |
true | |
when String | |
e.size == 1 | |
when Symbol | |
@parser.rules[e.to_s].cc? | |
when Alternation, Concatenation | |
e.cc? | |
else | |
raise | |
end | |
end | |
end | |
def single? | |
@ary.size <= 1 | |
end | |
def to_a | |
@ary | |
end | |
def to_s | |
strs, cc = to_s0 | |
if !strs.empty? | |
str = strs.map{|e|e.to_s}.join('|') | |
if cc | |
str << '|' | |
str << cc.to_s | |
else | |
end | |
elsif cc | |
str = cc.to_s | |
else | |
return '' | |
end | |
case @repeat | |
when nil | |
str | |
when [0, 0] | |
"" | |
when [0, 1], [nil, 1] | |
if atomic? | |
str <<= '?' | |
else | |
"(?:#{str})?" | |
end | |
else | |
"(?:#{str}){#{@repeat.join(",")}}" | |
end | |
end | |
private | |
def to_s0 | |
strs = [] | |
cc = nil | |
ary = @ary | |
while ary | |
alts = [] | |
ary.each do |e| | |
case e | |
when Range, Integer | |
cc = CC.new(@encoding) unless cc | |
cc << e | |
when String | |
strs << e | |
=begin | |
when Symbol | |
p e | |
if CORE_RULE_NAMES.include?(e) | |
alts << @parser.rules[e.to_s] | |
elsif r = @parser.rules[e.to_s] and r.atomic? | |
alts << r | |
else | |
unless r | |
p [__LINE__, e, r] | |
puts caller | |
end | |
strs << "\\g<#{e}>" | |
end | |
=end | |
when Alternation | |
alts.concat e.to_a | |
when Concatenation | |
alts << e.expand | |
when Repetition | |
case e.element | |
when Alternation, Concatenation | |
if e.cc? | |
alts << e.element | |
else | |
strs << e.to_s | |
end | |
else | |
if e.cc? | |
cc = CC.new(@encoding) unless cc | |
cc << e.element | |
else | |
strs << e.to_s | |
end | |
end | |
else | |
raise e.inspect | |
end | |
end | |
break if alts.empty? | |
ary = alts | |
end | |
return strs, cc | |
end | |
end | |
class Concatenation | |
def initialize(parser, encoding) | |
@parser = parser | |
@encoding = encoding | |
@ary = [] | |
end | |
def <<(item) | |
case item | |
when nil | |
raise "nil" | |
when Array, String, Alternation, Concatenation | |
return @ary if item.empty? | |
end | |
@ary << item | |
end | |
def e2s(e) | |
e.to_s | |
end | |
def expand | |
buf = nil | |
@ary.each do |e| | |
case buf | |
when nil | |
buf = e | |
when String | |
buf << e2s(e) | |
else | |
buf = e2s(buf).dup | |
buf << e2s(e) | |
end | |
end | |
buf | |
end | |
def atomic? | |
return true if @ary.empty? | |
return false if @ary.size > 1 | |
return @ary.first.atomic? | |
case item = @ary.first | |
when Integer, Range | |
true | |
when String | |
item.size == 1 | |
when Symbol | |
@parser.rules[item.to_s].atomic? | |
when Alternation, Concatenation, Repetition | |
item.atomic? | |
else | |
raise item.inspect | |
end | |
end | |
def cc? | |
return false if @ary.size != 1 | |
case item = @ary.first | |
when Integer, Range | |
true | |
when String | |
item.size == 1 | |
when Symbol | |
@parser.rules[item.to_s].cc? | |
when Alternation, Concatenation, Repetition | |
item.cc? | |
else | |
raise item.inspect | |
end | |
end | |
def empty? | |
@ary.empty? | |
end | |
end | |
class Repetition | |
def initialize(parser, element, repeat,rulename=nil) | |
@parser = parser | |
@element = element | |
@repeat = repeat | |
@rulename = rulename | |
end | |
def atomic? | |
case element = @element | |
when Integer, Range | |
true | |
when String | |
element.size == 1 | |
when Symbol | |
@parser.rules[element.to_s].atomic? | |
when Alternation, Concatenation, Repetition | |
element.atomic? | |
else | |
raise element.inspect | |
end | |
end | |
def cc? | |
return false unless @repeat.nil? | |
case element = @element | |
when Integer, Range | |
true | |
when String | |
element.size == 1 | |
when Symbol | |
@parser.rules[element.to_s].cc? | |
when Alternation, Concatenation | |
element.cc? | |
else | |
raise element.inspect | |
end | |
end | |
def single? | |
case @element | |
when Integer, Range, String, Symbol | |
true | |
when Alternation, Concatenation, Repetition | |
@element.single? | |
else | |
raise @element.inspect | |
end | |
end | |
def element | |
case element = @element | |
when Integer, Range, String | |
element | |
when Symbol | |
@parser.rules[element.to_s] | |
when Alternation, Concatenation | |
element | |
else | |
raise element.inspect | |
end | |
end | |
def to_s | |
element = @element | |
case element | |
when Integer | |
enc = @encoding || Encoding::UTF_8 | |
str = Regexp.quote(element.chr(enc)) | |
when Range | |
enc = @encoding || Encoding::UTF_8 | |
str = "[#{element.first.chr(enc)}-#{element.last.chr(enc)}]" | |
when String | |
str = element | |
when Alternation | |
str = element.to_s | |
when Symbol | |
r = @parser.rules[element.to_s] | |
if r | |
if Grammer::CORE_RULE_NAMES.include?(element) || | |
@parser.expand_rules.include?(element) | |
str = r.to_s | |
elsif @parser.autoexpand_rules[element] | |
str = "(?<#{element}>#{r})" | |
@parser.autoexpand_rules[element] = false | |
else | |
str = "\\g<#{element}>" | |
end | |
else | |
unless r | |
p [__LINE__, element, r] | |
puts caller | |
end | |
str = "\\g<#{element}>" | |
end | |
else | |
raise "unknown type of element #{element.inspect}" | |
end | |
case @repeat | |
when nil | |
str | |
when [0, 1], [nil, 1] | |
if atomic? | |
str <<= '?' | |
else | |
"(?:#{str})?" | |
end | |
else | |
atomic? ? "#{str}#@repeat" : "(?:#{str})#@repeat" | |
end | |
end | |
end | |
class Grammer | |
attr_reader :encoding, :expand_rules, :autoexpand_rules, :rules | |
attr_accessor :rulename | |
CORE_RULE_NAMES = %i[ALPHA BIT CHAR CR CRLF CTL DIGIT DQUOTE HEXDIG HTAB LF LWSP OCTET SP VCHAR WSP OWS RWS BWS word token tchar special] | |
def initialize(src=nil, encoding=nil) | |
@rules = {} | |
@refs = {} | |
@rulename = nil | |
@parser = self | |
@encoding = encoding | |
@expand_rules = nil | |
@autoexpand_rules = nil | |
parse(src, encoding) if src | |
end | |
# return value is Integer or Range or String | |
def scan_digits(ss, reg, base) | |
c = ss.scan(reg) or raise 'missing digits in num-val' | |
case ss.scan(/[.\-]/) | |
when '.' | |
encoding = ss.string.encoding | |
chrs = c.to_i(base).chr(encoding) | |
while ss.skip('.') | |
c = ss.scan(reg) or raise 'missing concatenated digits in num-val' | |
chrs << c.to_i(base).chr(encoding) | |
end | |
chrs | |
when '-' | |
d = ss.scan(reg) or raise 'missing the end of range num-val' | |
c.to_i(base)..d.to_i(base) | |
else | |
c.to_i(base) | |
end | |
end | |
def scan_repeat(ss) | |
# repeat = 1*DIGIT / (*DIGIT "*" *DIGIT) | |
min = ss.scan(DIGITS_) | |
star = ss.skip(/\*/) | |
max = ss.scan(DIGITS_) | |
if star | |
if max | |
min ? "{#{min},#{max}}" : | |
max == 1 ? '?' : "{,#{max}}" | |
elsif min | |
min == '1' ? '+' : "{#{min},}" | |
else | |
'*' | |
end | |
else | |
if max | |
raise if min | |
"{#{max}}" | |
elsif min | |
"{#{min}}" | |
else | |
nil | |
end | |
end | |
end | |
def scan_alternation(state) | |
ss = state[:ss] | |
encoding = ss.string.encoding | |
alternation = Alternation.new(self, encoding) | |
# alternation = concatenation *(*c-wsp "/" *c-wsp concatenation) | |
while true | |
concatenation = Concatenation.new(self, encoding) | |
$stderr.puts 'alternation: ' + ss.rest[/.*/][0, 100] if $DEBUG | |
# concatenation = repetition *(1*c-wsp repetition) | |
while true | |
$stderr.puts 'concatenation: ' + ss.rest[/.*/][0, 100] if $DEBUG | |
# repetition = [repeat] element | |
repeat = scan_repeat(ss) | |
# element = rulename / group / option / char-val / num-val / prose-val | |
if name = ss.scan(RULENAME_) or (ss.scan(/<([^>]+)>/) && name = ss[1]) | |
# prose-val = "<" *(%x20-3D / %x3F-7E) ">" | |
element = name.intern | |
n = @refs[state[:rulename]][name] | |
@refs[state[:rulename]][name] = n ? n+1 : 1 | |
elsif ss.skip(/\(/) | |
ss.skip(C_WSPS_) | |
element = scan_alternation(state) | |
ss.skip(C_WSPS_) | |
ss.skip(/\)/) or raise "missing ')'" | |
elsif ss.skip(/\[/) | |
ss.skip(C_WSPS_) | |
element = scan_alternation(state) | |
element.repeat = [0, 1] | |
ss.skip(C_WSPS_) | |
ss.skip(/\]/) or raise "missing ']'" | |
elsif ss.scan(/"([^"]*)"/) | |
# char-val = DQUOTE *(%x20-21 / %x23-7E) DQUOTE | |
if ss[1].length == 1 | |
element = ss[1].ord | |
else | |
element = Regexp.quote(ss[1]) | |
end | |
elsif ss.skip(/%/) | |
# num-val = "%" (bin-val / dec-val / hex-val) | |
case v = ss.getch | |
when 'b' | |
element = scan_digits(ss, /[01]+/, 2) | |
when 'd' | |
element = scan_digits(ss, /[0-9]+/, 10) | |
when 'x' | |
element = scan_digits(ss, /[0-9a-fA-F]+/, 16) | |
else | |
raise "unknown num-val type '#{v}'" | |
end | |
else | |
raise 'missing element (even if it has repeat)' if repeat | |
break | |
end | |
concatenation << Repetition.new(self, element, repeat, state[:rulename]) | |
break unless ss.skip(C_WSPP_) | |
end | |
alternation << concatenation | |
ss.skip(C_WSPS_) | |
break unless ss.skip(/\//) | |
ss.skip(C_WSPS_) | |
end | |
alternation | |
end | |
# http://tools.ietf.org/html/rfc2234 | |
def parse(src, encoding=nil) | |
src = src.force_encoding(encoding) if encoding | |
indent = 999 | |
src.scan(/^ *(?=\S)/){|s| indent = s.bytesize if indent > s.bytesize } | |
src.gsub!(/^ {#{indent}}/, '') if indent > 0 && indent != 999 | |
ss = StringScanner.new(src) | |
state = {ss: ss, rulename: nil} | |
# rulelist = 1*( rule / (*c-wsp c-nl) ) | |
until ss.eos? | |
# rule = rulename defined-as elements c-nl | |
if rulename = ss.scan(RULENAME_) | |
state[:rulename] = rulename | |
$stderr.puts 'rulename: ' + rulename if $DEBUG | |
# defined-as = *c-wsp ("=" / "=/") *c-wsp | |
ss.skip(C_WSPS_) | |
case ss.skip(/=\/?/) | |
when 2 | |
raise 'no previous definition even if this is OR-def' unless rules.key?(rulename) | |
when 1 | |
warn "duplicated definition #{rulename.inspect}" if @rules.key?(rulename) | |
else | |
raise | |
end | |
ss.skip(C_WSPS_) | |
# elements = alternation *c-wsp | |
@refs[rulename] = {} | |
@rules[rulename] = scan_alternation(state) | |
ss.skip(C_WSPS_) | |
ss.skip(C_NL_) or raise 'unexpected: ' + (ss.eos? ? 'EOS' : ss.rest[/.+|\n.*/].dump) | |
state[:rulename] = nil | |
else | |
ss.skip(C_WSPS_) | |
ss.skip(C_NL_) or raise 'unexpected: ' + (ss.eos? ? 'EOS' : ss.rest[/.+|\n.*/].dump) | |
end | |
end | |
nil | |
end | |
def build(name, expand_rules=[].freeze) | |
@expand_rules = expand_rules | |
@autoexpand_rules = {} | |
_include_defs(name, defs={}) | |
buf = "" | |
defs.each_pair do |key, value| | |
autoexpand_rules[key.intern] = true if value <= 1 | |
end | |
defs.each_key do |key| | |
next if CORE_RULE_NAMES.include?(key.intern) | |
next if @expand_rules.include?(key.intern) | |
next if @autoexpand_rules.key?(key.intern) | |
#next if @rules[key].cc? | |
buf << "(?<#{key}>#{@rules[key]}){0}" | |
end | |
buf << "(?<#{name}>#{@rules[name]})" | |
Regexp.new(buf) | |
end | |
def inspect | |
"#<%s:%#016x>" % [self.class.name, self.__id__<<1] | |
end | |
private | |
def _include_defs(name, defs) | |
unless @refs[name] | |
raise "missing definition of '#{name}'" | |
end | |
@refs[name].each_pair do |key, value| | |
n = defs[key] | |
if n | |
defs[key] = n+value | |
else | |
defs[key] = value | |
_include_defs(key, defs) | |
end | |
end | |
end | |
end | |
# http://tools.ietf.org/html/rfc2234 | |
def abnf2onig_lite(src) | |
table = {} | |
src.scan(/([\w\-]+)\s*=\s*(.*(?:\n\s*[^\w\s].*)*)/) do |name, expr| | |
current = [] | |
stack = [current] | |
expr.gsub!(/ ; .*/, '') # remove comments | |
expr.scan(/\s*(\/|[0-9*]*(?:"[^"\s]+"|<?[%\w\-]+>?|[^%\w\-\s]+))\s*/) do |atom,| | |
/\A(?:([0-9]+)(\*([0-9]+)?)?|(\*([0-9]+)?))?(.*)\z/ =~ atom | |
quantifier = | |
$3 ? "{#$1,#$3}" : | |
$2 ? "{#$1,}" : | |
$1 ? "{#$1}" : | |
$5 ? "{,#$5}" : | |
$4 ? '*' : | |
'' | |
next '' if $2 == '0' | |
case $6 | |
when /\A<?([\w\-]+)>?\z/ | |
current << "\\g<#$1>#{quantifier}" | |
when /\A"([^"]+)"\z/ | |
current << "#{Regexp.quote($1)}#{quantifier}" | |
when /\A%x([0-9A-F]+)(?:-([0-9A-F]+))?\z/ | |
current.push($2 ? "[\\u{#$1}-\\u{#$2}]" : "\\u{#$1}") | |
when '[' | |
current = ['(?:'] | |
stack << current | |
when ']' | |
current << ')?' | |
current = stack[-2] | |
current << stack.pop.join | |
when '/' | |
current << '|' | |
when '(' | |
current = [")#{quantifier}", '(?:'] | |
stack << current | |
when ')' | |
current << current.shift | |
current = stack[-2] | |
current << stack.pop.join | |
else | |
$stderr.puts atom | |
current << "{::#{x}::}" | |
end | |
end | |
table[name] = stack.join | |
end | |
table | |
end | |
def main | |
require 'pp' | |
#g = Grammer.new | |
#g.parse(CORE_RULES_) | |
#g.parse($stdin.read, Encoding::US_ASCII) | |
#p g.build(ARGV[0]) | |
#return | |
g = Grammer.new | |
g.parse(CORE_RULES_) | |
g.parse(ADDR_SPEC_RULES_) | |
g.parse(MAILTO_RULES_) | |
p g.build('local-part', %i[atext dot-atom-text qcontent quoted-pair quoted-string]) | |
p g.build('domain', %i[atext dot-atom-text dtext-no-obs]) | |
p g.build('addr-spec', %i[atext dot-atom-text dtext-no-obs qcontent quoted-pair quoted-string]) | |
p g.build('mailtoURI', %i[atext dot-atom-text dtext-no-obs qchar pct-encoded hfname hfvalue qcontent quoted-pair quoted-string]) | |
return | |
g = Grammer.new | |
g.parse(CORE_RULES_) | |
g.parse(URI_RULES_) | |
unless (uri = g.build('IPv4address')) =~"127.0.0.1" | |
puts "#{__LINE__} failed to match" | |
p uri | |
return | |
end | |
unless (uri = g.build('IPv6address')) =~ "::1" | |
puts "#{__LINE__} failed to match" | |
p uri | |
return | |
end | |
unless (uri = g.build('IP-literal')) =~"[::1]" | |
puts "#{__LINE__} failed to match" | |
p uri | |
return | |
end | |
p uri = g.build('URI', %i[h16 pchar pct-encoded]) | |
p uri.match("http://exmaple.org:80/foo") | |
p uri.match("http://127.0.0.1/foo?hoge#fuga") | |
p uri.match("https://[::1]:8080/%e3%81%82?hoge#fuga") | |
p uri = g.build('relative-ref', %i[h16 pchar pct-encoded]) | |
p uri.match("/foo") | |
p uri.match("/foo?hoge#fuga") | |
p uri.match("/%e3%81%82?hoge#fuga") | |
return | |
g.parse(IRI_RULES_, Encoding::UTF_8) | |
p uri = g.build('IRI') | |
p uri=~"http://\u{65e5}\u{672c}\u{8a9e}.jp/\u3042\u3044\u3046" | |
g = Grammer.new | |
g.parse(CORE_RULES_) | |
g.parse(ADDR_SPEC_RULES_) | |
p ras = g.build('addr-spec') | |
p ras =~ 'foo@example.com' | |
end | |
# http://tools.ietf.org/html/rfc5234 | |
CORE_RULES_ = <<'_text' | |
ALPHA = %x41-5A / %x61-7A ; A-Z / a-z | |
BIT = "0" / "1" | |
CHAR = %x01-7F | |
; any 7-bit US-ASCII character, excluding NUL | |
CR = %x0D | |
; carriage return | |
CRLF = CR LF | |
; Internet standard newline | |
CTL = %x00-1F / %x7F | |
; controls | |
DIGIT = %x30-39 | |
; 0-9 | |
DQUOTE = %x22 | |
; " (Double Quote) | |
HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F" | |
HTAB = %x09 | |
; horizontal tab | |
LF = %x0A | |
; linefeed | |
LWSP = *(WSP / CRLF WSP) | |
; linear white space (past newline) | |
OCTET = %x00-FF | |
; 8 bits of data | |
SP = %x20 | |
; space | |
VCHAR = %x21-7E | |
; visible (printing) characters | |
WSP = SP / HTAB | |
; white space | |
; http-bis-p1 | |
OWS = *( SP / HTAB ) | |
; "optional" whitespace | |
RWS = 1*( SP / HTAB ) | |
; "required" whitespace | |
BWS = OWS | |
; "bad" whitespace | |
word = token / quoted-string | |
token = 1*tchar | |
tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" | |
/ "+" / "-" / "." / "^" / "_" / "`" / "|" / "~" | |
/ DIGIT / ALPHA | |
; any VCHAR, except special | |
special = "(" / ")" / "<" / ">" / "@" / "," | |
/ ";" / ":" / "\" / DQUOTE / "/" / "[" | |
/ "]" / "?" / "=" / "{" / "}" | |
_text | |
URI_RULES_ = <<'_text' | |
URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] | |
hier-part = "//" authority path-abempty | |
/ path-absolute | |
/ path-rootless | |
/ path-empty | |
URI-reference = URI / relative-ref | |
absolute-URI = scheme ":" hier-part [ "?" query ] | |
relative-ref = relative-part [ "?" query ] [ "#" fragment ] | |
relative-part = "//" authority path-abempty | |
/ path-absolute | |
/ path-noscheme | |
/ path-empty | |
scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) | |
authority = [ userinfo "@" ] host [ ":" port ] | |
userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) | |
host = IP-literal / IPv4address / reg-name | |
port = *DIGIT | |
IP-literal = "[" ( IPv6address / IPvFuture ) "]" | |
IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) | |
IPv6address = 6( h16 ":" ) ls32 | |
/ "::" 5( h16 ":" ) ls32 | |
/ [ h16 ] "::" 4( h16 ":" ) ls32 | |
/ [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 | |
/ [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 | |
/ [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 | |
/ [ *4( h16 ":" ) h16 ] "::" ls32 | |
/ [ *5( h16 ":" ) h16 ] "::" h16 | |
/ [ *6( h16 ":" ) h16 ] "::" | |
h16 = 1*4HEXDIG | |
ls32 = ( h16 ":" h16 ) / IPv4address | |
IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet | |
dec-octet = DIGIT ; 0-9 | |
/ %x31-39 DIGIT ; 10-99 | |
/ "1" 2DIGIT ; 100-199 | |
/ "2" %x30-34 DIGIT ; 200-249 | |
/ "25" %x30-35 ; 250-255 | |
reg-name = *( unreserved / pct-encoded / sub-delims ) | |
path = path-abempty ; begins with "/" or is empty | |
/ path-absolute ; begins with "/" but not "//" | |
/ path-noscheme ; begins with a non-colon segment | |
/ path-rootless ; begins with a segment | |
/ path-empty ; zero characters | |
path-abempty = *( "/" segment ) | |
path-absolute = "/" [ segment-nz *( "/" segment ) ] | |
path-noscheme = segment-nz-nc *( "/" segment ) | |
path-rootless = segment-nz *( "/" segment ) | |
path-empty = 0<pchar> | |
segment = *pchar | |
segment-nz = 1*pchar | |
segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) | |
; non-zero-length segment without any colon ":" | |
pchar = unreserved / pct-encoded / sub-delims / ":" / "@" | |
query = *( pchar / "/" / "?" ) | |
fragment = *( pchar / "/" / "?" ) | |
pct-encoded = "%" HEXDIG HEXDIG | |
unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" | |
reserved = gen-delims / sub-delims | |
gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" | |
sub-delims = "!" / "$" / "&" / "'" / "(" / ")" | |
/ "*" / "+" / "," / ";" / "=" | |
_text | |
IRI_RULES_ = <<'_text' | |
IRI = scheme ":" ihier-part [ "?" iquery ] | |
[ "#" ifragment ] | |
ihier-part = "//" iauthority ipath-abempty | |
/ ipath-absolute | |
/ ipath-rootless | |
/ ipath-empty | |
IRI-reference = IRI / irelative-ref | |
absolute-IRI = scheme ":" ihier-part [ "?" iquery ] | |
irelative-ref = irelative-part [ "?" iquery ] [ "#" ifragment ] | |
irelative-part = "//" iauthority ipath-abempty | |
/ ipath-absolute | |
/ ipath-noscheme | |
/ ipath-empty | |
iauthority = [ iuserinfo "@" ] ihost [ ":" port ] | |
iuserinfo = *( iunreserved / pct-form / sub-delims / ":" ) | |
ihost = IP-literal / IPv4address / ireg-name | |
pct-form = pct-encoded | |
ireg-name = *( iunreserved / sub-delims ) | |
ipath = ipath-abempty ; begins with "/" or is empty | |
/ ipath-absolute ; begins with "/" but not "//" | |
/ ipath-noscheme ; begins with a non-colon segment | |
/ ipath-rootless ; begins with a segment | |
/ ipath-empty ; zero characters | |
ipath-abempty = *( path-sep isegment ) | |
ipath-absolute = path-sep [ isegment-nz *( path-sep isegment ) ] | |
ipath-noscheme = isegment-nz-nc *( path-sep isegment ) | |
ipath-rootless = isegment-nz *( path-sep isegment ) | |
ipath-empty = 0<ipchar> | |
path-sep = "/" | |
isegment = *ipchar | |
isegment-nz = 1*ipchar | |
isegment-nz-nc = 1*( iunreserved / pct-form / sub-delims | |
/ "@" ) | |
; non-zero-length segment without any colon ":" | |
ipchar = iunreserved / pct-form / sub-delims / ":" | |
/ "@" | |
iquery = *( ipchar / iprivate / "/" / "?" ) | |
ifragment = *( ipchar / "/" / "?" ) | |
iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar | |
ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF | |
/ %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD | |
/ %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD | |
/ %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD | |
/ %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD | |
/ %xD0000-DFFFD / %xE1000-EFFFD | |
iprivate = %xE000-F8FF / %xE0000-E0FFF / %xF0000-FFFFD | |
/ %x100000-10FFFD | |
_text | |
ADDR_SPEC_RULES_ = <<'_text' | |
addr-spec = local-part "@" domain | |
local-part = dot-atom / quoted-string / obs-local-part | |
domain = dot-atom / domain-literal / obs-domain | |
domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS] | |
dtext = %d33-90 / ; Printable US-ASCII | |
%d94-126 / ; characters not including | |
obs-dtext ; "[", "]", or "\" | |
atext = ALPHA / DIGIT / ; Printable US-ASCII | |
"!" / "#" / ; characters not including | |
"$" / "%" / ; specials. Used for atoms. | |
"&" / "'" / | |
"*" / "+" / | |
"-" / "/" / | |
"=" / "?" / | |
"^" / "_" / | |
"`" / "{" / | |
"|" / "}" / | |
"~" | |
atom = [CFWS] 1*atext [CFWS] | |
dot-atom-text = 1*atext *("." 1*atext) | |
dot-atom = [CFWS] dot-atom-text [CFWS] | |
specials = "(" / ")" / ; Special characters that do | |
"<" / ">" / ; not appear in atext | |
"[" / "]" / | |
":" / ";" / | |
"@" / "\" / | |
"," / "." / | |
DQUOTE | |
qtext = %d33 / ; Printable US-ASCII | |
%d35-91 / ; characters not including | |
%d93-126 / ; "\" or the quote character | |
obs-qtext | |
quoted-pair = ("\" (VCHAR / WSP)) / obs-qp | |
FWS = ([*WSP CRLF] 1*WSP) / obs-FWS | |
; Folding white space | |
ctext = %d33-39 / ; Printable US-ASCII | |
%d42-91 / ; characters not including | |
%d93-126 / ; "(", ")", or "\" | |
obs-ctext | |
ccontent = ctext / quoted-pair / comment | |
comment = "(" *([FWS] ccontent) [FWS] ")" | |
CFWS = (1*([FWS] comment) [FWS]) / FWS | |
obs-FWS = 1*WSP *(CRLF 1*WSP) | |
qcontent = qtext / quoted-pair | |
quoted-string = [CFWS] | |
DQUOTE *([FWS] qcontent) [FWS] DQUOTE | |
[CFWS] | |
word = atom / quoted-string | |
obs-local-part = word *("." word) | |
obs-domain = atom *("." atom) | |
obs-dtext = obs-NO-WS-CTL / quoted-pair | |
obs-NO-WS-CTL = %d1-8 / ; US-ASCII control | |
%d11 / ; characters that do not | |
%d12 / ; include the carriage | |
%d14-31 / ; return, line feed, and | |
%d127 ; white space characters | |
obs-ctext = obs-NO-WS-CTL | |
obs-qtext = obs-NO-WS-CTL | |
obs-qp = "\" (%d0 / obs-NO-WS-CTL / LF / CR) | |
_text | |
MAILTO_RULES_ = <<'_text' | |
; RFC6068 | |
mailtoURI = "mailto:" [ to ] [ hfields ] | |
to = addr-spec *("," addr-spec ) | |
hfields = "?" hfield *( "&" hfield ) | |
hfield = hfname "=" hfvalue | |
hfname = *qchar | |
hfvalue = *qchar | |
addr-spec = local-part "@" domain | |
local-part = dot-atom-text / quoted-string | |
domain = dot-atom-text / "[" *dtext-no-obs "]" | |
dtext-no-obs = %d33-90 / ; Printable US-ASCII | |
%d94-126 ; characters not including | |
; "[", "]", or "\" | |
qchar = unreserved / pct-encoded / some-delims | |
some-delims = "!" / "$" / "'" / "(" / ")" / "*" | |
/ "+" / "," / ";" / ":" / "@" | |
; RFC3986 | |
unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" | |
pct-encoded = "%" HEXDIG HEXDIG | |
; RFC5322 | |
; override by RFC6068 | |
; 3. Whitespace and comments within <local-part> and <domain> MUST NOT | |
; be used. They would not have any operational semantics. | |
; quoted-string = [CFWS] | |
; DQUOTE *([FWS] qcontent) [FWS] DQUOTE | |
; [CFWS] | |
quoted-string = DQUOTE *(qcontent) DQUOTE | |
; obs-qp = "\" (%d0 / obs-NO-WS-CTL / LF / CR) | |
; quoted-pair = ("\" (VCHAR / WSP)) / obs-qp | |
quoted-pair = "\" VCHAR | |
qtext = %d33 / ; Printable US-ASCII | |
%d35-91 / ; characters not including | |
%d93-126 ; "\" or the quote character | |
; / obs-qtext | |
atext = ALPHA / DIGIT / ; Printable US-ASCII | |
"!" / ; "#" / ; characters not including | |
"$" / "%" / ; specials. Used for atoms. | |
"&" / "'" / | |
"*" / "+" / | |
"-" / ; "/" / | |
"=" / "?" / | |
"^" / "_" / | |
"`" / "{" / | |
"|" / "}" / | |
"~" | |
_text | |
if __FILE__ == $0 | |
main | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment