Skip to content

Instantly share code, notes, and snippets.

@takaokouji
Created April 7, 2011 14:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save takaokouji/907895 to your computer and use it in GitHub Desktop.
Save takaokouji/907895 to your computer and use it in GitHub Desktop.
sanitize_regexp_string for "<pattern>{..}*" in Ruby
require 'test/unit'
def sanitize_regexp_string(chars)
lbrace_chars = "{"
rbrace_asterisk_chars = "}*"
backslash_chars = "\\"
res = chars.dup
res_diff = 0
p_start = 0
p_end = chars.length - 1
while p_start <= p_end
rac_pos = chars.index(rbrace_asterisk_chars, p_start)
if rac_pos == nil
return res
end
i = rac_pos - 1
l_pos = nil
state = 0
# 0: one time, number or ","
# 1: number or "," or "{"
# 2: one time, number
# 3: number or "{"
while i >= p_start
c = chars[i]
case state
when 0
case c
when "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"
state = 1
when ","
state = 2
else
break
end
when 1
case c
when "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"
when ","
state = 2
when lbrace_chars
l_pos = i
break
else
break
end
when 2
case c
when "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"
state = 3
else
break
end
when 3
case c
when "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"
when lbrace_chars
l_pos = i
break
else
break
end
end
i -= 1
end
if l_pos != nil
i = l_pos - 1
in_pos = nil
num_blackets = 0
num_braces = 0
num_backslaches = 0
state = 0
while i >= p_start
c = chars[i]
case state
when 0
case c
when "\\"
num_backslaches += 1
state = 2
when "]"
num_blackets += 1
state = 3
when ")"
num_braces += 1
state = 20
else
state = 1
end
when 1
case c
when "\\"
num_backslaches += 1
else
break
end
when 2
case c
when "\\"
num_backslaches += 1
else
break
end
when 3
case c
when "\\"
if num_blackets == 1
num_backslaches += 1
state = 1
else
num_backslaches += 1
state = 4
end
when "]"
num_blackets += 1
when "["
num_blackets -= 1
if num_blackets <= 0
in_pos = i
break
end
end
when 4
case c
when "\\"
num_backslaches += 1
else
num_backslaches = 0
if (num_backslaches % 2) == 1
num_blackets -= 1
end
state = 3
end
end
i -= 1
end
if state == 1
if (num_backslaches % 2) == 1
in_pos = l_pos - 2
else
if num_blackets == 0
in_pos = l_pos - 1
end
end
end
if state == 2
if (num_backslaches % 2) == 0
in_pos = l_pos - 2
end
end
if in_pos != nil
res[(res_diff + in_pos)..(res_diff + rac_pos)] = "(?:" + chars[in_pos..rac_pos] + ")"
res_diff += 4
end
end
p_start = rac_pos + rbrace_asterisk_chars.length
end
return res
end
class TestMethod < Test::Unit::TestCase
message_expected_actual_check =
[
["minimum", "(?:.{8})*", ".{8}*", true],
["escaped right brace 1", ".{8\\}*", ".{8\\}*", false],
["escaped right brace 2", ".{8\\\\}*", ".{8\\\\}*", false],
["escaped right brace 3", ".{8\\\\\\}*", ".{8\\\\\\}*", false],
["escaped right brace 4", ".{8\\\\\\\\}*", ".{8\\\\\\\\}*", false],
["escaped left brace 1", ".\\{8}*", ".\\{8}*", false],
["escaped left brace 2", ".(?:\\\\{8})*", ".\\\\{8}*", true],
["escaped left brace 3", ".\\\\\\{8}*", ".\\\\\\{8}*", false],
["escaped left brace 4", ".\\\\(?:\\\\{8})*", ".\\\\\\\\{8}*", true],
["multiple 1", "(?:.{8})*(?:.{8})*", ".{8}*.{8}*", true],
["multiple 2", "(?:.{8})*abc abc abc(?:.{8})*", ".{8}*abc abc abc.{8}*", true],
["multiple 3",
".{8\\}*(?:.{8})*.\\{8}*(?:.{8})*.\\{8\\}*",
".{8\\}*.{8}*.\\{8}*.{8}*.\\{8\\}*", false],
["brace 1", "(?:.{0,8})*", ".{0,8}*", true],
["brace 2", "(?:.{0,})*", ".{0,}*", true],
["brace 3", ".{,}*", ".{,}*", false],
["brace 4", ".{}*", ".{}*", false],
["brace 5", ".{a}*", ".{a}*", false],
["brace 6", ".{a,1}*", ".{a,1}*", false],
["brace 7", ".{1,2,3}*", ".{1,2,3}*", false],
["backslash 1", "(?:\\w{8})*", "\\w{8}*", true],
["backslash 2", "\\\\(?:w{8})*", "\\\\w{8}*", true],
["backslash 3", "\\\\(?:\\w{8})*", "\\\\\\w{8}*", true],
["backslash 4", "\\\\\\\\(?:w{8})*", "\\\\\\\\w{8}*", true],
["char class 1", "(?:[0-9]{8})*", "[0-9]{8}*", true],
["char class 2", "(?:[[:word:]]{8})*", "[[:word:]]{8}*", true],
["char class 3", "(?:[[:word:][:digit:]]{8})*", "[[:word:][:digit:]]{8}*", false],
["char class 4", "[abc](?:[[:word:][:digit:]]{8})*", "[abc][[:word:][:digit:]]{8}*", false],
["char class 5", "(?:[[]){8})*", "[[]{8}*", true],
["char class 6", "(?:[\\[]){8})*", "[\\[]{8}*", true],
["char class 7", "(?:[\\w]){8})*", "[\\w]{8}*", true],
["char class 8", "[\\\\[]{8}*", "[\\\\[]{8}*", false],
["char class 9", "(?:\\]{8})*", "\\]{8}*", false],
["char class 10", "\\\\]{8}*", "\\\\]{8}*", false],
["group 1", "(?:(abc){8})*", "(abc){8}*", true],
["group 2", "(?:((ab)(c)){8})*", "((ab)(c)){8}*", true],
["group 3", "(ABC)(?:((ab)(c)){8})*", "(ABC)((ab)(c)){8}*", true],
["char class and group 1", "(?:[)]{8})*", "[)]{8}*", true],
["char class and group 2", "((((?:[)))]{8})*)))", "((([)))]{8}*)))", true],
]
message_expected_actual_check.each do |msg, expected, actual, check|
class_eval <<-EOS
def test_sanitize_regexp_string__#{msg.gsub(/\s+/, "_")}
assert_equal(#{expected.inspect}, sanitize_regexp_string(#{actual.inspect}), #{msg.inspect})
if #{check}
assert_nothing_raised do
Regexp.new(sanitize_regexp_string(#{actual.inspect}))
end
end
end
EOS
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment