Created
March 13, 2010 09:27
-
-
Save jballanc/331222 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/re.cpp b/re.cpp | |
index dc7fbe7..3d800fe 100644 | |
--- a/re.cpp | |
+++ b/re.cpp | |
@@ -9,6 +9,7 @@ | |
#include "unicode/regex.h" | |
#include "unicode/unistr.h" | |
+#include "unicode/schriter.h" | |
#include "ruby/ruby.h" | |
#include "encoding.h" | |
#include "objc.h" | |
@@ -108,7 +109,7 @@ str_to_unistr(VALUE str) | |
} | |
static void | |
-sanitize_regexp_string(UnicodeString *unistr) | |
+sanitize_regexp_string(UnicodeString *unistr, int option) | |
{ | |
// ICU does not support [[:word::], so we need to replace all | |
// occurences by \w. | |
@@ -120,6 +121,58 @@ sanitize_regexp_string(UnicodeString *unistr) | |
while ((pos = unistr->indexOf(word_str)) >= 0) { | |
unistr->replace(pos, 10, repl_str); | |
} | |
+ | |
+ // ICU treats '#' as the start of a comment, even inside of '[]', | |
+ // so we need to escape the '#' | |
+ if (option & REGEXP_OPT_EXTENDED) { | |
+ StringCharacterIterator iter(*unistr); | |
+ UChar curr_char = iter.first(); | |
+ int32_t brackets = 0, braces = 0, parens = 0; | |
+ do { | |
+ switch (curr_char) { | |
+ case '[': | |
+ if (unistr->charAt(iter.getIndex() - 1) != '\\') { | |
+ brackets++; | |
+ } | |
+ break; | |
+ case ']': | |
+ if ((brackets > 0) && (unistr->charAt(iter.getIndex() - 1) != '\\')) { | |
+ brackets--; | |
+ } | |
+ break; | |
+ case '{': | |
+ if (unistr->charAt(iter.getIndex() - 1) != '\\') { | |
+ braces++; | |
+ } | |
+ break; | |
+ case '}': | |
+ if ((braces > 0) && (unistr->charAt(iter.getIndex() - 1) != '\\')) { | |
+ braces--; | |
+ } | |
+ break; | |
+ case '(': | |
+ if (unistr->charAt(iter.getIndex() - 1) != '\\') { | |
+ parens++; | |
+ } | |
+ break; | |
+ case ')': | |
+ if ((parens > 0) && (unistr->charAt(iter.getIndex() - 1) != '\\')) { | |
+ parens--; | |
+ } | |
+ break; | |
+ case '#': | |
+ if (((brackets > 0) || (braces > 0) || (parens > 0)) | |
+ && (unistr->charAt(iter.getIndex() - 1) != '?') | |
+ && (unistr->charAt(iter.getIndex() - 1) != '\\')) { | |
+ pos = iter.getIndex(); | |
+ unistr->insert(pos, '\\'); | |
+ iter.setText(*unistr); | |
+ iter.setIndex(pos + 1); | |
+ } | |
+ }; | |
+ curr_char = iter.next(); | |
+ } while (iter.hasNext()); | |
+ } | |
} | |
static bool | |
@@ -130,7 +183,7 @@ init_from_string(rb_regexp_t *regexp, VALUE str, int option, VALUE *excp) | |
UnicodeString *unistr = str_to_unistr(str); | |
assert(unistr != NULL); | |
- sanitize_regexp_string(unistr); | |
+ sanitize_regexp_string(unistr, option); | |
UParseError pe; | |
UErrorCode status = U_ZERO_ERROR; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment