Skip to content

Instantly share code, notes, and snippets.

@jballanc
Created March 13, 2010 09:27
Show Gist options
  • Save jballanc/331222 to your computer and use it in GitHub Desktop.
Save jballanc/331222 to your computer and use it in GitHub Desktop.
diff --git a/re.cpp b/re.cpp
index dc7fbe7..3d800fe 100644
--- a/re.cpp
+++ b/re.cpp
@@ -9,6 +9,7 @@
#include "unicode/regex.h"
#include "unicode/unistr.h"
+#include "unicode/schriter.h"
#include "ruby/ruby.h"
#include "encoding.h"
#include "objc.h"
@@ -108,7 +109,7 @@ str_to_unistr(VALUE str)
}
static void
-sanitize_regexp_string(UnicodeString *unistr)
+sanitize_regexp_string(UnicodeString *unistr, int option)
{
// ICU does not support [[:word::], so we need to replace all
// occurences by \w.
@@ -120,6 +121,58 @@ sanitize_regexp_string(UnicodeString *unistr)
while ((pos = unistr->indexOf(word_str)) >= 0) {
unistr->replace(pos, 10, repl_str);
}
+
+ // ICU treats '#' as the start of a comment, even inside of '[]',
+ // so we need to escape the '#'
+ if (option & REGEXP_OPT_EXTENDED) {
+ StringCharacterIterator iter(*unistr);
+ UChar curr_char = iter.first();
+ int32_t brackets = 0, braces = 0, parens = 0;
+ do {
+ switch (curr_char) {
+ case '[':
+ if (unistr->charAt(iter.getIndex() - 1) != '\\') {
+ brackets++;
+ }
+ break;
+ case ']':
+ if ((brackets > 0) && (unistr->charAt(iter.getIndex() - 1) != '\\')) {
+ brackets--;
+ }
+ break;
+ case '{':
+ if (unistr->charAt(iter.getIndex() - 1) != '\\') {
+ braces++;
+ }
+ break;
+ case '}':
+ if ((braces > 0) && (unistr->charAt(iter.getIndex() - 1) != '\\')) {
+ braces--;
+ }
+ break;
+ case '(':
+ if (unistr->charAt(iter.getIndex() - 1) != '\\') {
+ parens++;
+ }
+ break;
+ case ')':
+ if ((parens > 0) && (unistr->charAt(iter.getIndex() - 1) != '\\')) {
+ parens--;
+ }
+ break;
+ case '#':
+ if (((brackets > 0) || (braces > 0) || (parens > 0))
+ && (unistr->charAt(iter.getIndex() - 1) != '?')
+ && (unistr->charAt(iter.getIndex() - 1) != '\\')) {
+ pos = iter.getIndex();
+ unistr->insert(pos, '\\');
+ iter.setText(*unistr);
+ iter.setIndex(pos + 1);
+ }
+ };
+ curr_char = iter.next();
+ } while (iter.hasNext());
+ }
}
static bool
@@ -130,7 +183,7 @@ init_from_string(rb_regexp_t *regexp, VALUE str, int option, VALUE *excp)
UnicodeString *unistr = str_to_unistr(str);
assert(unistr != NULL);
- sanitize_regexp_string(unistr);
+ sanitize_regexp_string(unistr, option);
UParseError pe;
UErrorCode status = U_ZERO_ERROR;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment