jballanc/gist:331222

## gistfile1.diff
diff --git a/re.cpp b/re.cpp
index dc7fbe7..3d800fe 100644
--- a/re.cpp
+++ b/re.cpp
@@ -9,6 +9,7 @@

 #include "unicode/regex.h"
 #include "unicode/unistr.h"
+#include "unicode/schriter.h"
 #include "ruby/ruby.h"
 #include "encoding.h"
 #include "objc.h"
@@ -108,7 +109,7 @@ str_to_unistr(VALUE str)
 }

 static void
-sanitize_regexp_string(UnicodeString *unistr)
+sanitize_regexp_string(UnicodeString *unistr, int option)
 {
     // ICU does not support [[:word::], so we need to replace all
     // occurences by \w.
@@ -120,6 +121,58 @@ sanitize_regexp_string(UnicodeString *unistr)
     while ((pos = unistr->indexOf(word_str)) >= 0) {
 	unistr->replace(pos, 10, repl_str);
     }
+
+    // ICU treats '#' as the start of a comment, even inside of '[]',
+    // so we need to escape the '#'
+    if (option & REGEXP_OPT_EXTENDED) {
+        StringCharacterIterator iter(*unistr);
+        UChar curr_char = iter.first();
+        int32_t brackets = 0, braces = 0, parens = 0;
+        do {
+            switch (curr_char) {
+                case '[':
+                    if (unistr->charAt(iter.getIndex() - 1) != '\\') {
+                        brackets++;
+                    }
+                    break;
+                case ']':
+                    if ((brackets > 0) && (unistr->charAt(iter.getIndex() - 1) != '\\')) {
+                        brackets--;
+                    }
+                    break;
+                case '{':
+                    if (unistr->charAt(iter.getIndex() - 1) != '\\') {
+                        braces++;
+                    }
+                    break;
+                case '}':
+                    if ((braces > 0) && (unistr->charAt(iter.getIndex() - 1) != '\\')) {
+                        braces--;
+                    }
+                    break;
+                case '(':
+                    if (unistr->charAt(iter.getIndex() - 1) != '\\') {
+                        parens++;
+                    }
+                    break;
+                case ')':
+                    if ((parens > 0) && (unistr->charAt(iter.getIndex() - 1) != '\\')) {
+                        parens--;
+                    }
+                    break;
+                case '#':
+                    if (((brackets > 0) || (braces > 0) || (parens > 0))
+                        && (unistr->charAt(iter.getIndex() - 1) != '?')
+                        && (unistr->charAt(iter.getIndex() - 1) != '\\')) {
+                        pos = iter.getIndex();
+                        unistr->insert(pos, '\\');
+                        iter.setText(*unistr);
+                        iter.setIndex(pos + 1);
+                    }
+            };
+            curr_char = iter.next();
+        } while (iter.hasNext());
+    }
 }

 static bool
@@ -130,7 +183,7 @@ init_from_string(rb_regexp_t *regexp, VALUE str, int option, VALUE *excp)
     UnicodeString *unistr = str_to_unistr(str);
     assert(unistr != NULL);

-    sanitize_regexp_string(unistr);
+    sanitize_regexp_string(unistr, option);

     UParseError pe;
     UErrorCode status = U_ZERO_ERROR;
	diff --git a/re.cpp b/re.cpp
	index dc7fbe7..3d800fe 100644
	--- a/re.cpp
	+++ b/re.cpp
	@@ -9,6 +9,7 @@

	#include "unicode/regex.h"
	#include "unicode/unistr.h"
	+#include "unicode/schriter.h"
	#include "ruby/ruby.h"
	#include "encoding.h"
	#include "objc.h"
	@@ -108,7 +109,7 @@ str_to_unistr(VALUE str)
	}

	static void
	-sanitize_regexp_string(UnicodeString *unistr)
	+sanitize_regexp_string(UnicodeString *unistr, int option)
	{
	// ICU does not support [[:word::], so we need to replace all
	// occurences by \w.
	@@ -120,6 +121,58 @@ sanitize_regexp_string(UnicodeString *unistr)
	while ((pos = unistr->indexOf(word_str)) >= 0) {
	unistr->replace(pos, 10, repl_str);
	}
	+
	+ // ICU treats '#' as the start of a comment, even inside of '[]',
	+ // so we need to escape the '#'
	+ if (option & REGEXP_OPT_EXTENDED) {
	+ StringCharacterIterator iter(*unistr);
	+ UChar curr_char = iter.first();
	+ int32_t brackets = 0, braces = 0, parens = 0;
	+ do {
	+ switch (curr_char) {
	+ case '[':
	+ if (unistr->charAt(iter.getIndex() - 1) != '\\') {
	+ brackets++;
	+ }
	+ break;
	+ case ']':
	+ if ((brackets > 0) && (unistr->charAt(iter.getIndex() - 1) != '\\')) {
	+ brackets--;
	+ }
	+ break;
	+ case '{':
	+ if (unistr->charAt(iter.getIndex() - 1) != '\\') {
	+ braces++;
	+ }
	+ break;
	+ case '}':
	+ if ((braces > 0) && (unistr->charAt(iter.getIndex() - 1) != '\\')) {
	+ braces--;
	+ }
	+ break;
	+ case '(':
	+ if (unistr->charAt(iter.getIndex() - 1) != '\\') {
	+ parens++;
	+ }
	+ break;
	+ case ')':
	+ if ((parens > 0) && (unistr->charAt(iter.getIndex() - 1) != '\\')) {
	+ parens--;
	+ }
	+ break;
	+ case '#':
	+ if (((brackets > 0) \|\| (braces > 0) \|\| (parens > 0))
	+ && (unistr->charAt(iter.getIndex() - 1) != '?')
	+ && (unistr->charAt(iter.getIndex() - 1) != '\\')) {
	+ pos = iter.getIndex();
	+ unistr->insert(pos, '\\');
	+ iter.setText(*unistr);
	+ iter.setIndex(pos + 1);
	+ }
	+ };
	+ curr_char = iter.next();
	+ } while (iter.hasNext());
	+ }
	}

	static bool
	@@ -130,7 +183,7 @@ init_from_string(rb_regexp_t regexp, VALUE str, int option, VALUE excp)
	UnicodeString *unistr = str_to_unistr(str);
	assert(unistr != NULL);

	- sanitize_regexp_string(unistr);
	+ sanitize_regexp_string(unistr, option);

	UParseError pe;
	UErrorCode status = U_ZERO_ERROR;