Skip to content

Instantly share code, notes, and snippets.

@lrz
Created April 27, 2011 04:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lrz/943692 to your computer and use it in GitHub Desktop.
Save lrz/943692 to your computer and use it in GitHub Desktop.
diff --git a/re.c b/re.c
index 523ba17..0df6349 100644
--- a/re.c
+++ b/re.c
@@ -696,34 +696,56 @@ rb_reg_matcher_new(VALUE re, VALUE str)
u_errorName(status));
}
- long chars_len = 0;
- UChar *chars = rb_str_xcopy_uchars(str, &chars_len);
-
- if (chars_len == 0) {
- // uregex_setText() will complain if we pass a NULL pattern or a
- // pattern length of 0, so we do pass an empty pattern with a length
- // of -1 which indicates it's terminated by \0.
- chars = (UChar *)xmalloc(sizeof(UChar));
- *chars = '\0';
- chars_len = -1;
+ // Fast path when applying a regexp on an UTF-8 encoded text string.
+ // (Only if ICU is 4.6 or higher).
+ bool need_uchars = true;
+#if U_ICU_VERSION_MAJOR_NUM >= 4 && U_ICU_VERSION_MINOR_NUM >= 6
+ if (IS_RSTR(str)
+ && (IS_UTF8_ENC(RSTR(str)->encoding)
+ || IS_ASCII_ENC(RSTR(str)->encoding))) {
+ UText *text = utext_openUTF8(NULL, RSTR(str)->bytes,
+ RSTR(str)->length_in_bytes, &status);
+ if (status == U_ZERO_ERROR) {
+ uregex_setUText(match_pattern, text, &status);
+ utext_close(text);
+ if (status == U_ZERO_ERROR) {
+ need_uchars = false;
+ }
+ }
+ status = U_ZERO_ERROR;
}
+#endif
+
+ // Slow path, converting the text string into a buffer of uchars.
+ if (need_uchars) {
+ long chars_len = 0;
+ UChar *chars = rb_str_xcopy_uchars(str, &chars_len);
+
+ if (chars_len == 0) {
+ // uregex_setText() will complain if we pass a NULL pattern or a
+ // pattern length of 0, so we do pass an empty pattern with a
+ // length of -1 which indicates it's terminated by \0.
+ chars = (UChar *)xmalloc(sizeof(UChar));
+ *chars = '\0';
+ chars_len = -1;
+ }
- uregex_setText(match_pattern, chars, chars_len, &status);
+ uregex_setText(match_pattern, chars, chars_len, &status);
- if (status != U_ZERO_ERROR) {
- uregex_close(match_pattern);
- rb_raise(rb_eRegexpError, "can't set pattern text: %s",
- u_errorName(status));
+ if (status != U_ZERO_ERROR) {
+ uregex_close(match_pattern);
+ rb_raise(rb_eRegexpError, "can't set pattern text: %s",
+ u_errorName(status));
+ }
+ // Apparently uregex_setText doesn't copy the given string, so we need
+ // to keep it around until we finally destroy the matcher object.
+ GC_WB(&matcher->text_chars, chars);
}
matcher->pattern = match_pattern;
matcher->frozen_str = 0; // set lazily
GC_WB(&matcher->orig_str, str);
- // Apparently uregex_setText doesn't copy the given string, so we need
- // to keep it around until we finally destroy the matcher object.
- GC_WB(&matcher->text_chars, chars);
-
return (VALUE)matcher;
}
$ time ./miniruby -e "s=File.read('parse.c'); 1000.times { s.match(/./) }"
real 0m3.868s
user 0m3.863s
sys 0m0.197s
$ time macruby -e "s=File.read('parse.c'); 1000.times { s.match(/./) }"
real 0m3.269s
user 0m2.907s
sys 0m0.550s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment