Created
August 14, 2013 07:47
-
-
Save nobu/6228803 to your computer and use it in GitHub Desktop.
String#scrub and String#scrub! for 2.0.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'mkmf' | |
create_makefile('string/scrub') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <ruby.h> | |
#include <ruby/encoding.h> | |
#ifndef TRUE | |
#define TRUE 1 | |
#endif | |
#ifndef FALSE | |
#define FALSE 0 | |
#endif | |
#define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str)) | |
static inline const char * | |
search_nonascii(const char *p, const char *e) | |
{ | |
#if SIZEOF_VALUE == 8 | |
# define NONASCII_MASK 0x8080808080808080ULL | |
#elif SIZEOF_VALUE == 4 | |
# define NONASCII_MASK 0x80808080UL | |
#endif | |
#ifdef NONASCII_MASK | |
if ((int)sizeof(VALUE) * 2 < e - p) { | |
const VALUE *s, *t; | |
const VALUE lowbits = sizeof(VALUE) - 1; | |
s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); | |
while (p < (const char *)s) { | |
if (!ISASCII(*p)) | |
return p; | |
p++; | |
} | |
t = (const VALUE*)(~lowbits & (VALUE)e); | |
while (s < t) { | |
if (*s & NONASCII_MASK) { | |
t = s; | |
break; | |
} | |
s++; | |
} | |
p = (const char *)t; | |
} | |
#endif | |
while (p < e) { | |
if (!ISASCII(*p)) | |
return p; | |
p++; | |
} | |
return NULL; | |
} | |
static VALUE | |
str_compat_and_valid(VALUE str, rb_encoding *enc) | |
{ | |
int cr; | |
str = StringValue(str); | |
cr = rb_enc_str_coderange(str); | |
if (cr == ENC_CODERANGE_BROKEN) { | |
rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str); | |
} | |
else if (cr == ENC_CODERANGE_7BIT) { | |
rb_encoding *e = STR_ENC_GET(str); | |
if (!rb_enc_asciicompat(enc)) { | |
rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", | |
rb_enc_name(enc), rb_enc_name(e)); | |
} | |
} | |
else { /* ENC_CODERANGE_VALID */ | |
rb_encoding *e = STR_ENC_GET(str); | |
if (enc != e) { | |
rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", | |
rb_enc_name(enc), rb_enc_name(e)); | |
} | |
} | |
return str; | |
} | |
/** | |
* @param repl the replacement character | |
* @return If given string is invalid, returns a new string. Otherwise, returns Qnil. | |
*/ | |
static VALUE | |
str_scrub0(int argc, VALUE *argv, VALUE str) | |
{ | |
int cr = ENC_CODERANGE(str); | |
rb_encoding *enc; | |
int encidx; | |
VALUE repl; | |
if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) | |
return Qnil; | |
enc = STR_ENC_GET(str); | |
rb_scan_args(argc, argv, "01", &repl); | |
if (argc != 0) { | |
repl = str_compat_and_valid(repl, enc); | |
} | |
if (rb_enc_dummy_p(enc)) { | |
return Qnil; | |
} | |
encidx = rb_enc_to_index(enc); | |
#define DEFAULT_REPLACE_CHAR(str) do { \ | |
static const char replace[sizeof(str)-1] = str; \ | |
rep = replace; replen = (int)sizeof(replace); \ | |
} while (0) | |
if (rb_enc_asciicompat(enc)) { | |
const char *p = RSTRING_PTR(str); | |
const char *e = RSTRING_END(str); | |
const char *p1 = p; | |
const char *rep; | |
long replen; | |
int rep7bit_p; | |
VALUE buf = Qnil; | |
if (rb_block_given_p()) { | |
rep = NULL; | |
replen = 0; | |
rep7bit_p = FALSE; | |
} | |
else if (!NIL_P(repl)) { | |
rep = RSTRING_PTR(repl); | |
replen = RSTRING_LEN(repl); | |
rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT); | |
} | |
else if (encidx == rb_utf8_encindex()) { | |
DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD"); | |
rep7bit_p = FALSE; | |
} | |
else { | |
DEFAULT_REPLACE_CHAR("?"); | |
rep7bit_p = TRUE; | |
} | |
cr = ENC_CODERANGE_7BIT; | |
p = search_nonascii(p, e); | |
if (!p) { | |
p = e; | |
} | |
while (p < e) { | |
int ret = rb_enc_precise_mbclen(p, e, enc); | |
if (MBCLEN_NEEDMORE_P(ret)) { | |
break; | |
} | |
else if (MBCLEN_CHARFOUND_P(ret)) { | |
cr = ENC_CODERANGE_VALID; | |
p += MBCLEN_CHARFOUND_LEN(ret); | |
} | |
else if (MBCLEN_INVALID_P(ret)) { | |
/* | |
* p1~p: valid ascii/multibyte chars | |
* p ~e: invalid bytes + unknown bytes | |
*/ | |
long clen = rb_enc_mbmaxlen(enc); | |
if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str)); | |
if (p > p1) { | |
rb_str_buf_cat(buf, p1, p - p1); | |
} | |
if (e - p < clen) clen = e - p; | |
if (clen <= 2) { | |
clen = 1; | |
} | |
else { | |
const char *q = p; | |
clen--; | |
for (; clen > 1; clen--) { | |
ret = rb_enc_precise_mbclen(q, q + clen, enc); | |
if (MBCLEN_NEEDMORE_P(ret)) break; | |
if (MBCLEN_INVALID_P(ret)) continue; | |
UNREACHABLE; | |
} | |
} | |
if (rep) { | |
rb_str_buf_cat(buf, rep, replen); | |
if (!rep7bit_p) cr = ENC_CODERANGE_VALID; | |
} | |
else { | |
repl = rb_yield(rb_enc_str_new(p1, clen, enc)); | |
repl = str_compat_and_valid(repl, enc); | |
rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl)); | |
if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID) | |
cr = ENC_CODERANGE_VALID; | |
} | |
p += clen; | |
p1 = p; | |
p = search_nonascii(p, e); | |
if (!p) { | |
p = e; | |
break; | |
} | |
} | |
else { | |
UNREACHABLE; | |
} | |
} | |
if (NIL_P(buf)) { | |
if (p == e) { | |
ENC_CODERANGE_SET(str, cr); | |
return Qnil; | |
} | |
buf = rb_str_buf_new(RSTRING_LEN(str)); | |
} | |
if (p1 < p) { | |
rb_str_buf_cat(buf, p1, p - p1); | |
} | |
if (p < e) { | |
if (rep) { | |
rb_str_buf_cat(buf, rep, replen); | |
if (!rep7bit_p) cr = ENC_CODERANGE_VALID; | |
} | |
else { | |
repl = rb_yield(rb_enc_str_new(p, e-p, enc)); | |
repl = str_compat_and_valid(repl, enc); | |
rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl)); | |
if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID) | |
cr = ENC_CODERANGE_VALID; | |
} | |
} | |
ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr); | |
return buf; | |
} | |
else { | |
/* ASCII incompatible */ | |
const char *p = RSTRING_PTR(str); | |
const char *e = RSTRING_END(str); | |
const char *p1 = p; | |
VALUE buf = Qnil; | |
const char *rep; | |
long replen; | |
long mbminlen = rb_enc_mbminlen(enc); | |
if (!NIL_P(repl)) { | |
rep = RSTRING_PTR(repl); | |
replen = RSTRING_LEN(repl); | |
} | |
else if (!strcasecmp(rb_enc_name(enc), "UTF-16BE")) { | |
DEFAULT_REPLACE_CHAR("\xFF\xFD"); | |
} | |
else if (!strcasecmp(rb_enc_name(enc), "UTF-16LE")) { | |
DEFAULT_REPLACE_CHAR("\xFD\xFF"); | |
} | |
else if (!strcasecmp(rb_enc_name(enc), "UTF-32BE")) { | |
DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD"); | |
} | |
else if (!strcasecmp(rb_enc_name(enc), "UTF-32lE")) { | |
DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00"); | |
} | |
else { | |
DEFAULT_REPLACE_CHAR("?"); | |
} | |
while (p < e) { | |
int ret = rb_enc_precise_mbclen(p, e, enc); | |
if (MBCLEN_NEEDMORE_P(ret)) { | |
break; | |
} | |
else if (MBCLEN_CHARFOUND_P(ret)) { | |
p += MBCLEN_CHARFOUND_LEN(ret); | |
} | |
else if (MBCLEN_INVALID_P(ret)) { | |
const char *q = p; | |
long clen = rb_enc_mbmaxlen(enc); | |
if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str)); | |
if (p > p1) rb_str_buf_cat(buf, p1, p - p1); | |
if (e - p < clen) clen = e - p; | |
if (clen <= mbminlen * 2) { | |
clen = mbminlen; | |
} | |
else { | |
clen -= mbminlen; | |
for (; clen > mbminlen; clen-=mbminlen) { | |
ret = rb_enc_precise_mbclen(q, q + clen, enc); | |
if (MBCLEN_NEEDMORE_P(ret)) break; | |
if (MBCLEN_INVALID_P(ret)) continue; | |
UNREACHABLE; | |
} | |
} | |
if (rep) { | |
rb_str_buf_cat(buf, rep, replen); | |
} | |
else { | |
repl = rb_yield(rb_enc_str_new(p, e-p, enc)); | |
repl = str_compat_and_valid(repl, enc); | |
rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl)); | |
} | |
p += clen; | |
p1 = p; | |
} | |
else { | |
UNREACHABLE; | |
} | |
} | |
if (NIL_P(buf)) { | |
if (p == e) { | |
ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); | |
return Qnil; | |
} | |
buf = rb_str_buf_new(RSTRING_LEN(str)); | |
} | |
if (p1 < p) { | |
rb_str_buf_cat(buf, p1, p - p1); | |
} | |
if (p < e) { | |
if (rep) { | |
rb_str_buf_cat(buf, rep, replen); | |
} | |
else { | |
repl = rb_yield(rb_enc_str_new(p, e-p, enc)); | |
repl = str_compat_and_valid(repl, enc); | |
rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl)); | |
} | |
} | |
ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), ENC_CODERANGE_VALID); | |
return buf; | |
} | |
} | |
/* | |
* call-seq: | |
* str.scrub -> new_str | |
* str.scrub(repl) -> new_str | |
* str.scrub{|bytes|} -> new_str | |
* | |
* If the string is invalid byte sequence then replace invalid bytes with given replacement | |
* character, else returns self. | |
* If block is given, replace invalid bytes with returned value of the block. | |
* | |
* "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD" | |
* "abc\u3042\x81".scrub("*") #=> "abc\u3042*" | |
* "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>" | |
*/ | |
VALUE | |
rb_str_scrub(int argc, VALUE *argv, VALUE str) | |
{ | |
VALUE new = str_scrub0(argc, argv, str); | |
return NIL_P(new) ? rb_str_dup(str): new; | |
} | |
/* | |
* call-seq: | |
* str.scrub! -> str | |
* str.scrub!(repl) -> str | |
* str.scrub!{|bytes|} -> str | |
* | |
* If the string is invalid byte sequence then replace invalid bytes with given replacement | |
* character, else returns self. | |
* If block is given, replace invalid bytes with returned value of the block. | |
* | |
* "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD" | |
* "abc\u3042\x81".scrub!("*") #=> "abc\u3042*" | |
* "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>" | |
*/ | |
static VALUE | |
str_scrub_bang(int argc, VALUE *argv, VALUE str) | |
{ | |
VALUE new = str_scrub0(argc, argv, str); | |
if (!NIL_P(new)) rb_str_replace(str, new); | |
return str; | |
} | |
void | |
Init_scrub(void) | |
{ | |
rb_define_method(rb_cString, "scrub", rb_str_scrub, -1); | |
rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: US-ASCII | |
require 'test/unit' | |
class TestScrub < Test::Unit::TestCase | |
module AESU | |
def ua(str) str.dup.force_encoding("US-ASCII") end | |
def a(str) str.dup.force_encoding("ASCII-8BIT") end | |
def e(str) str.dup.force_encoding("EUC-JP") end | |
def s(str) str.dup.force_encoding("Windows-31J") end | |
def u(str) str.dup.force_encoding("UTF-8") end | |
end | |
include AESU | |
def test_scrub | |
str = "\u3042\u3044" | |
assert_not_same(str, str.scrub) | |
str.force_encoding(Encoding::ISO_2022_JP) # dummy encoding | |
assert_not_same(str, str.scrub) | |
assert_equal("\uFFFD\uFFFD\uFFFD", u("\x80\x80\x80").scrub) | |
assert_equal("\uFFFDA", u("\xF4\x80\x80A").scrub) | |
# exapmles in Unicode 6.1.0 D93b | |
assert_equal("\x41\uFFFD\uFFFD\x41\uFFFD\x41", | |
u("\x41\xC0\xAF\x41\xF4\x80\x80\x41").scrub) | |
assert_equal("\x41\uFFFD\uFFFD\uFFFD\x41", | |
u("\x41\xE0\x9F\x80\x41").scrub) | |
assert_equal("\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064", | |
u("\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64").scrub) | |
assert_equal("abcdefghijklmnopqrstuvwxyz\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064", | |
u("abcdefghijklmnopqrstuvwxyz\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64").scrub) | |
assert_equal("\u3042\u3013", u("\xE3\x81\x82\xE3\x81").scrub("\u3013")) | |
assert_raise(Encoding::CompatibilityError){ u("\xE3\x81\x82\xE3\x81").scrub(e("\xA4\xA2")) } | |
assert_raise(TypeError){ u("\xE3\x81\x82\xE3\x81").scrub(1) } | |
assert_raise(ArgumentError){ u("\xE3\x81\x82\xE3\x81\x82\xE3\x81").scrub(u("\x81")) } | |
assert_equal(e("\xA4\xA2\xA2\xAE"), e("\xA4\xA2\xA4").scrub(e("\xA2\xAE"))) | |
assert_equal("\u3042<e381>", u("\xE3\x81\x82\xE3\x81").scrub{|x|'<'+x.unpack('H*')[0]+'>'}) | |
assert_raise(Encoding::CompatibilityError){ u("\xE3\x81\x82\xE3\x81").scrub{e("\xA4\xA2")} } | |
assert_raise(TypeError){ u("\xE3\x81\x82\xE3\x81").scrub{1} } | |
assert_raise(ArgumentError){ u("\xE3\x81\x82\xE3\x81\x82\xE3\x81").scrub{u("\x81")} } | |
assert_equal(e("\xA4\xA2\xA2\xAE"), e("\xA4\xA2\xA4").scrub{e("\xA2\xAE")}) | |
assert_equal("\uFFFD\u3042".encode("UTF-16BE"), | |
"\xD8\x00\x30\x42".force_encoding(Encoding::UTF_16BE). | |
scrub) | |
assert_equal("\uFFFD\u3042".encode("UTF-16LE"), | |
"\x00\xD8\x42\x30".force_encoding(Encoding::UTF_16LE). | |
scrub) | |
assert_equal("\uFFFD".encode("UTF-32BE"), | |
"\xff".force_encoding(Encoding::UTF_32BE). | |
scrub) | |
assert_equal("\uFFFD".encode("UTF-32LE"), | |
"\xff".force_encoding(Encoding::UTF_32LE). | |
scrub) | |
end | |
def test_scrub_bang | |
str = "\u3042\u3044" | |
assert_same(str, str.scrub!) | |
str.force_encoding(Encoding::ISO_2022_JP) # dummy encoding | |
assert_same(str, str.scrub!) | |
str = u("\x80\x80\x80") | |
str.scrub! | |
assert_same(str, str.scrub!) | |
assert_equal("\uFFFD\uFFFD\uFFFD", str) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment