Skip to content

Instantly share code, notes, and snippets.

@nobu
Created August 14, 2013 07:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nobu/6228803 to your computer and use it in GitHub Desktop.
Save nobu/6228803 to your computer and use it in GitHub Desktop.
String#scrub and String#scrub! for 2.0.
require 'mkmf'
create_makefile('string/scrub')
#include <ruby.h>
#include <ruby/encoding.h>
#ifndef TRUE
#define TRUE 1
#endif
#ifndef FALSE
#define FALSE 0
#endif
#define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
static inline const char *
search_nonascii(const char *p, const char *e)
{
#if SIZEOF_VALUE == 8
# define NONASCII_MASK 0x8080808080808080ULL
#elif SIZEOF_VALUE == 4
# define NONASCII_MASK 0x80808080UL
#endif
#ifdef NONASCII_MASK
if ((int)sizeof(VALUE) * 2 < e - p) {
const VALUE *s, *t;
const VALUE lowbits = sizeof(VALUE) - 1;
s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
while (p < (const char *)s) {
if (!ISASCII(*p))
return p;
p++;
}
t = (const VALUE*)(~lowbits & (VALUE)e);
while (s < t) {
if (*s & NONASCII_MASK) {
t = s;
break;
}
s++;
}
p = (const char *)t;
}
#endif
while (p < e) {
if (!ISASCII(*p))
return p;
p++;
}
return NULL;
}
static VALUE
str_compat_and_valid(VALUE str, rb_encoding *enc)
{
int cr;
str = StringValue(str);
cr = rb_enc_str_coderange(str);
if (cr == ENC_CODERANGE_BROKEN) {
rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
}
else if (cr == ENC_CODERANGE_7BIT) {
rb_encoding *e = STR_ENC_GET(str);
if (!rb_enc_asciicompat(enc)) {
rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
rb_enc_name(enc), rb_enc_name(e));
}
}
else { /* ENC_CODERANGE_VALID */
rb_encoding *e = STR_ENC_GET(str);
if (enc != e) {
rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
rb_enc_name(enc), rb_enc_name(e));
}
}
return str;
}
/**
* @param repl the replacement character
* @return If given string is invalid, returns a new string. Otherwise, returns Qnil.
*/
static VALUE
str_scrub0(int argc, VALUE *argv, VALUE str)
{
int cr = ENC_CODERANGE(str);
rb_encoding *enc;
int encidx;
VALUE repl;
if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID)
return Qnil;
enc = STR_ENC_GET(str);
rb_scan_args(argc, argv, "01", &repl);
if (argc != 0) {
repl = str_compat_and_valid(repl, enc);
}
if (rb_enc_dummy_p(enc)) {
return Qnil;
}
encidx = rb_enc_to_index(enc);
#define DEFAULT_REPLACE_CHAR(str) do { \
static const char replace[sizeof(str)-1] = str; \
rep = replace; replen = (int)sizeof(replace); \
} while (0)
if (rb_enc_asciicompat(enc)) {
const char *p = RSTRING_PTR(str);
const char *e = RSTRING_END(str);
const char *p1 = p;
const char *rep;
long replen;
int rep7bit_p;
VALUE buf = Qnil;
if (rb_block_given_p()) {
rep = NULL;
replen = 0;
rep7bit_p = FALSE;
}
else if (!NIL_P(repl)) {
rep = RSTRING_PTR(repl);
replen = RSTRING_LEN(repl);
rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
}
else if (encidx == rb_utf8_encindex()) {
DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
rep7bit_p = FALSE;
}
else {
DEFAULT_REPLACE_CHAR("?");
rep7bit_p = TRUE;
}
cr = ENC_CODERANGE_7BIT;
p = search_nonascii(p, e);
if (!p) {
p = e;
}
while (p < e) {
int ret = rb_enc_precise_mbclen(p, e, enc);
if (MBCLEN_NEEDMORE_P(ret)) {
break;
}
else if (MBCLEN_CHARFOUND_P(ret)) {
cr = ENC_CODERANGE_VALID;
p += MBCLEN_CHARFOUND_LEN(ret);
}
else if (MBCLEN_INVALID_P(ret)) {
/*
* p1~p: valid ascii/multibyte chars
* p ~e: invalid bytes + unknown bytes
*/
long clen = rb_enc_mbmaxlen(enc);
if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
if (p > p1) {
rb_str_buf_cat(buf, p1, p - p1);
}
if (e - p < clen) clen = e - p;
if (clen <= 2) {
clen = 1;
}
else {
const char *q = p;
clen--;
for (; clen > 1; clen--) {
ret = rb_enc_precise_mbclen(q, q + clen, enc);
if (MBCLEN_NEEDMORE_P(ret)) break;
if (MBCLEN_INVALID_P(ret)) continue;
UNREACHABLE;
}
}
if (rep) {
rb_str_buf_cat(buf, rep, replen);
if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
}
else {
repl = rb_yield(rb_enc_str_new(p1, clen, enc));
repl = str_compat_and_valid(repl, enc);
rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
cr = ENC_CODERANGE_VALID;
}
p += clen;
p1 = p;
p = search_nonascii(p, e);
if (!p) {
p = e;
break;
}
}
else {
UNREACHABLE;
}
}
if (NIL_P(buf)) {
if (p == e) {
ENC_CODERANGE_SET(str, cr);
return Qnil;
}
buf = rb_str_buf_new(RSTRING_LEN(str));
}
if (p1 < p) {
rb_str_buf_cat(buf, p1, p - p1);
}
if (p < e) {
if (rep) {
rb_str_buf_cat(buf, rep, replen);
if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
}
else {
repl = rb_yield(rb_enc_str_new(p, e-p, enc));
repl = str_compat_and_valid(repl, enc);
rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
cr = ENC_CODERANGE_VALID;
}
}
ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
return buf;
}
else {
/* ASCII incompatible */
const char *p = RSTRING_PTR(str);
const char *e = RSTRING_END(str);
const char *p1 = p;
VALUE buf = Qnil;
const char *rep;
long replen;
long mbminlen = rb_enc_mbminlen(enc);
if (!NIL_P(repl)) {
rep = RSTRING_PTR(repl);
replen = RSTRING_LEN(repl);
}
else if (!strcasecmp(rb_enc_name(enc), "UTF-16BE")) {
DEFAULT_REPLACE_CHAR("\xFF\xFD");
}
else if (!strcasecmp(rb_enc_name(enc), "UTF-16LE")) {
DEFAULT_REPLACE_CHAR("\xFD\xFF");
}
else if (!strcasecmp(rb_enc_name(enc), "UTF-32BE")) {
DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
}
else if (!strcasecmp(rb_enc_name(enc), "UTF-32lE")) {
DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
}
else {
DEFAULT_REPLACE_CHAR("?");
}
while (p < e) {
int ret = rb_enc_precise_mbclen(p, e, enc);
if (MBCLEN_NEEDMORE_P(ret)) {
break;
}
else if (MBCLEN_CHARFOUND_P(ret)) {
p += MBCLEN_CHARFOUND_LEN(ret);
}
else if (MBCLEN_INVALID_P(ret)) {
const char *q = p;
long clen = rb_enc_mbmaxlen(enc);
if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
if (e - p < clen) clen = e - p;
if (clen <= mbminlen * 2) {
clen = mbminlen;
}
else {
clen -= mbminlen;
for (; clen > mbminlen; clen-=mbminlen) {
ret = rb_enc_precise_mbclen(q, q + clen, enc);
if (MBCLEN_NEEDMORE_P(ret)) break;
if (MBCLEN_INVALID_P(ret)) continue;
UNREACHABLE;
}
}
if (rep) {
rb_str_buf_cat(buf, rep, replen);
}
else {
repl = rb_yield(rb_enc_str_new(p, e-p, enc));
repl = str_compat_and_valid(repl, enc);
rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
}
p += clen;
p1 = p;
}
else {
UNREACHABLE;
}
}
if (NIL_P(buf)) {
if (p == e) {
ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
return Qnil;
}
buf = rb_str_buf_new(RSTRING_LEN(str));
}
if (p1 < p) {
rb_str_buf_cat(buf, p1, p - p1);
}
if (p < e) {
if (rep) {
rb_str_buf_cat(buf, rep, replen);
}
else {
repl = rb_yield(rb_enc_str_new(p, e-p, enc));
repl = str_compat_and_valid(repl, enc);
rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
}
}
ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), ENC_CODERANGE_VALID);
return buf;
}
}
/*
* call-seq:
* str.scrub -> new_str
* str.scrub(repl) -> new_str
* str.scrub{|bytes|} -> new_str
*
* If the string is invalid byte sequence then replace invalid bytes with given replacement
* character, else returns self.
* If block is given, replace invalid bytes with returned value of the block.
*
* "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
* "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
* "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>"
*/
VALUE
rb_str_scrub(int argc, VALUE *argv, VALUE str)
{
VALUE new = str_scrub0(argc, argv, str);
return NIL_P(new) ? rb_str_dup(str): new;
}
/*
* call-seq:
* str.scrub! -> str
* str.scrub!(repl) -> str
* str.scrub!{|bytes|} -> str
*
* If the string is invalid byte sequence then replace invalid bytes with given replacement
* character, else returns self.
* If block is given, replace invalid bytes with returned value of the block.
*
* "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
* "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
* "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>"
*/
static VALUE
str_scrub_bang(int argc, VALUE *argv, VALUE str)
{
VALUE new = str_scrub0(argc, argv, str);
if (!NIL_P(new)) rb_str_replace(str, new);
return str;
}
void
Init_scrub(void)
{
rb_define_method(rb_cString, "scrub", rb_str_scrub, -1);
rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
}
# coding: US-ASCII
require 'test/unit'
class TestScrub < Test::Unit::TestCase
module AESU
def ua(str) str.dup.force_encoding("US-ASCII") end
def a(str) str.dup.force_encoding("ASCII-8BIT") end
def e(str) str.dup.force_encoding("EUC-JP") end
def s(str) str.dup.force_encoding("Windows-31J") end
def u(str) str.dup.force_encoding("UTF-8") end
end
include AESU
def test_scrub
str = "\u3042\u3044"
assert_not_same(str, str.scrub)
str.force_encoding(Encoding::ISO_2022_JP) # dummy encoding
assert_not_same(str, str.scrub)
assert_equal("\uFFFD\uFFFD\uFFFD", u("\x80\x80\x80").scrub)
assert_equal("\uFFFDA", u("\xF4\x80\x80A").scrub)
# exapmles in Unicode 6.1.0 D93b
assert_equal("\x41\uFFFD\uFFFD\x41\uFFFD\x41",
u("\x41\xC0\xAF\x41\xF4\x80\x80\x41").scrub)
assert_equal("\x41\uFFFD\uFFFD\uFFFD\x41",
u("\x41\xE0\x9F\x80\x41").scrub)
assert_equal("\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064",
u("\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64").scrub)
assert_equal("abcdefghijklmnopqrstuvwxyz\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064",
u("abcdefghijklmnopqrstuvwxyz\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64").scrub)
assert_equal("\u3042\u3013", u("\xE3\x81\x82\xE3\x81").scrub("\u3013"))
assert_raise(Encoding::CompatibilityError){ u("\xE3\x81\x82\xE3\x81").scrub(e("\xA4\xA2")) }
assert_raise(TypeError){ u("\xE3\x81\x82\xE3\x81").scrub(1) }
assert_raise(ArgumentError){ u("\xE3\x81\x82\xE3\x81\x82\xE3\x81").scrub(u("\x81")) }
assert_equal(e("\xA4\xA2\xA2\xAE"), e("\xA4\xA2\xA4").scrub(e("\xA2\xAE")))
assert_equal("\u3042<e381>", u("\xE3\x81\x82\xE3\x81").scrub{|x|'<'+x.unpack('H*')[0]+'>'})
assert_raise(Encoding::CompatibilityError){ u("\xE3\x81\x82\xE3\x81").scrub{e("\xA4\xA2")} }
assert_raise(TypeError){ u("\xE3\x81\x82\xE3\x81").scrub{1} }
assert_raise(ArgumentError){ u("\xE3\x81\x82\xE3\x81\x82\xE3\x81").scrub{u("\x81")} }
assert_equal(e("\xA4\xA2\xA2\xAE"), e("\xA4\xA2\xA4").scrub{e("\xA2\xAE")})
assert_equal("\uFFFD\u3042".encode("UTF-16BE"),
"\xD8\x00\x30\x42".force_encoding(Encoding::UTF_16BE).
scrub)
assert_equal("\uFFFD\u3042".encode("UTF-16LE"),
"\x00\xD8\x42\x30".force_encoding(Encoding::UTF_16LE).
scrub)
assert_equal("\uFFFD".encode("UTF-32BE"),
"\xff".force_encoding(Encoding::UTF_32BE).
scrub)
assert_equal("\uFFFD".encode("UTF-32LE"),
"\xff".force_encoding(Encoding::UTF_32LE).
scrub)
end
def test_scrub_bang
str = "\u3042\u3044"
assert_same(str, str.scrub!)
str.force_encoding(Encoding::ISO_2022_JP) # dummy encoding
assert_same(str, str.scrub!)
str = u("\x80\x80\x80")
str.scrub!
assert_same(str, str.scrub!)
assert_equal("\uFFFD\uFFFD\uFFFD", str)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment