Created
June 24, 2012 12:48
-
-
Save nanaya/2983139 to your computer and use it in GitHub Desktop.
Test speed of various methods of sanitizing invalid UTF-8 characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
require "benchmark" | |
Encoding.default_internal = Encoding::UTF_8 | |
def encode1(str) | |
str.force_encoding(Encoding::UTF_8) | |
if !str.valid_encoding? | |
replace_invalid_characters(str) | |
end | |
str | |
end | |
def encode2(str) | |
str.chars.map{ |c| c.force_encoding("UTF-8").valid_encoding? ? c : "?" }.join | |
end | |
def encode3(str) | |
str.force_encoding(Encoding::UTF_8) | |
if str.valid_encoding? | |
str | |
else | |
str.chars.map{ |c| c.valid_encoding? ? c : "?" }.join | |
end | |
end | |
def encode4(str) | |
str.force_encoding('UTF-8').encode('UTF-16', :invalid => :replace, :replace => '').encode('UTF-8') | |
end | |
def self.replace_invalid_characters(str) | |
for i in (0...str.size) | |
if !str[i].valid_encoding? | |
str[i] = "?" | |
end | |
end | |
end | |
N = 100000 | |
VALID = "foo£bar£car".encode(Encoding::UTF_8).force_encoding(Encoding::ASCII_8BIT) | |
puts encode1(VALID.dup) | |
puts encode2(VALID.dup) | |
puts encode3(VALID.dup) | |
puts encode4(VALID.dup) | |
Benchmark.bmbm do |x| | |
x.report { for i in (1..N); encode1(VALID.dup); end } | |
x.report { for i in (1..N); encode2(VALID.dup); end } | |
x.report { for i in (1..N); encode3(VALID.dup); end } | |
x.report { for i in (1..N); encode4(VALID.dup); end } | |
end | |
INVALID = "£foo\xC0\x8Abar\xC0\x8Acar".force_encoding(Encoding::ASCII_8BIT) | |
puts encode1(INVALID.dup) | |
puts encode2(INVALID.dup) | |
puts encode3(INVALID.dup) | |
puts encode4(INVALID.dup) | |
Benchmark.bmbm do |x| | |
x.report { for i in (1..N); encode1(INVALID.dup); end } | |
x.report { for i in (1..N); encode2(INVALID.dup); end } | |
x.report { for i in (1..N); encode3(INVALID.dup); end } | |
x.report { for i in (1..N); encode4(INVALID.dup); end } | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment