Skip to content

Instantly share code, notes, and snippets.

@nanaya
Created June 24, 2012 12:48
Show Gist options
  • Save nanaya/2983139 to your computer and use it in GitHub Desktop.
Save nanaya/2983139 to your computer and use it in GitHub Desktop.
Test speed of various methods of sanitizing invalid UTF-8 characters
# encoding: utf-8
require "benchmark"
Encoding.default_internal = Encoding::UTF_8
def encode1(str)
str.force_encoding(Encoding::UTF_8)
if !str.valid_encoding?
replace_invalid_characters(str)
end
str
end
def encode2(str)
str.chars.map{ |c| c.force_encoding("UTF-8").valid_encoding? ? c : "?" }.join
end
def encode3(str)
str.force_encoding(Encoding::UTF_8)
if str.valid_encoding?
str
else
str.chars.map{ |c| c.valid_encoding? ? c : "?" }.join
end
end
def encode4(str)
str.force_encoding('UTF-8').encode('UTF-16', :invalid => :replace, :replace => '').encode('UTF-8')
end
def self.replace_invalid_characters(str)
for i in (0...str.size)
if !str[i].valid_encoding?
str[i] = "?"
end
end
end
N = 100000
VALID = "foo£bar£car".encode(Encoding::UTF_8).force_encoding(Encoding::ASCII_8BIT)
puts encode1(VALID.dup)
puts encode2(VALID.dup)
puts encode3(VALID.dup)
puts encode4(VALID.dup)
Benchmark.bmbm do |x|
x.report { for i in (1..N); encode1(VALID.dup); end }
x.report { for i in (1..N); encode2(VALID.dup); end }
x.report { for i in (1..N); encode3(VALID.dup); end }
x.report { for i in (1..N); encode4(VALID.dup); end }
end
INVALID = "£foo\xC0\x8Abar\xC0\x8Acar".force_encoding(Encoding::ASCII_8BIT)
puts encode1(INVALID.dup)
puts encode2(INVALID.dup)
puts encode3(INVALID.dup)
puts encode4(INVALID.dup)
Benchmark.bmbm do |x|
x.report { for i in (1..N); encode1(INVALID.dup); end }
x.report { for i in (1..N); encode2(INVALID.dup); end }
x.report { for i in (1..N); encode3(INVALID.dup); end }
x.report { for i in (1..N); encode4(INVALID.dup); end }
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment