Skip to content

Instantly share code, notes, and snippets.

@hrp
Created December 28, 2013 01:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hrp/8155066 to your computer and use it in GitHub Desktop.
Save hrp/8155066 to your computer and use it in GitHub Desktop.
Dump non-ascii benchmark
# encoding: utf-8
require 'benchmark'
require 'iconv'
class String
def remove_nonascii(replacement='')
n=self.split("")
self.slice!(0..self.size)
n.each { |b|
if b[0].to_i< 33 || b[0].to_i>127 then
self.concat(replacement)
else
self.concat(b)
end
}
self.to_s
end
def remove_non_ascii(replacement="")
self.gsub(/[\u0080-\u00ff]/,replacement)
end
def iconv
Iconv.conv('US-ASCII//IGNORE', 'UTF-8', self)
end
def with_encode
# See String#encode
encoding_options = {
:invalid => :replace, # Replace invalid byte sequences
:undef => :replace, # Replace anything not defined in ASCII
:replace => '', # Replace above with this
:universal_newline => true # Always break lines with \n
}
self.encode 'ASCII', encoding_options
end
def with_chars
self.chars.select(&:ascii_only?).join.strip.gsub(/[(\s)]+/, " ").gsub("&nbsp;", "").strip
end
def regex
self.gsub(/\P{ASCII}/, '')
end
end
utf8 = '☼☹☼✿☺☻☹☃⌇♒♒⌨☝♡“¥¥ß©®@÷π≠ascii' * 3
# puts utf8.force_encoding( Encoding.find('ASCII') )
puts utf8.with_encode
puts utf8.with_chars
puts utf8.iconv
puts utf8.regex
# puts utf8.encode Encoding.find('ASCII'), undef: :replace
n = 10_000
Benchmark.bm do |x|
x.report('iconv') { n.times { utf8.iconv } }
x.report('encod') { n.times { utf8.with_encode } }
x.report('chars') { n.times { utf8.with_chars } }
# x.report('force') { n.times { utf8.force_encoding('US-ASCII') } }
x.report('regex') { n.times { utf8.regex } }
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment