|
module StringFormatter |
|
module_function |
|
|
|
def as_utf8(str) |
|
if str.respond_to?(:encode!) |
|
if defined?(JRUBY_VERSION) |
|
# don't blow up https://github.com/jruby/jruby/issues/375 |
|
str = as_ascii(str) |
|
else |
|
# Converting it to a higher higher character set (UTF-16) and then |
|
# back (to UTF-8) ensures that you will strip away invalid or undefined byte sequences. |
|
str.encode!(Encoding::UTF_16LE, :invalid => :replace, :undef => :replace, :replace => '_') |
|
end |
|
str.encode!(Encoding::UTF_8) |
|
else |
|
str |
|
end |
|
rescue => e |
|
Airbrake.notify(e) |
|
force_utf8_encoding(str) |
|
end |
|
|
|
# help out copy and pasting errors of good-looking email addresses |
|
# by stripping out non-ASCII characters |
|
def as_ascii(str) |
|
if str.respond_to?(:to_ascii) |
|
# with stringex or talentbox-unidecoder |
|
str.to_ascii |
|
else |
|
# avoids invalid multi-byte escape error |
|
ascii_text = str.encode( Encoding::ASCII, invalid: :replace, undef: :replace, replace: '' ) |
|
# see http://www.ruby-forum.com/topic/183413 |
|
pattern = Regexp.new('[\x80-\xff]', nil, 'n') |
|
ascii_text.gsub(pattern, '') |
|
end |
|
end |
|
|
|
# This is an attempt to fix issues with strings that are SafeBuffers |
|
# breaking URI.escape and RightAws::AwsUtils.URLencode |
|
def regular_string(str) |
|
if RUBY_VERSION >= '1.9' |
|
(str.nil? || str.class.to_s == 'String') ? str : str.to_s.to_str |
|
# do not check is_a?(String) here since ActiveSupport::SafeBuffer and ActiveSupport::OutputBuffer return true |
|
else |
|
str.to_s |
|
end |
|
end |
|
|
|
def force_external_encoding(str) |
|
if defined?(Encoding) && Encoding.default_external |
|
str.force_encoding(Encoding.default_external) unless str.frozen? |
|
end |
|
end |
|
|
|
# for reference, see http://www.zendesk.com/blog/upgrade-the-road-to-1-9 |
|
# and https://gist.github.com/jeffyip/4091200#file_additional_monkey_patches.rb |
|
# and http://www.benjaminfleischer.com/2013/06/09/ruby-19-upgrade-and-encoding-hell/ |
|
def force_utf8_encoding(str) |
|
if str.is_a?(String) && str.respond_to?(:force_encoding) |
|
str = str.dup if str.frozen? |
|
|
|
str.force_encoding(Encoding::UTF_8) |
|
|
|
if !str.valid_encoding? |
|
#logger.warn("encoding: forcing invalid UTF-8 string; text is #{str}") |
|
str.encode!(Encoding::UTF_8, Encoding::ISO_8859_1) |
|
end |
|
end |
|
|
|
str |
|
end |
|
|
|
# for reference, see http://www.zendesk.com/blog/upgrade-the-road-to-1-9 |
|
def force_binary_encoding(str) |
|
if str.is_a?(String) && str.respond_to?(:force_encoding) |
|
str = str.dup if str.frozen? |
|
|
|
str.force_encoding(Encoding::BINARY) |
|
end |
|
|
|
str |
|
end |
|
|
|
# Encodes a string from encoding "from" to encoding "to" in |
|
# a way that works for both ruby 1.8 and 1.9 |
|
def convert_string_encoding(to, from, str) |
|
if "1.9".respond_to?(:force_encoding) |
|
str = str.dup if str.frozen? |
|
str.encode(to, from, :undef => :replace) |
|
else |
|
require 'iconv' |
|
Iconv.conv(to, from, str) |
|
end |
|
end |
|
|
|
|
|
module Other |
|
# a binread in https://github.com/seattlerb/flog/blob/master/lib/flog.rb#L5 |
|
class File |
|
RUBY19 = "<3".respond_to? :encoding unless defined? RUBY19 |
|
class << self |
|
alias :binread :read unless RUBY19 |
|
end |
|
end |
|
# new rubygems executables do |
|
def force_binary_string(str) |
|
str.dup.force_encoding("BINARY") if str.respond_to? :force_encoding |
|
end |
|
end |
|
# or use iconv? |
|
module Iconv |
|
# BOM handling |
|
def convert_to_utf8 |
|
# Data files are exported as Little Endian UTF-16. We need to parse as UTF-8 |
|
contents = File.open(@file_name).read |
|
begin |
|
converted = Iconv.iconv('UTF-8', 'UTF-16LE', contents) |
|
converted.first.gsub!("\xEF\xBB\xBF", '') # strip the BOM (byte order mark) from the first line of input |
|
output = File.open(@file_name, 'w') |
|
output.write(converted) |
|
rescue Iconv::Failure |
|
puts $!.inspect |
|
end |
|
end |
|
end |
|
end |