bf4/README.md Secret

## README.md

      
    Raw
  

              README.md
            
          
    I'm sure this problem is solved or half-solved all over the place.  Let's get it all together.
I have a bunch of relevant links in this blog post (especially this airbnb gist) but I still don't have a general solution to deploy.

Magic comments? BOM-handling?
Handling errors?
Why do so many gems make a 'binread' method?

Here's some example code of stuff I've seen, but I'd love to have a general solution. Maybe stringex to convert and Charlock Holmes to detect encoding?
e.g. issues 1, 2, 3, 4, 5, 6
e.g. test suites 1/spec, 2

  
## string_formatter.rb
module StringFormatter
  module_function

  def as_utf8(str)
    if str.respond_to?(:encode!)
      if defined?(JRUBY_VERSION)
        # don't blow up https://github.com/jruby/jruby/issues/375
        str = as_ascii(str)
      else
        # Converting it to a higher higher character set (UTF-16) and then
        #  back (to UTF-8) ensures that you will strip away invalid or undefined byte sequences.
        str.encode!(Encoding::UTF_16LE, :invalid => :replace, :undef => :replace, :replace => '_')
      end
      str.encode!(Encoding::UTF_8)
    else
      str
    end
  rescue => e
    Airbrake.notify(e)
    force_utf8_encoding(str)
  end

  # help out copy and pasting errors of good-looking email addresses
  # by stripping out non-ASCII characters
  def as_ascii(str)
    if str.respond_to?(:to_ascii)
      # with stringex or talentbox-unidecoder
      str.to_ascii
    else
      # avoids invalid multi-byte escape error
      ascii_text = str.encode( Encoding::ASCII, invalid: :replace, undef: :replace, replace: '' )
      # see http://www.ruby-forum.com/topic/183413
      pattern = Regexp.new('[\x80-\xff]', nil, 'n')
      ascii_text.gsub(pattern, '')
    end
  end

  # This is an attempt to fix issues with strings that are SafeBuffers
  #   breaking URI.escape and RightAws::AwsUtils.URLencode
  def regular_string(str)
    if RUBY_VERSION >= '1.9'
      (str.nil? || str.class.to_s == 'String') ? str : str.to_s.to_str
      # do not check is_a?(String) here since ActiveSupport::SafeBuffer and ActiveSupport::OutputBuffer return true
    else
      str.to_s
    end
  end

  def force_external_encoding(str)
    if defined?(Encoding) && Encoding.default_external
      str.force_encoding(Encoding.default_external) unless str.frozen?
    end
  end

  # for reference, see http://www.zendesk.com/blog/upgrade-the-road-to-1-9
  # and https://gist.github.com/jeffyip/4091200#file_additional_monkey_patches.rb
  # and http://www.benjaminfleischer.com/2013/06/09/ruby-19-upgrade-and-encoding-hell/
  def force_utf8_encoding(str)
    if str.is_a?(String) && str.respond_to?(:force_encoding)
      str = str.dup if str.frozen?

      str.force_encoding(Encoding::UTF_8)

      if !str.valid_encoding?
        #logger.warn("encoding: forcing invalid UTF-8 string; text is #{str}")
        str.encode!(Encoding::UTF_8, Encoding::ISO_8859_1)
      end
    end

    str
  end

  # for reference, see http://www.zendesk.com/blog/upgrade-the-road-to-1-9
  def force_binary_encoding(str)
    if str.is_a?(String) && str.respond_to?(:force_encoding)
      str = str.dup if str.frozen?

      str.force_encoding(Encoding::BINARY)
    end

    str
  end

  # Encodes a string from encoding "from" to encoding "to" in
  # a way that works for both ruby 1.8 and 1.9
  def convert_string_encoding(to, from, str)
    if "1.9".respond_to?(:force_encoding)
      str = str.dup if str.frozen?
      str.encode(to, from, :undef => :replace)
    else
      require 'iconv'
      Iconv.conv(to, from, str)
    end
  end


  module Other
    # a binread in https://github.com/seattlerb/flog/blob/master/lib/flog.rb#L5
    class File
      RUBY19 = "<3".respond_to? :encoding unless defined? RUBY19
      class << self
        alias :binread :read unless RUBY19
      end
    end
    # new rubygems executables do
    def force_binary_string(str)
      str.dup.force_encoding("BINARY") if str.respond_to? :force_encoding
    end
  end
  # or use iconv?
  module Iconv
    # BOM handling
    def convert_to_utf8
      # Data files are exported as Little Endian UTF-16. We need to parse as UTF-8
      contents = File.open(@file_name).read
      begin
        converted = Iconv.iconv('UTF-8', 'UTF-16LE', contents)
        converted.first.gsub!("\xEF\xBB\xBF", '') # strip the BOM (byte order mark) from the first line of input
        output = File.open(@file_name, 'w')
        output.write(converted)
      rescue Iconv::Failure
        puts $!.inspect
      end
    end
  end
end
	module StringFormatter
	module_function

	def as_utf8(str)
	if str.respond_to?(:encode!)
	if defined?(JRUBY_VERSION)
	# don't blow up https://github.com/jruby/jruby/issues/375
	str = as_ascii(str)
	else
	# Converting it to a higher higher character set (UTF-16) and then
	# back (to UTF-8) ensures that you will strip away invalid or undefined byte sequences.
	str.encode!(Encoding::UTF_16LE, :invalid => :replace, :undef => :replace, :replace => '_')
	end
	str.encode!(Encoding::UTF_8)
	else
	str
	end
	rescue => e
	Airbrake.notify(e)
	force_utf8_encoding(str)
	end

	# help out copy and pasting errors of good-looking email addresses
	# by stripping out non-ASCII characters
	def as_ascii(str)
	if str.respond_to?(:to_ascii)
	# with stringex or talentbox-unidecoder
	str.to_ascii
	else
	# avoids invalid multi-byte escape error
	ascii_text = str.encode( Encoding::ASCII, invalid: :replace, undef: :replace, replace: '' )
	# see http://www.ruby-forum.com/topic/183413
	pattern = Regexp.new('[\x80-\xff]', nil, 'n')
	ascii_text.gsub(pattern, '')
	end
	end

	# This is an attempt to fix issues with strings that are SafeBuffers
	# breaking URI.escape and RightAws::AwsUtils.URLencode
	def regular_string(str)
	if RUBY_VERSION >= '1.9'
	(str.nil? \|\| str.class.to_s == 'String') ? str : str.to_s.to_str
	# do not check is_a?(String) here since ActiveSupport::SafeBuffer and ActiveSupport::OutputBuffer return true
	else
	str.to_s
	end
	end

	def force_external_encoding(str)
	if defined?(Encoding) && Encoding.default_external
	str.force_encoding(Encoding.default_external) unless str.frozen?
	end
	end

	# for reference, see http://www.zendesk.com/blog/upgrade-the-road-to-1-9
	# and https://gist.github.com/jeffyip/4091200#file_additional_monkey_patches.rb
	# and http://www.benjaminfleischer.com/2013/06/09/ruby-19-upgrade-and-encoding-hell/
	def force_utf8_encoding(str)
	if str.is_a?(String) && str.respond_to?(:force_encoding)
	str = str.dup if str.frozen?

	str.force_encoding(Encoding::UTF_8)

	if !str.valid_encoding?
	#logger.warn("encoding: forcing invalid UTF-8 string; text is #{str}")
	str.encode!(Encoding::UTF_8, Encoding::ISO_8859_1)
	end
	end

	str
	end

	# for reference, see http://www.zendesk.com/blog/upgrade-the-road-to-1-9
	def force_binary_encoding(str)
	if str.is_a?(String) && str.respond_to?(:force_encoding)
	str = str.dup if str.frozen?

	str.force_encoding(Encoding::BINARY)
	end

	str
	end

	# Encodes a string from encoding "from" to encoding "to" in
	# a way that works for both ruby 1.8 and 1.9
	def convert_string_encoding(to, from, str)
	if "1.9".respond_to?(:force_encoding)
	str = str.dup if str.frozen?
	str.encode(to, from, :undef => :replace)
	else
	require 'iconv'
	Iconv.conv(to, from, str)
	end
	end


	module Other
	# a binread in https://github.com/seattlerb/flog/blob/master/lib/flog.rb#L5
	class File
	RUBY19 = "<3".respond_to? :encoding unless defined? RUBY19
	class << self
	alias :binread :read unless RUBY19
	end
	end
	# new rubygems executables do
	def force_binary_string(str)
	str.dup.force_encoding("BINARY") if str.respond_to? :force_encoding
	end
	end
	# or use iconv?
	module Iconv
	# BOM handling
	def convert_to_utf8
	# Data files are exported as Little Endian UTF-16. We need to parse as UTF-8
	contents = File.open(@file_name).read
	begin
	converted = Iconv.iconv('UTF-8', 'UTF-16LE', contents)
	converted.first.gsub!("\xEF\xBB\xBF", '') # strip the BOM (byte order mark) from the first line of input
	output = File.open(@file_name, 'w')
	output.write(converted)
	rescue Iconv::Failure
	puts $!.inspect
	end
	end
	end
	end