Skip to content

Instantly share code, notes, and snippets.

@bernerdschaefer
Forked from durran/string.rb
Created July 5, 2012 12:48
Show Gist options
  • Save bernerdschaefer/3053502 to your computer and use it in GitHub Desktop.
Save bernerdschaefer/3053502 to your computer and use it in GitHub Desktop.
Safe encode input strings, when they can be flagged with the wrong encoding.
class String
def force_valid_encoding
find_encoding(Encoding.list.to_enum)
end
def safe_encode(name)
force_valid_encoding
encode(name, :undef => :replace, :invalid => :replace, :replace => "")
end
private
def find_encoding(encodings)
if valid_encoding?
self
else
force_next_encoding(encodings)
end
end
def force_next_encoding(encodings)
force_encoding(encodings.next)
find_encoding(encodings)
end
end
# encoding: utf-8
require "rspec"
require "./string"
describe String do
describe "#force_valid_encoding" do
context "when it is flagged with a valid encoding" do
let(:string) do
"\xff\xdb\xff\xdf".force_encoding("UTF-16")
end
let(:forced) do
string.force_valid_encoding
end
it "retains the valid encoding" do
forced.encoding.name.should == "UTF-16"
end
end
context "when it is flagged with another valid encoding" do
let(:string) do
"\xff\xdb\xff\xdf".force_encoding("ASCII-8BIT")
end
let(:forced) do
string.force_valid_encoding
end
it "retains the other encoding if valid" do
forced.encoding.name.should == "ASCII-8BIT"
end
end
context "when it is flagged with an invalid encoding" do
let(:string) do
"\xff\xdb\xff\xdf".force_encoding("UTF-8")
end
let(:forced) do
string.force_valid_encoding
end
it "forces the first the valid encoding" do
forced.encoding.name.should == "ASCII-8BIT"
end
end
end
describe "#safe_encode" do
context "when coding with invalid characters" do
let(:string) do
"\xff\xdb\xff\xdf".force_encoding("ASCII-8BIT")
end
let(:encoded) do
string.safe_encode("UTF-8")
end
it "replaces invalid sequences and characters with an empty string" do
encoded.should be_empty
end
end
context "when encoding with valid characters" do
let(:string) do
"\x00\x00".force_encoding("UTF-16LE")
end
let(:encoded) do
string.safe_encode("UTF-8")
end
it "keeps the string intact" do
encoded.should == "\u0000"
end
end
context "when utf-8 string is tagged as binary" do
let(:string) do
"Fachkräftemangel".force_encoding 'binary'
end
let(:encoded) do
string.safe_encode('utf-8')
end
it "fixes the encoding" do
encoded.should eql string.dup.force_encoding('utf-8')
end
end
end
end
@durran
Copy link

durran commented Jul 5, 2012

Yeah that's acceptable to me.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment