Skip to content

Instantly share code, notes, and snippets.

@durran
Created July 5, 2012 10:42
Show Gist options
  • Save durran/3052912 to your computer and use it in GitHub Desktop.
Save durran/3052912 to your computer and use it in GitHub Desktop.
Safe encode input strings, when they can be flagged with the wrong encoding.
class String
def force_valid_encoding
find_encoding(Encoding.list.to_enum)
end
def safe_encode(name)
force_valid_encoding
encode(name, :undef => :replace, :invalid => :replace, :replace => "")
end
private
def find_encoding(encodings)
if valid_encoding?
self
else
force_next_encoding(encodings)
end
end
def force_next_encoding(encodings)
force_encoding(encodings.next)
find_encoding(encodings)
end
end
require "spec_helper"
describe String do
describe "#force_valid_encoding" do
context "when it is flagged with a valid encoding" do
let(:string) do
"\xff\xdb\xff\xdf".force_encoding("UTF-16")
end
let(:forced) do
string.force_valid_encoding
end
it "retains the valid encoding" do
forced.encoding.name.should == "UTF-16"
end
end
context "when it is flagged with another valid encoding" do
let(:string) do
"\xff\xdb\xff\xdf".force_encoding("ASCII-8BIT")
end
let(:forced) do
string.force_valid_encoding
end
it "retains the other encoding if valid" do
forced.encoding.name.should == "ASCII-8BIT"
end
end
context "when it is flagged with an invalid encoding" do
let(:string) do
"\xff\xdb\xff\xdf".force_encoding("UTF-8")
end
let(:forced) do
string.force_valid_encoding
end
it "forces the first the valid encoding" do
forced.encoding.name.should == "ASCII-8BIT"
end
end
end
describe "#safe_encode" do
context "when coding with invalid characters" do
let(:string) do
"\xff\xdb\xff\xdf".force_encoding("ASCII-8BIT")
end
let(:encoded) do
string.safe_encode("UTF-8")
end
it "replaces invalid sequences and characters with an empty string" do
encoded.should be_empty
end
end
context "when encoding with valid characters" do
let(:string) do
"\x00\x00".force_encoding("UTF-16LE")
end
let(:encoded) do
string.safe_encode("UTF-8")
end
it "keeps the string intact" do
encoded.should == "\u0000"
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment