Clean a string from invalid byte sequences.
class String | |
# Defaults for String#scrubbed_utf8's options argument | |
ScrubbedUtf8Defaults = {invalid: :replace, undef: :replace} | |
# Similar to what String#encode does, the options argument is also the same, | |
# but it defaults to :replace for both :invalid and :undef options. | |
# | |
# :invalid: | |
# If the value is :replace, #encode replaces invalid byte sequences in str | |
# with the replacement character. The default is :replace. | |
# | |
# :undef: | |
# If the value is :replace, #encode replaces characters which are undefined in | |
# the destination encoding with the replacement character. The default | |
# is :replace. | |
# | |
# :replace: | |
# Sets the replacement string to the given value. The default replacement | |
# string is "uFFFD" for Unicode encoding forms, and "?" otherwise. | |
# | |
# :fallback: | |
# Sets the replacement string by the given object for undefined character. | |
# The object should be a Hash, a Proc, a Method, or an object which has [] | |
# method. Its key is an undefined character encoded in the source encoding of | |
# current transcoder. Its value can be any encoding until it can be converted | |
# into the destination encoding of the transcoder. | |
# | |
# :xml: | |
# The value must be :text or :attr. If the value is :text #encode replaces | |
# undefined characters with their (upper-case hexadecimal) numeric character | |
# references. '&', '<', and '>' are converted to "&", "<", and ">", | |
# respectively. If the value is :attr, #encode also quotes the replacement | |
# result (using '"'), and replaces '"' with """. | |
# | |
# :cr_newline: | |
# Replaces LF ("n") with CR ("r") if value is true. | |
# | |
# :crlf_newline: | |
# Replaces LF ("n") with CRLF ("r\n") if value is true. | |
# | |
# :universal_newline: | |
# Replaces CRLF ("r\n") and CR ("r") with LF ("n") if value is true. | |
def scrubbed_utf8(options=nil) | |
options = options ? ScrubbedUtf8Defaults.merge(options) : ScrubbedUtf8Defaults | |
if encoding.equal?(Encoding::UTF_8) | |
if valid_encoding? | |
self | |
else | |
encode(Encoding::UTF_16, options).encode(Encoding::UTF_8) | |
end | |
else | |
encode(Encoding::UTF_8, options) | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment