Skip to content

Instantly share code, notes, and snippets.

@troelskn
Created June 23, 2021 07:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save troelskn/0ed926153097da9591a8b741a0377644 to your computer and use it in GitHub Desktop.
Save troelskn/0ed926153097da9591a8b741a0377644 to your computer and use it in GitHub Desktop.
module Mojibake
# UTF-8 content, interpreted as latin1
class Utf8
CANARY = "æøåÆØÅ".chars.map { |c| c.encode(Encoding::UTF_8).force_encoding(Encoding::ISO_8859_1).encode(Encoding::UTF_8) }.freeze
def detect?(content)
CANARY.any? { |c| content.include?(c) }
end
def repair(content)
content.encode(Encoding::ISO_8859_1).force_encoding(Encoding::UTF_8)
end
end
# Latin1 content, interpreted as UTF-8
class Latin1
CANARY = "æøåÆØÅ".chars.map { |c| c.encode(Encoding::ISO_8859_1).force_encoding(Encoding::UTF_8) }.freeze
def detect?(content)
CANARY.any? { |c| content.include?(c) }
end
def repair(content)
content.encode(Encoding::UTF_8).force_encoding(Encoding::ISO_8859_1)
end
end
class << self
def sniffers
[Utf8, Latin1].map(&:new)
end
def detect?(content)
return true if content.encoding == Encoding::ASCII_8BIT
sniffers.any? do |sniffer|
sniffer.detect?(content)
end
end
def repair(content)
content = Mojibake.auto_encode(content)
sniffers.each do |sniffer|
return sniffer.repair(content) if sniffer.detect?(content)
end
content
end
def auto_encode(mixed)
return mixed.encode(Encoding::UTF_8) unless mixed.encoding == Encoding::ASCII_8BIT
[Encoding::UTF_8, Encoding::ISO_8859_1].each do |encoding|
mixed = mixed.dup.force_encoding(encoding)
return mixed.encode(Encoding::UTF_8) if mixed.valid_encoding?
end
raise "Unable to determine encoding of ASCII_8BIT"
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment