Created
February 10, 2017 10:00
-
-
Save carpodaster/2809ebdda26d016860b438e70bc842f2 to your computer and use it in GitHub Desktop.
Elixir: Converting Latin1-encoded HTML to UTF-8
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defmodule Latin1Convert do | |
@doc """ | |
Convert an input HTML string to UTF-8 unicode. | |
""" | |
@spec call(String.t) :: String.t | |
def call(html) do | |
content_type = content_type_from_header(html) | |
cond do | |
content_type == :latin1 -> | |
html | |
|> :unicode.characters_to_binary(:latin1) | |
|> remove_meta_http_equiv_encoding | |
true -> | |
html | |
end | |
end | |
@doc """ | |
Looks for a <meta http-equiv="Content-Type"> node in the input | |
string's HTML header and returns an atom representing the encoding. | |
""" | |
@spec content_type_from_header(String.t) :: atom | nil | |
def content_type_from_header(html) do | |
encoding = meta_http_equiv_encoding(html) | |
cond do | |
Regex.match?(~r(iso-8859)i, encoding) -> | |
:latin1 | |
Regex.match?(~r(utf-8)i, encoding) -> | |
:unicode | |
true -> | |
nil | |
end | |
end | |
@doc """ | |
Retrieves the content type indication from `html`. | |
iex>"<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=ISO-8859-1\"></head></html>" |> Latin1Convert.meta_http_equiv_encoding | |
"text/html; charset=ISO-8859-1" | |
iex>Latin1Convert.meta_http_equiv_encoding("<html></html>") | |
"" | |
""" | |
@spec meta_http_equiv_encoding(String.t) :: String.t | |
def meta_http_equiv_encoding(html) do | |
String.downcase(html) | |
|> Floki.attribute("head > meta[http-equiv=content-type]", "content") | |
|> List.first | |
|> to_string | |
end | |
# Caveat: not really case-sensitive check for the DOM node. | |
# Floki doesn't seem to understand `$=foo i` queries. We can't | |
# `String.downcase` here as that will mess up the filter chain. | |
defp remove_meta_http_equiv_encoding(html) do | |
Floki.filter_out(html, "head > meta[http-equiv*=ontent-]") | |
|> Floki.raw_html | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment