Skip to content

Instantly share code, notes, and snippets.

@ocramz
Created April 23, 2023 12:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ocramz/cf0a41d455aebb6fc1991d6ab99ddfd3 to your computer and use it in GitHub Desktop.
Save ocramz/cf0a41d455aebb6fc1991d6ab99ddfd3 to your computer and use it in GitHub Desktop.
decoding URI-encoded strings in Haskell
import Data.Bits ((.|.),(.&.),shiftL)
import Data.Char (chr, isHexDigit, digitToInt)
import Data.Text (Text, pack, unpack)
{-
sources :
uri-encode (https://hackage.haskell.org/package/uri-encode-1.5.0.7/docs/src/Network.URI.Encode.html#decodeText)
network-uri (https://hackage.haskell.org/package/network-uri-2.6.4.2/docs/src/Network.URI.html#unEscapeString)
-}
-- | URI decode a 'Text', unicode aware.
decodeText :: Text -> Text
decodeText = pack . unEscapeString . unpack
-- |Turns all instances of escaped characters in the string back
-- into literal characters.
--
unEscapeString :: String -> String
unEscapeString [] = ""
unEscapeString s@(c:cs) = case unEscapeByte s of
Just (byte, rest) -> unEscapeUtf8 byte rest
Nothing -> c : unEscapeString cs
unEscapeByte :: String -> Maybe (Int, String)
unEscapeByte ('%':x1:x2:s) | isHexDigit x1 && isHexDigit x2 =
Just (digitToInt x1 * 16 + digitToInt x2, s)
unEscapeByte _ = Nothing
-- Adapted from http://hackage.haskell.org/package/utf8-string
-- by Eric Mertens, BSD3
unEscapeUtf8 :: Int -> String -> String
unEscapeUtf8 c rest
| c < 0x80 = chr c : unEscapeString rest
| c < 0xc0 = replacement_character : unEscapeString rest
| c < 0xe0 = multi1
| c < 0xf0 = multi_byte 2 0xf 0x800
| c < 0xf8 = multi_byte 3 0x7 0x10000
| c < 0xfc = multi_byte 4 0x3 0x200000
| c < 0xfe = multi_byte 5 0x1 0x4000000
| otherwise = replacement_character : unEscapeString rest
where
replacement_character = '\xfffd'
multi1 = case unEscapeByte rest of
Just (c1, ds) | c1 .&. 0xc0 == 0x80 ->
let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|. fromEnum (c1 .&. 0x3f)
in if d >= 0x000080 then toEnum d : unEscapeString ds
else replacement_character : unEscapeString ds
_ -> replacement_character : unEscapeString rest
multi_byte :: Int -> Int -> Int -> String
multi_byte i mask overlong =
aux i rest (unEscapeByte rest) (c .&. mask)
where
aux 0 rs _ acc
| overlong <= acc && acc <= 0x10ffff &&
(acc < 0xd800 || 0xdfff < acc) &&
(acc < 0xfffe || 0xffff < acc) = chr acc : unEscapeString rs
| otherwise = replacement_character : unEscapeString rs
aux n _ (Just (r, rs)) acc
| r .&. 0xc0 == 0x80 = aux (n-1) rs (unEscapeByte rs)
$! shiftL acc 6 .|. (r .&. 0x3f)
aux _ rs _ _ = replacement_character : unEscapeString rs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment