Skip to content

Instantly share code, notes, and snippets.

@michaelt
Created February 13, 2014 17:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save michaelt/8979818 to your computer and use it in GitHub Desktop.
Save michaelt/8979818 to your computer and use it in GitHub Desktop.
streamUtf8 tests
{-#LANGUAGE ScopedTypeVariables#-}
import Test.QuickCheck hiding ((.&.))
import Test.Framework (Test, testGroup, defaultMain)
import Test.Framework.Providers.QuickCheck2 (testProperty)
import Debug.Trace (trace)
import Control.Exception (SomeException, evaluate, try)
import System.IO.Unsafe (unsafePerformIO)
import Data.Bits ((.&.))
import Data.Char (chr)
import Data.String
import Data.Char (chr)
import Data.List (intersperse)
import Control.Monad
import Data.Either (partitionEithers)
import qualified Data.ByteString as B
import qualified Data.Text as T
import qualified Data.Text.Encoding as E
import Data.Text.StreamDecoding
main :: IO ()
main = defaultMain [tests]
-- >>> :main -a 10000 -t badmiddle
tests = testGroup "stream_utf8" [
testProperty "t_utf8_stream_badmiddle" t_utf8_stream_badmiddle,
testProperty "t_utf8_stream_badend" t_utf8_stream_badend,
testProperty "t_utf8_stream_badstart" t_utf8_stream_badstart]
(<>) = B.append
chunk n bs = let (a,b) = B.splitAt n bs in if B.null a then [] else a : chunk n b
space = intersperse B.empty
setup = do
Positive n <- arbitrary
Positive k <- arbitrary
Positive u <- arbitrary
txt <- genUnicode :: Gen T.Text
let chunkSize = mod n 7 + 1 :: Int
vecSize = mod k 7 + 1 :: Int
spaces = foldr (.) id (take (mod u 2) (repeat space))
return (spaces, txt, chunkSize, vecSize)
t_utf8_stream_badend = do
(spaces, txt, chunkSize, vecSize) <- setup
forAll (vector vecSize) $
(roundtrip' . spaces . chunk chunkSize . appendBytes txt)
`eq` (appendBytes txt)
where appendBytes txt bts = E.encodeUtf8 txt <> B.pack bts
t_utf8_stream_badmiddle = do
(spaces, txt, chunkSize, vecSize) <- setup
forAll (vector vecSize) $
(roundtrip' . spaces . chunk chunkSize . insertBytes txt)
`eq` (insertBytes txt)
where insertBytes txt bts = let n = T.length txt
(a,b) = T.splitAt n txt
in E.encodeUtf8 a <> B.pack bts <> E.encodeUtf8 b
t_utf8_stream_badstart = do
(spaces, txt, chunkSize, vecSize) <- setup
forAll (vector vecSize) $
(roundtrip' . spaces . chunk chunkSize . affixBytes txt)
`eq` (affixBytes txt)
where affixBytes txt bts = B.pack bts <> E.encodeUtf8 txt
roundtrip' :: [B.ByteString] -> B.ByteString
roundtrip' bss = let (ts,bs) = twolists bss in B.concat $ map E.encodeUtf8 ts ++ bs
where
twolists :: [B.ByteString] -> ([T.Text],[B.ByteString])
twolists = partitionEithers . mark streamUtf8 where
mark dec [] = case dec B.empty of
DecodeResultSuccess t dec' -> []
DecodeResultFailure t bs' -> Left t : Right bs': []
mark dec (bs:bss) = case dec bs of
DecodeResultSuccess t dec' -> Left t : mark dec' bss
DecodeResultFailure t bs' -> Left t : map Right (bs':bss)
roundtrip :: [B.ByteString] -> B.ByteString
roundtrip bss = go streamUtf8 B.empty bss where
go dec acc [] = case dec B.empty of
DecodeResultSuccess t dec' -> acc
DecodeResultFailure t bs' -> acc <> bs'
go dec acc (bs:bss) = case dec bs of
DecodeResultSuccess t dec' -> go dec' (acc <> E.encodeUtf8 t) bss
DecodeResultFailure t bs' -> acc <> E.encodeUtf8 t <> bs' <> B.concat bss
-- Helpers from the `text` test suite
-- Ensure that two potentially bottom values (in the sense of crashing
-- for some inputs, not looping infinitely) either both crash, or both
-- give comparable results for some input.
(=^=) :: (Eq a, Show a) => a -> a -> Bool
i =^= j = unsafePerformIO $ do
x <- try (evaluate i)
y <- try (evaluate j)
case (x,y) of
(Left (_ :: SomeException), Left (_ :: SomeException))
-> return True
(Right a, Right b) -> return (a == b)
e -> trace ("*** Divergence: " ++ show e) return False
infix 4 =^=
{-# NOINLINE (=^=) #-}
-- Do two functions give the same answer?
eq :: (Eq a, Show a) => (t -> a) -> (t -> a) -> t -> Bool
eq a b s = a s =^= b s
instance Arbitrary B.ByteString where
arbitrary = B.pack `fmap` arbitrary
genUnicode :: IsString a => Gen a
genUnicode = fmap fromString string where
string = sized $ \n ->
do k <- choose (0,n)
sequence [ char | _ <- [1..k] ]
excluding :: [a -> Bool] -> Gen a -> Gen a
excluding bad gen = loop
where
loop = do
x <- gen
if or (map ($ x) bad)
then loop
else return x
reserved = [lowSurrogate, highSurrogate, noncharacter]
lowSurrogate c = c >= 0xDC00 && c <= 0xDFFF
highSurrogate c = c >= 0xD800 && c <= 0xDBFF
noncharacter c = masked == 0xFFFE || masked == 0xFFFF
where
masked = c .&. 0xFFFF
ascii = choose (0,0x7F)
plane0 = choose (0xF0, 0xFFFF)
plane1 = oneof [ choose (0x10000, 0x10FFF)
, choose (0x11000, 0x11FFF)
, choose (0x12000, 0x12FFF)
, choose (0x13000, 0x13FFF)
, choose (0x1D000, 0x1DFFF)
, choose (0x1F000, 0x1FFFF)
]
plane2 = oneof [ choose (0x20000, 0x20FFF)
, choose (0x21000, 0x21FFF)
, choose (0x22000, 0x22FFF)
, choose (0x23000, 0x23FFF)
, choose (0x24000, 0x24FFF)
, choose (0x25000, 0x25FFF)
, choose (0x26000, 0x26FFF)
, choose (0x27000, 0x27FFF)
, choose (0x28000, 0x28FFF)
, choose (0x29000, 0x29FFF)
, choose (0x2A000, 0x2AFFF)
, choose (0x2B000, 0x2BFFF)
, choose (0x2F000, 0x2FFFF)
]
plane14 = choose (0xE0000, 0xE0FFF)
planes = [ascii, plane0, plane1, plane2, plane14]
char = chr `fmap` excluding reserved (oneof planes)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment