Skip to content

Instantly share code, notes, and snippets.

@sjakobi
Created November 10, 2018 01:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sjakobi/f0e6842fb6644e3a37dc41f869f6c158 to your computer and use it in GitHub Desktop.
Save sjakobi/f0e6842fb6644e3a37dc41f869f6c158 to your computer and use it in GitHub Desktop.
SplitUtf8StringSpec.hs
{-# language BangPatterns, MagicHash, UnboxedTuples, MultiWayIf #-}
module Main (main, spec) where
import Test.Hspec
import Test.QuickCheck
import Data.ByteString
import qualified Data.ByteString as BS
import qualified Data.ByteString.Internal as BS
import Foreign
import GHC.Exts
import Encoding
import FastFunctions
import GHC.ForeignPtr
main :: IO ()
main = hspec spec
spec :: Spec
spec = do
describe "utf8SplitAtByteString" $ do
it "concatenation" $ property $
\(Utf8String bs) n -> let (a, b) = utf8SplitAtByteString n bs in a <> b === bs
newtype Utf8String = Utf8String ByteString
deriving Show
instance Arbitrary Utf8String where
arbitrary = do
Utf8String bs <- mkUtf8String . getUnicodeString <$> arbitrary
n <- arbitrary
pure (Utf8String (snd (utf8SplitAtByteString n bs)))
mkUtf8String :: String -> Utf8String
mkUtf8String s =
inlinePerformIO $ do
let len = utf8EncodedLength s
buf <- mallocForeignPtrBytes len
withForeignPtr buf $ \ptr -> do
utf8EncodeString ptr s
pure (Utf8String (BS.fromForeignPtr buf 0 len))
-- | Split after a given number of characters.
-- Negative values are treated as if they are 0.
utf8SplitAtByteString :: Int -> ByteString -> (ByteString, ByteString)
utf8SplitAtByteString n0 bs@(BS.PS fptr off0 len)
| n0 <= 0 = (BS.empty, bs)
| otherwise =
case go n0 start of
ptr | ptr >= end -> (bs, BS.empty)
ptr ->
let d = ptr `minusPtr` start
in (BS.PS fptr off0 d, BS.PS fptr (off0 + d) (len - d))
where
!start = unsafeForeignPtrToPtr fptr `plusPtr` off0
!end = start `plusPtr` len
go n ptr
| n > 0 && ptr < end = go (pred n) (ptr `plusPtr` utf8CharSize ptr)
| otherwise = ptr
-- | Returns the size of UTF8-encoded character at the given 'Addr#'.
--
-- The validity of the encoding is not checked.
{-# INLINE utf8CharSize# #-}
utf8CharSize# :: Addr# -> Int#
utf8CharSize# a# =
let !ch0 = indexWord8OffAddr# a# 0# in
if | isTrue# (ch0 `leWord#` 0x7F##) -> 1#
| isTrue# (ch0 `leWord#` 0xDF##) -> 2#
| isTrue# (ch0 `leWord#` 0xEF##) -> 3#
| otherwise -> 4#
-- | Returns the size of UTF8-encoded character beginning at the given
-- @'Ptr' 'Word8'@.
--
-- The validity of the encoding is not checked.
utf8CharSize :: Ptr Word8 -> Int
utf8CharSize (Ptr a#) = I# (utf8CharSize# a#)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment