Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Lower-casing a ByteString
{-# LANGUAGE OverloadedStrings, MagicHash, UnboxedTuples, BangPatterns #-}
import Test.Hspec (hspec)
import Test.Hspec.QuickCheck (prop)
import Criterion.Main (defaultMain, bench, whnf)
import qualified Data.ByteString as S
import qualified Data.ByteString.Char8 as S8
import Data.ByteString.Unsafe (unsafePackAddressLen, unsafeUseAsCStringLen)
import qualified Data.Char
import qualified Data.Word8
import System.IO.Unsafe (unsafePerformIO)
import Foreign.Storable (peekByteOff)
import Foreign.Ptr (Ptr)
import Foreign.C.String (CStringLen)
import GHC.Prim
( leWord#, Word#, plusWord#, State#, RealWorld, Addr#
, writeWord8Array#, (==#), byteArrayContents#, unsafeFreezeByteArray#
, newPinnedByteArray#
)
import GHC.Base (Int (..))
import GHC.Word (Word (..), Word8 (..))
import GHC.Types (IO (..))
main :: IO ()
main = do
hspec $ do
prop "char8 == word8" $ \s ->
let bs = S.pack s
in S8.map Data.Char.toLower bs == S.map Data.Word8.toLower bs
prop "char8 == bsToLower" $ \s ->
let bs = S.pack s
in S8.map Data.Char.toLower bs == bsToLower bs
prop "char8 == map toLowerC" $ \s ->
let bs = S.pack s
in S8.map Data.Char.toLower bs == S8.map toLowerC bs
input <- S.readFile "bench.hs"
defaultMain
[ bench "Char8" $ whnf (S.length . S8.map Data.Char.toLower) input
, bench "Char8 toLowerC" $ whnf (S.length . S8.map toLowerC) input
, bench "Word8" $ whnf (S.length . S.map Data.Word8.toLower) input
, bench "bsToLower" $ whnf (S.length . bsToLower) input
]
bsToLower :: S.ByteString -> S.ByteString
bsToLower bs
| S.null bs = S.empty
| otherwise = unsafePerformIO $ unsafeUseAsCStringLen bs cstrlenToLower
cstrlenToLower :: CStringLen -> IO S.ByteString
cstrlenToLower (input, !len'@(I# len)) = IO $ \state0 ->
case newPinnedByteArray# len state0 of
(# state1, mba #) ->
case loop mba 0 state1 of
state2 ->
case unsafeFreezeByteArray# mba state2 of
(# state3, ba #) ->
unsafePackAddressLen' len' (byteArrayContents# ba) state3
where
loop mba offset@(I# offset') stateA
| offset' ==# len = stateA
| otherwise =
case peek' input offset stateA of
(# stateB, W8# w #) ->
case writeWord8Array# mba offset' (toLowerU w) stateB of
stateC -> loop mba (offset + 1) stateC
unsafePackAddressLen' :: Int
-> Addr#
-> State# RealWorld
-> (# State# RealWorld, S.ByteString #)
unsafePackAddressLen' i a =
f
where
IO f = unsafePackAddressLen i a
peek' :: Ptr b -> Int -> State# RealWorld -> (# State# RealWorld, Word8 #)
peek' ptr offset =
f
where
IO f = peekByteOff ptr offset
toLowerU :: Word# -> Word#
toLowerU w
| isUpperU w = w `plusWord#` space
| otherwise = w
where
!(W8# space) = 0x20
isUpperU :: Word# -> Bool
isUpperU w =
(l1 `leWord#` w && w `leWord#` h1) ||
(l2 `leWord#` w && w `leWord#` h2) ||
(l3 `leWord#` w && w `leWord#` h3)
where
!(W# l1) = 0x41
!(W# h1) = 0x5a
!(W# l2) = 0xc0
!(W# h2) = 0xd6
!(W# l3) = 0xd8
!(W# h3) = 0xde
toLowerC :: Char -> Char
toLowerC w
| isUpperC w = toEnum $ fromEnum w + 0x20
| otherwise = w
isUpperC :: Char -> Bool
isUpperC w =
(l1 <= w && w <= h1) ||
(l2 <= w && w <= h2) ||
(l3 <= w && w <= h3)
where
l1 = '\x41'
h1 = '\x5a'
l2 = '\xc0'
h2 = '\xd6'
l3 = '\xd8'
h3 = '\xde'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment