Skip to content

Instantly share code, notes, and snippets.

@ulysses4ever
Created August 10, 2020 20:34
Show Gist options
  • Save ulysses4ever/23926a5d2a115363534abe47f0467b9b to your computer and use it in GitHub Desktop.
Save ulysses4ever/23926a5d2a115363534abe47f0467b9b to your computer and use it in GitHub Desktop.
{-# LANGUAGE RecordWildCards, FlexibleContexts #-}
-- The Computer Language Benchmarks Game
-- https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
--
-- Contributed by cahu ette
module Main where
import Data.Bits
import Data.List
import Data.Word
import Data.Traversable
import Text.Printf
import Data.STRef
import Control.Monad
import Control.Monad.ST
import Control.Parallel.Strategies
import qualified Data.Vector.Hashtables.Internal as H
import qualified Data.Vector.Storable.Mutable as VM
import qualified Data.Vector.Storable as VS
import qualified Data.Vector.Generic as V
import Data.Primitive.MutVar
import qualified Data.Map.Strict as M
import qualified Data.ByteString.Char8 as B
type Key = Int
type Cont = VM.MVector
type FreezedCont = VS.Vector Int
type HashTable s = H.Dictionary s Cont Key Cont Int
{---------------------------------------------------------------
Table Keys Management
Note: conversions Char <--> Key (~Int)
---------------------------------------------------------------}
{- By using only 2 bits to encode keys, it's important to use a different table
- for different key sizes. Otherwise, if we encode 'A' as 0x00, "AT" and
- "AAT" would map to the same bucket in the table.
-
- We could use 3 bits per letter to avoid this problem if needed.
-}
bitsForChar :: Char -> Key
bitsForChar 'a' = 0
bitsForChar 'A' = 0
bitsForChar 'c' = 1
bitsForChar 'C' = 1
bitsForChar 'g' = 2
bitsForChar 'G' = 2
bitsForChar 't' = 3
bitsForChar 'T' = 3
bitsForChar _ = error "Ay, Caramba!"
charForBits :: Key -> Char
charForBits 0 = 'A'
charForBits 1 = 'C'
charForBits 2 = 'G'
charForBits 3 = 'T'
charForBits _ = error "Ay, Caramba!"
packKey :: B.ByteString -> Key
packKey = go zeroBits
where
go k bs = case B.uncons bs of
Nothing -> k
Just (c, cs) -> go (unsafeShiftL k 2 .|. bitsForChar c) cs
{-# INLINE packKey #-}
unpackKey :: Int -> Key -> B.ByteString
unpackKey = go []
where
go s 0 _ = B.pack s
go s l i = go (charForBits (i .&. 3) : s) (l-1) (unsafeShiftR i 2)
{-# INLINE unpackKey #-}
{---------------------------------------------------------------
Table Access
---------------------------------------------------------------}
{-
newTable =
H.initialize 1
-- newSTRef H.empty
-- H.new
-}
incKey ::
HashTable s ->
Key ->
ST s ()
incKey t k = do
i <- H.findEntry t k
case i of {- should be an API for this: updateWith -}
(-1) ->
H.insert t k 1
_ -> do
H.Dictionary{..} <- readMutVar . H.getDRef $ t
value `VM.unsafeModify` (+ 1) $ i
{-# INLINE incKey #-}
getVal ::
HashTable s ->
Key ->
ST s Int
getVal t k = do
i <- H.findEntry t k
case i of {- should be an API for this: getOrDefault -}
(-1) -> return 0
_ -> do
H.Dictionary{..} <- readMutVar . H.getDRef $ t
value H.! i
tableToList ::
HashTable s ->
ST s (FreezedCont, FreezedCont)
tableToList t = do
ks <- H.keys t
vs <- H.values t
return (ks,vs)
{---------------------------------------------------------------
Main Algorithms
---------------------------------------------------------------}
countOccurrences ::
forall s.
Int ->
Int ->
B.ByteString ->
ST s (HashTable s)
countOccurrences jumpSize frameSize input = do
t <- H.initialize 1 -- newTable
let
takeFrame = B.take frameSize
nextFrame = B.drop jumpSize
go :: B.ByteString -> ST s ()
go bs
| B.length bs < frameSize = return ()
| otherwise =
let
k = takeFrame bs
in
incKey t (packKey k) >>
go (nextFrame bs)
go input
return t
extractSequence :: String -> B.ByteString -> B.ByteString
extractSequence s = findSeq
where
prefix = B.pack ('>' : s)
skipSeq =
B.dropWhile (/= '>')
. B.drop 1
takeSeq =
B.filter (/= '\n')
. B.takeWhile (/= '>') -- extract until next header
. B.dropWhile (/= '\n') -- skip header
findSeq str
| prefix `B.isPrefixOf` str = takeSeq str
| otherwise = findSeq (skipSeq str)
main :: IO ()
main = do
s <- extractSequence "THREE" <$> B.getContents
let keys = ["GGT","GGTA","GGTATT","GGTATTTTAATT","GGTATTTTAATTTATAGT"]
let threads = [0 .. 63]
let threadWorkOcc key tid = runST $ do
t <- countOccurrences (length threads) (B.length key) (B.drop tid s)
getVal t (packKey key)
let calcOcc key = sum $ runEval $
mapM (rpar . threadWorkOcc (B.pack key)) threads
let threadWorkFreq len tid = runST $ do
t <- countOccurrences (length threads) len (B.drop tid s)
(ks, vs) <- tableToList t
let vs' = V.foldr ((:) . freq) [] vs
ks' = V.foldr (\k ks -> B.unpack (unpackKey len k) : ks) [] ks
return (ks', vs')
where
freq v = 100 * fromIntegral v / fromIntegral (B.length s - len + 1)
let calcFreq len =
let ksvs = runEval $ mapM (rpar . threadWorkFreq len) threads
m = foldr
(\(ks, vs) m ->
foldr (uncurry $ M.insertWith (+)) m (zip ks vs))
M.empty
ksvs
in
M.toList m
let resultsOcc = map (\k -> (k, calcOcc k)) keys
printFreq (calcFreq 1)
putStrLn ""
printFreq (calcFreq 2)
putStrLn ""
forM_ resultsOcc $ \(k, r) -> printf "%d\t%s\n" r k
where
sortFreq = sortBy
(\ (k :: String, v :: Double) (k', v') ->
compare v' v `mappend` compare k k')
printFreq :: [(String, Double)] -> IO ()
printFreq l = forM_ (sortFreq l) $ uncurry (printf "%s %.3f\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment