Skip to content

Instantly share code, notes, and snippets.

@Bodigrim
Created March 4, 2024 23:55
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Bodigrim/e145727a5d9f44cfe2d5292007fffdce to your computer and use it in GitHub Desktop.
Save Bodigrim/e145727a5d9f44cfe2d5292007fffdce to your computer and use it in GitHub Desktop.
#!/usr/bin/env cabal
{- cabal:
build-depends: base >= 4.19, bytestring, containers
default-language: GHC2021
-}
{-# LANGUAGE ExtendedLiterals #-}
{-# LANGUAGE MagicHash #-}
{-# OPTIONS_GHC -Wall -O2 -threaded #-}
import Control.Monad
import Data.ByteString (ByteString)
import Data.ByteString qualified as B
import Data.ByteString.Char8 qualified as C8
import Data.ByteString.Unsafe qualified as B
import Data.Foldable
import Data.Map (Map)
import Data.Map.Strict qualified as M
import Data.Map.Internal qualified as M (balance)
import Data.Map.Strict.Internal qualified as M
import Text.Printf
import GHC.Word
import System.IO
import Data.IORef
import Control.Concurrent
newtype Station = Station ByteString
deriving (Eq, Show)
instance Ord Station where
compare (Station xs) (Station ys) =
compare (B.length xs) (B.length ys) <> compare xs ys
data Entry = Entry
{ _station :: !Station
, _temperature :: !Int
} deriving (Show)
-- Bayawan;-21.1
-- Andranomenatsa;-1.2
-- Benton Harbor;36.2
-- Taulahā;0.6
parseLine :: ByteString -> Entry
parseLine xs = case x4 of
W8# 59#Word8 -- ord ';'
-> Entry (Station $ B.unsafeTake (l - 4) xs) (x3' * 10 + x1' - 528)
W8# 45#Word8 -- ord '-'
-> Entry (Station $ B.unsafeTake (l - 5) xs) (528 - x3' * 10 - x1')
_ -> case x5 of
W8# 59#Word8 -- ord ';'
-> Entry (Station $ B.unsafeTake (l - 5) xs) (x4' * 100 + x3' * 10 + x1' - 5328)
_ -- ord '-'
-> Entry (Station $ B.unsafeTake (l - 6) xs) (5328 - x4' * 100 - x3' * 10 - x1')
where
l = B.length xs
x1 = B.unsafeIndex xs (l - 1) -- last digit
x3 = B.unsafeIndex xs (l - 3) -- another digit
x4 = B.unsafeIndex xs (l - 4) -- digit or sign or semicolon
x5 = B.unsafeIndex xs (l - 5) -- sign or semicolon
x1' = fromIntegral x1
x3' = fromIntegral x3
x4' = fromIntegral x4
validateParseLine :: ByteString -> IO ()
validateParseLine xs
| ds == 0
= unless (xs == ss <> C8.pack ";0.0" || xs == ss <> C8.pack ";-0.0")
$ error $ "bad parse " ++ C8.unpack xs
| otherwise
= unless (xs == ss <> C8.pack (';' : printf "%.1f" (fromIntegral ds / 10 :: Double)))
$ error $ "bad parse " ++ C8.unpack xs ++ " but got " ++ show (parseLine xs)
where
Entry (Station ss) ds = parseLine xs
data Quartet = Quartet
{ _min :: !Int
, _total :: !Int
, _cnt :: !Word
, _max :: !Int
} deriving (Eq)
mkQuartet :: Int -> Quartet
mkQuartet x = Quartet x x 1 x
updateQuartet :: Int -> Quartet -> Quartet
updateQuartet x (Quartet a b c d) = Quartet (min a x) (b + x) (c + 1) (max d x)
instance Semigroup Quartet where
Quartet a b c d <> Quartet a' b' c' d' =
Quartet (min a a') (b + b') (c + c') (max d d')
-- https://github.com/haskell/containers/issues/809
upsert :: forall k a. Ord k => (Maybe a -> a) -> k -> Map k a -> Map k a
upsert f = go
where
go :: k -> Map k a -> Map k a
go !k M.Tip = M.singleton k (f Nothing)
go k (M.Bin sx kx x l r) = case compare k kx of
LT -> M.balance kx x (go k l) r
GT -> M.balance kx x l (go k r)
EQ -> let fx = f (Just x) in
fx `seq` M.Bin sx kx fx l r
parse :: ByteString -> Map Station Quartet
parse xs = foldl' go mempty entries
where
entries = map parseLine $ C8.lines xs
go :: Map Station Quartet -> Entry -> Map Station Quartet
go m (Entry ss ds) = upsert (maybe (mkQuartet ds) (updateQuartet ds)) ss m
aggregate :: Map Station Quartet -> ByteString
aggregate m = C8.cons '{' (C8.snoc (B.drop 2 (M.foldMapWithKey go m')) '}')
where
m' = M.fromList $ map (\(Station ss, q) -> (ss, q)) $ M.assocs m
go ss (Quartet a b c d) = C8.pack ", " <> ss <> C8.pack
(printf "=%.1f/%.1f/%.1f" (fromIntegral a / 10 :: Double) (fromIntegral b / (fromIntegral c * 10) :: Double) (fromIntegral d / 10 :: Double))
parseInChunks :: MVar () -> IORef (Map Station Quartet) -> Handle -> IO ()
parseInChunks mv ref h = do
xs <- B.hGetSome h 1048576
-- putStrLn $ "read " ++ show (B.length xs)
eof <- hIsEOF h
xs' <- if eof then pure xs else do
let (ys, zs) = C8.breakEnd (== '\n') xs
-- putStrLn $ "seek back " ++ show (B.length zs)
hSeek h (RelativeSeek) (negate (toInteger (B.length zs)))
pure ys
_ <- forkIO $ do
atomicModifyIORef' ref (\old -> (M.unionWith (<>) old (parse xs'), ()))
when eof $
putMVar mv ()
if eof then pure () else
parseInChunks mv ref h
main :: IO ()
main = do
cnt <- B.readFile "data/measurements.txt"
let ls = C8.lines cnt
when debug $
traverse_ validateParseLine ls
let parsed = parse cnt
h <- openFile "data/measurements.txt" ReadMode
parsedRef <- newIORef mempty
mv <- newEmptyMVar
parseInChunks mv parsedRef h
!_ <- takeMVar mv
parsed' <- readIORef parsedRef
when debug $
print (parsed == parsed')
C8.putStrLn $ aggregate parsed'
debug :: Bool
debug = False
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment