Created
April 28, 2017 14:37
-
-
Save nh2/16c52a261c41cd03dae9b6e7338af8c4 to your computer and use it in GitHub Desktop.
Versioned serialisation in Haskell without per-value versioning overhead
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- This snippet shows an idea I had about versioned serialisation. | |
-- | |
-- Many serialisation frameworks don't deal with backwards compatibility: | |
-- If the serialisation logic is changed, new programs can no longer | |
-- read data generated by old code. | |
-- | |
-- This creates issues in upgradability, see e.g. | |
-- * https://www.reddit.com/r/haskell/comments/44w15q/psa_if_youre_serializing_floating_point_numbers/ | |
-- | |
-- Solutions so far included | |
-- * fix serialisation for any given type forever | |
-- (no way to fix e.g. performance mistakes later) | |
-- * change serialisation (breaking backwards compatibility) | |
-- * make a new serialisation framework | |
-- | |
-- An obvious upgradable solution is to tag every serialised value | |
-- with a version, so that deserialisation code can inspect the version | |
-- to read old serialisations. | |
-- This has the drawback of vastly bloated binary representations: | |
-- If each Word8 carries e.g. a Word32-sized version tag, the storage | |
-- needed increases 5-fold compared to storing just the data. | |
-- | |
-- The idea is now to store the version tags not for each sub value in | |
-- the object to serialise, but only once for each type, in a "header" | |
-- next to the top-level object. | |
-- For example, this header would carry the information about which | |
-- version *all* of the Word8's in the objects are serialised; the | |
-- Word8 version tag is stored once, instead of next to each Word8. | |
-- | |
-- As a result, the versioning overhead in the serialised value is | |
-- O(number of types in the object) | |
-- instead of | |
-- O(number of values in the object). | |
-- | |
-- This snippet demonstrates this idea. | |
-- | |
-- Details: | |
-- * Each type that can be serialised is an instance of the `Serialize` | |
-- typeclass. | |
-- * Each such type has a globally unique `typeId` that never changes, | |
-- and a `typeVersion` that should be bumped when the serialisation | |
-- format is changed. | |
-- (Global uniqueness is enforced by randomly generating the `typeId` | |
-- enough bits so that collisions are unlikely, as is common practice | |
-- for UUIDs; in this example I use 64 bits but 128 would be better.) | |
-- * By inspecting the `typeVersion`, old formats can be deserialised. | |
-- * The `Serialize` typeclass provides the functionality of collecting, | |
-- for a serialisable type `a`, all the `typeId`s and their | |
-- `typeVersion`s of any type that can appear in any sub-object of | |
-- `a`, so that a global map from `typeId` to `typeVersion` can be | |
-- stored at top-level along with the serialised `a`. | |
-- * For deserialisation, this map map is decoded first from the | |
-- top-level header, and then passed along to the deserialisation | |
-- function for each type, so that each instance can look up its | |
-- `typeId` in the map to decide what version it needs to deserialise. | |
-- TODO: | |
-- * Add default `collectSubTypeVersions` implementation using | |
-- `GHC.Generics`. | |
-- * Use the usual efficient machinery for serialisation (like a Builder); | |
-- put `serialize`/`deserialisation` behind standard `Get`/`Put` | |
-- monads (not done here for simplicity and demonstration). | |
-- Once done, the `TypeVersionMap` argument to `deserialize` should | |
-- live in the `Get` context instead, and `deserialize` should take | |
-- a `TypeVersion` argument instead, so that each deserialisation | |
-- function can only see the version it should deserialise, an not | |
-- the versions of all types in the `TypeVersionMap`. | |
-- * Currently we store all types that can occur in a value in the | |
-- `TypeVersionMap` (we can compute it statically). An alternative | |
-- is to store only those types that do actually occur in a value. | |
-- This can make the `TypeVersionMap` smaller when sum types are used, | |
-- but requires a full traversal of the value to serialise in order | |
-- to compute the header (loss of streaming serialisation support). | |
-- Most likely, both approaches are useful an should be offered. | |
{-# LANGUAGE BangPatterns #-} | |
{-# LANGUAGE DeriveGeneric #-} | |
{-# LANGUAGE ScopedTypeVariables #-} | |
{-# LANGUAGE TypeApplications #-} | |
{-# OPTIONS_GHC -Wall #-} | |
module ToplevelVersionedSerialization where | |
import Data.Binary (Binary) | |
import qualified Data.Binary as Binary | |
import qualified Data.ByteString.Lazy as BSL | |
import Data.Map.Strict (Map) | |
import qualified Data.Map.Strict as Map | |
import Data.Text (Text) | |
import Data.Word (Word64) | |
import Data.Coerce (coerce) | |
import GHC.Generics (Generic) | |
newtype TypeId a = TypeId { unTypeId :: Word64 } deriving (Eq, Ord, Show, Generic) | |
newtype TypeVersion a = TypeVersion { unTypeVersion :: Word64 } deriving (Eq, Ord, Show, Generic) | |
newtype TypeVersionMap a = TypeVersionMap { unTypeVersionMap :: Map Word64 Word64 } deriving (Eq, Ord, Show, Generic) | |
instance Binary (TypeVersionMap a) | |
lookupTypeVersion :: TypeId a -> TypeVersionMap a -> Maybe (TypeVersion a) | |
lookupTypeVersion (TypeId i) (TypeVersionMap m) = TypeVersion <$> Map.lookup i m | |
castTypeVersionMap :: TypeVersionMap a -> TypeVersionMap b | |
castTypeVersionMap = coerce | |
class Serialize a where | |
-- | Unique ID for this type; must never change. | |
typeId :: TypeId a | |
-- | Current serialisation representation version for this type; | |
-- change it when you change how this type is serialised. | |
typeVersion :: TypeVersion a | |
-- | Contains the versions of all sub types. | |
-- | |
-- To get a map that also contains an entry for the current type (a), | |
-- use `typeVersionMap`. | |
-- | |
-- Implementations should call `typeVersionMap` on all of the data | |
-- types immediate sub types (fields / record fields). | |
collectSubTypeVersions :: TypeVersionMap a | |
-- | Serialise a value. | |
serialize :: a -> BSL.ByteString | |
-- | Tries to deserialise a value at a given version. | |
-- On success, returns the value and the remaining input `ByteString`. | |
-- | |
-- Implementations should query which version is in the `ByteString` | |
-- using `version = lookupTypeVersion typeId versionMap` | |
-- and then deserialise that version appropriately. | |
deserialize :: TypeVersionMap a -> BSL.ByteString -> Either String (a, BSL.ByteString) | |
-- | Given a type `a`, returns a map that tells us the `TypeVersion` of | |
-- any recursive sub-type used inside `a`; also contains an entry | |
-- for `a` itself. | |
-- | |
-- Use `lookupTypeVersion` on the result to query versions. | |
-- | |
-- This function ensures termination on recursive data types; | |
-- its outputs are always finite. | |
typeVersionMap :: forall a . (Serialize a) => TypeVersionMap a | |
typeVersionMap = addUnseenTypes $ TypeVersionMap (Map.singleton (unTypeId (typeId @a)) (unTypeVersion (typeVersion @a))) | |
where | |
addUnseenTypes :: TypeVersionMap a -> TypeVersionMap a | |
addUnseenTypes !m = case lookupTypeVersion (typeId @a) m of | |
Just{} -> m -- already in the map; stop recursing | |
Nothing -> addUnseenTypes $ TypeVersionMap (unTypeVersionMap m `Map.union` unTypeVersionMap (collectSubTypeVersions @a)) | |
-- | Serialise a data type, encoding a header containing all versions | |
-- of all types inside it along with it. | |
toplevelSerialize :: forall a . (Serialize a) => a -> BSL.ByteString | |
toplevelSerialize a = enc (typeVersionMap @a, serialize a) | |
-- | Deserialise a value serialised with `toplevelSerialize`. | |
toplevelDeserialize :: forall a . (Serialize a) => BSL.ByteString -> Either String (a, BSL.ByteString) | |
toplevelDeserialize bs = case dec bs of | |
Left errMsg -> Left errMsg | |
Right ((typeVersions :: TypeVersionMap a, serializedA :: BSL.ByteString), shouldBeEmptyRest) | |
| not (BSL.null shouldBeEmptyRest) -> Left "toplevelDeserialize: shouldBeEmptyRest not empty" | |
| otherwise -> deserialize typeVersions serializedA | |
-- Some helpers to turn values into `ByteString`s an the other way around. | |
enc :: (Binary a) => a -> BSL.ByteString | |
enc = Binary.encode | |
dec :: (Binary a) => BSL.ByteString -> Either String (a, BSL.ByteString) | |
dec bs = case Binary.decodeOrFail bs of | |
Left (_bs, _offset, errMsg) -> Left errMsg | |
Right (restBs, _offset, a) -> Right (a, restBs) | |
-- Example usage | |
instance Serialize Int where | |
typeId = TypeId 0x1f1ccad4d2f5ae37 | |
typeVersion = TypeVersion 1 | |
collectSubTypeVersions = TypeVersionMap $ Map.fromList [] | |
serialize = enc | |
deserialize versionMap bs = let i = typeId in case lookupTypeVersion i versionMap of | |
Nothing -> Left $ "deserialize @Int: Missing typeId " ++ show i ++ " in versionMap" | |
Just version | |
| version == typeVersion -> dec bs | |
| otherwise -> Left $ "deserialize @Int: Unknown typeVersion " ++ show version | |
instance Serialize Text where | |
typeId = TypeId 0xb1cc1583d92aca3c | |
typeVersion = TypeVersion 1 | |
collectSubTypeVersions = TypeVersionMap $ Map.fromList [] | |
serialize = enc | |
deserialize versionMap bs = let i = typeId in case lookupTypeVersion i versionMap of | |
Nothing -> Left $ "deserialize @Int: Missing typeId " ++ show i ++ " in versionMap" | |
Just version | |
| version == typeVersion -> dec bs | |
| otherwise -> Left $ "deserialize @Text: Unknown typeVersion " ++ show version | |
data User = User | |
{ userName :: Text | |
, userAge :: Int | |
} deriving (Eq, Ord, Show) | |
instance Serialize User where | |
typeId = TypeId 0x74286ebf6d139844 | |
typeVersion = TypeVersion 1 | |
collectSubTypeVersions = TypeVersionMap $ Map.unions | |
[ unTypeVersionMap (typeVersionMap @Text) | |
, unTypeVersionMap (typeVersionMap @Int) | |
] | |
serialize (User name age) = serialize name `BSL.append` serialize age | |
deserialize versionMap bs = let i = typeId in case lookupTypeVersion i versionMap of | |
Nothing -> Left $ "deserialize @User: Missing typeId " ++ show i ++ " in versionMap" | |
Just version | |
| version == typeVersion -> do | |
(name, restBsAge) <- deserialize (castTypeVersionMap versionMap) bs | |
(age, restBs) <- deserialize (castTypeVersionMap versionMap) restBsAge | |
return (User name age, restBs) | |
| otherwise -> Left $ "deserialize @User: Unknown typeVersion " ++ show version | |
data NewUser = NewUser | |
{ userName2 :: Text | |
, userAge2 :: Int | |
, userVisits :: Int | |
} deriving (Eq, Ord, Show) | |
instance Serialize NewUser where | |
typeId = TypeId 0x74286ebf6d139844 -- same as `User` | |
typeVersion = TypeVersion 2 -- updated from `User` | |
collectSubTypeVersions = TypeVersionMap $ Map.unions | |
[ unTypeVersionMap (typeVersionMap @Text) | |
, unTypeVersionMap (typeVersionMap @Int) | |
, unTypeVersionMap (typeVersionMap @Int) -- technically unnecessary duplicate, but we just do one or each record field, like a Generic instance would do | |
] | |
serialize (NewUser name age visits) = serialize name `BSL.append` serialize age `BSL.append` serialize visits | |
deserialize versionMap bs = let i = typeId in case lookupTypeVersion i versionMap of | |
Nothing -> Left $ "deserialize @NewUser: Missing typeId " ++ show i ++ " in versionMap" | |
Just version | |
| version == typeVersion -> do | |
(name, restBsAge) <- deserialize (castTypeVersionMap versionMap) bs | |
(age, restBsVisits) <- deserialize (castTypeVersionMap versionMap) restBsAge | |
(visits, restBs) <- deserialize (castTypeVersionMap versionMap) restBsVisits | |
return (NewUser name age visits, restBs) | |
| version == TypeVersion 1 -> do | |
(User name age, restBs) <- deserialize (castTypeVersionMap versionMap) bs | |
return (NewUser name age 0, restBs) | |
| otherwise -> Left $ "deserialize @NewUser: Unknown typeVersion " ++ show version | |
instance (Serialize a) => Serialize (Maybe a) where | |
typeId = TypeId 0x1f92d0dbf020caa4 | |
typeVersion = TypeVersion 1 | |
collectSubTypeVersions = TypeVersionMap $ Map.unions | |
[ unTypeVersionMap (typeVersionMap @a) | |
] | |
serialize m = case m of | |
Nothing -> "0" | |
Just x -> "1" `BSL.append` serialize x | |
deserialize versionMap bs = let i = typeId in case lookupTypeVersion i versionMap of | |
Nothing -> Left $ "deserialize @(Maybe a): Missing typeId " ++ show i ++ " in versionMap" | |
Just version | |
| version == typeVersion -> do | |
let (tag, rest) = BSL.splitAt 1 bs | |
case tag of | |
"0" -> return (Nothing, rest) | |
"1" -> do | |
(val, restBs) <- deserialize (castTypeVersionMap versionMap) rest | |
return (Just val, restBs) | |
_ -> Left $ "deserialize @(Maybe a): Unknown tag " ++ show tag | |
| otherwise -> Left $ "deserialize @(Maybe a): Unknown typeVersion " ++ show version | |
data Recursive = Recursive | |
{ recValue :: Int | |
, recNext :: Maybe Recursive | |
} deriving (Eq, Ord, Show) | |
instance Serialize Recursive where | |
typeId = TypeId 0xbb8a2db28b32caf7 | |
typeVersion = TypeVersion 1 | |
collectSubTypeVersions = TypeVersionMap $ Map.unions | |
[ unTypeVersionMap (typeVersionMap @Int) | |
, unTypeVersionMap (typeVersionMap @Recursive) | |
] | |
serialize (Recursive value m'next) = serialize value `BSL.append` maybe "" serialize m'next | |
deserialize versionMap bs = let i = typeId in case lookupTypeVersion i versionMap of | |
Nothing -> Left $ "deserialize @Recursive: Missing typeId " ++ show i ++ " in versionMap" | |
Just version | |
| version == typeVersion -> do | |
(value, restBsNext) <- deserialize (castTypeVersionMap versionMap) bs | |
(next, restBs) <- deserialize (castTypeVersionMap versionMap) restBsNext | |
return (Recursive value next, restBs) | |
| otherwise -> Left $ "deserialize @Recursive: Unknown typeVersion " ++ show version | |
main :: IO () | |
main = do | |
-- Simple serialisation / deserialisation | |
print $ | |
toplevelDeserialize @User (toplevelSerialize (User "name" 42)) | |
-- Deserialisation of old versions | |
let oldVersionBytes :: BSL.ByteString | |
oldVersionBytes = toplevelSerialize (User "name" 42) | |
print $ | |
toplevelDeserialize @NewUser oldVersionBytes | |
-- Recursive data type | |
print $ | |
toplevelDeserialize @Recursive (toplevelSerialize (Recursive 1 (Just (Recursive 2 Nothing)))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment