andrewthad/block_leak.hs

## block_leak.hs
{-# language BangPatterns #-}
{-# language MagicHash #-}
{-# language UnboxedTuples #-}

{-# OPTIONS_GHC -O2 -Wall -fforce-recomp #-}

import GHC.Exts
import GHC.IO (IO(..))
import System.Mem (performMajorGC)

-- This is an example program intended to illustrate how overallocating
-- space for byte arrays can lead to fragmentation. These configurations
-- lead to these results:
--
--   +--------------------------+-----------------------------------+
--   | Configuration            | Results                           |
--   +-------------+------------+-------------------+---------------+
--   | Buffer Size | Pinnedness | Gen 0 Collections | Max residency |
--   +=============+============+===================+===============+
--   | 2020        | Unpinned   | 7                 | 8468336       |
--   | 2056        | Unpinned   | 15                | 8599408       |
--   | 2020        | Pinned     | 13                | 8533928       |
--   | 2056        | Pinned     | 13                | 8664968       |
--   +-------------+------------+-------------------+---------------+
--
-- To change to pinnedness, toggle between newByteArray# and
-- newPinnedByteArray# in this file. To change the block size, set
-- the top-level bufSize constant. Maximum residency stays the same
-- across the board. The thing this benchmark is supposed to demonstrate
-- is fragmentation unreported by GHC. Notice that with unpinned memory,
-- bumping the buffers that are being used from just under 2KB to
-- just over it leads to twice as many minor GCs. The reason for this,
-- discussed in http://web.mit.edu/~ezyang/Public/blocks.pdf, is that
--
--   > We would like the nursery to be a contiguous region. Thus, the
--   > nursery requests as large acontiguous block group as possible
--   > (blocks per mblock). There are, however, two important ancillary
--   > considerations.
--   >
--   > First, it is not actually desirable for the nursery to be a
--   > single large block, for the purposes of parallel garbage collection.
--   > A single large block cannot load balanced over multiple processors,
--   > whereas a collection of small blockscan be. Thus, after receiving
--   > a block group, the nursery then chops the block group into individual
--   > blocks, which just happen to be contiguous. Second, requests for large
--   > blocks can result in bad fragmentation, since the bigger the request,
--   > the harder it is to fulfill. So while it is nice to have contiguous
--   > memory, it is more important to make sure that small block groups get
--   > used up. So in practice, the nursery will be a chain of blocks, which
--   > happen to be somewhat contiguous.
--
-- By constrast, large/pinned object are less likely to exhibit this behavior.
-- The blocks used for these object get coalesced when possible.
--
-- Keep in mind that the numbers in the table do not imply any particular
-- performance characteristics. They merely demonstrate that a lot of bytes
-- in the nursery do not get used when there are lots of unpinned consective
-- allocations of just over half a block.

main :: IO ()
main = do
  let go !ix !xs = if ix >= (0 :: Int)
        then do
          b <- newByteArray bufSize
          go (ix - 1) (Cons b xs)
        else pure xs
  r <- go 4095 Nil
  performMajorGC
  print (bufListLen 0 r)

bufSize :: Int
bufSize = 2056

data BufList = Cons !MutableByteArray !BufList | Nil

data MutableByteArray = MutableByteArray (MutableByteArray# RealWorld)

sizeofMutableByteArray :: MutableByteArray -> Int
sizeofMutableByteArray (MutableByteArray arr#) = I# (sizeofMutableByteArray# arr#)

newByteArray :: Int -> IO MutableByteArray
newByteArray (I# n#)
  = IO (\s# -> case newPinnedByteArray# n# s# of
                        (# s'#, arr# #) -> (# s'#, MutableByteArray arr# #))

bufListLen :: Int -> BufList -> Int
bufListLen !acc Nil = acc
bufListLen !acc (Cons b ys) = bufListLen (acc + sizeofMutableByteArray b) ys
	{-# language BangPatterns #-}
	{-# language MagicHash #-}
	{-# language UnboxedTuples #-}

	{-# OPTIONS_GHC -O2 -Wall -fforce-recomp #-}

	import GHC.Exts
	import GHC.IO (IO(..))
	import System.Mem (performMajorGC)

	-- This is an example program intended to illustrate how overallocating
	-- space for byte arrays can lead to fragmentation. These configurations
	-- lead to these results:
	--
	-- +--------------------------+-----------------------------------+
	-- \| Configuration \| Results \|
	-- +-------------+------------+-------------------+---------------+
	-- \| Buffer Size \| Pinnedness \| Gen 0 Collections \| Max residency \|
	-- +=============+============+===================+===============+
	-- \| 2020 \| Unpinned \| 7 \| 8468336 \|
	-- \| 2056 \| Unpinned \| 15 \| 8599408 \|
	-- \| 2020 \| Pinned \| 13 \| 8533928 \|
	-- \| 2056 \| Pinned \| 13 \| 8664968 \|
	-- +-------------+------------+-------------------+---------------+
	--
	-- To change to pinnedness, toggle between newByteArray# and
	-- newPinnedByteArray# in this file. To change the block size, set
	-- the top-level bufSize constant. Maximum residency stays the same
	-- across the board. The thing this benchmark is supposed to demonstrate
	-- is fragmentation unreported by GHC. Notice that with unpinned memory,
	-- bumping the buffers that are being used from just under 2KB to
	-- just over it leads to twice as many minor GCs. The reason for this,
	-- discussed in http://web.mit.edu/~ezyang/Public/blocks.pdf, is that
	--
	-- > We would like the nursery to be a contiguous region. Thus, the
	-- > nursery requests as large acontiguous block group as possible
	-- > (blocks per mblock). There are, however, two important ancillary
	-- > considerations.
	-- >
	-- > First, it is not actually desirable for the nursery to be a
	-- > single large block, for the purposes of parallel garbage collection.
	-- > A single large block cannot load balanced over multiple processors,
	-- > whereas a collection of small blockscan be. Thus, after receiving
	-- > a block group, the nursery then chops the block group into individual
	-- > blocks, which just happen to be contiguous. Second, requests for large
	-- > blocks can result in bad fragmentation, since the bigger the request,
	-- > the harder it is to fulfill. So while it is nice to have contiguous
	-- > memory, it is more important to make sure that small block groups get
	-- > used up. So in practice, the nursery will be a chain of blocks, which
	-- > happen to be somewhat contiguous.
	--
	-- By constrast, large/pinned object are less likely to exhibit this behavior.
	-- The blocks used for these object get coalesced when possible.
	--
	-- Keep in mind that the numbers in the table do not imply any particular
	-- performance characteristics. They merely demonstrate that a lot of bytes
	-- in the nursery do not get used when there are lots of unpinned consective
	-- allocations of just over half a block.

	main :: IO ()
	main = do
	let go !ix !xs = if ix >= (0 :: Int)
	then do
	b <- newByteArray bufSize
	go (ix - 1) (Cons b xs)
	else pure xs
	r <- go 4095 Nil
	performMajorGC
	print (bufListLen 0 r)

	bufSize :: Int
	bufSize = 2056

	data BufList = Cons !MutableByteArray !BufList \| Nil

	data MutableByteArray = MutableByteArray (MutableByteArray# RealWorld)

	sizeofMutableByteArray :: MutableByteArray -> Int
	sizeofMutableByteArray (MutableByteArray arr#) = I# (sizeofMutableByteArray# arr#)

	newByteArray :: Int -> IO MutableByteArray
	newByteArray (I# n#)
	= IO (\s# -> case newPinnedByteArray# n# s# of
	(# s'#, arr# #) -> (# s'#, MutableByteArray arr# #))

	bufListLen :: Int -> BufList -> Int
	bufListLen !acc Nil = acc
	bufListLen !acc (Cons b ys) = bufListLen (acc + sizeofMutableByteArray b) ys