burtonator/gist:1602152

## gistfile1.pl
###
# Main peregrine configuration file.  All values here can be overridden on the
# command line by running --param=value.  For example to change the default
# basedir you can run with --basedir=/d0/peregrine-fs
#
###

###
#
# The controller which will send and receive RPC messages and handle job
# execution.  This should be the machine you execute your jobs from and the
# controller should be up and online for the full completion of the job.
#

controller=localhost:11111

###
#
# Control port for normal node operations (PFS, job scheduling, etc).
#

port=11112

###
#
# Base directory for all filesystem operations.  The root directory is built
# from the basedir by using the /host/port as a suffix.  This is done so that
# you can run two daemons per box and for testing.
#
# For multi-tenancy configuration it would be best to place a dedicated basedir
# for each disk that you intend to use in production.
#
#

basedir=/tmp/peregrine-fs

###
#
# Host concurrency in order to determine how many jobs can run on hosts at a
# given time.  If you are on a multicore machine it is better to have have a
# daemon use the right mix of hardware and keep concurrency per disk low.
# Ideally, per one HDD the concurrency would be about 1 but not everyone can run
# with this configuration.

concurrency=2

###
#
# Number of replicas to maintain per partition.  For smaller clusters it may be
# cost effective to use on replica and just-reschedule the job if it fails.  For
# larger clusters, using 2-3 replicas will mean that you can complete a job
# without it failing as the more hosts you add the higher the probability that
# the entire job will fail due to one machine crashing.
#
# It is generally better to have more hosts in your cluster as this will allow
# greater parallel recovery.  The maximum parallel recovery can be provided
# until the number of failed hosts is > nr_hosts - nr_replicas at which point we
# won't be able to evenly distribute partitions among new hosts and performance
# will fall during recovery.

replicas=1

###
# The size of the buffer used for receiving shuffle data.  Larger is better as
# this will allow enable more contiguous data together on disk.  You do not want
# to over allocate this as Peregrine will run out of memory and need to
# reschedule jobs.  We commit to disk at 1/2 this amount so the individual
# shuffle files will be batched and written as units.  This allows us to group
# partition data together which enables faster sequential reads during the
# reduce phase.

shuffleBufferSize=200M

###
#
# The max number of files we attempt to merge from at a time during the reduce
# phase when merging shuffle segments.  At the first phase of shuffling, we read
# all the shuffle group files off disk, and then sort them and then write them
# back to disk.  If the total number of sorted files/segments is less than
# shuffleSegmentMergeParallelism then we merge them directly.  If not we create
# another file, merge the first `shuffleSegmentMergeParallelism` files, and then
# do it on the next batch of files.  Then we do one more stage where we merge
# those children and then the reduce is complete.
#

shuffleSegmentMergeParallelism=75

###
#
# When > 0 we enable fallocate and allocate files in extents to enable
# contiguous reads and avoid problems with ext3, ext4 (and to some extent XFS)
# writing fragmented files when writing to multiple locations.

fallocateExtentSize=10M

###
#
# Set the sorting buffer size.  Used when performing a reduce to minimize disk
# IOs.  The larger the better as this will reduce disk IO.  This memory is
# allocated outside the JVM via mmap, mlock and usually read implicitly or
# expliclty via fadvise WILLNEED and DONTNEED.

sortBufferSize=250M

###
#
# When true we run posix_fadvise POSIX_FADV_DONTNEED to evict pages from the
# page cache which we know will not be needed again.  This includes map chunks
# which have already been mapped and shuffle files that have been reduced.  If
# your job can fit into memory and is iterative with many iterations, it may be
# better to keep this disabled.  The rationale behind fadvising away pages is
# that some Linux kernels get confused with the VFS cache pressure and start
# swapping.  In production environments you may also consider compiling your
# kernel without swap (which is what we do with production jobs) which would
# also fix this problem.
#
# For larger jobs that are much greater than memory (2-10x) a better strategy is
# to use all available memory for the combiner and sort buffer.  There may be
# situations where you see slight speedups due to IO being read directly from
# cache but a more effective strategy is to allocate all memory to
# sortBufferSize and shuffleBufferSize to minimize data written over the network
# and to sort more efficiently.

fadviseDontNeedEnabled=true

###
# Specify the chunk size used for storing files in PFS.  The default of 128MB is
# almost certainly acceptable.
#

chunkSize=128M

###
# When true, purge shuffle data sent to nodes during computation.  For all
# production jobs this should be set to true.  This is almost always a debug
# option for developers to analyze intermediate data, usually to optimize
# performance.

purgeShuffleData=true

###
# Set the key partitioner implementation.  The default is probably acceptable
# but if you have a custom partitioner you can set the implementation here.
# This is a classname with the standard Java classname syntax (com.example.Name)
# but if you set it to a short name the peregrine internal package is used.

partitionerDelegate=RangePartitioner

###
#
# Sync page writing.  When we write IO to disk we periodically sync it so that
# we don't end up filling the VFS page cache with dirty buffers only for them to
# be evicted later.  Most of our jobs are LONG running jobs where the IO needs
# to be written ANYWAY so having these in the page cache is not helpful.  If we
# were to let this happen the kernel would write pages in the background (even
# after an algorithm is finished) and a future algorithm would have to compete
# for IOs with background writes which would yield poor performance.  We sync
# every syncWriteSize bytes so that the write IO is balanced out during the full
# write of the file.  This also means that when we close() the file that only a
# small amount of IO has to be written (not hundreds of megs to gigs) which
# means that we have a smooth IO pattern which makes things easier to debug and
# understand as well as take advantage of periods of low IO activity.
#
# Set to zero to disable.

syncWriteSize=20M

###
#
# Attempt to set the maximum file handles that we can use.  This is only
# supported on Linux at the moment.  If this fails due to either permissions or
# any other reason you will probably need to explicitly change it via ulimit or
# changing the global system limits.
#
# Set to zero to disable.

maxOpenFileHandles=10000
	###
	# Main peregrine configuration file. All values here can be overridden on the
	# command line by running --param=value. For example to change the default
	# basedir you can run with --basedir=/d0/peregrine-fs
	#
	###

	###
	#
	# The controller which will send and receive RPC messages and handle job
	# execution. This should be the machine you execute your jobs from and the
	# controller should be up and online for the full completion of the job.
	#

	controller=localhost:11111

	###
	#
	# Control port for normal node operations (PFS, job scheduling, etc).
	#

	port=11112

	###
	#
	# Base directory for all filesystem operations. The root directory is built
	# from the basedir by using the /host/port as a suffix. This is done so that
	# you can run two daemons per box and for testing.
	#
	# For multi-tenancy configuration it would be best to place a dedicated basedir
	# for each disk that you intend to use in production.
	#
	#

	basedir=/tmp/peregrine-fs

	###
	#
	# Host concurrency in order to determine how many jobs can run on hosts at a
	# given time. If you are on a multicore machine it is better to have have a
	# daemon use the right mix of hardware and keep concurrency per disk low.
	# Ideally, per one HDD the concurrency would be about 1 but not everyone can run
	# with this configuration.

	concurrency=2

	###
	#
	# Number of replicas to maintain per partition. For smaller clusters it may be
	# cost effective to use on replica and just-reschedule the job if it fails. For
	# larger clusters, using 2-3 replicas will mean that you can complete a job
	# without it failing as the more hosts you add the higher the probability that
	# the entire job will fail due to one machine crashing.
	#
	# It is generally better to have more hosts in your cluster as this will allow
	# greater parallel recovery. The maximum parallel recovery can be provided
	# until the number of failed hosts is > nr_hosts - nr_replicas at which point we
	# won't be able to evenly distribute partitions among new hosts and performance
	# will fall during recovery.

	replicas=1

	###
	# The size of the buffer used for receiving shuffle data. Larger is better as
	# this will allow enable more contiguous data together on disk. You do not want
	# to over allocate this as Peregrine will run out of memory and need to
	# reschedule jobs. We commit to disk at 1/2 this amount so the individual
	# shuffle files will be batched and written as units. This allows us to group
	# partition data together which enables faster sequential reads during the
	# reduce phase.

	shuffleBufferSize=200M

	###
	#
	# The max number of files we attempt to merge from at a time during the reduce
	# phase when merging shuffle segments. At the first phase of shuffling, we read
	# all the shuffle group files off disk, and then sort them and then write them
	# back to disk. If the total number of sorted files/segments is less than
	# shuffleSegmentMergeParallelism then we merge them directly. If not we create
	# another file, merge the first `shuffleSegmentMergeParallelism` files, and then
	# do it on the next batch of files. Then we do one more stage where we merge
	# those children and then the reduce is complete.
	#

	shuffleSegmentMergeParallelism=75

	###
	#
	# When > 0 we enable fallocate and allocate files in extents to enable
	# contiguous reads and avoid problems with ext3, ext4 (and to some extent XFS)
	# writing fragmented files when writing to multiple locations.

	fallocateExtentSize=10M

	###
	#
	# Set the sorting buffer size. Used when performing a reduce to minimize disk
	# IOs. The larger the better as this will reduce disk IO. This memory is
	# allocated outside the JVM via mmap, mlock and usually read implicitly or
	# expliclty via fadvise WILLNEED and DONTNEED.

	sortBufferSize=250M

	###
	#
	# When true we run posix_fadvise POSIX_FADV_DONTNEED to evict pages from the
	# page cache which we know will not be needed again. This includes map chunks
	# which have already been mapped and shuffle files that have been reduced. If
	# your job can fit into memory and is iterative with many iterations, it may be
	# better to keep this disabled. The rationale behind fadvising away pages is
	# that some Linux kernels get confused with the VFS cache pressure and start
	# swapping. In production environments you may also consider compiling your
	# kernel without swap (which is what we do with production jobs) which would
	# also fix this problem.
	#
	# For larger jobs that are much greater than memory (2-10x) a better strategy is
	# to use all available memory for the combiner and sort buffer. There may be
	# situations where you see slight speedups due to IO being read directly from
	# cache but a more effective strategy is to allocate all memory to
	# sortBufferSize and shuffleBufferSize to minimize data written over the network
	# and to sort more efficiently.

	fadviseDontNeedEnabled=true

	###
	# Specify the chunk size used for storing files in PFS. The default of 128MB is
	# almost certainly acceptable.
	#

	chunkSize=128M

	###
	# When true, purge shuffle data sent to nodes during computation. For all
	# production jobs this should be set to true. This is almost always a debug
	# option for developers to analyze intermediate data, usually to optimize
	# performance.

	purgeShuffleData=true

	###
	# Set the key partitioner implementation. The default is probably acceptable
	# but if you have a custom partitioner you can set the implementation here.
	# This is a classname with the standard Java classname syntax (com.example.Name)
	# but if you set it to a short name the peregrine internal package is used.

	partitionerDelegate=RangePartitioner

	###
	#
	# Sync page writing. When we write IO to disk we periodically sync it so that
	# we don't end up filling the VFS page cache with dirty buffers only for them to
	# be evicted later. Most of our jobs are LONG running jobs where the IO needs
	# to be written ANYWAY so having these in the page cache is not helpful. If we
	# were to let this happen the kernel would write pages in the background (even
	# after an algorithm is finished) and a future algorithm would have to compete
	# for IOs with background writes which would yield poor performance. We sync
	# every syncWriteSize bytes so that the write IO is balanced out during the full
	# write of the file. This also means that when we close() the file that only a
	# small amount of IO has to be written (not hundreds of megs to gigs) which
	# means that we have a smooth IO pattern which makes things easier to debug and
	# understand as well as take advantage of periods of low IO activity.
	#
	# Set to zero to disable.

	syncWriteSize=20M

	###
	#
	# Attempt to set the maximum file handles that we can use. This is only
	# supported on Linux at the moment. If this fails due to either permissions or
	# any other reason you will probably need to explicitly change it via ulimit or
	# changing the global system limits.
	#
	# Set to zero to disable.

	maxOpenFileHandles=10000