tobert/cassandra.yaml

## cassandra.yaml
# save some memory on indexes
index_interval: 512

# this can hurt your read latency, but in my experience it's better to stay caught up
compaction_throughput_mb_per_sec: 0

# if you have lots & lots of connections, e.g. from Hadoop, saves memory
rpc_server_type: hsha

## fstab
# /etc/fstab
# Use "nobootwait" option in /etc/fstab on Ubuntu or upstart WILL troll you!
# relatime is the default in Linux since around 2.6.31, so skip the relatime/noatime unless
# you've actually measured a difference
/dev/md7 /data_raid xfs    nobootwait,defaults    0 0

## jvm-numa.sh
java -XX:+UseNUMA
# or
numactl --interleave java -jar foo.jar

## limits.conf
# /etc/security/limits.conf
# this is not a multi-user system, ulimits are useless
* - memlock unlimited
* - nofile 1048576
* - fsize unlimited
* - nproc 999999

## rc.local
# put in /etc/rc.local
# Linux RAID tuning
# Al Tobey 2011-09-19 <al@ooyala.com>
# Haven't tested this in ages, YMMV.

drive_ra=$((2**14)) # 16k readahead

for sysent in /sys/block/sd[a-z]
do
    drive=$(basename $sysent)
    sector_size=$(blockdev --getss /dev/$drive)

    # it should already be CFQ, make sure
    echo cfq > /sys/block/$drive/queue/scheduler

    # allow 256 in-flight BIO's per drive for better IO merging
    echo 256 > /sys/block/$drive/queue/nr_requests

    # adjust readahead
    blockdev --setra $(($drive_ra / $sector_size)) /dev/$drive
done

# we always mount the critical data volume on /data_raid
dr_vol=$(awk '/data_raid/{print $1}' < /etc/fstab)
if [ -n "$dr_vol" ] ; then
    # blockdev works in sectors
    dr_vol_ss=$(blockdev --getss $dr_vol)
    # short name, e.g. 'md7'
    dr_vol_name=$(basename $dr_vol)
    # count the number of devices in the raid
    dr_vol_devcnt=$(< /sys/block/$dr_vol_name/md/raid_disks)
    # number of drives in the raid * readahead set on the drives earlier
    dr_vol_rabytes=$(($drive_ra * $dr_vol_devcnt))

    # significantly increase the number of entries in the stripe cache (default 128)
    if [ -e "/sys/block/$dr_vol_name/md/stripe_cache_size" ] ; then
      echo 16384 > /sys/block/$dr_vol_name/md/stripe_cache_size
    fi
    # set readahead on the raid device
    blockdev --setra $(($dr_vol_rabytes / $dr_vol_ss)) $dr_vol
fi

## schema.txt
# Enable compression! surprisingly good
compression_options = {'sstable_compression': 'org.apache.cassandra.io.compress.SnappyCompressor'};

# Examine bloom filter false-positives
# nodetool -h localhost cfstats |grep Bloom
bloom_filter_fp_chance = 0.1 # diminishing returns

# Reduce ssTable count
# memory pressure caused frequent memtable flushes compaction throttling made it worse
compaction_strategy_options = {'sstable_size_in_mb': 256}

# Give yourself time to repair
gc_grace = 5184000 # 60 days

## sysctl.conf
net.ipv4.ip_forward=0
net.ipv6.conf.all.forwarding=0
kernel.sysrq = 1
kernel.panic = 300
fs.file-max = 1048576
kernel.pid_max = 999999
net.core.rmem_max = 16777216
net.core.wmem_max = 16777216
net.ipv4.tcp_rmem = 4096 65536 16777216
net.ipv4.tcp_wmem = 4096 65536 16777216
vm.max_map_count = 1048576
	# save some memory on indexes
	index_interval: 512

	# this can hurt your read latency, but in my experience it's better to stay caught up
	compaction_throughput_mb_per_sec: 0

	# if you have lots & lots of connections, e.g. from Hadoop, saves memory
	rpc_server_type: hsha
	# /etc/fstab
	# Use "nobootwait" option in /etc/fstab on Ubuntu or upstart WILL troll you!
	# relatime is the default in Linux since around 2.6.31, so skip the relatime/noatime unless
	# you've actually measured a difference
	/dev/md7 /data_raid xfs nobootwait,defaults 0 0
	# /etc/security/limits.conf
	# this is not a multi-user system, ulimits are useless
	* - memlock unlimited
	* - nofile 1048576
	* - fsize unlimited
	* - nproc 999999
	# put in /etc/rc.local
	# Linux RAID tuning
	# Al Tobey 2011-09-19 <al@ooyala.com>
	# Haven't tested this in ages, YMMV.

	drive_ra=$((2**14)) # 16k readahead

	for sysent in /sys/block/sd[a-z]
	do
	drive=$(basename $sysent)
	sector_size=$(blockdev --getss /dev/$drive)

	# it should already be CFQ, make sure
	echo cfq > /sys/block/$drive/queue/scheduler

	# allow 256 in-flight BIO's per drive for better IO merging
	echo 256 > /sys/block/$drive/queue/nr_requests

	# adjust readahead
	blockdev --setra $(($drive_ra / $sector_size)) /dev/$drive
	done

	# we always mount the critical data volume on /data_raid
	dr_vol=$(awk '/data_raid/{print $1}' < /etc/fstab)
	if [ -n "$dr_vol" ] ; then
	# blockdev works in sectors
	dr_vol_ss=$(blockdev --getss $dr_vol)
	# short name, e.g. 'md7'
	dr_vol_name=$(basename $dr_vol)
	# count the number of devices in the raid
	dr_vol_devcnt=$(< /sys/block/$dr_vol_name/md/raid_disks)
	# number of drives in the raid * readahead set on the drives earlier
	dr_vol_rabytes=$(($drive_ra * $dr_vol_devcnt))

	# significantly increase the number of entries in the stripe cache (default 128)
	if [ -e "/sys/block/$dr_vol_name/md/stripe_cache_size" ] ; then
	echo 16384 > /sys/block/$dr_vol_name/md/stripe_cache_size
	fi
	# set readahead on the raid device
	blockdev --setra $(($dr_vol_rabytes / $dr_vol_ss)) $dr_vol
	fi
	# Enable compression! surprisingly good
	compression_options = {'sstable_compression': 'org.apache.cassandra.io.compress.SnappyCompressor'};

	# Examine bloom filter false-positives
	# nodetool -h localhost cfstats \|grep Bloom
	bloom_filter_fp_chance = 0.1 # diminishing returns

	# Reduce ssTable count
	# memory pressure caused frequent memtable flushes compaction throttling made it worse
	compaction_strategy_options = {'sstable_size_in_mb': 256}

	# Give yourself time to repair
	gc_grace = 5184000 # 60 days
	net.ipv4.ip_forward=0
	net.ipv6.conf.all.forwarding=0
	kernel.sysrq = 1
	kernel.panic = 300
	fs.file-max = 1048576
	kernel.pid_max = 999999
	net.core.rmem_max = 16777216
	net.core.wmem_max = 16777216
	net.ipv4.tcp_rmem = 4096 65536 16777216
	net.ipv4.tcp_wmem = 4096 65536 16777216
	vm.max_map_count = 1048576