awreece/perf_post_process.sh

## perf_post_process.sh
#!/bin/bash
#
# Generates a "latency heat map", showing both on-CPU and off-CPU stacks. In addition,
# attempts to color code on-CPU based on CPI by diffling the on-CPU stacks sampled
# at ~1000 Hz and every ~3e6 instructions. Intuitively, if a stack is more common in
# frequency stacks than in count stacks, than it represents a "slower" stack.
#

perf_data_dir=$1
dest_dir=$2
function_filter=$3

script_dir=$(dirname $0)
export PATH="$script_dir:$PATH"

freq_data_file=$perf_data_dir/freq.data
freq_ms_stacks_file=$dest_dir/freq.ms_stacks
sched_data_file=$perf_data_dir/sched.data
sched_diffed_stacks_file=$dest_dir/sched.diffed_stacks
count_data_file=$perf_data_dir/count.data
count_raw_stacks_file=$dest_dir/count.raw_stacks
latency_svg_file=$dest_dir/latency_graph.svg

# TODO(awreece) This is a gross hack that needs to go as soon as possible.
sudo -l | grep -q NOPASSWD || \
    (echo "Cannot execute passwordless sudo" >&2; false) || exit 1

echo "Condensing stacks from $perf_data_dir into $dest_dir"

#
# Parse the perf data files into stacks for consumption by flamegraph.pl.
# Since we will be comparing sched stacks to cpu frequency stacks, we need to
# convert both to a common unit: weight in milliseconds. To simplify this
# logic, we perform the conversion before c++filt (which may add spaces, etc).
#
# Since we measured 997 samples/sec, we need to divide by 997/1000 to get ms.
sudo perf script -i "$freq_data_file" | stackcollapse-perf.pl | \
    awk '{print $1, int($2*1000/997)}' | \
    c++filt | grep "$function_filter" >"$freq_ms_stacks_file" &

#
# We do not need to convert the count stacks, since they are normalized
# in difffolded.pl and not directly compared to the sched stacks.
#
sudo perf script -i "$count_data_file" | stackcollapse-perf.pl | \
    c++filt | grep "$function_filter" >"$count_raw_stacks_file" &

#
# Since this reports sleep time in us, we need to divide by 1000 to get ms.
# To simplify later processing, we emit already "diffed" stacks at this stage.
# By emitting "0, <time ms>" for each sched stack and using `--negate` as an
# argument to flamegraph.pl below, we ensure the sched stacks will be colored
# bright blue and sized according to the time spent off cpu.
#
sudo perf script -f time,comm,pid,tid,event,ip,sym,dso,trace \
    -i "$sched_data_file" | \
    stackcollapse-perf-sched.awk | \
    awk '(int($2/1000) > 0) {print $1, 0, int($2/1000)}' | \
    c++filt | grep "$function_filter" >"$sched_diffed_stacks_file" &

wait
echo "done."

echo -n "Generating $latency_svg_file ... "
#
# We diff the stacks carefully to try to make stacks colored blue represent
# blocking or stalling for memory and stacks colored red represent efficient
# stacks.  Intuitively, if a stack is more common in freq stacks (measured
# ~1000x a second) than in count stacks (measured every ~3M instructions on a
# ~3GHz machine), than it represents a "slower" stack. Since we normalize both
# counts, we only measure the proporition of the total samples a stack
# constitutes in both cases and the precise numbers are not important.
#
# If compare the stacks in this order and using `--negate` as an argument to
# flamegraph.pl below, we cause stacks that are sampled (proportionately) more
# frequently in count.stacks than in freq.stacks will be colored red (and
# vice versa will be colored blue) and sized according to the time spent
# on cpu.
#
cat <(difffolded.pl -n "$count_raw_stacks_file" "$freq_ms_stacks_file") \
    "$sched_diffed_stacks_file" | \
    flamegraph.pl --negate >"$latency_svg_file"
echo "done."

grep -q CDATA "$latency_svg_file" || \
    (echo "No data in flamegraph!" >&2; false) || \
    exit 22

## perf_record.sh
#!/bin/bash
#
# Collects perf.data files needed for a "latency heat map".
#

workdir=$1
pids=$2

function die
{
	echo "$@" >&2
	exit 1
}

sudo -l | grep -q NOPASSWD || die "Cannot execute passwordless sudo"
sudo mkdir -p "$workdir" || die "Failure to create $workdir"

trap finish INT QUIT TERM PIPE HUP
function finish
{
	[[ -z $perf_count_pid && -z $perf_freq_pid && -z $perf_sched_pid ]] || \
	    sudo kill $perf_count_pid $perf_freq_pid $perf_sched_pid
	wait
}

# Aim to sample every ~3M instructions since moderns CPUs are ~3GHz.
sudo perf record --output "$workdir/count.data" --pid=$pids --call-graph=fp \
    --event cycles:u --count $((2**10 * 997 * 3)) &
perf_count_pid=$!

# Sample at 997 Hz to avoid sampling in lockstep with something at 1000Hz.
sudo perf record --output "$workdir/freq.data" --pid=$pids --call-graph=fp \
    --event cycles:u --freq 997 &
perf_freq_pid=$!

sudo perf record --output "$workdir/sched.data" --pid=$pids --call-graph=fp \
    --event sched:sched_switch --event sched:sched_stat_sleep \
    --event sched:sched_stat_blocked --event sched:sched_process_exit &
perf_sched_pid=$!

wait
	#!/bin/bash
	#
	# Generates a "latency heat map", showing both on-CPU and off-CPU stacks. In addition,
	# attempts to color code on-CPU based on CPI by diffling the on-CPU stacks sampled
	# at ~1000 Hz and every ~3e6 instructions. Intuitively, if a stack is more common in
	# frequency stacks than in count stacks, than it represents a "slower" stack.
	#

	perf_data_dir=$1
	dest_dir=$2
	function_filter=$3

	script_dir=$(dirname $0)
	export PATH="$script_dir:$PATH"

	freq_data_file=$perf_data_dir/freq.data
	freq_ms_stacks_file=$dest_dir/freq.ms_stacks
	sched_data_file=$perf_data_dir/sched.data
	sched_diffed_stacks_file=$dest_dir/sched.diffed_stacks
	count_data_file=$perf_data_dir/count.data
	count_raw_stacks_file=$dest_dir/count.raw_stacks
	latency_svg_file=$dest_dir/latency_graph.svg

	# TODO(awreece) This is a gross hack that needs to go as soon as possible.
	sudo -l \| grep -q NOPASSWD \|\| \
	(echo "Cannot execute passwordless sudo" >&2; false) \|\| exit 1

	echo "Condensing stacks from $perf_data_dir into $dest_dir"

	#
	# Parse the perf data files into stacks for consumption by flamegraph.pl.
	# Since we will be comparing sched stacks to cpu frequency stacks, we need to
	# convert both to a common unit: weight in milliseconds. To simplify this
	# logic, we perform the conversion before c++filt (which may add spaces, etc).
	#
	# Since we measured 997 samples/sec, we need to divide by 997/1000 to get ms.
	sudo perf script -i "$freq_data_file" \| stackcollapse-perf.pl \| \
	awk '{print $1, int($2*1000/997)}' \| \
	c++filt \| grep "$function_filter" >"$freq_ms_stacks_file" &

	#
	# We do not need to convert the count stacks, since they are normalized
	# in difffolded.pl and not directly compared to the sched stacks.
	#
	sudo perf script -i "$count_data_file" \| stackcollapse-perf.pl \| \
	c++filt \| grep "$function_filter" >"$count_raw_stacks_file" &

	#
	# Since this reports sleep time in us, we need to divide by 1000 to get ms.
	# To simplify later processing, we emit already "diffed" stacks at this stage.
	# By emitting "0, <time ms>" for each sched stack and using `--negate` as an
	# argument to flamegraph.pl below, we ensure the sched stacks will be colored
	# bright blue and sized according to the time spent off cpu.
	#
	sudo perf script -f time,comm,pid,tid,event,ip,sym,dso,trace \
	-i "$sched_data_file" \| \
	stackcollapse-perf-sched.awk \| \
	awk '(int($2/1000) > 0) {print $1, 0, int($2/1000)}' \| \
	c++filt \| grep "$function_filter" >"$sched_diffed_stacks_file" &

	wait
	echo "done."

	echo -n "Generating $latency_svg_file ... "
	#
	# We diff the stacks carefully to try to make stacks colored blue represent
	# blocking or stalling for memory and stacks colored red represent efficient
	# stacks. Intuitively, if a stack is more common in freq stacks (measured
	# ~1000x a second) than in count stacks (measured every ~3M instructions on a
	# ~3GHz machine), than it represents a "slower" stack. Since we normalize both
	# counts, we only measure the proporition of the total samples a stack
	# constitutes in both cases and the precise numbers are not important.
	#
	# If compare the stacks in this order and using `--negate` as an argument to
	# flamegraph.pl below, we cause stacks that are sampled (proportionately) more
	# frequently in count.stacks than in freq.stacks will be colored red (and
	# vice versa will be colored blue) and sized according to the time spent
	# on cpu.
	#
	cat <(difffolded.pl -n "$count_raw_stacks_file" "$freq_ms_stacks_file") \
	"$sched_diffed_stacks_file" \| \
	flamegraph.pl --negate >"$latency_svg_file"
	echo "done."

	grep -q CDATA "$latency_svg_file" \|\| \
	(echo "No data in flamegraph!" >&2; false) \|\| \
	exit 22
	#!/bin/bash
	#
	# Collects perf.data files needed for a "latency heat map".
	#

	workdir=$1
	pids=$2

	function die
	{
	echo "$@" >&2
	exit 1
	}

	sudo -l \| grep -q NOPASSWD \|\| die "Cannot execute passwordless sudo"
	sudo mkdir -p "$workdir" \|\| die "Failure to create $workdir"

	trap finish INT QUIT TERM PIPE HUP
	function finish
	{
	[[ -z $perf_count_pid && -z $perf_freq_pid && -z $perf_sched_pid ]] \|\| \
	sudo kill $perf_count_pid $perf_freq_pid $perf_sched_pid
	wait
	}

	# Aim to sample every ~3M instructions since moderns CPUs are ~3GHz.
	sudo perf record --output "$workdir/count.data" --pid=$pids --call-graph=fp \
	--event cycles:u --count $((2*10 997 * 3)) &
	perf_count_pid=$!

	# Sample at 997 Hz to avoid sampling in lockstep with something at 1000Hz.
	sudo perf record --output "$workdir/freq.data" --pid=$pids --call-graph=fp \
	--event cycles:u --freq 997 &
	perf_freq_pid=$!

	sudo perf record --output "$workdir/sched.data" --pid=$pids --call-graph=fp \
	--event sched:sched_switch --event sched:sched_stat_sleep \
	--event sched:sched_stat_blocked --event sched:sched_process_exit &
	perf_sched_pid=$!

	wait