Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ellimilial/fa97326d1221194e0016 to your computer and use it in GitHub Desktop.
Save ellimilial/fa97326d1221194e0016 to your computer and use it in GitHub Desktop.
A bash script to run Hadoop DFSIO tests multiple times with different file/batch sizes, averaging the results. Suitable for command line and SSH (Jenkins).
#!/bin/bash
#
# Run DFSIO write and read tests for multiple file size/count configurations. Get the average speed over RUNS_PER_CONFIG executions.
# The throughtput calculation method assumes all tests are run on a single 'wave' , i.e. BATCH_SIZE < total mapper task
#
# Replace hadoop/yarn in run commands as required.
#
# To run via ssh, say on Jenkins, wrap in:
# ssh namenode.server.com <<'ENDSSH'
# (... code ...)
# ENDSSH
readonly TEST_JAR="(...)/hadoop-mapreduce-client-jobclient-tests.jar"
readonly FILE_SIZES=("10MB" "100MB" "1GB")
readonly BATCH_SIZES=("10" "100")
readonly RUNS_PER_CONFIG=2
join() { local IFS="$1"; shift; echo "$*"; }
avg() { for i in "$@"; do ((total+=$i)); ((count+=1)); done; echo $((total/count)); }
run_write_benchmark() { name="DFSIO benchmark - write ${1} files of ${2}"; var=$(yarn jar $TEST_JAR TestDFSIO -Dmapred.job.name="${name}" -write -nrFiles $1 -fileSize $2 2>&1 | tee /dev/stderr | grep "Throughput mb/sec:" | awk '{print $NF}'); var=$(printf "%.0f" $var); var=$(($var*$1)); echo $var; }
run_read_benchmark() { name="DFSIO benchmark - read ${1} files of ${2}"; var=$(yarn jar $TEST_JAR TestDFSIO -Dmapred.job.name="${name}" -read -nrFiles $1 -fileSize $2 2>&1 | tee /dev/stderr | grep "Throughput mb/sec:" | awk '{print $NF}'); var=$(printf "%.0f" $var); var=$(($var*$1)); echo $var; }
main() {
all_write_results=()
all_read_results=()
for ((i=0;i<${#FILE_SIZES[@]};++i)); do
for ((j=0;j<${#BATCH_SIZES[@]};++j)); do
file_size=${FILE_SIZES[$i]}
batch_size=${BATCH_SIZES[$j]}
config_id="${batch_size} ${file_size} files"
echo "Running tests for ${config_id}..."
results_write=()
results_read=()
for ((k = 1; k <= $RUNS_PER_CONFIG; k++)); do
results_write+=($(run_write_benchmark $batch_size $file_size))
results_read+=($(run_read_benchmark $batch_size $file_size))
done
avg_write=$(avg "${results_write[@]}")
results_write_joined=$(join , ${results_write[@]})
all_write_results+=("Avg ${avg_write} MB/s - ${config_id}, all (MB/s): ${results_write_joined}")
avg_read=$(avg "${results_read[@]}")
results_read_joined=$(join , ${results_read[@]})
all_read_results+=("Avg ${avg_read} MB/s - ${config_id}, all (MB/s): ${results_read_joined}")
done
done
echo "--- Read throughput ---"
for ((i=0;i<${#all_read_results[@]};++i)); do
echo "${all_read_results[$i]}"
done
echo "--- Write throughput ---"
for ((i=0;i<${#all_write_results[@]};++i)); do
echo "${all_write_results[$i]}"
done
}
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment