Last active
December 23, 2015 12:00
-
-
Save ellimilial/fa97326d1221194e0016 to your computer and use it in GitHub Desktop.
A bash script to run Hadoop DFSIO tests multiple times with different file/batch sizes, averaging the results. Suitable for command line and SSH (Jenkins).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Run DFSIO write and read tests for multiple file size/count configurations. Get the average speed over RUNS_PER_CONFIG executions. | |
# The throughtput calculation method assumes all tests are run on a single 'wave' , i.e. BATCH_SIZE < total mapper task | |
# | |
# Replace hadoop/yarn in run commands as required. | |
# | |
# To run via ssh, say on Jenkins, wrap in: | |
# ssh namenode.server.com <<'ENDSSH' | |
# (... code ...) | |
# ENDSSH | |
readonly TEST_JAR="(...)/hadoop-mapreduce-client-jobclient-tests.jar" | |
readonly FILE_SIZES=("10MB" "100MB" "1GB") | |
readonly BATCH_SIZES=("10" "100") | |
readonly RUNS_PER_CONFIG=2 | |
join() { local IFS="$1"; shift; echo "$*"; } | |
avg() { for i in "$@"; do ((total+=$i)); ((count+=1)); done; echo $((total/count)); } | |
run_write_benchmark() { name="DFSIO benchmark - write ${1} files of ${2}"; var=$(yarn jar $TEST_JAR TestDFSIO -Dmapred.job.name="${name}" -write -nrFiles $1 -fileSize $2 2>&1 | tee /dev/stderr | grep "Throughput mb/sec:" | awk '{print $NF}'); var=$(printf "%.0f" $var); var=$(($var*$1)); echo $var; } | |
run_read_benchmark() { name="DFSIO benchmark - read ${1} files of ${2}"; var=$(yarn jar $TEST_JAR TestDFSIO -Dmapred.job.name="${name}" -read -nrFiles $1 -fileSize $2 2>&1 | tee /dev/stderr | grep "Throughput mb/sec:" | awk '{print $NF}'); var=$(printf "%.0f" $var); var=$(($var*$1)); echo $var; } | |
main() { | |
all_write_results=() | |
all_read_results=() | |
for ((i=0;i<${#FILE_SIZES[@]};++i)); do | |
for ((j=0;j<${#BATCH_SIZES[@]};++j)); do | |
file_size=${FILE_SIZES[$i]} | |
batch_size=${BATCH_SIZES[$j]} | |
config_id="${batch_size} ${file_size} files" | |
echo "Running tests for ${config_id}..." | |
results_write=() | |
results_read=() | |
for ((k = 1; k <= $RUNS_PER_CONFIG; k++)); do | |
results_write+=($(run_write_benchmark $batch_size $file_size)) | |
results_read+=($(run_read_benchmark $batch_size $file_size)) | |
done | |
avg_write=$(avg "${results_write[@]}") | |
results_write_joined=$(join , ${results_write[@]}) | |
all_write_results+=("Avg ${avg_write} MB/s - ${config_id}, all (MB/s): ${results_write_joined}") | |
avg_read=$(avg "${results_read[@]}") | |
results_read_joined=$(join , ${results_read[@]}) | |
all_read_results+=("Avg ${avg_read} MB/s - ${config_id}, all (MB/s): ${results_read_joined}") | |
done | |
done | |
echo "--- Read throughput ---" | |
for ((i=0;i<${#all_read_results[@]};++i)); do | |
echo "${all_read_results[$i]}" | |
done | |
echo "--- Write throughput ---" | |
for ((i=0;i<${#all_write_results[@]};++i)); do | |
echo "${all_write_results[$i]}" | |
done | |
} | |
main |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment