Skip to content

Instantly share code, notes, and snippets.

@mrpeverill
Last active May 16, 2023 15:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mrpeverill/645cd9a646119eb05544340e0418af01 to your computer and use it in GitHub Desktop.
Save mrpeverill/645cd9a646119eb05544340e0418af01 to your computer and use it in GitHub Desktop.
Compression Benchmarking Script using HTCondor
#!/bin/bash
set -e # quit on error
# QC output from fmriprep:
#tarch="/projects/abcd_data/NDARINV02H7G2T6.tar.xz"
# BIDS input file:
#tarch="/projects/abcd_data/NDARINV07THBLHG-inputs.tar.xz"
# largest file (full fmriprep output):
tarch="/projects/abcd_data/NDARINVRHLEYZMW-full.tar.xz"
scratchdir="./tmp/scratch/"
mkdir -p $scratchdir
echo "decompressing $tarch"
tar -xJf $tarch -C $scratchdir
# Setup our filenames
testdir="/projects/abcd_data/compresstesttmp"
id=$1
testarch="$testdir/tarch${id}"
# Set up our output
timeformat='%e \t %M \t %P'
echo "Mlabel" > tmp/methout.txt
echo -e "realSeconds \t peakMem \t CPUperc" > tmp/timeout.txt
echo "Ratio" > tmp/ratout.txt
# Get a baseline
method="gtar.df.ra"
echo "testing $method"
echo $method >> tmp/methout.txt
/usr/bin/time -f "$timeformat" -ao tmp/timeout.txt tar -cf $testarch $scratchdir
rawreadsize=`du -h $testarch`
echo "The real size of the tar file is $rawreadsize"
rawsize=`du $testarch | cut -f1`
echo "1" >> tmp/ratout.txt
rm $testarch
test_comp () {
method=$1
cmd=$2
echo "testing $method"
echo "command is $cmd"
echo $method >> tmp/methout.txt
/usr/bin/time -f "$timeformat" -ao tmp/timeout.txt $cmd
csize=`du $testarch | cut -f1`
awk "BEGIN {print ($csize / $rawsize)}" >> tmp/ratout.txt
rm $testarch
}
test_comp "gtar.df.gz" "tar -czf $testarch $scratchdir"
test_comp "gtar.in.gz" "tar --sort=inode -czf $testarch $scratchdir"
test_comp "gtar.nm.gz" "tar --sort=name -czf $testarch $scratchdir"
echo "Making a reverse filelist"
find $scratchdir -type f > tmp/filelist
rev tmp/filelist | sort | rev > tmp/revfilelist
test_comp "gtar.rv.gz" "tar -czf $testarch -T tmp/revfilelist"
# XZ 1
test_comp "gtar.df.xz" "tar -cJf $testarch $scratchdir"
test_comp "gtar.in.xz" "tar --sort=inode -cJf $testarch $scratchdir"
test_comp "gtar.nm.xz" "tar --sort=name -cJf $testarch $scratchdir"
test_comp "gtar.rv.xz" "tar -cJf $testarch -T tmp/revfilelist"
# XZ 8 Core
export XZ_OPT='-T8'
test_comp "gtar.df.x8" "tar -cJf $testarch $scratchdir"
test_comp "gtar.in.x8" "tar --sort=inode -cJf $testarch $scratchdir"
test_comp "gtar.nm.x8" "tar --sort=name -cJf $testarch $scratchdir"
test_comp "gtar.rv.x8" "tar -cJf $testarch -T tmp/revfilelist"
# XZ 8 core 10MiB blocks
export XZ_OPT="-T8 --block-size=10486760"
test_comp "gtar.df.xb" "tar -cJf $testarch $scratchdir"
test_comp "gtar.in.xb" "tar --sort=inode -cJf $testarch $scratchdir"
test_comp "gtar.nm.xb" "tar --sort=name -cJf $testarch $scratchdir"
test_comp "gtar.rv.xb" "tar -cJf $testarch -T tmp/revfilelist"
echo "output:"
paste tmp/methout.txt tmp/timeout.txt tmp/ratout.txt | tee ./compressbenchmarks$id.tsv
# You may need to remove the % sign from the output tsv files with:
# sed -i 's/%//g' *tsv
# Run submit file
universe = vanilla
executable = compresstest.sh
arguments = $(Process)
#should_transfer_files = NO
#For interactive use:
should_transfer_files = YES
transfer_input_files = compresstest.sh
# output and logging
logdir = /home/groups/abcd_data/dwnld_preproc/tools/compressbenchmark/output/
output = $(logdir)$(Cluster).$(Process)_compresstest.out
error = $(logdir)$(Cluster).$(Process)_compresstest.err
log = $(logdir)$(Cluster).$(Process)_compresstest.log
priority=2
# Machine Needs
request_cpus = 8
request_memory = 32GB
request_disk = 160GB
Requirements = (Target.HasCHTCProjects == true)
queue 20
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment