Skip to content

Instantly share code, notes, and snippets.

@s5unty
Last active September 8, 2020 14:07
Show Gist options
  • Save s5unty/e636a1ca698c6817330825eba67941e7 to your computer and use it in GitHub Desktop.
Save s5unty/e636a1ca698c6817330825eba67941e7 to your computer and use it in GitHub Desktop.
pack huge directory(billion files) to multiple tar files
#!/bin/bash
export PATH="/usr/local/bin:/usr/bin:/bin:/usr/local/sbin:/usr/sbin:/sbin"
export PACK_ROOT="${PACK_ROOT:-/}"
export PACK_NAME="${PACK_NAME:-$(shuf -zer -n5 {a..z})}"
export PACK_SIZE="${PACK_SIZE:-10000}"
export PACK_JOBS="${PACK_JOBS:-16}"
while [[ "$#" -gt 1 ]]; do
case $1 in
-C|--root) PACK_ROOT="$2"; shift ;;
-n|--name) PACK_NAME="$2"; shift ;;
-s|--size) PACK_SIZE="$2"; shift ;;
-j|--jobs) PACK_JOBS="$2"; shift ;;
*) echo "Unknown parameter passed: $1"; exit 1 ;;
esac
shift
done
# --exclude-tag,,, https://stackoverflow.com/a/13296077/1355228
tar -c -v -P -f /dev/null --exclude-tag ".void" -C "$PACK_ROOT" "$1" \
| tee "${PACK_NAME}.index" \
| split -a 5 -d -l $PACK_SIZE --additional-suffix=".tmp" - "$PACK_NAME."
for one in $(ls -1 ${PACK_NAME}.*.tmp); do
num="${one%.tmp}" # some.thing.01234.tmp -> some.thing.01234
num="${num##*.}" # some.thing.01234 -> 01234
[[ $num =~ ^-?[0-9]+$ ]] || continue
# split thousands file into multiple sub-folders
# 00034,00134,01234,01034,10134,...,99934 -> 34
dir="$(printf %02d $(expr $num % 100))"
mkdir -p $dir
log="$dir/${one%%.tmp}.log"
tar="$dir/${one%%.tmp}.tar.zst"
[[ -s $one ]] || continue
[[ -s $tar ]] && continue
sem -j $PACK_JOBS "tar \
-c --zstd --numeric-owner --no-recursion \
-P -V "$PACK_ROOT" -C "$PACK_ROOT" \
-vv --index-file=$log -T $one -f $tar \
&& rm -f -- $one"
# sem -j $PACK_JOBS "dar -Q -zlzop-3 -R $1 --include-from-file $one -c $PACK_NAME"
done
sem --wait
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment