Skip to content

Instantly share code, notes, and snippets.

@taroved
Last active August 29, 2015 14:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save taroved/11269983 to your computer and use it in GitHub Desktop.
Save taroved/11269983 to your computer and use it in GitHub Desktop.
Bash script which silently compress directory with millions of files into directory structure like this: /target/directory/2014/12/millions.tar.gz (!and delete source files!)
#!/bin/bash
#example of archiving 2013 year of directory /path/to/source/dir/subdir with 512k speed limit:
#>> ./archive_huge_dir.sh -S /path/to/source/dir -T /path/to/target/dir -L 512k -n subdir -y 2013
#example of archiving december of 2013 year of directory /path/to/source/dir/millions_files_subdir with 512k speed limit:
#>> ./archive_huge_dir.sh -S /path/to/source/dir -T /path/to/target/dir -L 512k -n subdir -y 2013 -m 12
#example of archiving several years with 512k speed limit:
#>> for year in `seq 2009 2013`; do ./archive_huge_dir.sh -S /path/to/source/dir -T /path/to/target/dir -L 512k -n subdir -y $year -Y ; done
function curtime { date +"%F %T"; }
function echotime { echo `curtime`" $@"; }
function echoerr { echotime "$@" 1>&2; }
function execs { type "$1" > /dev/null 2>&1; }
function usage { echo "Usage: $0 -S /path/to/source/dir -T /path/to/target/dir -L <optional: rate_limit> -n <subdirectory of /path/to/source/dir> -y <YEAR> -m <optional: month_number> -Y(skip confirm)" 1>&2; exit 1; }
while getopts 'S:T:L:n:y:m:Y' flag; do
case "${flag}" in
S) SOURCE_DIR="${OPTARG}" ;;
T) TARGET_DIR="${OPTARG}" ;;
L) MAX_RATE="${OPTARG}" ;;
n) SUBDIR="${OPTARG}" ;;
y) YEAR="${OPTARG}" ;;
m) MONTH="${OPTARG}" ;;
Y) NOCONFIRM=1 ;;
*) usage ;;
esac
done
if [ -z "$SOURCE_DIR" ]; then echo "(S)ource dir is required."; usage; fi
if [ -z "$TARGET_DIR" ]; then echo "(T)arget dir is required."; usage; fi
if [ -z "$SUBDIR" ]; then echo "Subdirectory (n)ame is required."; usage; fi
if [ -z "$YEAR" ]; then echo "(y)ear is required."; usage; fi
if [ ! -d "$SOURCE_DIR/$SUBDIR" ]
then
echoerr "Directory $SOURCE_DIR/$SUBDIR not exists."
usage
fi
if [ -z "$MONTH" ]
then
START_MONTH=1
END_MONTH=12
ARCHIVE_DIR=$TARGET_DIR/$YEAR
START_MONTH00=`printf '%02d' $START_MONTH`
END_MONTH00=`printf '%02d' $END_MONTH`
else
START_MONTH=$MONTH
END_MONTH=$START_MONTH
START_MONTH00=`printf '%02d' $START_MONTH`
END_MONTH00=`printf '%02d' $END_MONTH`
ARCHIVE_DIR=$TARGET_DIR/$YEAR/$START_MONTH00
fi
CUR_DIR=`pwd`
LIST_FILE=archive_huge_dir.current_files_list_`date +%s`.tmp
ARCHIVE_PATH=$ARCHIVE_DIR/$SUBDIR.tar.gz
if [ -e $ARCHIVE_PATH ]
then
echoerr "Archive $ARCHIVE_PATH already exists. Exiting."
exit 1;
fi
#0. Check Pipe Viewer installation
if ! `execs pv`
then
echoerr "Pipe Viewer is not installed."
exit 1
fi
#1. Define time interval
declare -i START_TIME=`date --utc --date="$YEAR-$START_MONTH-01" +%s`
if [ $END_MONTH -eq 12 ]
then
declare -i END_TIME=`date --utc --date="$(( YEAR+1 ))-01-01" +%s`
else
MONTH_AFTER_END00=`printf '%02d' $(( END_MONTH+1 ))`
declare -i END_TIME=`date --utc --date="$YEAR-$MONTH_AFTER_END00-01" +%s`
fi
echotime "Start time: `date --utc --date=@$START_TIME +'%F %T %Z'`"
echotime "End time: `date --utc --date=@$END_TIME +'%F %T %Z'`"
#2. Create files list (line format: <size> <relative_path>)
echotime "Creation of files list for directry '$SOURCE_DIR/$SUBDIR' for $START_MONTH-$END_MONTH monthes of $YEAR year..."
(cd $SOURCE_DIR/$SUBDIR; find . -type f -printf '%T@\t%s\t%p\n' | awk '{ if ($1 >= '$START_TIME' && $1 < '$END_TIME') print $2 "\t" $3 }' > $CUR_DIR/$LIST_FILE)
files_count=( `wc -l $LIST_FILE` )
if [ "${files_count[0]}" -gt "0" ]
then
echotime "Found ${files_count[0]} files."
echotime "Files list saved into $LIST_FILE"
if [ -z "$NOCONFIRM" ]
then
read -p "Are you sure? " -n 1 -r
echo
fi
if [[ -n "$NOCONFIRM" || $REPLY =~ ^[Yy]$ ]]
then
#3. Archive files into single tar.gz file
files_size_sum=`cat $LIST_FILE | awk '{ SUM+=$1 } END { print SUM }'`
mkdir -p $ARCHIVE_DIR
echotime "Compression started (size sum is $files_size_sum bytes, max rate is $MAX_RATE per second):"
if [ -z "$MAX_RATE" ]
then
(cd $SOURCE_DIR/$SUBDIR; cut -f 2 $CUR_DIR/$LIST_FILE | tar -c -T - | pv -s $files_size_sum | gzip > $ARCHIVE_PATH)
else
(cd $SOURCE_DIR/$SUBDIR; cut -f 2 $CUR_DIR/$LIST_FILE | tar -c -T - | pv -s $files_size_sum -L $MAX_RATE | gzip > $ARCHIVE_PATH)
fi
#4. Remove archived files if archived successfully
if [ "$?" -eq "0" ]
then
echotime "Compressed successfully! Archive path: $ARCHIVE_PATH"
echotime "Remove archived files..."
(cd $SOURCE_DIR/$SUBDIR; cut -f 2 $CUR_DIR/$LIST_FILE | while read file; do rm -f "$file"; done)
echotime "Archived files removed."
fi
fi
else
echoerr "No files for the period."
fi
#5. Remove files list file
rm $LIST_FILE
echotime "$LIST_FILE removed"
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment