Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@tanema
Last active March 4, 2022 09:23
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save tanema/2c752d3c9725c7ffea94 to your computer and use it in GitHub Desktop.
Save tanema/2c752d3c9725c7ffea94 to your computer and use it in GitHub Desktop.
migrate files from gridfs to aws s3
#! /bin/bash
###################### USAGE ######################################
usage() {
echo "
Usage: mongotos3 [-t n] mongo_host mongo_collection s3_bucket
-t : number of parallel processes to use
mongo_host : the host of the mongodb server
mongo_collection : the collection to collecthe gridfs data from
s3_bucket : the name of the bucket you want to cp the files to
"
}
###################### END USAGE ##################################
# how many times to split up the list
thread_count=8
# parrallel process pid array
_worker_pids=()
# incremented variable to see progress
_current_file=1
# get options just -t for setting how many threads you want
while getopts 't:*:' opt; do
case $opt in
t) thread_count=$OPTARG;;
*)
usage
exit
;;
esac
done
shift $((OPTIND-1))
# script params
if [ "$#" -ne 3 ]
then
usage
fi
# mongo host
_host="${1:?Mongo Host Required}"
# mongo collection to pull grid_fs data from
_db="${2:?Mongo Collection required}"
# s3 bucket for everything to be synced to
_bucket="${3:?AWS Bucket Required}"
# all the files
_files_list=$(mongofiles -h $_host -db $_db list)
# total files to be synced
_total_files=$(echo "$_files_list" | wc -l | awk {'print $1'})
# how many lines to send to each thread
((lines_per_file=(_total_files + thread_count - 1) / thread_count))
###################### LOGGING ####################################
RED=$(tput setaf 1)
GREEN=$(tput setaf 2)
NORMAL=$(tput sgr0)
log_ok() {
let COL=$(tput cols)-${#1}+${#GREEN}+${#NORMAL}
printf "%s%${COL}s" "$1" "$GREEN[OK]$NORMAL"
}
log_fail() {
let COL=$(tput cols)-${#1}+${#RED}+${#NORMAL}
printf "%s%${COL}s" "$1" "$RED[FAIL]$NORMAL"
}
###################### END LOGGING ################################
###################### METHOD DEFINITIONS #########################
# param $1: filepath from mongo
# param $2: worker identity number
syncfile () {
status="(worker $2) $_current_file/$lines_per_file $_bucket/$1"
((_current_file++))
#check if file is already on the server
file_count=$((0+$(aws s3 ls $_bucket/$1 | wc -l)))
if [[ $file_count -gt 0 ]]; then
log_ok "$status Already on server"
else
filename="_migration-$_current_file-$(uuidgen)"
#get file from gridfs and create a temp file of it
mongofiles -h $_host -db $_db get --local $filename $1 > /dev/null 2>&1
#get file succeeded
if [ $? -eq 0 ]; then
#send it to s3
aws s3 cp $filename s3://$_bucket/$1 --dryrun --quiet
#send file status and if this file migration succeeded
if [ $? -eq 0 ]; then
log_ok "$status"
else
log_fail "$status"
fi
#rm temp file gotten from gridfs
rm $filename
else
log_fail "$status Get from db failed"
fi
fi
}
# param: $1 worker identity number
# param: $2 starting line number in the file to process
process_lines () {
while read -r line; do
#get filename
file=$(echo "$line" | awk -F'\t' '{ print $1 }')
#if connected message then continue
[[ $file == 'connected to'* ]] && continue
# sync the file with the server
syncfile $file $1
done < <(echo "$_files_list" | head -n $(($2 + $lines_per_file)) | tail -n $lines_per_file)
}
# used for kill signals
# calls kill on each pid
kill_all_workers () {
echo 'killing all workers'
for ((i=0; i <= ${#_worker_pids[@]}; ++i)); do
kill -6 ${_worker_pids[i]} > /dev/null 2>&1
done
echo 'migration aborted'
#cleanup any files that were interrupted
rm _migration-* > /dev/null 2>&1
}
###################### END METHOD DEFINITIONS #####################
#allows ctrl c to work in the while loop
trap "kill_all_workers" SIGINT SIGHUP SIGTERM
for ((i=0; i < $thread_count; ++i)); do
echo "starting worker $i"
#call process on this chunk of files
process_lines $i $((lines_per_file * i)) &
#record the pid for cleanup and waiting
_worker_pids+=($!)
done
#wait for each process to finish
for ((i=0; i <= ${#_worker_pids[@]}; ++i)); do
wait ${_worker_pids[i]} > /dev/null 2>&1
done
#if no errors say we are complete
if [ $? -eq 0 ]; then
echo DONE
fi
@simpleshadow
Copy link

Thanks! Just successfully used this (with a few minor updates to the mongofiles options for latest version).

@lspellman
Copy link

Thank you, this was very useful. I had to make some mods as well, primarily dealing with the fact that my filenames had spaces in them. When syncfile routine passes file over as an argument, the name gets split up and the second part of the filename becomes argument 2, etc... Basically putting quotes around $1 in most places takes care of it.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment