Skip to content

Instantly share code, notes, and snippets.

@lytefast
Last active August 29, 2015 14:13
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lytefast/234f147cd1dbe13a07e2 to your computer and use it in GitHub Desktop.
Save lytefast/234f147cd1dbe13a07e2 to your computer and use it in GitHub Desktop.
#!/bin/bash
# Shuffle large input
USAGE_MSG="Usage: randomize_input {input_dir} {batch_size} {result_prefix}"
# Set parameters
if [[ $# -eq 0 ]] ; then
echo $USAGE_MSG
exit -1
fi
INPUT_DIR=$1
echo 'input dir =' $INPUT_DIR
BATCH_SIZE=${2:-50}
echo 'batch size =' $BATCH_SIZE
PREFIX=${3:-"results.part."}
echo 'prefix =' $PREFIX
# randomize files since shuf can't execute on large input
shuf -e $INPUT_DIR/* > input.list
function finish {
rm input.list
}
trap finish EXIT
# merge files & randomize each batch of files
wc_results=($(wc -l input.list))
num_files=${wc_results[0]}
echo '# files =' $num_files
TEMP_FILE=$(mktemp)
echo 'temp file=' $TEMP_FILE
function write_batch {
local input_list=$1
local start_index=$2
local end_index=$((start_index + BATCH_SIZE -1))
local filename=$PREFIX$start_index
# set -x # echo on
sed -n "$start_index,${end_index}p" input.list | xargs cat > $TEMP_FILE && shuf $TEMP_FILE > $filename
# set +x # echo off
}
COUNTER=1
while [ $COUNTER -le $num_files ]; do
echo The counter is $COUNTER
write_batch input.list $COUNTER
let COUNTER+=$BATCH_SIZE
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment