Skip to content

Instantly share code, notes, and snippets.

@divideby0
Last active June 27, 2016 22:08
Show Gist options
  • Save divideby0/c329d29e5d0807a03fa77f24765e9b68 to your computer and use it in GitHub Desktop.
Save divideby0/c329d29e5d0807a03fa77f24765e9b68 to your computer and use it in GitHub Desktop.
#!/usr/bin/env bash
set -o xtrace # trace what gets executed
set -o errexit # exit when a command fails.
set -o nounset # exit when your script tries to use undeclared variables
__dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
DATA_DIR="${__dir}/containers/elasticsearch/data"
DOCKER_MACHINE_IP="$(docker-machine ip $(docker-machine active))"
TMP_DATA_DIR="${DATA_DIR}/tmp"
ES_URL="http://${DOCKER_MACHINE_IP}:9200"
MAX_SPLIT_LINES=10000
mkdir -p "$TMP_DATA_DIR"
create_index () {
index=${1}
data_file=${2}
echo "Creating ${index} index"
# Clear and recreate the index
http --ignore-stdin DELETE "${ES_URL}/${index}"
http --ignore-stdin POST "${ES_URL}/${index}"
# Slow the refresh interval to 120s while doing bulk updates
echo '{"index": {"refresh_interval": "120s"}}' | http --ignore-stdin PUT "${ES_URL}/${index}/_settings"
mkdir -p "${index}"
cd "${index}"
cd "${TMP_DATA_DIR}"
tar -xzvf "${DATA_DIR}/bulk/${data_file}.json.bulk.tar.gz"
# Split the lines file into 10k-line pieces and submit them to the bulk APIs
split -a 5 -l ${MAX_SPLIT_LINES} "${TMP_DATA_DIR}/${data_file}.json.bulk" "${data_file}.json.bulk."
for f in `ls ${TMP_DATA_DIR}/${data_file}.json.bulk.*`
do
http --ignore-stdin --timeout 60 -h POST "${ES_URL}/${index}/_bulk" "@${f}"
done
# Force a refresh and optimize (to merge immediately)
http POST "${ES_URL}/${index}/_refresh"
http POST "${ES_URL}/${index}/_optimize"
# Restore the refresh settings after bulk indexing
echo '{"index": {"refresh_interval": "1s"}}' | http --ignore-stdin PUT "${ES_URL}/${index}/_settings"
}
create_index "freebase" "films"
cd "${__dir}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment