divideby0/index-data.sh

## index-data.sh
#!/usr/bin/env bash

set -o xtrace  # trace what gets executed
set -o errexit # exit when a command fails.
set -o nounset # exit when your script tries to use undeclared variables

__dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

DATA_DIR="${__dir}/containers/elasticsearch/data"

DOCKER_MACHINE_IP="$(docker-machine ip $(docker-machine active))"

TMP_DATA_DIR="${DATA_DIR}/tmp"
ES_URL="http://${DOCKER_MACHINE_IP}:9200"
MAX_SPLIT_LINES=10000

mkdir -p "$TMP_DATA_DIR"

create_index () {
	index=${1}
	data_file=${2}
	echo "Creating ${index} index"

	# Clear and recreate the index
	http --ignore-stdin DELETE "${ES_URL}/${index}"
	http --ignore-stdin POST "${ES_URL}/${index}"

	# Slow the refresh interval to 120s while doing bulk updates
	echo '{"index": {"refresh_interval": "120s"}}' | http --ignore-stdin PUT "${ES_URL}/${index}/_settings"
	mkdir -p "${index}"
	cd "${index}"
	cd "${TMP_DATA_DIR}"
	tar -xzvf "${DATA_DIR}/bulk/${data_file}.json.bulk.tar.gz"

	# Split the lines file into 10k-line pieces and submit them to the bulk APIs
	split -a 5 -l ${MAX_SPLIT_LINES} "${TMP_DATA_DIR}/${data_file}.json.bulk" "${data_file}.json.bulk."
	for f in `ls ${TMP_DATA_DIR}/${data_file}.json.bulk.*`
	do
		http --ignore-stdin --timeout 60 -h POST "${ES_URL}/${index}/_bulk" "@${f}"
	done

	# Force a refresh and optimize (to merge immediately)
	http POST "${ES_URL}/${index}/_refresh"
	http POST "${ES_URL}/${index}/_optimize"

        # Restore the refresh settings after bulk indexing
	echo '{"index": {"refresh_interval": "1s"}}' | http --ignore-stdin PUT "${ES_URL}/${index}/_settings"
}

create_index "freebase" "films"

cd "${__dir}"
	#!/usr/bin/env bash

	set -o xtrace # trace what gets executed
	set -o errexit # exit when a command fails.
	set -o nounset # exit when your script tries to use undeclared variables

	__dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

	DATA_DIR="${__dir}/containers/elasticsearch/data"

	DOCKER_MACHINE_IP="$(docker-machine ip $(docker-machine active))"

	TMP_DATA_DIR="${DATA_DIR}/tmp"
	ES_URL="http://${DOCKER_MACHINE_IP}:9200"
	MAX_SPLIT_LINES=10000

	mkdir -p "$TMP_DATA_DIR"

	create_index () {
	index=${1}
	data_file=${2}
	echo "Creating ${index} index"

	# Clear and recreate the index
	http --ignore-stdin DELETE "${ES_URL}/${index}"
	http --ignore-stdin POST "${ES_URL}/${index}"

	# Slow the refresh interval to 120s while doing bulk updates
	echo '{"index": {"refresh_interval": "120s"}}' \| http --ignore-stdin PUT "${ES_URL}/${index}/_settings"
	mkdir -p "${index}"
	cd "${index}"
	cd "${TMP_DATA_DIR}"
	tar -xzvf "${DATA_DIR}/bulk/${data_file}.json.bulk.tar.gz"

	# Split the lines file into 10k-line pieces and submit them to the bulk APIs
	split -a 5 -l ${MAX_SPLIT_LINES} "${TMP_DATA_DIR}/${data_file}.json.bulk" "${data_file}.json.bulk."
	for f in `ls ${TMP_DATA_DIR}/${data_file}.json.bulk.*`
	do
	http --ignore-stdin --timeout 60 -h POST "${ES_URL}/${index}/_bulk" "@${f}"
	done

	# Force a refresh and optimize (to merge immediately)
	http POST "${ES_URL}/${index}/_refresh"
	http POST "${ES_URL}/${index}/_optimize"

	# Restore the refresh settings after bulk indexing
	echo '{"index": {"refresh_interval": "1s"}}' \| http --ignore-stdin PUT "${ES_URL}/${index}/_settings"
	}

	create_index "freebase" "films"

	cd "${__dir}"