LowWeiLin/index_wikipedia_elasticsearch.sh

## index_wikipedia_elasticsearch.sh
# https://www.elastic.co/blog/loading-wikipedia
# https://github.com/wikimedia/search-extra

# bin/elasticsearch-plugin install analysis-icu
# bin/elasticsearch-plugin install org.wikimedia.search:extra:6.3.1.2

# Variables
es="host.docker.internal:9200"
site="en.wikipedia.org"
index="enwiki"
indexDate="20190422"
indexType="content"
dump="$index-$indexDate-cirrussearch-$indexType.json.gz"

# Install tools
# apt-get update && apt-get install wget curl jq

# Download the data
# wget "https://dumps.wikimedia.org/other/cirrussearch/$indexDate/$dump"

# Download index
if [ ! -f $site'.settings.json' ]; then
  wget -O $site'.settings.json' 'https://'$site'/w/api.php?action=cirrus-settings-dump&format=json&formatversion=2'
fi

# Download mapping
if [ ! -f $site'.mapping.json' ]; then
  wget -O $site'.mapping.json' 'https://'$site'/w/api.php?action=cirrus-mapping-dump&format=json&formatversion=2'
fi

# Note: Update mappings as required
# (e.g. Create keyword fields for filtering, aggregations, sorting, where required)
# useful - category, external_link, outgoing_link, template
# maybe? - heading

# Delete previous index
curl -XDELETE $es/$index?pretty

# Add new index, jq to select from json, refresh interval: -1 for faster indexing
cat $site'.settings.json' |
  jq '{
    settings: {
      index: {
        refresh_interval : "-1",
        query: {
          default_field: "all"
        },
        analysis: .content.page.index.analysis,
        similarity: .content.page.index.similarity
      }
    }
  }' |
  curl -H 'Content-Type: application/json' -XPUT $es/$index?pretty -d @-

# Add new mapping, sed commands to replace keys that were renamed in elasticsearch
cat $site'.mapping.json' |
  jq .content.page |
  sed 's/"index_analyzer"/"analyzer"/' |
  sed 's/"position_offset_gap"/"position_increment_gap"/' |
  curl -H 'Content-Type: application/json' -XPUT $es/$index/_mapping/page?pretty -d @-

# Split data into chunks
if [ ! -d './chunks' ]; then
  echo Chunking data...
  mkdir ./chunks
  cd ./chunks
  zcat ../$dump | split -a 10 -l 500 - $index
  cd ../
fi

# Upload into elasticsearch
echo Bulk indexing data...
if [ ! -d './chunks-indexed' ]; then
  mkdir ./chunks-indexed
fi
cd ./chunks
for file in *; do
  echo -n "${file}:  "
  took=$(curl -s -H 'Content-Type: application/x-ndjson' -XPOST $es/$index/_bulk?pretty --data-binary @$file |
    grep took | cut -d':' -f 2 | cut -d',' -f 1)
  printf '%7s\n' $took
  [ "x$took" = "x" ] || mv $file ../chunks-indexed
done

# Remove chunked data
# rm -rf ./chunks
# rm -rf ./chunks-indexed

# Refresh index
# curl -H 'Content-Type: application/json' -XPOST $es/$index/_refresh

# Note: Reset index.refresh_interval if required
	# https://www.elastic.co/blog/loading-wikipedia
	# https://github.com/wikimedia/search-extra

	# bin/elasticsearch-plugin install analysis-icu
	# bin/elasticsearch-plugin install org.wikimedia.search:extra:6.3.1.2

	# Variables
	es="host.docker.internal:9200"
	site="en.wikipedia.org"
	index="enwiki"
	indexDate="20190422"
	indexType="content"
	dump="$index-$indexDate-cirrussearch-$indexType.json.gz"

	# Install tools
	# apt-get update && apt-get install wget curl jq

	# Download the data
	# wget "https://dumps.wikimedia.org/other/cirrussearch/$indexDate/$dump"

	# Download index
	if [ ! -f $site'.settings.json' ]; then
	wget -O $site'.settings.json' 'https://'$site'/w/api.php?action=cirrus-settings-dump&format=json&formatversion=2'
	fi

	# Download mapping
	if [ ! -f $site'.mapping.json' ]; then
	wget -O $site'.mapping.json' 'https://'$site'/w/api.php?action=cirrus-mapping-dump&format=json&formatversion=2'
	fi

	# Note: Update mappings as required
	# (e.g. Create keyword fields for filtering, aggregations, sorting, where required)
	# useful - category, external_link, outgoing_link, template
	# maybe? - heading

	# Delete previous index
	curl -XDELETE $es/$index?pretty

	# Add new index, jq to select from json, refresh interval: -1 for faster indexing
	cat $site'.settings.json' \|
	jq '{
	settings: {
	index: {
	refresh_interval : "-1",
	query: {
	default_field: "all"
	},
	analysis: .content.page.index.analysis,
	similarity: .content.page.index.similarity
	}
	}
	}' \|
	curl -H 'Content-Type: application/json' -XPUT $es/$index?pretty -d @-

	# Add new mapping, sed commands to replace keys that were renamed in elasticsearch
	cat $site'.mapping.json' \|
	jq .content.page \|
	sed 's/"index_analyzer"/"analyzer"/' \|
	sed 's/"position_offset_gap"/"position_increment_gap"/' \|
	curl -H 'Content-Type: application/json' -XPUT $es/$index/_mapping/page?pretty -d @-

	# Split data into chunks
	if [ ! -d './chunks' ]; then
	echo Chunking data...
	mkdir ./chunks
	cd ./chunks
	zcat ../$dump \| split -a 10 -l 500 - $index
	cd ../
	fi

	# Upload into elasticsearch
	echo Bulk indexing data...
	if [ ! -d './chunks-indexed' ]; then
	mkdir ./chunks-indexed
	fi
	cd ./chunks
	for file in *; do
	echo -n "${file}: "
	took=$(curl -s -H 'Content-Type: application/x-ndjson' -XPOST $es/$index/_bulk?pretty --data-binary @$file \|
	grep took \| cut -d':' -f 2 \| cut -d',' -f 1)
	printf '%7s\n' $took
	[ "x$took" = "x" ] \|\| mv $file ../chunks-indexed
	done

	# Remove chunked data
	# rm -rf ./chunks
	# rm -rf ./chunks-indexed

	# Refresh index
	# curl -H 'Content-Type: application/json' -XPOST $es/$index/_refresh

	# Note: Reset index.refresh_interval if required