Skip to content

Instantly share code, notes, and snippets.

@crashangelbr
Last active July 11, 2018 17:19
Show Gist options
  • Save crashangelbr/37cb29288a74638dcd49db956b34bfca to your computer and use it in GitHub Desktop.
Save crashangelbr/37cb29288a74638dcd49db956b34bfca to your computer and use it in GitHub Desktop.
Import Wikipedia to ElasticSearch
#!/bin/bash
# https://www.elastic.co/blog/loading-wikipedia
# https://dumps.wikimedia.org/other/cirrussearch/current/
DeleteAll(){
curl -XDELETE $es/$index?pretty
}
CreateIndex(){
curl -s 'https://'$site'/w/api.php?action=cirrus-settings-dump&format=json&formatversion=2' -H'Content-Type: application/json' |
jq '{
analysis: .content.page.index.analysis,
number_of_shards: 1,
number_of_replicas: 0
}' |
curl -XPUT $es/$index?pretty -H'Content-Type: application/json' -d @-
curl -s 'https://'$site'/w/api.php?action=cirrus-mapping-dump&format=json&formatversion=2' -H'Content-Type: application/json' |
jq .content |
sed 's/"index_analyzer"/"analyzer"/' |
sed 's/"position_offset_gap"/"position_increment_gap"/' |
curl -XPUT $es/$index/_mapping/page?pretty -H'Content-Type: application/json' -d @-
}
SplitFiles()
{
mkdir chunks
cd chunks
zcat ../$dump | split -a 10 -l 500 - $index
}
Dump()
{
cd chunks
for file in *; do
echo -n "${file}: "
took=$(curl -s -XPOST $es/$index/_bulk?pretty -H'Content-Type: application/json' --data-binary @$file |
grep took | cut -d':' -f 2 | cut -d',' -f 1)
printf '%7s\n' $took
[ "x$took" = "x" ] || rm $file
done
cd ..
}
Execute_Wikipedia_Content()
{
export site=pt.wikipedia.org
export index=ptwikipedia_content
export dump=ptwiki-20180702-cirrussearch-content.json.gz
DeleteAll
CreateIndex
SplitFiles
Dump
}
Execute_Wikipedia_General()
{
export site=pt.wikipedia.org
export index=ptwikipedia_general
export dump=ptwiki-20180702-cirrussearch-general.json.gz
DeleteAll
CreateIndex
SplitFiles
Dump
}
Execute_Wikionary_Content()
{
export site=pt.wikionary.org
export index=ptwikionary_content
export dump=ptwiktionary-20180702-cirrussearch-content.json.gz
DeleteAll
CreateIndex
SplitFiles
Dump
}
Execute_Wikionary_General()
{
export site=pt.wikionary.org
export index=ptwikionary_general
export dump=ptwiktionary-20180702-cirrussearch-general.json.gz
DeleteAll
CreateIndex
SplitFiles
Dump
}
export es=localhost:9200
Execute_Wikipedia_Content
Execute_Wikipedia_General
Execute_Wikionary_Content
Execute_Wikionary_General
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment