Skip to content

Instantly share code, notes, and snippets.

@korczis
Forked from lukas-vlcek/gist:4673027
Created August 30, 2016 09:37
Show Gist options
  • Save korczis/20561d3c49522eab2b689185b425e4ae to your computer and use it in GitHub Desktop.
Save korczis/20561d3c49522eab2b689185b425e4ae to your computer and use it in GitHub Desktop.
Čeština v elasticsearch
#!/bin/sh
# download
wget http://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-0.20.4.zip
# unzip and start
unzip elasticsearch-0.20.4.zip
cd elasticsearch-0.20.4
# remove data in case you have defined some analyzers in the past (e.g. stop/start)
rm -rf data/
./bin/elasticsearch
# give it some time to startup
sleep 10
# is it running?
curl localhost:9200
# Setup analyzers:
curl -X PUT localhost:9200/test -d '
{
"settings" : {
"analysis" : {
"analyzer" : {
"cestina1" : {
"type": "czech"
},
"cestina2" : {
"type" : "custom",
"tokenizer" : "standard",
"filter" : [ "standard", "lowercase", "czech_stemmer1" ]
},
"cestina3" : {
"type" : "custom",
"tokenizer" : "standard",
"filter" : [ "standard", "lowercase", "czech_stemmer2" ]
},
"cestina4" : {
"type" : "custom",
"tokenizer" : "standard",
"filter" : [ "standard", "lowercase", "czech_stop", "czech_stemmer2" ]
}
},
"filter" : {
"czech_stemmer1" : {
"type" : "stemmer",
"name" : "czech"
},
"czech_stemmer2" : {
"type" : "czech_stem"
},
"czech_stop" : {
"type" : "stop",
"stopwords" : ["_czech_"]
}
}
}
}
}'
# Phrase: "Bankovní poplatky jsou nehorázné"
# cestina1: preconfigured czech analyzer
curl 'localhost:9200/test/_analyze?pretty=1&analyzer=cestina1&text=Bankovn%C3%AD%20poplatky%20jsou%20nehor%C3%A1zn%C3%A9'
# cestina2: custom analyzer using czech stemmer
curl 'localhost:9200/test/_analyze?pretty=1&analyzer=cestina2&text=Bankovn%C3%AD%20poplatky%20jsou%20nehor%C3%A1zn%C3%A9'
# cestina3: custom analyzer using czech stemmer (using a little shorter notation)
curl 'localhost:9200/test/_analyze?pretty=1&analyzer=cestina3&text=Bankovn%C3%AD%20poplatky%20jsou%20nehor%C3%A1zn%C3%A9'
# cestina4: Note both cestina2 and cestina3 did not exclude stop words. Let's add czech stopwords list.
# Note the custom analyzer is in fact the same to what is preconfigured in cestina1 under the hood.
curl 'localhost:9200/test/_analyze?pretty=1&analyzer=cestina4&text=Bankovn%C3%AD%20poplatky%20jsou%20nehor%C3%A1zn%C3%A9'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment