Skip to content

Instantly share code, notes, and snippets.

@benoit-intrw
Created August 23, 2012 10:06
Show Gist options
  • Save benoit-intrw/3435048 to your computer and use it in GitHub Desktop.
Save benoit-intrw/3435048 to your computer and use it in GitHub Desktop.
Elasticsearch: test analyser for text like 'R&D' or 'Canal+'
#!/bin/bash
# Script and configuration to test analyser
#
# Check arguments
hostname=$1
indexname=$2
if [ -z "$hostname" ] || [ -z "$indexname" ]
then
echo "Usage: test_analyzer hostname indexname"
echo " with:"
echo " - hostname: IP or hostname (port 9200 is added)"
echo " - indexname: name of the index (example test)"
exit 1
fi
# Test string ; note: le HEREDOC permet de teste tous les caractères difficile à echaper en bash
teststring=$( cat <<EOF
Canal+ annonce 3 ans de R&D !
EOF
)
# Build URI
baseuri="http://$hostname:9200/$indexname"
# Confirm before delete
#
echo -n "Index $baseuri will be deleted. Continue ? [yn] "
read are_you_sure
if [ "$are_you_sure" != "y" ]
then
echo "Cancelled!"
exit 2
fi
# Delete index
echo -n "Deleting ... "
curl -XDELETE $baseuri/
echo ""
# Load settings
echo -n "Loading settings ... "
curl -XPUT "$baseuri/" -d '
{
"settings": {
"index": {
"analysis": {
"analyzer": {
"francais": {
"filter": [
"lowercase",
"stop_francais",
"fr_stemmer",
"asciifolding",
"elision"
],
"tokenizer": "standard",
"type": "custom"
} ,
"test1": {
"tokenizer": "standard",
"type": "custom"
},
"test2": {
"tokenizer": "whitespace",
"type": "custom"
},
"test3": {
"filter": [
"lowercase",
"stop_francais",
"fr_stemmer",
"asciifolding",
"elision"
],
"tokenizer": "whitespace",
"type": "custom"
},
"test4": {
"filter": [
"lowercase",
"stop_francais",
"fr_stemmer",
"asciifolding",
"elision"
],
"tokenizer": "standard",
"char_filter" : ["my_mapping"],
"type": "custom"
}
},
"filter": {
"elision": {
"articles": [ "l", "m", "t", "qu", "n", "s", "j", "d" ],
"type": "elision"
},
"fr_stemmer": {
"name": "french",
"type": "stemmer"
},
"stop_francais": {
"stopwords": [
"_french_"
],
"type": "stop"
}
},
"char_filter" : {
"my_mapping" : {
"type" : "mapping",
"mappings" : ["&=>et", "+=>plus"]
}
}
},
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}'
echo ""
echo ""
echo "Test string : $teststring"
echo ""
echo "Analyzer custom français (francais)"
curl -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=francais" -d "$teststring"
echo ""
echo "Tokenizer standard seul (test1)"
curl -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=test1" -d "$teststring"
echo ""
echo "Tokenizer whitespace seul (test2)"
curl -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=test2" -d "$teststring"
echo ""
echo "Analyzer custom français avec tokenizer whitespace (test3)"
curl -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=test3" -d "$teststring"
echo ""
echo "Analyzer custom français avec tokenizer standard et mapping (test4)"
curl -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=test4" -d "$teststring"
echo ""
# End.
echo "Done!"
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment