Skip to content

Instantly share code, notes, and snippets.

@benoit-intrw
Created February 22, 2013 16:29
Show Gist options
  • Save benoit-intrw/5014658 to your computer and use it in GitHub Desktop.
Save benoit-intrw/5014658 to your computer and use it in GitHub Desktop.
Elasticsearch: test french stemmer
#!/bin/bash
# Script and configuration to test stemmer
#
# Check arguments
hostname=$1
indexname=$2
if [ -z "$hostname" ] || [ -z "$indexname" ]
then
echo "Usage: test_stemmer hostname indexname"
echo " with:"
echo " - hostname: IP or hostname (port 9200 is added)"
echo " - indexname: name of the index (example test)"
exit 1
fi
# Test string ; note: le HEREDOC permet de tester tous les caractères difficile à echaper en bash
teststring=$( cat <<EOF
"En sa totalité l'archive n'est pas descriptible, et elle est incontournable en son actualité." - Michel Foucault, L'Archéologie du Savoir
La Martinique est une île faisant partie de l'archipel des Antilles, elle est située dans la mer des Caraïbes.
En linguistique, la racinisation (ou désuffixation, ou stemming en anglais) est le nom donné au procédé qui vise à transformer les flexions en leur radical ou stemme.
EOF
)
# Build URI
baseuri="http://$hostname:9200/$indexname"
# Confirm before delete
#
echo -n "Index $baseuri will be deleted. Continue ? [yn] "
read are_you_sure
if [ "$are_you_sure" != "y" ]
then
echo "Cancelled!"
exit 2
fi
# Delete index
echo -n "Deleting ... "
curl -XDELETE $baseuri/
echo ""
# Load settings
echo -n "Loading settings ... "
curl -XPUT "$baseuri/" -d '
{
"settings": {
"index": {
"analysis": {
"analyzer": {
"french_stemmer": {
"filter": [
"french_stemmer"
],
"tokenizer": "standard",
"type": "custom"
} ,
"light_french_stemmer": {
"filter": [
"light_french_stemmer"
],
"tokenizer": "standard",
"type": "custom"
} ,
"minimal_french_stemmer": {
"filter": [
"minimal_french_stemmer"
],
"tokenizer": "standard",
"type": "custom"
}
},
"filter": {
"french_stemmer": {
"name": "french",
"type": "stemmer"
},
"light_french_stemmer": {
"name": "light_french",
"type": "stemmer"
},
"minimal_french_stemmer": {
"name": "minimal_french",
"type": "stemmer"
}
}
},
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}'
echo ""
echo ""
echo "Test string : $teststring"
echo ""
echo "Stemmer french"
curl -s -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=french_stemmer" -d "$teststring" | grep '"token"' | cut -d '"' -f 4 | tr "\n" " "
echo ""
echo "Stemmer light_french_stemmer"
curl -s -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=light_french_stemmer" -d "$teststring" | grep '"token"' | cut -d '"' -f 4 | tr "\n" " "
echo ""
echo "Stemmer minimal_french_stemmer"
curl -s -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=minimal_french_stemmer" -d "$teststring" | grep '"token"' | cut -d '"' -f 4 | tr "\n" " "
echo ""
# End.
echo "Done!"
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment