benoit-intrw/test_analyzer.sh

## test_analyzer.sh
#!/bin/bash
# Script and configuration to test analyser
#

# Check arguments
hostname=$1
indexname=$2

if [ -z "$hostname" ] || [ -z "$indexname" ]
then
    echo "Usage: test_analyzer hostname indexname"
    echo "    with:"
    echo "    - hostname: IP or hostname (port 9200 is added)"
    echo "    - indexname: name of the index (example test)"
    exit 1
fi

# Test string ; note: le HEREDOC permet de teste tous les caractères difficile à echaper en bash
teststring=$( cat <<EOF
Canal+ annonce 3 ans de R&D !
EOF
)

# Build URI
baseuri="http://$hostname:9200/$indexname"

# Confirm before delete
#
echo -n "Index $baseuri will be deleted. Continue ? [yn] "
read are_you_sure
if [ "$are_you_sure" != "y" ]
then
    echo "Cancelled!"
    exit 2
fi

# Delete index
echo -n "Deleting ... "
curl -XDELETE $baseuri/
echo ""

# Load settings
echo -n "Loading settings ... "
curl -XPUT "$baseuri/" -d '
{
    "settings": {
        "index": {
            "analysis": {
                "analyzer": {
                    "francais": {
                        "filter": [
                            "lowercase",
                            "stop_francais",
                            "fr_stemmer",
                            "asciifolding",
                            "elision"
                        ],
                        "tokenizer": "standard",
                        "type": "custom"
                    } ,
                    "test1": {
                        "tokenizer": "standard",
                        "type": "custom"
                    },
                    "test2": {
                        "tokenizer": "whitespace",
                        "type": "custom"
                    },
                    "test3": {
                        "filter": [
                            "lowercase",
                            "stop_francais",
                            "fr_stemmer",
                            "asciifolding",
                            "elision"
                        ],
                        "tokenizer": "whitespace",
                        "type": "custom"
                    },
                    "test4": {
                        "filter": [
                            "lowercase",
                            "stop_francais",
                            "fr_stemmer",
                            "asciifolding",
                            "elision"
                        ],
                        "tokenizer": "standard",
                        "char_filter" : ["my_mapping"],
                        "type": "custom"
                    }
                },
                "filter": {
                    "elision": {
                        "articles": [ "l", "m", "t", "qu", "n", "s", "j", "d" ],
                        "type": "elision"
                    },
                    "fr_stemmer": {
                        "name": "french",
                        "type": "stemmer"
                    },
                    "stop_francais": {
                        "stopwords": [
                            "_french_"
                        ],
                        "type": "stop"
                    }
                },
                "char_filter" : {
                    "my_mapping" : {
                        "type" : "mapping",
                        "mappings" : ["&=>et", "+=>plus"]
                    }
                }
            },
            "number_of_replicas": 0,
            "number_of_shards": 1
        }
    }
}'

echo ""
echo ""
echo "Test string : $teststring"
echo ""

echo "Analyzer custom français (francais)"
curl -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=francais" -d "$teststring"
echo ""
echo "Tokenizer standard seul (test1)"
curl -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=test1" -d "$teststring"
echo ""
echo "Tokenizer whitespace seul (test2)"
curl -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=test2" -d "$teststring"
echo ""
echo "Analyzer custom français avec tokenizer whitespace (test3)"
curl -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=test3" -d "$teststring"
echo ""
echo "Analyzer custom français avec tokenizer standard et mapping (test4)"
curl -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=test4" -d "$teststring"
echo ""

# End.
echo "Done!"
exit 0
	#!/bin/bash
	# Script and configuration to test analyser
	#

	# Check arguments
	hostname=$1
	indexname=$2

	if [ -z "$hostname" ] \|\| [ -z "$indexname" ]
	then
	echo "Usage: test_analyzer hostname indexname"
	echo " with:"
	echo " - hostname: IP or hostname (port 9200 is added)"
	echo " - indexname: name of the index (example test)"
	exit 1
	fi

	# Test string ; note: le HEREDOC permet de teste tous les caractères difficile à echaper en bash
	teststring=$( cat <<EOF
	Canal+ annonce 3 ans de R&D !
	EOF
	)

	# Build URI
	baseuri="http://$hostname:9200/$indexname"

	# Confirm before delete
	#
	echo -n "Index $baseuri will be deleted. Continue ? [yn] "
	read are_you_sure
	if [ "$are_you_sure" != "y" ]
	then
	echo "Cancelled!"
	exit 2
	fi

	# Delete index
	echo -n "Deleting ... "
	curl -XDELETE $baseuri/
	echo ""

	# Load settings
	echo -n "Loading settings ... "
	curl -XPUT "$baseuri/" -d '
	{
	"settings": {
	"index": {
	"analysis": {
	"analyzer": {
	"francais": {
	"filter": [
	"lowercase",
	"stop_francais",
	"fr_stemmer",
	"asciifolding",
	"elision"
	],
	"tokenizer": "standard",
	"type": "custom"
	} ,
	"test1": {
	"tokenizer": "standard",
	"type": "custom"
	},
	"test2": {
	"tokenizer": "whitespace",
	"type": "custom"
	},
	"test3": {
	"filter": [
	"lowercase",
	"stop_francais",
	"fr_stemmer",
	"asciifolding",
	"elision"
	],
	"tokenizer": "whitespace",
	"type": "custom"
	},
	"test4": {
	"filter": [
	"lowercase",
	"stop_francais",
	"fr_stemmer",
	"asciifolding",
	"elision"
	],
	"tokenizer": "standard",
	"char_filter" : ["my_mapping"],
	"type": "custom"
	}
	},
	"filter": {
	"elision": {
	"articles": [ "l", "m", "t", "qu", "n", "s", "j", "d" ],
	"type": "elision"
	},
	"fr_stemmer": {
	"name": "french",
	"type": "stemmer"
	},
	"stop_francais": {
	"stopwords": [
	"_french_"
	],
	"type": "stop"
	}
	},
	"char_filter" : {
	"my_mapping" : {
	"type" : "mapping",
	"mappings" : ["&=>et", "+=>plus"]
	}
	}
	},
	"number_of_replicas": 0,
	"number_of_shards": 1
	}
	}
	}'

	echo ""
	echo ""
	echo "Test string : $teststring"
	echo ""

	echo "Analyzer custom français (francais)"
	curl -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=francais" -d "$teststring"
	echo ""
	echo "Tokenizer standard seul (test1)"
	curl -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=test1" -d "$teststring"
	echo ""
	echo "Tokenizer whitespace seul (test2)"
	curl -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=test2" -d "$teststring"
	echo ""
	echo "Analyzer custom français avec tokenizer whitespace (test3)"
	curl -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=test3" -d "$teststring"
	echo ""
	echo "Analyzer custom français avec tokenizer standard et mapping (test4)"
	curl -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=test4" -d "$teststring"
	echo ""

	# End.
	echo "Done!"
	exit 0