Created
June 23, 2011 09:42
-
-
Save ofavre/1042252 to your computer and use it in GitHub Desktop.
Problem in highlighting with stemming analyzers using most default parameters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
curl -XDELETE 'localhost:9200/index' | |
{"ok":true,"acknowledged":true} | |
curl -XPUT 'localhost:9200/index' -d '{"settings":{"index":{"number_of_shards":1,"number_of_replicas":0}}}' | |
{"ok":true,"acknowledged":true} | |
# The mapping I would like | |
curl -XPUT 'localhost:9200/index/type/_mapping' -d '{ | |
"type":{ | |
"_source":{ "enabled":false }, | |
"_analyzer":{ "path":"lang" }, | |
"properties":{ | |
"text":{ | |
"type":"string", | |
"store":true, | |
"index":"analyzed" | |
}, | |
"lang":{ | |
"type":"string", | |
"store":true, | |
"index":"not_analyzed" | |
} | |
} | |
} | |
}' | |
{"ok":true,"acknowledged":true} | |
curl -XGET 'localhost:9200/index/_analyze?pretty=1&analyzer=spanish' -d 'lola' | |
{ | |
"tokens" : [ { | |
"token" : "lol", | |
"start_offset" : 0, | |
"end_offset" : 4, | |
"type" : "<ALPHANUM>", | |
"position" : 1 | |
} ] | |
} | |
curl -XPUT 'localhost:9200/index/type/docspanish' -d '{ | |
"lang":"spanish", | |
"text":"spanish lola stuff" | |
}' | |
{"ok":true,"_index":"index","_type":"type","_id":"docspanish","_version":1} | |
curl -XGET 'localhost:9200/index/_analyze?pretty=1&analyzer=english' -d 'lol' | |
{ | |
"tokens" : [ { | |
"token" : "lol", | |
"start_offset" : 0, | |
"end_offset" : 3, | |
"type" : "<ALPHANUM>", | |
"position" : 1 | |
} ] | |
} | |
curl -XPUT 'localhost:9200/index/type/docenglish' -d '{ | |
"lang":"english", | |
"text":"english lol stuff" | |
}' | |
{"ok":true,"_index":"index","_type":"type","_id":"docenglish","_version":1} | |
curl -XPOST 'localhost:9200/index/_optimize?refresh=true&flush=true&wait_for_merge=true' | |
{"ok":true,"_shards":{"total":1,"successful":1,"failed":0}} | |
curl -XGET 'localhost:9200/index/type/_search?pretty=1&fields=*' -d '{ | |
"query":{ | |
"term":{ | |
"text":"lol" | |
} | |
} | |
}' | |
{ | |
"took" : 1, | |
"timed_out" : false, | |
"_shards" : { | |
"total" : 1, | |
"successful" : 1, | |
"failed" : 0 | |
}, | |
"hits" : { | |
"total" : 2, | |
"max_score" : 0.2972674, | |
"hits" : [ { | |
"_index" : "index", | |
"_type" : "type", | |
"_id" : "docspanish", | |
"_score" : 0.2972674, | |
"fields" : { | |
"lang" : "spanish", | |
"text" : "spanish lola stuff" | |
} | |
}, { | |
"_index" : "index", | |
"_type" : "type", | |
"_id" : "docenglish", | |
"_score" : 0.2972674, | |
"fields" : { | |
"lang" : "english", | |
"text" : "english lol stuff" | |
} | |
} ] | |
} | |
} | |
# Highlight not returned for the stemmed "lola" word | |
curl -XGET 'localhost:9200/index/type/_search?pretty=1&fields=*' -d '{ | |
"query":{ | |
"term":{ | |
"text": "lol" | |
} | |
}, | |
"highlight":{ | |
"fields":{ | |
"text":{ "number_of_fragments":0 } | |
} | |
} | |
}' | |
{ | |
"took" : 1, | |
"timed_out" : false, | |
"_shards" : { | |
"total" : 1, | |
"successful" : 1, | |
"failed" : 0 | |
}, | |
"hits" : { | |
"total" : 2, | |
"max_score" : 0.2972674, | |
"hits" : [ { | |
"_index" : "index", | |
"_type" : "type", | |
"_id" : "docspanish", | |
"_score" : 0.2972674, | |
"fields" : { | |
"lang" : "spanish", | |
"text" : "spanish lola stuff" | |
} | |
}, { | |
"_index" : "index", | |
"_type" : "type", | |
"_id" : "docenglish", | |
"_score" : 0.2972674, | |
"fields" : { | |
"lang" : "english", | |
"text" : "english lol stuff" | |
}, | |
"highlight" : { | |
"text" : [ "english <em>lol</em> stuff" ] | |
} | |
} ] | |
} | |
} | |
# test with _source disabled, term_vector=with_positions_offsets, fields stored | |
curl -XDELETE 'localhost:9200/index' | |
{"ok":true,"acknowledged":true} | |
curl -XPUT 'localhost:9200/index' -d '{"settings":{"index":{"number_of_shards":1,"number_of_replicas":0}}}' | |
{"ok":true,"acknowledged":true} | |
curl -XPUT 'localhost:9200/index/type/_mapping' -d '{ | |
"type":{ | |
"_source":{ "enabled":false }, | |
"_analyzer":{ "path":"lang" }, | |
"properties":{ | |
"text":{ | |
"type":"string", | |
"store":true, | |
"index":"analyzed", | |
"term_vector":"with_positions_offsets" | |
}, | |
"lang":{ | |
"type":"string", | |
"store":true, | |
"index":"not_analyzed" | |
} | |
} | |
} | |
}' | |
{"ok":true,"acknowledged":true} | |
curl -XGET 'localhost:9200/index/_analyze?pretty=1&analyzer=spanish' -d 'lola' | |
{ | |
"tokens" : [ { | |
"token" : "lol", | |
"start_offset" : 0, | |
"end_offset" : 4, | |
"type" : "<ALPHANUM>", | |
"position" : 1 | |
} ] | |
} | |
curl -XPUT 'localhost:9200/index/type/docspanish' -d '{ | |
"lang":"spanish", | |
"text":"spanish lola stuff" | |
}' | |
{"ok":true,"_index":"index","_type":"type","_id":"docspanish","_version":1} | |
curl -XPUT 'localhost:9200/index/type/docenglish' -d '{ | |
"lang":"english", | |
"text":"english lol stuff" | |
}' | |
{"ok":true,"_index":"index","_type":"type","_id":"docenglish","_version":1} | |
curl -XPOST 'localhost:9200/index/_optimize?refresh=true&flush=true&wait_for_merge=true' | |
{"ok":true,"_shards":{"total":1,"successful":1,"failed":0}} | |
curl -XGET 'localhost:9200/index/type/_search?pretty=1&fields=*' -d '{ | |
"query":{ | |
"term":{ | |
"text": "lol" | |
} | |
}, | |
"highlight":{ | |
"fields":{ | |
"text":{ "number_of_fragments":0 } | |
} | |
} | |
}' | |
{ | |
"took" : 1, | |
"timed_out" : false, | |
"_shards" : { | |
"total" : 1, | |
"successful" : 1, | |
"failed" : 0 | |
}, | |
"hits" : { | |
"total" : 2, | |
"max_score" : 0.2972674, | |
"hits" : [ { | |
"_index" : "index", | |
"_type" : "type", | |
"_id" : "docspanish", | |
"_score" : 0.2972674, | |
"fields" : { | |
"lang" : "spanish", | |
"text" : "spanish lola stuff" | |
}, | |
"highlight" : { | |
"text" : [ "spanish <em>lola</em> stuff " ] | |
} | |
}, { | |
"_index" : "index", | |
"_type" : "type", | |
"_id" : "docenglish", | |
"_score" : 0.2972674, | |
"fields" : { | |
"lang" : "english", | |
"text" : "english lol stuff" | |
}, | |
"highlight" : { | |
"text" : [ "english <em>lol</em> stuff " ] | |
} | |
} ] | |
} | |
} | |
# test with _source enabled, term_vector=with_positions_offsets, fields stored | |
curl -XDELETE 'localhost:9200/index' | |
{"ok":true,"acknowledged":true} | |
curl -XPUT 'localhost:9200/index' -d '{"settings":{"index":{"number_of_shards":1,"number_of_replicas":0}}}' | |
{"ok":true,"acknowledged":true} | |
curl -XPUT 'localhost:9200/index/type/_mapping' -d '{ | |
"type":{ | |
"_source":{ "enabled":true }, | |
"_analyzer":{ "path":"lang" }, | |
"properties":{ | |
"text":{ | |
"type":"string", | |
"store":true, | |
"index":"analyzed", | |
"term_vector":"with_positions_offsets" | |
}, | |
"lang":{ | |
"type":"string", | |
"store":true, | |
"index":"not_analyzed" | |
} | |
} | |
} | |
}' | |
{"ok":true,"acknowledged":true} | |
curl -XPUT 'localhost:9200/index/type/docspanish' -d '{ | |
"lang":"spanish", | |
"text":"spanish lola stuff" | |
}' | |
{"ok":true,"_index":"index","_type":"type","_id":"docspanish","_version":1} | |
curl -XPUT 'localhost:9200/index/type/docenglish' -d '{ | |
"lang":"english", | |
"text":"english lol stuff" | |
}' | |
{"ok":true,"_index":"index","_type":"type","_id":"docenglish","_version":1} | |
curl -XPOST 'localhost:9200/index/_optimize?refresh=true&flush=true&wait_for_merge=true' | |
{"ok":true,"_shards":{"total":1,"successful":1,"failed":0}} | |
curl -XGET 'localhost:9200/index/type/_search?pretty=1&fields=*' -d '{ | |
"query":{ | |
"term":{ | |
"text": "lol" | |
} | |
}, | |
"highlight":{ | |
"fields":{ | |
"text":{ "number_of_fragments":0 } | |
} | |
} | |
}' | |
{ | |
"took" : 1, | |
"timed_out" : false, | |
"_shards" : { | |
"total" : 1, | |
"successful" : 1, | |
"failed" : 0 | |
}, | |
"hits" : { | |
"total" : 2, | |
"max_score" : 0.2972674, | |
"hits" : [ { | |
"_index" : "index", | |
"_type" : "type", | |
"_id" : "docspanish", | |
"_score" : 0.2972674, | |
"fields" : { | |
"lang" : "spanish", | |
"text" : "spanish lola stuff" | |
}, | |
"highlight" : { | |
"text" : [ "spanish <em>lola</em> stuff " ] | |
} | |
}, { | |
"_index" : "index", | |
"_type" : "type", | |
"_id" : "docenglish", | |
"_score" : 0.2972674, | |
"fields" : { | |
"lang" : "english", | |
"text" : "english lol stuff" | |
}, | |
"highlight" : { | |
"text" : [ "english <em>lol</em> stuff " ] | |
} | |
} ] | |
} | |
} | |
# test with _source disabled, term_vector=no, fields stored | |
curl -XDELETE 'localhost:9200/index' | |
{"ok":true,"acknowledged":true} | |
curl -XPUT 'localhost:9200/index' -d '{"settings":{"index":{"number_of_shards":1,"number_of_replicas":0}}}' | |
{"ok":true,"acknowledged":true} | |
curl -XPUT 'localhost:9200/index/type/_mapping' -d '{ | |
"type":{ | |
"_source":{ "enabled":false }, | |
"_analyzer":{ "path":"lang" }, | |
"properties":{ | |
"text":{ | |
"type":"string", | |
"store":true, | |
"index":"analyzed", | |
"term_vector":"no" | |
}, | |
"lang":{ | |
"type":"string", | |
"store":true, | |
"index":"not_analyzed" | |
} | |
} | |
} | |
}' | |
{"ok":true,"acknowledged":true} | |
curl -XGET 'localhost:9200/index/_analyze?pretty=1&analyzer=spanish' -d 'lola' | |
{ | |
"tokens" : [ { | |
"token" : "lol", | |
"start_offset" : 0, | |
"end_offset" : 4, | |
"type" : "<ALPHANUM>", | |
"position" : 1 | |
} ] | |
} | |
curl -XPUT 'localhost:9200/index/type/docspanish' -d '{ | |
"lang":"spanish", | |
"text":"spanish lola stuff" | |
}' | |
{"ok":true,"_index":"index","_type":"type","_id":"docspanish","_version":1} | |
curl -XPUT 'localhost:9200/index/type/docenglish' -d '{ | |
"lang":"english", | |
"text":"english lol stuff" | |
}' | |
{"ok":true,"_index":"index","_type":"type","_id":"docenglish","_version":1} | |
curl -XPOST 'localhost:9200/index/_optimize?refresh=true&flush=true&wait_for_merge=true' | |
{"ok":true,"_shards":{"total":1,"successful":1,"failed":0}} | |
curl -XGET 'localhost:9200/index/type/_search?pretty=1&fields=*' -d '{ | |
"query":{ | |
"term":{ | |
"text": "lol" | |
} | |
}, | |
"highlight":{ | |
"fields":{ | |
"text":{ "number_of_fragments":0 } | |
} | |
} | |
}' | |
{ | |
"took" : 2, | |
"timed_out" : false, | |
"_shards" : { | |
"total" : 1, | |
"successful" : 1, | |
"failed" : 0 | |
}, | |
"hits" : { | |
"total" : 2, | |
"max_score" : 0.2972674, | |
"hits" : [ { | |
"_index" : "index", | |
"_type" : "type", | |
"_id" : "docspanish", | |
"_score" : 0.2972674, | |
"fields" : { | |
"lang" : "spanish", | |
"text" : "spanish lola stuff" | |
} | |
}, { | |
"_index" : "index", | |
"_type" : "type", | |
"_id" : "docenglish", | |
"_score" : 0.2972674, | |
"fields" : { | |
"lang" : "english", | |
"text" : "english lol stuff" | |
}, | |
"highlight" : { | |
"text" : [ "english <em>lol</em> stuff" ] | |
} | |
} ] | |
} | |
} | |
# test with _source enabled, term_vector=no, fields stored | |
curl -XDELETE 'localhost:9200/index' | |
{"ok":true,"acknowledged":true} | |
curl -XPUT 'localhost:9200/index' -d '{"settings":{"index":{"number_of_shards":1,"number_of_replicas":0}}}' | |
{"ok":true,"acknowledged":true} | |
curl -XPUT 'localhost:9200/index/type/_mapping' -d '{ | |
"type":{ | |
"_source":{ "enabled":true }, | |
"_analyzer":{ "path":"lang" }, | |
"properties":{ | |
"text":{ | |
"type":"string", | |
"store":true, | |
"index":"analyzed", | |
"term_vector":"no" | |
}, | |
"lang":{ | |
"type":"string", | |
"store":true, | |
"index":"not_analyzed" | |
} | |
} | |
} | |
}' | |
{"ok":true,"acknowledged":true} | |
curl -XPUT 'localhost:9200/index/type/docspanish' -d '{ | |
"lang":"spanish", | |
"text":"spanish lola stuff" | |
}' | |
{"ok":true,"_index":"index","_type":"type","_id":"docspanish","_version":1} | |
curl -XPUT 'localhost:9200/index/type/docenglish' -d '{ | |
"lang":"english", | |
"text":"english lol stuff" | |
}' | |
{"ok":true,"_index":"index","_type":"type","_id":"docenglish","_version":1} | |
curl -XPOST 'localhost:9200/index/_optimize?refresh=true&flush=true&wait_for_merge=true' | |
{"ok":true,"_shards":{"total":1,"successful":1,"failed":0}} | |
curl -XGET 'localhost:9200/index/type/_search?pretty=1&fields=*' -d '{ | |
"query":{ | |
"term":{ | |
"text": "lol" | |
} | |
}, | |
"highlight":{ | |
"fields":{ | |
"text":{ "number_of_fragments":0 } | |
} | |
} | |
}' | |
{ | |
"took" : 2, | |
"timed_out" : false, | |
"_shards" : { | |
"total" : 1, | |
"successful" : 1, | |
"failed" : 0 | |
}, | |
"hits" : { | |
"total" : 2, | |
"max_score" : 0.2972674, | |
"hits" : [ { | |
"_index" : "index", | |
"_type" : "type", | |
"_id" : "docspanish", | |
"_score" : 0.2972674, | |
"fields" : { | |
"lang" : "spanish", | |
"text" : "spanish lola stuff" | |
} | |
}, { | |
"_index" : "index", | |
"_type" : "type", | |
"_id" : "docenglish", | |
"_score" : 0.2972674, | |
"fields" : { | |
"lang" : "english", | |
"text" : "english lol stuff" | |
}, | |
"highlight" : { | |
"text" : [ "english <em>lol</em> stuff" ] | |
} | |
} ] | |
} | |
} | |
# Works with term_vector=with_positions_offsets, _source enabled or not (field stored) | |
set +v |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Conclusion: