Skip to content

Instantly share code, notes, and snippets.

@ofavre
Created June 23, 2011 09:42
Show Gist options
  • Save ofavre/1042252 to your computer and use it in GitHub Desktop.
Save ofavre/1042252 to your computer and use it in GitHub Desktop.
Problem in highlighting with stemming analyzers using most default parameters
curl -XDELETE 'localhost:9200/index'
{"ok":true,"acknowledged":true}
curl -XPUT 'localhost:9200/index' -d '{"settings":{"index":{"number_of_shards":1,"number_of_replicas":0}}}'
{"ok":true,"acknowledged":true}
# The mapping I would like
curl -XPUT 'localhost:9200/index/type/_mapping' -d '{
"type":{
"_source":{ "enabled":false },
"_analyzer":{ "path":"lang" },
"properties":{
"text":{
"type":"string",
"store":true,
"index":"analyzed"
},
"lang":{
"type":"string",
"store":true,
"index":"not_analyzed"
}
}
}
}'
{"ok":true,"acknowledged":true}
curl -XGET 'localhost:9200/index/_analyze?pretty=1&analyzer=spanish' -d 'lola'
{
"tokens" : [ {
"token" : "lol",
"start_offset" : 0,
"end_offset" : 4,
"type" : "<ALPHANUM>",
"position" : 1
} ]
}
curl -XPUT 'localhost:9200/index/type/docspanish' -d '{
"lang":"spanish",
"text":"spanish lola stuff"
}'
{"ok":true,"_index":"index","_type":"type","_id":"docspanish","_version":1}
curl -XGET 'localhost:9200/index/_analyze?pretty=1&analyzer=english' -d 'lol'
{
"tokens" : [ {
"token" : "lol",
"start_offset" : 0,
"end_offset" : 3,
"type" : "<ALPHANUM>",
"position" : 1
} ]
}
curl -XPUT 'localhost:9200/index/type/docenglish' -d '{
"lang":"english",
"text":"english lol stuff"
}'
{"ok":true,"_index":"index","_type":"type","_id":"docenglish","_version":1}
curl -XPOST 'localhost:9200/index/_optimize?refresh=true&flush=true&wait_for_merge=true'
{"ok":true,"_shards":{"total":1,"successful":1,"failed":0}}
curl -XGET 'localhost:9200/index/type/_search?pretty=1&fields=*' -d '{
"query":{
"term":{
"text":"lol"
}
}
}'
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"hits" : {
"total" : 2,
"max_score" : 0.2972674,
"hits" : [ {
"_index" : "index",
"_type" : "type",
"_id" : "docspanish",
"_score" : 0.2972674,
"fields" : {
"lang" : "spanish",
"text" : "spanish lola stuff"
}
}, {
"_index" : "index",
"_type" : "type",
"_id" : "docenglish",
"_score" : 0.2972674,
"fields" : {
"lang" : "english",
"text" : "english lol stuff"
}
} ]
}
}
# Highlight not returned for the stemmed "lola" word
curl -XGET 'localhost:9200/index/type/_search?pretty=1&fields=*' -d '{
"query":{
"term":{
"text": "lol"
}
},
"highlight":{
"fields":{
"text":{ "number_of_fragments":0 }
}
}
}'
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"hits" : {
"total" : 2,
"max_score" : 0.2972674,
"hits" : [ {
"_index" : "index",
"_type" : "type",
"_id" : "docspanish",
"_score" : 0.2972674,
"fields" : {
"lang" : "spanish",
"text" : "spanish lola stuff"
}
}, {
"_index" : "index",
"_type" : "type",
"_id" : "docenglish",
"_score" : 0.2972674,
"fields" : {
"lang" : "english",
"text" : "english lol stuff"
},
"highlight" : {
"text" : [ "english <em>lol</em> stuff" ]
}
} ]
}
}
# test with _source disabled, term_vector=with_positions_offsets, fields stored
curl -XDELETE 'localhost:9200/index'
{"ok":true,"acknowledged":true}
curl -XPUT 'localhost:9200/index' -d '{"settings":{"index":{"number_of_shards":1,"number_of_replicas":0}}}'
{"ok":true,"acknowledged":true}
curl -XPUT 'localhost:9200/index/type/_mapping' -d '{
"type":{
"_source":{ "enabled":false },
"_analyzer":{ "path":"lang" },
"properties":{
"text":{
"type":"string",
"store":true,
"index":"analyzed",
"term_vector":"with_positions_offsets"
},
"lang":{
"type":"string",
"store":true,
"index":"not_analyzed"
}
}
}
}'
{"ok":true,"acknowledged":true}
curl -XGET 'localhost:9200/index/_analyze?pretty=1&analyzer=spanish' -d 'lola'
{
"tokens" : [ {
"token" : "lol",
"start_offset" : 0,
"end_offset" : 4,
"type" : "<ALPHANUM>",
"position" : 1
} ]
}
curl -XPUT 'localhost:9200/index/type/docspanish' -d '{
"lang":"spanish",
"text":"spanish lola stuff"
}'
{"ok":true,"_index":"index","_type":"type","_id":"docspanish","_version":1}
curl -XPUT 'localhost:9200/index/type/docenglish' -d '{
"lang":"english",
"text":"english lol stuff"
}'
{"ok":true,"_index":"index","_type":"type","_id":"docenglish","_version":1}
curl -XPOST 'localhost:9200/index/_optimize?refresh=true&flush=true&wait_for_merge=true'
{"ok":true,"_shards":{"total":1,"successful":1,"failed":0}}
curl -XGET 'localhost:9200/index/type/_search?pretty=1&fields=*' -d '{
"query":{
"term":{
"text": "lol"
}
},
"highlight":{
"fields":{
"text":{ "number_of_fragments":0 }
}
}
}'
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"hits" : {
"total" : 2,
"max_score" : 0.2972674,
"hits" : [ {
"_index" : "index",
"_type" : "type",
"_id" : "docspanish",
"_score" : 0.2972674,
"fields" : {
"lang" : "spanish",
"text" : "spanish lola stuff"
},
"highlight" : {
"text" : [ "spanish <em>lola</em> stuff " ]
}
}, {
"_index" : "index",
"_type" : "type",
"_id" : "docenglish",
"_score" : 0.2972674,
"fields" : {
"lang" : "english",
"text" : "english lol stuff"
},
"highlight" : {
"text" : [ "english <em>lol</em> stuff " ]
}
} ]
}
}
# test with _source enabled, term_vector=with_positions_offsets, fields stored
curl -XDELETE 'localhost:9200/index'
{"ok":true,"acknowledged":true}
curl -XPUT 'localhost:9200/index' -d '{"settings":{"index":{"number_of_shards":1,"number_of_replicas":0}}}'
{"ok":true,"acknowledged":true}
curl -XPUT 'localhost:9200/index/type/_mapping' -d '{
"type":{
"_source":{ "enabled":true },
"_analyzer":{ "path":"lang" },
"properties":{
"text":{
"type":"string",
"store":true,
"index":"analyzed",
"term_vector":"with_positions_offsets"
},
"lang":{
"type":"string",
"store":true,
"index":"not_analyzed"
}
}
}
}'
{"ok":true,"acknowledged":true}
curl -XPUT 'localhost:9200/index/type/docspanish' -d '{
"lang":"spanish",
"text":"spanish lola stuff"
}'
{"ok":true,"_index":"index","_type":"type","_id":"docspanish","_version":1}
curl -XPUT 'localhost:9200/index/type/docenglish' -d '{
"lang":"english",
"text":"english lol stuff"
}'
{"ok":true,"_index":"index","_type":"type","_id":"docenglish","_version":1}
curl -XPOST 'localhost:9200/index/_optimize?refresh=true&flush=true&wait_for_merge=true'
{"ok":true,"_shards":{"total":1,"successful":1,"failed":0}}
curl -XGET 'localhost:9200/index/type/_search?pretty=1&fields=*' -d '{
"query":{
"term":{
"text": "lol"
}
},
"highlight":{
"fields":{
"text":{ "number_of_fragments":0 }
}
}
}'
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"hits" : {
"total" : 2,
"max_score" : 0.2972674,
"hits" : [ {
"_index" : "index",
"_type" : "type",
"_id" : "docspanish",
"_score" : 0.2972674,
"fields" : {
"lang" : "spanish",
"text" : "spanish lola stuff"
},
"highlight" : {
"text" : [ "spanish <em>lola</em> stuff " ]
}
}, {
"_index" : "index",
"_type" : "type",
"_id" : "docenglish",
"_score" : 0.2972674,
"fields" : {
"lang" : "english",
"text" : "english lol stuff"
},
"highlight" : {
"text" : [ "english <em>lol</em> stuff " ]
}
} ]
}
}
# test with _source disabled, term_vector=no, fields stored
curl -XDELETE 'localhost:9200/index'
{"ok":true,"acknowledged":true}
curl -XPUT 'localhost:9200/index' -d '{"settings":{"index":{"number_of_shards":1,"number_of_replicas":0}}}'
{"ok":true,"acknowledged":true}
curl -XPUT 'localhost:9200/index/type/_mapping' -d '{
"type":{
"_source":{ "enabled":false },
"_analyzer":{ "path":"lang" },
"properties":{
"text":{
"type":"string",
"store":true,
"index":"analyzed",
"term_vector":"no"
},
"lang":{
"type":"string",
"store":true,
"index":"not_analyzed"
}
}
}
}'
{"ok":true,"acknowledged":true}
curl -XGET 'localhost:9200/index/_analyze?pretty=1&analyzer=spanish' -d 'lola'
{
"tokens" : [ {
"token" : "lol",
"start_offset" : 0,
"end_offset" : 4,
"type" : "<ALPHANUM>",
"position" : 1
} ]
}
curl -XPUT 'localhost:9200/index/type/docspanish' -d '{
"lang":"spanish",
"text":"spanish lola stuff"
}'
{"ok":true,"_index":"index","_type":"type","_id":"docspanish","_version":1}
curl -XPUT 'localhost:9200/index/type/docenglish' -d '{
"lang":"english",
"text":"english lol stuff"
}'
{"ok":true,"_index":"index","_type":"type","_id":"docenglish","_version":1}
curl -XPOST 'localhost:9200/index/_optimize?refresh=true&flush=true&wait_for_merge=true'
{"ok":true,"_shards":{"total":1,"successful":1,"failed":0}}
curl -XGET 'localhost:9200/index/type/_search?pretty=1&fields=*' -d '{
"query":{
"term":{
"text": "lol"
}
},
"highlight":{
"fields":{
"text":{ "number_of_fragments":0 }
}
}
}'
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"hits" : {
"total" : 2,
"max_score" : 0.2972674,
"hits" : [ {
"_index" : "index",
"_type" : "type",
"_id" : "docspanish",
"_score" : 0.2972674,
"fields" : {
"lang" : "spanish",
"text" : "spanish lola stuff"
}
}, {
"_index" : "index",
"_type" : "type",
"_id" : "docenglish",
"_score" : 0.2972674,
"fields" : {
"lang" : "english",
"text" : "english lol stuff"
},
"highlight" : {
"text" : [ "english <em>lol</em> stuff" ]
}
} ]
}
}
# test with _source enabled, term_vector=no, fields stored
curl -XDELETE 'localhost:9200/index'
{"ok":true,"acknowledged":true}
curl -XPUT 'localhost:9200/index' -d '{"settings":{"index":{"number_of_shards":1,"number_of_replicas":0}}}'
{"ok":true,"acknowledged":true}
curl -XPUT 'localhost:9200/index/type/_mapping' -d '{
"type":{
"_source":{ "enabled":true },
"_analyzer":{ "path":"lang" },
"properties":{
"text":{
"type":"string",
"store":true,
"index":"analyzed",
"term_vector":"no"
},
"lang":{
"type":"string",
"store":true,
"index":"not_analyzed"
}
}
}
}'
{"ok":true,"acknowledged":true}
curl -XPUT 'localhost:9200/index/type/docspanish' -d '{
"lang":"spanish",
"text":"spanish lola stuff"
}'
{"ok":true,"_index":"index","_type":"type","_id":"docspanish","_version":1}
curl -XPUT 'localhost:9200/index/type/docenglish' -d '{
"lang":"english",
"text":"english lol stuff"
}'
{"ok":true,"_index":"index","_type":"type","_id":"docenglish","_version":1}
curl -XPOST 'localhost:9200/index/_optimize?refresh=true&flush=true&wait_for_merge=true'
{"ok":true,"_shards":{"total":1,"successful":1,"failed":0}}
curl -XGET 'localhost:9200/index/type/_search?pretty=1&fields=*' -d '{
"query":{
"term":{
"text": "lol"
}
},
"highlight":{
"fields":{
"text":{ "number_of_fragments":0 }
}
}
}'
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"hits" : {
"total" : 2,
"max_score" : 0.2972674,
"hits" : [ {
"_index" : "index",
"_type" : "type",
"_id" : "docspanish",
"_score" : 0.2972674,
"fields" : {
"lang" : "spanish",
"text" : "spanish lola stuff"
}
}, {
"_index" : "index",
"_type" : "type",
"_id" : "docenglish",
"_score" : 0.2972674,
"fields" : {
"lang" : "english",
"text" : "english lol stuff"
},
"highlight" : {
"text" : [ "english <em>lol</em> stuff" ]
}
} ]
}
}
# Works with term_vector=with_positions_offsets, _source enabled or not (field stored)
set +v
@ofavre
Copy link
Author

ofavre commented Jun 23, 2011

Conclusion:

  • either _source or field's store should be enabled
  • for the highlight to work proprely with stemming, term_vector=with_positions_offsets is required (no less)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment