Created
July 5, 2011 12:29
-
-
Save ofavre/1064746 to your computer and use it in GitHub Desktop.
Highlight not working for multi-fields if each subfield is not stored, "main subfield" stored, and _source disabled
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set -v | |
# Delete an eventual previous index | |
curl -XDELETE 'localhost:9200/bugindex' | |
# Create an index | |
curl -XPUT 'localhost:9200/bugindex' -d '{ | |
"settings" : { | |
"index" : { | |
"number_of_shards" : 1, | |
"number_of_replicas" : 0 | |
} | |
} | |
}' | |
# Define a type | |
curl -XPUT 'localhost:9200/bugindex/bugtype/_mapping' -d '{ | |
"bugtype" : { | |
"_source" : { "enabled" : false }, | |
"_all" : { "enabled" : true }, | |
"properties" : { | |
"_all" : { "type" : "string" , "index" : "analyzed" , "store" : "no" , "analyzer" : "whitespace" }, | |
"in_all" : { "type" : "string" , "index" : "analyzed" , "store" : "yes" , "analyzer" : "keyword" }, | |
"simple" : { "type" : "string" , "index" : "analyzed" , "store" : "yes" , "analyzer" : "whitespace" }, | |
"multi" : { "type" : "multi_field" , | |
"fields" : { | |
"multi" : { "type" : "string" , "index" : "no" , "store" : "yes" , "include_in_all" : "no" }, | |
"exact" : { "type" : "string" , "index" : "analyzed" , "store" : "no" , "analyzer" : "whitespace" , "term_vector" : "with_positions_offsets" }, | |
"english" : { "type" : "string" , "index" : "analyzed" , "store" : "no" , "analyzer" : "english" , "term_vector" : "with_positions_offsets" } | |
} | |
} | |
} | |
} | |
}' | |
# Index a single test doc | |
curl -XPUT 'localhost:9200/bugindex/bugtype/1' -d '{ | |
"in_all" : "key word 1", | |
"simple" : "ab bc cd", | |
"multi" : "out of the box" | |
}' | |
# Make it avaiable for search | |
curl -XPOST 'localhost:9200/bugindex/_optimize?refresh=true&flush=true' | |
# Works as intended | |
curl -XGET 'localhost:9200/bugindex/bugtype/_search?pretty=1&fields=*' -d '{ | |
"query" : { | |
"term" : { | |
"simple" : "bc" | |
} | |
}, | |
"highlight" : { | |
"fields" : { | |
"simple" : { | |
"number_of_fragments" : 0 | |
} | |
} | |
} | |
}' | |
# Works as intended | |
curl -XGET 'localhost:9200/bugindex/bugtype/_search?pretty=1&fields=*' -d '{ | |
"query" : { | |
"term" : { | |
"in_all" : "key word 1" | |
} | |
}, | |
"highlight" : { | |
"fields" : { | |
"in_all" : { | |
"number_of_fragments":0 | |
} | |
} | |
} | |
}' | |
# Search so so (depends on the following), highlight works only if : | |
# - either _source is enabled | |
# - or multi.exact is stored | |
# Neither of the two is acceptable (index size is already a bit too large, | |
# we have _source disabled and all fields stored, | |
# as disk seeks are not so problematic with SSD) | |
curl -XGET 'localhost:9200/bugindex/bugtype/_search?pretty=1&fields=*' -d '{ | |
"query" : { | |
"term" : { | |
"multi.exact" : "box" | |
} | |
}, | |
"highlight" : { | |
"fields" : { | |
"multi.exact" : { | |
"number_of_fragments":0 | |
} | |
} | |
} | |
}' | |
# Search ok, highlight INEXISTENT | |
# (searching against multi (same field as the highlighted one) | |
# won't work because it's not indexed) | |
curl -XGET 'localhost:9200/bugindex/bugtype/_search?pretty=1&fields=*' -d '{ | |
"query" : { | |
"term" : { | |
"multi.exact" : "box" | |
} | |
}, | |
"highlight" : { | |
"fields" : { | |
"multi" : { | |
"number_of_fragments":0 | |
} | |
} | |
} | |
}' | |
# It feels like we should: | |
# - Use a multi-term to analyze a single field in different manners | |
# - Index only the subfields | |
# - Store only the "main" subfield, | |
# as it would result in storing multiple times the exact same data otherwise | |
# - Query against one subfield (not the "main" subfield) | |
# However with highlighting we would like to | |
# - highlight against the multi-field (ie the "main" subfield) | |
# - group highlights of searches against all the subfields | |
# (I agree everybody may not whish this though) | |
# A solution would consist of either: | |
# - Accesing the "main" field's value (either stored of from source) | |
# when trying to access to the subfield's value. | |
# - Not using multi-fields and using special analyzers that would | |
# multiplex the terms from multiple sub-analyzers | |
# Implementing this last solution may be beneficial also for: | |
# - Text search where you may wish to index exact, normalize and stemmed | |
# terms in the same field index. | |
# - Group analyzed terms from multiple fields into a kind of _all_terms field | |
# which would leverage differenciated analyzers and rapidity of searching | |
# against a single field index. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment