Created
February 21, 2013 15:22
-
-
Save lukas-vlcek/5005428 to your computer and use it in GitHub Desktop.
Elasticsearch: Highlighting with nGrams (possible issue?)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
curl -X PUT localhost:9200/myindex -d ' | |
{ | |
"settings" : { | |
"index":{ | |
"number_of_replicas":0, | |
"number_of_shards":1, | |
"analysis":{ | |
"analyzer":{ | |
"default":{ | |
"type":"custom", | |
"tokenizer":"keyword", | |
"filter":[ | |
"lowercase", | |
"my_ngram" | |
] | |
} | |
}, | |
"filter":{ | |
"my_ngram":{ | |
"type":"nGram", | |
"min_gram":1, | |
"max_gram":20 | |
} | |
} | |
} | |
} | |
}, | |
"mappings" : { | |
"product" : { | |
"properties" : { | |
"code" : { | |
"type" : "multi_field", | |
"fields" : { | |
"code" : { | |
"type" : "string", | |
"analyzer" : "default", | |
"store" : "yes" | |
}, | |
"code.ngram" : { | |
"type" : "string", | |
"analyzer" : "default", | |
"store" : "yes", | |
"term_vector":"with_positions_offsets" | |
} | |
} | |
} | |
} | |
} | |
} | |
}' | |
# Now, index data | |
curl -X POST 'localhost:9200/myindex/product' -d '{ | |
"code" : "Samsung Galaxy i7500" | |
}' | |
curl -X POST 'localhost:9200/myindex/product' -d '{ | |
"code" : "Samsung Galaxy 5 Europa" | |
}' | |
curl -X POST 'localhost:9200/myindex/product' -d '{ | |
"code" : "Samsung Galaxy Mini" | |
}' | |
# ============================================ | |
# After refresh, do some queries | |
# First, query just for a single character 'i' | |
# ============================================ | |
curl -X GET 'localhost:9200/myindex/product/_search?pretty' -d '{ | |
"fields" : [ "code" ], | |
"query" : { | |
"term" : { | |
"code" : "i" | |
} | |
}, | |
"highlight" : { | |
"number_of_fragments" : 0, | |
"fields" : { | |
"code.ngram":{} | |
} | |
} | |
}' | |
# Works fine | |
... hit #1 | |
"fields" : { | |
"code" : "Samsung Galaxy Mini" | |
}, | |
"highlight" : { | |
"code.ngram" : [ "Samsung Galaxy M<em>i</em>n<em>i</em>" ] | |
} | |
... hit #2 | |
"fields" : { | |
"code" : "Samsung Galaxy i7500" | |
}, | |
"highlight" : { | |
"code.ngram" : [ "Samsung Galaxy <em>i</em>7500" ] | |
} | |
# Seems to works fine also for longer strings | |
curl -X GET 'localhost:9200/myindex/product/_search?pretty' -d '{ | |
"fields" : [ "code" ], | |
"query" : { | |
"term" : { | |
"code" : "galaxy 5" | |
} | |
}, | |
"highlight" : { | |
"number_of_fragments" : 0, | |
"fields" : { | |
"code.ngram":{} | |
} | |
} | |
}' | |
... hit #1 | |
"fields" : { | |
"code" : "Samsung Galaxy 5 Europa" | |
}, | |
"highlight" : { | |
"code.ngram" : [ "Samsung <em>Galaxy 5</em> Europa" ] | |
} | |
# ============================================ | |
# Previous query would work only if user provide search terms in correct order. | |
# Now, let us try something fancy! Use query_string with whitespace analyzer. | |
# Search for 'mini sam' | |
# ============================================ | |
curl -X GET 'localhost:9200/myindex/product/_search?pretty' -d '{ | |
"fields" : [ "code" ], | |
"query" : { | |
"query_string" : { | |
"default_field" : "code.ngram", | |
"query" : "mini sam", | |
"analyzer" : "whitespace" | |
} | |
}, | |
"highlight" : { | |
"number_of_fragments" : 0, | |
"fields" : { | |
"code.ngram":{} | |
} | |
} | |
}' | |
... hit #1 | |
"fields" : { | |
"code" : "Samsung Galaxy Mini" | |
}, | |
"highlight" : { | |
"code.ngram" : [ "<em>Sam</em>sung Galaxy <em>Mini</em>" ] | |
} | |
... hit #2 | |
"fields" : { | |
"code" : "Samsung Galaxy i7500" | |
}, | |
"highlight" : { | |
"code.ngram" : [ "<em>Sam</em>sung Galaxy i7500" ] | |
} | |
... hit #3 | |
"fields" : { | |
"code" : "Samsung Galaxy 5 Europa" | |
}, | |
"highlight" : { | |
"code.ngram" : [ "<em>Sam</em>sung Galaxy 5 Europa" ] | |
} | |
# Awesome! | |
# ============================================ | |
# But why query for 'sam xy' does not work? And why it fires an exception? | |
# ============================================ | |
curl -X GET 'localhost:9200/myindex/product/_search?pretty' -d '{ | |
"fields" : [ "code" ], | |
"query" : { | |
"query_string" : { | |
"default_field" : "code.ngram", | |
"query" : "sam xy", | |
"analyzer" : "whitespace" | |
} | |
}, | |
"highlight" : { | |
"number_of_fragments" : 0, | |
"fields" : { | |
"code.ngram":{} | |
} | |
} | |
}' | |
# Response | |
{ | |
"error" : "SearchPhaseExecutionException[Failed to execute phase [query_fetch], total failure; shardFailures {[YsXZQqRWRP2rcyx6_eR_dQ][myindex][0]: FetchPhaseExecutionException[[myindex][0]: query[filtered(code.code.ngram:sam code.code.ngram:xy)->cache(_type:product)],from[0],size[10]: Fetch Failed [Failed to highlight field [code.ngram]]]; nested: StringIndexOutOfBoundsException[String index out of range: -14]; }]", | |
"status" : 500 | |
} | |
# Server log | |
2013-02-21 16:21:45,160][DEBUG][action.search.type ] [Arachne] [myindex][0], node[YsXZQqRWRP2rcyx6_eR_dQ], [P], s[STARTED]: Failed to execute [org.elasticsearch.action.search.SearchRequest@7885a30c] | |
org.elasticsearch.search.fetch.FetchPhaseExecutionException: [myindex][0]: query[filtered(code.code.ngram:sam code.code.ngram:xy)->cache(_type:product)],from[0],size[10]: Fetch Failed [Failed to highlight field [code.ngram]] | |
at org.elasticsearch.search.highlight.HighlightPhase.hitExecute(HighlightPhase.java:335) | |
at org.elasticsearch.search.fetch.FetchPhase.execute(FetchPhase.java:250) | |
at org.elasticsearch.search.SearchService.executeFetchPhase(SearchService.java:326) | |
at org.elasticsearch.search.action.SearchServiceTransportAction.sendExecuteFetch(SearchServiceTransportAction.java:243) | |
at org.elasticsearch.action.search.type.TransportSearchQueryAndFetchAction$AsyncAction.sendExecuteFirstPhase(TransportSearchQueryAndFetchAction.java:75) | |
at org.elasticsearch.action.search.type.TransportSearchTypeAction$BaseAsyncAction.performFirstPhase(TransportSearchTypeAction.java:205) | |
at org.elasticsearch.action.search.type.TransportSearchTypeAction$BaseAsyncAction.performFirstPhase(TransportSearchTypeAction.java:192) | |
at org.elasticsearch.action.search.type.TransportSearchTypeAction$BaseAsyncAction$2.run(TransportSearchTypeAction.java:178) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918) | |
at java.lang.Thread.run(Thread.java:680) | |
Caused by: java.lang.StringIndexOutOfBoundsException: String index out of range: -14 | |
at java.lang.String.substring(String.java:1937) | |
at org.apache.lucene.search.vectorhighlight.BaseFragmentsBuilder.makeFragment(BaseFragmentsBuilder.java:166) | |
at org.apache.lucene.search.vectorhighlight.AbstractFragmentsBuilder.createFragments(AbstractFragmentsBuilder.java:84) | |
at org.apache.lucene.search.vectorhighlight.FastVectorHighlighter.getBestFragments(FastVectorHighlighter.java:186) | |
at org.elasticsearch.search.highlight.HighlightPhase.hitExecute(HighlightPhase.java:327) | |
... 10 more |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I'm having this same problem. Did you ever figure out what the problem was?