# Delete previous tests | |
curl -XDELETE 'http://127.0.0.1:9200/files/?pretty=1' | |
# Setup | |
curl -XPUT 'http://127.0.0.1:9200/files/?pretty=1' -d ' | |
{ | |
"settings" : { | |
"analysis" : { | |
"analyzer" : { | |
"filename_analyzer" : { | |
"tokenizer" : "filename_tokenizer", | |
"filter" : ["lowercase"] | |
} | |
}, | |
"tokenizer" : { | |
"filename_tokenizer" : { | |
"type" : "NGram", | |
"max_gram" : 100, | |
"min_gram" : 2 | |
} | |
} | |
} | |
}, | |
"mappings" : { | |
"file" : { | |
"properties" : { | |
"filename" : { | |
"type" : "string", | |
"analyzer" : "filename_analyzer" | |
} | |
} | |
} | |
} | |
} | |
' | |
# Insert some documents: | |
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.01.12.txt" }' | |
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.01.05.txt" }' | |
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.05.01.txt" }' | |
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.08.27.txt" }' | |
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.12.12.txt" }' | |
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2011.12.12.txt" }' | |
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "file_01_2012.09.09.txt" }' | |
curl -X POST 'http://localhost:9200/files/_refresh' | |
# Find all documents except the last one (which contains "2011"): | |
curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1' -d ' | |
{ | |
"query" : { | |
"span_near" : { | |
"clauses" : [ | |
{ "span_term" : { "filename" : "2012" } } | |
], | |
"slop": 100, | |
"in_order" : true | |
} | |
} | |
} | |
' | |
# Find all documents which contain "12" followed by "01" (the first three | |
# documents): | |
curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1' -d ' | |
{ | |
"query" : { | |
"span_near" : { | |
"clauses" : [ | |
{ "span_term" : { "filename" : "12" } }, | |
{ "span_term" : { "filename" : "01" } } | |
], | |
"slop": 100, | |
"in_order" : true | |
} | |
} | |
} | |
' | |
# BUT this search does not work: | |
curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1' -d ' | |
{ | |
"query" : { | |
"span_near" : { | |
"clauses" : [ | |
{ "span_term" : { "filename" : "2012" } }, | |
{ "span_term" : { "filename" : "01" } } | |
], | |
"slop": 100, | |
"in_order" : true | |
} | |
} | |
} | |
' | |
# However with "in_order" set to false it works, but it returns the unwanted | |
# "file_01_2012.09.09.txt": | |
curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1' -d ' | |
{ | |
"query" : { | |
"span_near" : { | |
"clauses" : [ | |
{ "span_term" : { "filename" : "2012" } }, | |
{ "span_term" : { "filename" : "01" } } | |
], | |
"slop": 100, | |
"in_order" : true | |
} | |
} | |
} | |
' | |
# I think the "2012 followed by 01"-query does not work because of how the | |
# nGram-tokenizer determines the token's position which is used by "span_near"- | |
# query to determine the order of the terms: | |
curl -XGET 'http://127.0.0.1:9200/files/_analyze?pretty=1&text=My_file_2012.01.05.txt&analyzer=filename_analyzer' | |
... | |
{ | |
"token" : "01", ------------> "01" inside "2012" | |
"start_offset" : 9, | |
"end_offset" : 11, | |
"type" : "word", | |
"position" : 10 | |
}, | |
{ | |
"token" : "12", | |
"start_offset" : 10, | |
"end_offset" : 12, | |
"type" : "word", | |
"position" : 11 | |
}, | |
... | |
{ | |
"token" : "01", ------------> "01" inside ".01." | |
"start_offset" : 13, | |
"end_offset" : 15, | |
"type" : "word", | |
"position" : 14 | |
} | |
... | |
{ | |
"token" : "2012", | |
"start_offset" : 8, | |
"end_offset" : 12, | |
"type" : "word", | |
"position" : 50 | |
} | |
... | |
# The nGram-tokenizer just increments the position for each token: | |
# First it generates the tokens with two characters which get a "position"-value | |
# from 1 to 21. Then it generates the tokens with three characters (position 22 | |
# to 41) and so on. | |
# So when searching for "2012" and "01" the position-values 50 and 10 are used | |
# which are not in order and so the files are not found. But when using "12" and | |
# "01" then the values 11 and 14 are used which are in order. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment