public
Created

ElasticSearch - filename search using nGram

  • Download Gist
gistfile1.sh
Shell
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
# Delete previous tests
 
curl -XDELETE 'http://127.0.0.1:9200/files/?pretty=1'
 
 
# Setup
 
curl -XPUT 'http://127.0.0.1:9200/files/?pretty=1' -d '
{
"settings" : {
"analysis" : {
"analyzer" : {
"filename_analyzer" : {
"tokenizer" : "filename_tokenizer",
"filter" : ["lowercase"]
}
},
"tokenizer" : {
"filename_tokenizer" : {
"type" : "NGram",
"max_gram" : 100,
"min_gram" : 2
}
}
}
},
"mappings" : {
"file" : {
"properties" : {
"filename" : {
"type" : "string",
"analyzer" : "filename_analyzer"
}
}
}
}
}
'
 
 
# Insert some documents:
 
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.01.12.txt" }'
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.01.05.txt" }'
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.05.01.txt" }'
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.08.27.txt" }'
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.12.12.txt" }'
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2011.12.12.txt" }'
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "file_01_2012.09.09.txt" }'
curl -X POST 'http://localhost:9200/files/_refresh'
 
 
# Find all documents except the last one (which contains "2011"):
 
curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1' -d '
{
"query" : {
"span_near" : {
"clauses" : [
{ "span_term" : { "filename" : "2012" } }
],
"slop": 100,
"in_order" : true
}
}
}
'
 
 
# Find all documents which contain "12" followed by "01" (the first three
# documents):
 
curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1' -d '
{
"query" : {
"span_near" : {
"clauses" : [
{ "span_term" : { "filename" : "12" } },
{ "span_term" : { "filename" : "01" } }
],
"slop": 100,
"in_order" : true
}
}
}
'
 
 
# BUT this search does not work:
 
curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1' -d '
{
"query" : {
"span_near" : {
"clauses" : [
{ "span_term" : { "filename" : "2012" } },
{ "span_term" : { "filename" : "01" } }
],
"slop": 100,
"in_order" : true
}
}
}
'
 
 
# However with "in_order" set to false it works, but it returns the unwanted
# "file_01_2012.09.09.txt":
 
curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1' -d '
{
"query" : {
"span_near" : {
"clauses" : [
{ "span_term" : { "filename" : "2012" } },
{ "span_term" : { "filename" : "01" } }
],
"slop": 100,
"in_order" : true
}
}
}
'
 
 
# I think the "2012 followed by 01"-query does not work because of how the
# nGram-tokenizer determines the token's position which is used by "span_near"-
# query to determine the order of the terms:
 
curl -XGET 'http://127.0.0.1:9200/files/_analyze?pretty=1&text=My_file_2012.01.05.txt&analyzer=filename_analyzer'
...
{
"token" : "01", ------------> "01" inside "2012"
"start_offset" : 9,
"end_offset" : 11,
"type" : "word",
"position" : 10
},
{
"token" : "12",
"start_offset" : 10,
"end_offset" : 12,
"type" : "word",
"position" : 11
},
...
{
"token" : "01", ------------> "01" inside ".01."
"start_offset" : 13,
"end_offset" : 15,
"type" : "word",
"position" : 14
}
...
{
"token" : "2012",
"start_offset" : 8,
"end_offset" : 12,
"type" : "word",
"position" : 50
}
...
 
# The nGram-tokenizer just increments the position for each token:
# First it generates the tokens with two characters which get a "position"-value
# from 1 to 21. Then it generates the tokens with three characters (position 22
# to 41) and so on.
# So when searching for "2012" and "01" the position-values 50 and 10 are used
# which are not in order and so the files are not found. But when using "12" and
# "01" then the values 11 and 14 are used which are in order.

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.