-
-
Save derryos/2218785ca960e3a4f30f to your computer and use it in GitHub Desktop.
Trying to do language specific searching with multi_field in ES
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create an index with some language support: | |
http://localhost:9200/items/_settings | |
{ | |
items: { | |
settings: { | |
index.version.created: "200499", | |
index.number_of_replicas: "1", | |
index.analysis.analyzer.en.type: "english", | |
index.number_of_shards: "5", | |
index.analysis.analyzer.null.type: "standard", | |
index.analysis.analyzer.fr.type: "french", | |
index.analysis.analyzer.de.type: "german" | |
} | |
} | |
} | |
#Test1: | |
curl -XPUT 'http://localhost:9200/items/testLanguage/_mapping' -d'{ | |
"testLanguage" : { | |
"_analyzer" : { | |
"path" : "language" | |
}, | |
"properties" : { | |
"name" : { | |
"type" : "multi_field", | |
"fields" : { | |
"name" : {"type" : "string", "index" : "analyzed"}, | |
"untouched" : {"type" : "string", "index" : "not_analyzed"} | |
} | |
}, | |
"language" : { | |
"type" : "string", | |
"index":"not_analyzed" | |
} | |
} | |
} | |
}' | |
# When i go to check the mapping, i get: | |
{ | |
"testLanguage":{ | |
"_analyzer":{ | |
"path":"language" | |
}, | |
"properties":{ | |
"language":{ | |
"type":"string", | |
"index":"not_analyzed", | |
"omit_norms":true, | |
"index_options":"docs" | |
}, | |
"name":{ | |
"type":"multi_field", | |
"fields":{ | |
"name":{ | |
"type":"string" | |
}, | |
"untouched":{ | |
"type":"string", | |
"index":"not_analyzed", | |
"omit_norms":true, | |
"index_options":"docs", | |
"include_in_all":false | |
} | |
} | |
} | |
} | |
} | |
} | |
# I spotted the extra options (omit norms, index_options, include_in_all) that were added by ES | |
# Add some content and try to expose different results using certain analyzers | |
curl -XPOST 'http://localhost:9200/items/testLanguage/' -d'{"language":"en","name":"The boy went to die in the shop"}' | |
curl -XPOST 'http://localhost:9200/items/testLanguage/' -d'{"language":"de","name":"The boy went to die in the shop"}' | |
curl -XPOST 'http://localhost:9200/items/testLanguage/' -d'{"language":"null","name":"The boy went to die in the shop"}' | |
http://localhost:9200/items/testLanguage/_search?q=die (e.g. standard analyzer) | |
#gives english/null(standard) results | |
http://localhost:9200/items/testLanguage/_search?q=die&analyzer=english | |
#(same results as above) | |
http://localhost:9200/items/testLanguage/_search?q=die&analyzer=german | |
#(or analyzer=de) gets 0 results | |
http://localhost:9200/items/testLanguage/_search?q=the%20boy%20went%20to%20die%20in%20the%20shop | |
#All 3 results in the order standard, english, german | |
http://localhost:9200/items/testLanguage/_search?q=the%20boy%20went%20to%20die%20in%20the%20shop&analyzer=en | |
#All 3 results in the order english, standard, german | |
http://localhost:9200/items/testLanguage/_search?q=the%20boy%20went%20to%20die%20in%20the%20shop&analyzer=german | |
#All 3 results in the order german, standard, english | |
curl -XPOST 'http://localhost:9200/items/testLanguage/' -d'{"language":"en","name":"How to create a sales plan for people who like shopping"}' | |
curl -XPOST 'http://localhost:9200/items/testLanguage/' -d'{"language":"de","name":"How to create a sales plan for people who like shopping"}' | |
curl -XPOST 'http://localhost:9200/items/testLanguage/' -d'{"language":"null","name":"How to create a sales plan for people who like shopping"}' | |
http://localhost:9200/items/testLanguage/_search?q=shopping | |
#returns only german and standard (no english) | |
http://localhost:9200/items/testLanguage/_search?q=shopping&analyzer=de | |
#Same as above | |
http://localhost:9200/items/testLanguage/_search?q=shopping&analyzer=english | |
#Returns only the english result for "How to create a sales plan for people who like shopping" but matches all 3 results of the previously inserted data ("The boy went to die in the shop") | |
http://localhost:9200/items/testLanguage/_search?q=sales | |
#Returns only the standard null result | |
http://localhost:9200/items/testLanguage/_search?q=sales&analyzer=de | |
#Returns only the german result | |
http://localhost:9200/items/testLanguage/_search?q=sales&analyzer=english | |
#Returns only the english result | |
# Now try and use the untouched field | |
curl -XPOST "http://localhost:9200/items/testLanguage/_search?pretty=true" -d ' | |
{ | |
"query":{ | |
"term":{ | |
"name.untouched":"shop" | |
} | |
} | |
} | |
' | |
# Gives 0 results | |
# I looked at kimchy's example (https://gist.github.com/kimchy/1296043) | |
# When i run it with: | |
curl -XGET localhost:9200/test/_search?q=name.untouched:test | |
# i get 0 results. | |
#I understand that the issue is to do what what is being stored/indexed but i've tried changing some of the indexing variables: | |
curl -XPUT 'http://localhost:9200/items/testLanguage1/_mapping' -d'{"testLanguage":{"_analyzer":{"path":"language"},"properties":{"language":{"type":"string","index":"not_analyzed","omit_norms":true,"index_options":"docs"},"name":{"type":"multi_field","fields":{"name":{"type":"string"},"untouched":{"type":"string","index":"not_analyzed","omit_norms":false,"index_options":"freqs","include_in_all":true}}}}}}' | |
#FYI, i changed omit_norms, index_options and include_in_all | |
# When i ask ES for the mapping, i get: | |
{ | |
testLanguage1: { | |
_analyzer: { | |
path: "language" | |
}, | |
properties: { | |
language: { | |
type: "string", | |
index: "not_analyzed", | |
omit_norms: true, | |
index_options: "docs" | |
}, | |
name: { | |
type: "multi_field", | |
fields: { | |
name: { | |
type: "string" | |
}, | |
untouched: { | |
type: "string", | |
index: "not_analyzed", | |
omit_norms: true, | |
index_options: "freqs", | |
include_in_all: false | |
} | |
} | |
} | |
} | |
} | |
} | |
#The value of index_options have changed yet all the other values have reverted. | |
#All i want is the ability to search with terms over the not_analyzed multi field so that in the examples above, i can do language specific and then language ambiguous search. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment