Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Trying to do language specific searching with multi_field in ES
# Create an index with some language support:
http://localhost:9200/items/_settings
{
items: {
settings: {
index.version.created: "200499",
index.number_of_replicas: "1",
index.analysis.analyzer.en.type: "english",
index.number_of_shards: "5",
index.analysis.analyzer.null.type: "standard",
index.analysis.analyzer.fr.type: "french",
index.analysis.analyzer.de.type: "german"
}
}
}
#Test1:
curl -XPUT 'http://localhost:9200/items/testLanguage/_mapping' -d'{
"testLanguage" : {
"_analyzer" : {
"path" : "language"
},
"properties" : {
"name" : {
"type" : "multi_field",
"fields" : {
"name" : {"type" : "string", "index" : "analyzed"},
"untouched" : {"type" : "string", "index" : "not_analyzed"}
}
},
"language" : {
"type" : "string",
"index":"not_analyzed"
}
}
}
}'
# When i go to check the mapping, i get:
{
"testLanguage":{
"_analyzer":{
"path":"language"
},
"properties":{
"language":{
"type":"string",
"index":"not_analyzed",
"omit_norms":true,
"index_options":"docs"
},
"name":{
"type":"multi_field",
"fields":{
"name":{
"type":"string"
},
"untouched":{
"type":"string",
"index":"not_analyzed",
"omit_norms":true,
"index_options":"docs",
"include_in_all":false
}
}
}
}
}
}
# I spotted the extra options (omit norms, index_options, include_in_all) that were added by ES
# Add some content and try to expose different results using certain analyzers
curl -XPOST 'http://localhost:9200/items/testLanguage/' -d'{"language":"en","name":"The boy went to die in the shop"}'
curl -XPOST 'http://localhost:9200/items/testLanguage/' -d'{"language":"de","name":"The boy went to die in the shop"}'
curl -XPOST 'http://localhost:9200/items/testLanguage/' -d'{"language":"null","name":"The boy went to die in the shop"}'
http://localhost:9200/items/testLanguage/_search?q=die (e.g. standard analyzer)
#gives english/null(standard) results
http://localhost:9200/items/testLanguage/_search?q=die&analyzer=english
#(same results as above)
http://localhost:9200/items/testLanguage/_search?q=die&analyzer=german
#(or analyzer=de) gets 0 results
http://localhost:9200/items/testLanguage/_search?q=the%20boy%20went%20to%20die%20in%20the%20shop
#All 3 results in the order standard, english, german
http://localhost:9200/items/testLanguage/_search?q=the%20boy%20went%20to%20die%20in%20the%20shop&analyzer=en
#All 3 results in the order english, standard, german
http://localhost:9200/items/testLanguage/_search?q=the%20boy%20went%20to%20die%20in%20the%20shop&analyzer=german
#All 3 results in the order german, standard, english
curl -XPOST 'http://localhost:9200/items/testLanguage/' -d'{"language":"en","name":"How to create a sales plan for people who like shopping"}'
curl -XPOST 'http://localhost:9200/items/testLanguage/' -d'{"language":"de","name":"How to create a sales plan for people who like shopping"}'
curl -XPOST 'http://localhost:9200/items/testLanguage/' -d'{"language":"null","name":"How to create a sales plan for people who like shopping"}'
http://localhost:9200/items/testLanguage/_search?q=shopping
#returns only german and standard (no english)
http://localhost:9200/items/testLanguage/_search?q=shopping&analyzer=de
#Same as above
http://localhost:9200/items/testLanguage/_search?q=shopping&analyzer=english
#Returns only the english result for "How to create a sales plan for people who like shopping" but matches all 3 results of the previously inserted data ("The boy went to die in the shop")
http://localhost:9200/items/testLanguage/_search?q=sales
#Returns only the standard null result
http://localhost:9200/items/testLanguage/_search?q=sales&analyzer=de
#Returns only the german result
http://localhost:9200/items/testLanguage/_search?q=sales&analyzer=english
#Returns only the english result
# Now try and use the untouched field
curl -XPOST "http://localhost:9200/items/testLanguage/_search?pretty=true" -d '
{
"query":{
"term":{
"name.untouched":"shop"
}
}
}
'
# Gives 0 results
# I looked at kimchy's example (https://gist.github.com/kimchy/1296043)
# When i run it with:
curl -XGET localhost:9200/test/_search?q=name.untouched:test
# i get 0 results.
#I understand that the issue is to do what what is being stored/indexed but i've tried changing some of the indexing variables:
curl -XPUT 'http://localhost:9200/items/testLanguage1/_mapping' -d'{"testLanguage":{"_analyzer":{"path":"language"},"properties":{"language":{"type":"string","index":"not_analyzed","omit_norms":true,"index_options":"docs"},"name":{"type":"multi_field","fields":{"name":{"type":"string"},"untouched":{"type":"string","index":"not_analyzed","omit_norms":false,"index_options":"freqs","include_in_all":true}}}}}}'
#FYI, i changed omit_norms, index_options and include_in_all
# When i ask ES for the mapping, i get:
{
testLanguage1: {
_analyzer: {
path: "language"
},
properties: {
language: {
type: "string",
index: "not_analyzed",
omit_norms: true,
index_options: "docs"
},
name: {
type: "multi_field",
fields: {
name: {
type: "string"
},
untouched: {
type: "string",
index: "not_analyzed",
omit_norms: true,
index_options: "freqs",
include_in_all: false
}
}
}
}
}
}
#The value of index_options have changed yet all the other values have reverted.
#All i want is the ability to search with terms over the not_analyzed multi field so that in the examples above, i can do language specific and then language ambiguous search.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment