-
-
Save polyfractal/4542494 to your computer and use it in GitHub Desktop.
#create a test index with shingle mapping | |
curl -XPUT localhost:9200/test -d '{ | |
"settings":{ | |
"index":{ | |
"analysis":{ | |
"analyzer":{ | |
"analyzer_shingle":{ | |
"tokenizer":"standard", | |
"filter":["standard", "lowercase", "filter_stop", "filter_shingle"] | |
} | |
}, | |
"filter":{ | |
"filter_shingle":{ | |
"type":"shingle", | |
"max_shingle_size":5, | |
"min_shingle_size":2, | |
"output_unigrams":"true" | |
}, | |
"filter_stop":{ | |
"type":"stop", | |
"enable_position_increments":"false" | |
} | |
} | |
} | |
} | |
}, | |
"mappings":{ | |
"product":{ | |
"properties":{ | |
"title":{ | |
"search_analyzer":"analyzer_shingle", | |
"index_analyzer":"analyzer_shingle", | |
"type":"string" | |
} | |
} | |
} | |
} | |
}' | |
#Add some docs to the index | |
curl -XPOST localhost:9200/test/product/1 -d '{"title" : "Sample product title for shingles"}' | |
curl -XPOST localhost:9200/test/product/2 -d '{"title" : "Another title"}' | |
curl -XPOST localhost:9200/test/product/3 -d '{"title" : "Shingles is a viral disease"}' | |
#Analyze API to check out shingling | |
curl -XGET 'localhost:9200/test/_analyze?analyzer=analyzer_shingle&pretty' -d 'Test text to see shingles' | grep token | |
#Sample search | |
curl -XGET 'localhost:9200/test/product/_search?q=title:product+title&pretty' | |
#this one won't return anything, because of the stop filter | |
curl -XGET 'localhost:9200/test/product/_search?q=title:is+a&pretty' | |
#while this one will, because we emit unigrams | |
curl -XGET 'localhost:9200/test/product/_search?q=title:is+a+viral&pretty' |
Working gist for elastic search v 5.6:
curl -XPUT localhost:9200/test -d '{
"settings": {
"index": {
"analysis": {
"analyzer": {
"analyzer_shingle": {
"tokenizer": "standard",
"filter": [
"standard",
"lowercase",
"filter_stop",
"filter_shingle"
]
}
},
"filter": {
"filter_shingle": {
"type": "shingle",
"max_shingle_size": 5,
"min_shingle_size": 2,
"output_unigrams": "true"
},
"filter_stop": {
"type": "stop"
}
}
}
}
},
"mappings": {
"product": {
"properties": {
"title": {
"search_analyzer": "analyzer_shingle",
"analyzer": "analyzer_shingle",
"type": "text"
}
}
}
}
}'
I use Elasticserach 6.1 and I have an issue when querying the shingle field with fuzziness.
The following query works as expected:
GET test/product/_search
{
"query": {
"match": {
"title": {
"query": "Shingles is",
"fuzziness": "AUTO"
}
}
}
}
I would expect that replacing "Shingles is" with "Shingle is" would return the same result due to the fuzziness, however the result is empty. Any help?
version of gist updated for 6.3
curl -XPUT localhost:9200/test -H 'Content-Type: application/json' -d '{
"settings": {
"index": {
"analysis": {
"analyzer": {
"analyzer_shingle": {
"tokenizer": "standard",
"filter": [
"standard",
"lowercase",
"filter_stop",
"filter_shingle"
]
}
},
"filter": {
"filter_shingle": {
"type": "shingle",
"max_shingle_size": 5,
"min_shingle_size": 2,
"output_unigrams": "true"
},
"filter_stop": {
"type": "stop"
}
}
}
}
},
"mappings": {
"product": {
"properties": {
"title": {
"search_analyzer": "analyzer_shingle",
"analyzer": "analyzer_shingle",
"type": "text"
}
}
}
}
}'
#Add some docs to the index
curl -H 'Content-Type: application/json' -XPOST localhost:9200/test/product/1 -d '{"title" : "Sample product title for shingles"}'
curl -H 'Content-Type: application/json' -XPOST localhost:9200/test/product/2 -d '{"title" : "Another title"}'
curl -H 'Content-Type: application/json' -XPOST localhost:9200/test/product/3 -d '{"title" : "Shingles is a viral disease"}'
#Analyze API to check out shingling
curl -H 'Content-Type: application/json' -XGET 'localhost:9200/test/_analyze?pretty' -d '{ "analyzer": "analyzer_shingle", "text":"Test text to see shingles"}'
#Sample search
curl -XGET 'localhost:9200/test/product/_search?q=title:product+title&pretty'
#this one won't return anything, because of the stop filter
curl -XGET 'localhost:9200/test/product/_search?q=title:is+a&pretty'
#while this one will, because we emit unigrams
curl -XGET 'localhost:9200/test/product/_search?q=title:is+a+viral&pretty'
Hi I am trying to create a index with custom analyzer as given below . when we push the data to this index using spark submit job the job is failing with nodes not available exception;
Index mapping :
"settings": {
"index": {
"analysis": {
"analyzer": {
"analyzer_shingle": {
"tokenizer": "standard",
"filter": [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"filter_stop",
"english_stemmer",
"filter_shingle"
]
}
},
"filter": {
"filter_shingle": {
"type": "shingle",
"max_shingle_size": 4,
"min_shingle_size": 2,
"output_unigrams": "true"
},
"english_stemmer": {
"type": "stemmer",
"language": "english"
},
"english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
},
"english_stop": {
"type": "stop",
"stopwords": "english"
},
"filter_stop": {
"type": "stop",
"stopwords": ["it",
"its",
"itself",
"they",
"them",
"their",
"theirs",
"themselves",
"what",
"which",
"who",
"whom",
"this",
"that",
"these",
"those",
"am",
"is",
"are",
"was",
"were",
"be",
"been",
"being",
"have",
"has",
"had",
"having",
"do",
"does",
"did",
"doing",
"a",
"an",
"the",
"and",
"but",
"if",
"or",
"because",
"as",
"until",
"while",
"of",
"at",
"by",
"for",
"with",
"about",
"between",
"into",
"through",
"during",
"before",
"after",
"above",
"below",
"to",
"from",
"up",
"down",
"in",
"out",
"on",
"off",
"over",
"under",
"again",
"then",
"once",
"here",
"there",
"when",
"where",
"why",
"how",
"all",
"any",
"both",
"each",
"few",
"more",
"most",
"other",
"some",
"such",
"no",
"nor",
"not",
"only",
"same",
"so",
"than",
"too",
"very",
"s",
"t",
"can",
"will",
"just",
"don",
"should",
"now",
"apparatus",
"embodiments",
"embodiments",
"technique",
"operation",
"operations"]
}
}
}
}
},
"mappings": {
"-indexName1": {
"properties": {
"cpc": {
"type": "text",
"analyzer": "standard"
},
"definition": {
"type": "text",
"search_analyzer": "analyzer_shingle",
"analyzer": "analyzer_shingle"
}
}
}
}
exception :
ERROR Executor: Exception in task 8.0 in stage 0.0 (TID 2)
org.elasticsearch.hadoop.rest.EsHadoopNoNodesLeftException: Connection error (check network and/or proxy settings)- all nodes failed; tried [[]]
at org.elasticsearch.hadoop.rest.NetworkClient.execute(NetworkClient.java:149)
at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:461)
at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:445)
at org.elasticsearch.hadoop.rest.RestClient.bulk(RestClient.java:186)
at org.elasticsearch.hadoop.rest.RestRepository.tryFlush(RestRepository.java:222)
at org.elasticsearch.hadoop.rest.RestRepository.flush(RestRepository.java:244)
at org.elasticsearch.hadoop.rest.RestRepository.doWriteToIndex(RestRepository.java:198)
at org.elasticsearch.hadoop.rest.RestRepository.writeToIndex(RestRepository.java:161)
at org.elasticsearch.spark.rdd.EsRDDWriter.write(EsRDDWriter.scala:67)
at org.elasticsearch.spark.rdd.EsSpark$$anonfun$doSaveToEs$1.apply(EsSpark.scala:107)
at org.elasticsearch.spark.rdd.EsSpark$$anonfun$doSaveToEs$1.apply(EsSpark.scala:107)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
ES 7.5.2
1st error:
{"error":"Content-Type header [application/x-www-form-urlencoded] is not supported","status":406}
and 2nd, after adding -H 'Content-Type: application/json' :
{"error":{"root_cause":[{"type":"illegal_argument_exception","reason":"enable_position_increments is not supported anymore. Please fix your analysis chain"}],"type":"illegal_argument_exception","reason":"enable_position_increments is not supported anymore. Please fix your analysis chain"},"status":400}
This one is working with the latest ES version 7 -
PUT /moviesdb
{
"settings": {
"index": {
"number_of_shards": 1,
"analysis": {
"analyzer": {
"analyzer_shingle": {
"tokenizer": "standard",
"filter": [
"lowercase",
"filter_stop",
"filter_shingle"
]
}
},
"filter": {
"filter_shingle": {
"type": "shingle",
"min_shingle_size": 2,
"max_shingle_size": 4,
"output_unigrams": "true"
},
"filter_stop": {
"type": "stop"
}
}
}
}
},
"mappings": {
"properties": {
"title": {
"analyzer": "analyzer_shingle",
"type": "text"
}
}
}
}
You can simply copy it since I have gone through 3-4 posts to get rid of all my errors.
I have the same question as renatalucia. Can we do fuzzy searches with this? How?
index_analyzer was removed, so you can change it to "analyzer" and in this case since its the same as the search_analyzer you can also drop the search_analyzer from the mapping. Also it needs to be changed from string to text mapping type.