-
-
Save polyfractal/4542494 to your computer and use it in GitHub Desktop.
#create a test index with shingle mapping | |
curl -XPUT localhost:9200/test -d '{ | |
"settings":{ | |
"index":{ | |
"analysis":{ | |
"analyzer":{ | |
"analyzer_shingle":{ | |
"tokenizer":"standard", | |
"filter":["standard", "lowercase", "filter_stop", "filter_shingle"] | |
} | |
}, | |
"filter":{ | |
"filter_shingle":{ | |
"type":"shingle", | |
"max_shingle_size":5, | |
"min_shingle_size":2, | |
"output_unigrams":"true" | |
}, | |
"filter_stop":{ | |
"type":"stop", | |
"enable_position_increments":"false" | |
} | |
} | |
} | |
} | |
}, | |
"mappings":{ | |
"product":{ | |
"properties":{ | |
"title":{ | |
"search_analyzer":"analyzer_shingle", | |
"index_analyzer":"analyzer_shingle", | |
"type":"string" | |
} | |
} | |
} | |
} | |
}' | |
#Add some docs to the index | |
curl -XPOST localhost:9200/test/product/1 -d '{"title" : "Sample product title for shingles"}' | |
curl -XPOST localhost:9200/test/product/2 -d '{"title" : "Another title"}' | |
curl -XPOST localhost:9200/test/product/3 -d '{"title" : "Shingles is a viral disease"}' | |
#Analyze API to check out shingling | |
curl -XGET 'localhost:9200/test/_analyze?analyzer=analyzer_shingle&pretty' -d 'Test text to see shingles' | grep token | |
#Sample search | |
curl -XGET 'localhost:9200/test/product/_search?q=title:product+title&pretty' | |
#this one won't return anything, because of the stop filter | |
curl -XGET 'localhost:9200/test/product/_search?q=title:is+a&pretty' | |
#while this one will, because we emit unigrams | |
curl -XGET 'localhost:9200/test/product/_search?q=title:is+a+viral&pretty' |
version of gist updated for 6.3
curl -XPUT localhost:9200/test -H 'Content-Type: application/json' -d '{
"settings": {
"index": {
"analysis": {
"analyzer": {
"analyzer_shingle": {
"tokenizer": "standard",
"filter": [
"standard",
"lowercase",
"filter_stop",
"filter_shingle"
]
}
},
"filter": {
"filter_shingle": {
"type": "shingle",
"max_shingle_size": 5,
"min_shingle_size": 2,
"output_unigrams": "true"
},
"filter_stop": {
"type": "stop"
}
}
}
}
},
"mappings": {
"product": {
"properties": {
"title": {
"search_analyzer": "analyzer_shingle",
"analyzer": "analyzer_shingle",
"type": "text"
}
}
}
}
}'
#Add some docs to the index
curl -H 'Content-Type: application/json' -XPOST localhost:9200/test/product/1 -d '{"title" : "Sample product title for shingles"}'
curl -H 'Content-Type: application/json' -XPOST localhost:9200/test/product/2 -d '{"title" : "Another title"}'
curl -H 'Content-Type: application/json' -XPOST localhost:9200/test/product/3 -d '{"title" : "Shingles is a viral disease"}'
#Analyze API to check out shingling
curl -H 'Content-Type: application/json' -XGET 'localhost:9200/test/_analyze?pretty' -d '{ "analyzer": "analyzer_shingle", "text":"Test text to see shingles"}'
#Sample search
curl -XGET 'localhost:9200/test/product/_search?q=title:product+title&pretty'
#this one won't return anything, because of the stop filter
curl -XGET 'localhost:9200/test/product/_search?q=title:is+a&pretty'
#while this one will, because we emit unigrams
curl -XGET 'localhost:9200/test/product/_search?q=title:is+a+viral&pretty'
Hi I am trying to create a index with custom analyzer as given below . when we push the data to this index using spark submit job the job is failing with nodes not available exception;
Index mapping :
"settings": {
"index": {
"analysis": {
"analyzer": {
"analyzer_shingle": {
"tokenizer": "standard",
"filter": [
"english_possessive_stemmer",
"lowercase",
"english_stop",
"filter_stop",
"english_stemmer",
"filter_shingle"
]
}
},
"filter": {
"filter_shingle": {
"type": "shingle",
"max_shingle_size": 4,
"min_shingle_size": 2,
"output_unigrams": "true"
},
"english_stemmer": {
"type": "stemmer",
"language": "english"
},
"english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
},
"english_stop": {
"type": "stop",
"stopwords": "english"
},
"filter_stop": {
"type": "stop",
"stopwords": ["it",
"its",
"itself",
"they",
"them",
"their",
"theirs",
"themselves",
"what",
"which",
"who",
"whom",
"this",
"that",
"these",
"those",
"am",
"is",
"are",
"was",
"were",
"be",
"been",
"being",
"have",
"has",
"had",
"having",
"do",
"does",
"did",
"doing",
"a",
"an",
"the",
"and",
"but",
"if",
"or",
"because",
"as",
"until",
"while",
"of",
"at",
"by",
"for",
"with",
"about",
"between",
"into",
"through",
"during",
"before",
"after",
"above",
"below",
"to",
"from",
"up",
"down",
"in",
"out",
"on",
"off",
"over",
"under",
"again",
"then",
"once",
"here",
"there",
"when",
"where",
"why",
"how",
"all",
"any",
"both",
"each",
"few",
"more",
"most",
"other",
"some",
"such",
"no",
"nor",
"not",
"only",
"same",
"so",
"than",
"too",
"very",
"s",
"t",
"can",
"will",
"just",
"don",
"should",
"now",
"apparatus",
"embodiments",
"embodiments",
"technique",
"operation",
"operations"]
}
}
}
}
},
"mappings": {
"-indexName1": {
"properties": {
"cpc": {
"type": "text",
"analyzer": "standard"
},
"definition": {
"type": "text",
"search_analyzer": "analyzer_shingle",
"analyzer": "analyzer_shingle"
}
}
}
}
exception :
ERROR Executor: Exception in task 8.0 in stage 0.0 (TID 2)
org.elasticsearch.hadoop.rest.EsHadoopNoNodesLeftException: Connection error (check network and/or proxy settings)- all nodes failed; tried [[]]
at org.elasticsearch.hadoop.rest.NetworkClient.execute(NetworkClient.java:149)
at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:461)
at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:445)
at org.elasticsearch.hadoop.rest.RestClient.bulk(RestClient.java:186)
at org.elasticsearch.hadoop.rest.RestRepository.tryFlush(RestRepository.java:222)
at org.elasticsearch.hadoop.rest.RestRepository.flush(RestRepository.java:244)
at org.elasticsearch.hadoop.rest.RestRepository.doWriteToIndex(RestRepository.java:198)
at org.elasticsearch.hadoop.rest.RestRepository.writeToIndex(RestRepository.java:161)
at org.elasticsearch.spark.rdd.EsRDDWriter.write(EsRDDWriter.scala:67)
at org.elasticsearch.spark.rdd.EsSpark$$anonfun$doSaveToEs$1.apply(EsSpark.scala:107)
at org.elasticsearch.spark.rdd.EsSpark$$anonfun$doSaveToEs$1.apply(EsSpark.scala:107)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
ES 7.5.2
1st error:
{"error":"Content-Type header [application/x-www-form-urlencoded] is not supported","status":406}
and 2nd, after adding -H 'Content-Type: application/json' :
{"error":{"root_cause":[{"type":"illegal_argument_exception","reason":"enable_position_increments is not supported anymore. Please fix your analysis chain"}],"type":"illegal_argument_exception","reason":"enable_position_increments is not supported anymore. Please fix your analysis chain"},"status":400}
This one is working with the latest ES version 7 -
PUT /moviesdb
{
"settings": {
"index": {
"number_of_shards": 1,
"analysis": {
"analyzer": {
"analyzer_shingle": {
"tokenizer": "standard",
"filter": [
"lowercase",
"filter_stop",
"filter_shingle"
]
}
},
"filter": {
"filter_shingle": {
"type": "shingle",
"min_shingle_size": 2,
"max_shingle_size": 4,
"output_unigrams": "true"
},
"filter_stop": {
"type": "stop"
}
}
}
}
},
"mappings": {
"properties": {
"title": {
"analyzer": "analyzer_shingle",
"type": "text"
}
}
}
}
You can simply copy it since I have gone through 3-4 posts to get rid of all my errors.
I have the same question as renatalucia. Can we do fuzzy searches with this? How?
I use Elasticserach 6.1 and I have an issue when querying the shingle field with fuzziness.
The following query works as expected:
I would expect that replacing "Shingles is" with "Shingle is" would return the same result due to the fuzziness, however the result is empty. Any help?