Skip to content

Instantly share code, notes, and snippets.

@dacr
Last active April 2, 2023 10:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dacr/2265ba5e80cdaa12225e616dbd81948e to your computer and use it in GitHub Desktop.
Save dacr/2265ba5e80cdaa12225e616dbd81948e to your computer and use it in GitHub Desktop.
example based elasticsearch training lab content for use within kibana developer console / published by https://github.com/dacr/code-examples-manager #bf08e514-f207-46df-9db9-c77dc60a350c/5771cf9a8152eceb95afbf04c029cbfab4d790c2
## summary : example based elasticsearch training lab content for use within kibana developer console
## keywords : cem, code-examples-manager, elasticsearch, configuration, lab, index-mapping, query
## publish : gist
## authors : David Crosson
## license : Apache NON-AI License Version 2.0 (https://raw.githubusercontent.com/non-ai-licenses/non-ai-licenses/main/NON-AI-APACHE2)
## id : bf08e514-f207-46df-9db9-c77dc60a350c
## created-on : 2021-04-29T16:52:54Z
## managed-by : https://github.com/dacr/code-examples-manager
# ===============================================================
# DO NOT EXECUTE IF USING THE COMMON ELASTICSEARCH
DELETE _template/cem_mapping_template
DELETE cem-*
DELETE tmp-cem-*
DELETE tmp-exec-*
# INJECT DATA
# EXECUTE inserted-documents-default/feed.sh
# => CREATE KIBANA AN INDEX PATTERN FOR : cem-*
# ===============================================================
# USE the basic index naming "cem-default"
# --------------------- EX1-1
GET _cat/indices
# --------------------- EX1-2
GET cem-default/_count
# --------------------- EX1-3
GET cem-default/_search
# --------------------- EX1-4
GET cem-default/_search?q='fractal mandelbrot'
#Lucene query string syntax : a AND b, a OR b, ...
# --------------------- EX1-5
GET cem-default/_search
{
"query": {
"query_string": {
"query": "fractal OR mandelbrot"
}
}
}
# same results & score as previous
# uses query string “mini-language" : a AND b, a OR b, field:value, (), _exists_:field, "that", th*, th?t, ...
# --------------------- EX1-6
GET cem-default/_search
{
"query": {
"simple_query_string": {
"query": "fractal|mandelbrot"
}
}
}
# same results & score as previous
# uses simple query string syntax : a+b, a|b, -b, "that", th*, ()
# --------------------- EX1-7
GET cem-default/_search
{
"query": {
"match_all": {
}
}
}
# returns everything with full content
# --------------------- EX1-8
GET cem-default/_search
{
"_source": ["filename", "summary"],
"query": {
"match_all": {
}
}
}
# returns everything with content limited to given fields
# --------------------- EX1-9
GET cem-default/_search
{
"_source": ["summary"],
"query": {
"match": {
"summary": "caching operations"
}
}
}
# by default a OR is done within the given text after it has been analyzed
# no dedicated syntax, no order
# but many parameters to change the match behavior : operator, analyzer, ...
# --------------------- EX1-10
GET cem-default/_search
{
"_source": ["summary"],
"query": {
"match_phrase": {
"summary": "get current user"
}
}
}
# match in sequence => must be the exact phrase
# we'll how to improve that after
# --------------------- EX1-11
GET cem-default/_search
{
"query": {
"query_string": {
"query": "logging-tips"
}
}
}
# --------------------- EX1-12
GET cem-default/_search
{
"query": {
"query_string": {
"fields": ["summary"],
"query": "logging-tips"
}
}
}
# --------------------- EX1-13
GET cem-default/_search
{
"_source": ["filename", "summary"],
"query": {
"query_string": {
"query": "logg*"
}
}
}
# --------------------- EX1-14
GET cem-default/_search
{
"_source": ["filename", "summary"],
"query": {
"simple_query_string": {
"query": "crosson.david@gmail.com"
}
}
}
# --------------------- EX1-15
GET cem-default/_search
{
"_source": ["filename", "summary"],
"query": {
"simple_query_string": {
"query": "\"crosson.david@gmail.com\""
}
}
}
# --------------------- EX1-16
GET cem-default/_search
{
"query": {
"simple_query_string": {
"query": "unsafeRun"
}
}
}
# returns NO results
# --------------------- EX1-17
GET cem-default/_search
{
"query": {
"simple_query_string": {
"query": "Runtime.default.unsafeRun"
}
}
}
# returns many results with a high score
# because for the standard analyzer "Runtime.default.unsafeRun" is a unique word
# --------------------- EX1-18
GET cem-default/_search
{
"query": {
"simple_query_string": {
"query": "2.6.13"
}
}
}
# ===============================================================
# Evaluating analyzers
# --------------------- EX2-1
POST _analyze
{
"analyzer": "standard",
"text": "I sold some stocks yesterday, people are sicks"
}
# --------------------- EX2-2
POST _analyze
{
"analyzer": "english",
"text": "I sold some stocks yesterday, people are sicks"
}
# --------------------- EX2-3
POST _analyze
{
"analyzer": "english",
"text": "I'm getting sick"
}
# --------------------- EX2-4
POST _analyze
{
"analyzer": "standard",
"text": "j'ai attrapé la grippe hier à cause de personnes contagieuses"
}
# --------------------- EX2-5
POST _analyze
{
"analyzer": "french",
"text": "j'ai attrapé la grippe hier à cause de personnes contagieuses"
}
# --------------------- EX2-6
POST _analyze
{
"analyzer": "standard",
"text": "truc.much there and then."
}
# --------------------- EX2-7
POST _analyze
{
"analyzer": "simple",
"text": "truc.much there and then."
}
# --------------------- EX2-8
POST _analyze
{
"analyzer": "standard",
"text": "crosson.david@gmail.com"
}
# --------------------- EX2-9
POST _analyze
{
"analyzer": "simple",
"text": "crosson.david@gmail.com"
}
# --------------------- EX2-10
POST _analyze
{
"analyzer": "whitespace",
"text": "crosson.david@gmail.com"
}
# ===============================================================
# Enhancing search and start more complex queries
# DO NOT EXECUTE IF USING THE COMMON ELASTICSEARCH
DELETE _template/cem_mapping_template
DELETE cem-*
# --------------------- EX3-1
# Using the simple analyzer instead of the standard one
PUT cem-default
{
"settings":{
"index":{"number_of_replicas":0},
"analysis":{
"analyzer":{"default":{"type" : "simple"}}
}
}
}
# INJECT DATA
# EXECUTE inserted-documents-default/feed.sh
# --------------------- EX3-2
GET _cat/indices
# now our index is green !
# --------------------- EX3-3
GET cem-default/_search
{
"_source":["category","filename","summary"],
"query": {
"query_string": {
"query": "unsafeRun"
}
}
}
# NOW RETURNS MANY RESULTS
# --------------------- EX3-4
GET cem-default/_search
{
"query": {
"query_string": {
"query": "2.6.13"
}
}
}
# NOW RETURNS NO RESULTS
# --------------------- EX3-5
GET cem-default/_search
{
"query": {
"query_string": {
"query": "backoff"
}
}
}
# RETURNS no results because camelcase not take into account of course
# --------------------- EX3-6
GET cem-default/_search
{
"query": {
"simple_query_string": {
"query": "ExponentialBackoffRetry"
}
}
}
# --------------------- EX3-7
GET cem-default/_search
{
"_source":["filename","summary"],
"query": {
"query_string": {
"query": "scala"
}
}
}
# --------------------- EX3-8
GET cem-default/_search
{
"_source":["filename","summary"],
"query": {
"query_string": {
"query": "scala zio"
}
}
}
# implicits OR
# --------------------- EX3-9
GET cem-default/_search
{
"_source":["filename","summary"],
"query": {
"query_string": {
"query": "scala OR zio"
}
}
}
# --------------------- EX3-10
GET cem-default/_search
{
"_source":["filename","summary"],
"query": {
"query_string": {
"query": "scala AND zio"
}
}
}
# --------------------- EX3-11
GET cem-default/_search
{
"_source":["filename","summary"],
"query": {
"bool": {
"should": [
{"match": {"keywords":"scala zio"}}
]
}
}
}
# --------------------- EX3-12
GET cem-default/_search
{
"_source":["filename","summary"],
"query": {
"bool": {
"must": [
{"match": {"keywords":"scala"}},
{"match": {"keywords":"zio"}}
]
}
}
}
# --------------------- EX3-13
# The good way to give more importance to specific fields
GET cem-default/_search
{
"query": {
"multi_match": {
"query": "get user",
"operator": "and",
"type": "most_fields",
"fields": [
"keywords^1",
"summary^1",
"content^5"
]
}
}
}
# ===============================================================
# Fixing again the search capabilities
# Check the default generated mapping
GET cem-default/_mapping
# DO NOT EXECUTE IF USING THE COMMON ELASTICSEARCH
DELETE _template/cem_mapping_template
DELETE cem-*
# --------------------- EX4-1
# Using the simple analyzer instead of the standard one
# and add support for camecase
PUT cem-default
{
"settings": {
"index": {
"number_of_replicas": 0,
"number_of_shards": 2
},
"analysis": {
"analyzer": {
"default": {
"type": "simple"
},
"camelcase": {
"type" : "pattern",
"pattern" :
"([^\\p{L}\\d]+)|(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)|(?<=[\\p{L}&&[^\\p{Lu}]])(?=\\p{Lu})|(?<=\\p{Lu})(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])"
}
}
}
},
"mappings": {
"properties": {
"summary": {
"type": "text",
"fields": {
"summary-english": {
"type": "text",
"analyzer": "english"
}
}
},
"content": {
"type": "text",
"fields": {
"content-standard": {
"type": "text",
"analyzer": "standard"
},
"content-english": {
"type": "text",
"analyzer": "english"
},
"content-camelcase": {
"type": "text",
"analyzer": "camelcase",
"search_analyzer": "standard"
}
}
}
}
}
}
# INJECT DATA
# EXECUTE inserted-documents-default/feed.sh
# --------------------- EX4-2
POST cem-default/_analyze
{
"field": "content",
"text": "coolRaoul42, trucBidule, joe_doe"
}
# --------------------- EX4-3
POST cem-default/_analyze
{
"field": "content.content-camelcase",
"text": "coolRaoul42, trucBidule, joe_doe"
}
# --------------------- EX4-4
POST cem-default/_analyze
{
"field": "content.content-camelcase",
"text" : "import org.apache.curator.retry.{ExponentialBackoffRetry, RetryNTimes}"
}
# --------------------- EX4-5
GET cem-default/_search
{
"query": {
"query_string": {
"query": "unsafeRun"
}
}
}
# NOW RETURNS THE RIGHT NUMBER OF RESULTS
# ALSO BECAUSE search_analyzer has been set to "standard" on content-camelcase
# --------------------- EX4-6
GET cem-default/_search
{
"query": {
"query_string": {
"query": "unsafe"
}
}
}
# And of course we can now use just a part of camelcase word
# --------------------- EX4-7
GET cem-default/_search
{
"query": {
"simple_query_string": {
"query": "2.6.13"
}
}
}
# NOW RETURNS THE RIGHT NUMBER OF RESULTS
# ALSO BECAUSE search_analyzer has been set to "standard" on content-camelcase
# --------------------- EX4-8
GET cem-default/_search
{
"query": {
"query_string": {
"query": "backoff"
}
}
}
# --------------------- EX4-9
GET cem-default/_search
{
"query": {
"query_string": {
"query": "exponential AND backoff"
}
}
}
# --------------------- EX4-10
GET cem-default/_search
{
"_source": ["content"],
"query": {
"query_string": {
"query": "\"async.AsyncCuratorFramework\""
}
}
}
# TODO - to DEBUG
GET cem-default/_search
{
"_source": ["content"],
"query": {
"match_phrase_prefix": {
"query": "async curator framework"
}
}
}
# --------------------- EX4-11
# IT IS POSSIBLE TO HIGHLIGHT THE MATCH !
GET cem-default/_search
{
"_source":["summary"],
"query": {
"match": {
"summary": "snippets user"
}
},
"highlight" : {
"fields" : {
"summary": {"force_source" : true}
}
}
}
# ===============================================================
# Optimizing mapping and playing with tokens
# DO NOT EXECUTE IF USING THE COMMON ELASTICSEARCH
DELETE _template/cem_mapping_template
DELETE cem-*
# --------------------- EX5-1
# EXECUTE PRIOR TO DATA INJECTION
PUT _template/cem_mapping_template
{
"index_patterns": ["cem-*"],
"settings": {
"index": {
"number_of_replicas": 0
},
"analysis": {
"filter":{
"english_stop": {
"type": "stop",
"stopwords": "_english_"
},
"english_keywords": {
"type": "keyword_marker",
"keywords": ["example"]
},
"english_stemmer": {
"type": "stemmer",
"language": "english"
},
"english_possessive_stemmer": {
"type": "stemmer",
"language": "possessive_english"
},
"english_synonym" : {
"type" : "synonym",
"synonyms_path" : "synonyms.txt"
}
},
"analyzer": {
"default": {
"type": "simple"
},
"camelcase": {
"type" : "pattern",
"pattern" :
"([^\\p{L}\\d]+)|(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)|(?<=[\\p{L}&&[^\\p{Lu}]])(?=\\p{Lu})|(?<=\\p{Lu})(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])"
},
"rebuilt_english": {
"tokenizer": "standard",
"filter": [
"english_synonym",
"english_possessive_stemmer",
"lowercase",
"asciifolding",
"english_stop",
"english_keywords",
"english_stemmer"
]
}
}
}
},
"mappings": {
"properties": {
"category": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 2000
}
}
},
"summary": {
"type": "text",
"analyzer": "rebuilt_english",
"fields": {
"summary-english": {
"type": "text",
"analyzer": "rebuilt_english",
"search_analyzer" : "rebuilt_english"
},
"summary-standard": {
"type": "text",
"analyzer": "standard",
"search_analyzer" : "standard"
}
}
},
"content": {
"type": "text",
"fields": {
"content-standard": {
"type": "text",
"analyzer": "standard"
},
"content-english": {
"type": "text",
"analyzer": "rebuilt_english",
"search_analyzer" : "rebuilt_english"
},
"content-camelcase": {
"type": "text",
"analyzer": "camelcase",
"search_analyzer": "standard"
}
}
},
"file": {
"type": "text"
},
"filename": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"id": {
"type": "text"
},
"uuid": {
"type": "text"
},
"keywords": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 1000
}
}
},
"lastUpdated": {
"type": "date"
},
"managedBy": {
"type": "text"
},
"license": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"publish": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"authors": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"execution": {
"type": "text"
},
"runWith": {
"type": "text"
}
}
}
}
# INJECT DATA (USE TIMED BASE INDEX NAME)
# EXECUTE inserted-documents-timed/feed.sh
# To check the applied mapping (automatically taken from the defined mapping template)
# --------------------- EX5-2
GET _cat/indices
# --------------------- EX5-3
GET cem-*/_count
# --------------------- EX5-4
GET cem-*/_search
{
"query": {
"query_string": {
"query": "zio AND (resource OR environment)"
}
}
}
# --------------------- EX5-5
GET cem-*/_search
{
"query": {
"simple_query_string": {
"query": "zio+(resource|environment)"
}
}
}
# --------------------- EX5-6
GET cem-*/_search
{
"_source": ["summary"],
"query": {
"match": {
"summary": {
"query":"get the current user",
"operator": "and"
}
}
}
}
# the is ignored now
# --------------------- EX5-7
GET cem-*/_search
{
"query": {
"match_phrase": {
"content": "cheat sheet"
}
}
}
# --------------------- EX5-8
GET cem-*/_search?size=5
{
"_source": ["summary", "keywords"],
"query": {
"bool": {
"must": {
"match":{ "publish":"gist"}
},
"filter": {
"term": { "keywords": "testable"}
},
"must_not": [
{"match": {"summary":{"query":"hello world"}}},
{"match": {"keywords":"async"}}
]
}
}
}
# filter focus on sub data, allow caching and are ignored in the scorings
# --------------------- EX5-9
# search for missing fields
POST /cem-*/_search
{
"_source": ["filename","summary"],
"size": 20,
"query": {
"bool": {
"must_not": [
{"exists": {"field": "execution"}}
]
}
}
}
# --------------------- EX5-10
# Search within date ranges
POST cem-*/_search
{
"query": {
"bool": {
"must":{"query_string": { "query": "vertx"} },
"filter": {"range": {
"created_on": {
"gte": "2020-06-01",
"lte": "2020-12-31"
}
}}
}
}
}
# --------------------- EX5-11
# Count created this last 6 months
POST /cem-*/_count
{
"query": {
"bool": {
"must":{"query_string": { "query": "elasticsearch"} },
"filter": {"range": {
"created_on": {
"gte": "now-1y"
}
}}
}
}
}
# TODO check with last_updated no results returned !!! (with zio)
# --------------------- EX5-12
# Natural language query attempt
GET cem-*/_search
{
"_source":["summary"],
"query": {
"query_string": {
"fields": ["summary"],
"query": "getting the users"
}
}
}
# Remember :
POST cem-2021-5/_analyze
{
"field":"summary",
"text": "getting the users"
}
# --------------------- EX5-13
GET cem-*/_search
{
"_source":["summary"],
"query": {
"query_string": {
"fields": ["summary"],
"query": "getting the users",
"default_operator": "AND"
}
}
}
# users => user, getting => get, "the" is ignored
# --------------------- EX5-14
# fetch & get has been made synonyms (config/synonyms.txt)
GET cem-*/_search
{
"_source":["summary"],
"query": {
"query_string": {
"fields": ["summary", "content"],
"query": "fetch the users",
"default_operator": "AND"
}
}
}
GET cem-*/_search
{
"query": {
"query_string": {
"query": "synonyms"
}
}
}
# all synonyms have been added in that case check (config/synonyms.txt)
POST cem-2021-5/_analyze
{
"field":"summary",
"text": "fetch the users"
}
# --------------------- EX5-15
GET cem-*/_search
{
"_source":["summary"],
"query": {
"query_string": {
"fields": ["summary"],
"query": "sheetcheat",
"default_operator": "AND"
}
}
}
# all synonyms have been mapped into a single token check (config/synonyms.txt)
POST cem-2021-5/_analyze
{
"field":"summary",
"text": "sheetcheat"
}
# --------------------- EX5-16A
GET cem-*/_search
{
"_source":["summary"],
"query": {
"query_string": {
"fields": ["summary"],
"query": "arango database",
"default_operator": "AND"
}
}
}
# --------------------- EX5-16B ?? => FIXED
# something going wrong here : issues, issue, unassigned
# After check the expected document wasn't containing the "issue" word !!
# TAKE care with multifields when specifying search fields
GET cem-*/_search
{
"_source":["summary"],
"query": {
"query_string": {
"fields":["content"],
"query": "elastic issues unassigned",
"default_operator": "AND"
}
}
}
# NO RESULTS !!!
GET cem-*/_search
{
"_source":["summary", "content"],
"query": {
"query_string": {
"fields":["content.content-english"],
"query": "elastic issues unassigned",
"default_operator": "AND"
}
}
}
GET cem-*/_search
{
"_source":["summary", "content"],
"query": {
"query_string": {
"fields":["content.content-english"],
"query": "elastic issue unassign",
"default_operator": "AND"
}
}
}
POST cem-2021-5/_analyze
{
"field":"content.content-english",
"text": "elastic issues unassigned"
}
# --------------------- EX5-17
# all synonyms have been mapped into two tokens, check (config/synonyms.txt)
POST cem-2021-1/_analyze
{
"field": "summary.summary-english",
"text": "arangodb"
}
# PLACE THE SYNONYMS FILTER IN FIRST POSITION !
# --------------------- EX5-18
# gérer les fautes d'orthographe... ou les erreurs de saisies
GET cem-*/_search
{
"_source": ["summary"],
"query": {
"query_string": {
"query":"interation"
}
}
}
# fuzzy search to deal with 1 mispelled word
GET cem-*/_search
{
"_source": ["summary"],
"query": {
"fuzzy": {
"summary.summary-standard": {
"value":"interation"
}
}
}
}
# interation => interactions !
# --------------------- EX5-19
# fuzzy search to deal with mispelled words
GET cem-*/_search
{
"_source": ["summary"],
"size": 3,
"query": {
"match": {
"summary": {
"query": "got the uuseer",
"fuzziness": "AUTO"
}
}
}
}
# ===============================================================
# Advanced queries
# --------------------- EX6-1
# How many distinct examples keywords ? results stored in aggregations.count.value
POST /cem-*/_search?size=0
{
"aggs" : {
"the-count-for-me" : {
"cardinality" : {
"field" : "keywords.keyword"
}
}
}
}
# --------------------- EX6-2
# Examples keywords occurences count / How many examples for each keyword ?
# Top10 of the most used keywords
POST /cem-*/_search
{
"size":0,
"aggs" : {
"my-results" : {
"terms" : {
"field" : "keywords.keyword",
"size": 10
}
}
}
}
# --------------------- EX6-3
# How much examples in each category
POST /cem-*/_search
{
"size":0,
"aggs" : {
"results" : {
"terms" : {
"field" : "category.keyword",
"size": 50
}
}
}
}
# --------------------- EX6-4
# Significant terms - categories with more than 5 document with async & http keywords
POST /cem-*/_search?size=0
{
"query": {
"terms": {
"keywords": [
"async", "http"
]
}
},
"aggregations": {
"significant_categories": {
"significant_terms": {
"field": "category.keyword",
"min_doc_count": 5
}
}
}
}
# --------------------- EX6-5
# COUNT AGGREGATE MONTHLY FOR A GIVEN TIME RANGE
GET /cem-*/_search
{
"aggs": {
"my_results": {
"date_histogram": {
"field": "created_on",
"calendar_interval": "1M",
"time_zone": "Europe/Paris",
"min_doc_count": 1
}
}
},
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"created_on": {
"format": "strict_date_optional_time",
"gte": "2018-01-01T00:00:00.000Z",
"lte": "2021-12-31T23:59:59.999Z"
}
}
}
]
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment