dacr/cem-elk-99-dev-console.elk

## cem-elk-99-dev-console.elk
## summary : example based elasticsearch training lab content for use within kibana developer console
## keywords : cem, code-examples-manager, elasticsearch, configuration, lab, index-mapping, query
## publish : gist
## authors : David Crosson
## license : Apache NON-AI License Version 2.0 (https://raw.githubusercontent.com/non-ai-licenses/non-ai-licenses/main/NON-AI-APACHE2)
## id : bf08e514-f207-46df-9db9-c77dc60a350c
## created-on : 2021-04-29T16:52:54Z
## managed-by : https://github.com/dacr/code-examples-manager

# ===============================================================
# DO NOT EXECUTE IF USING THE COMMON ELASTICSEARCH
DELETE _template/cem_mapping_template
DELETE cem-*
DELETE tmp-cem-*
DELETE tmp-exec-*


# INJECT DATA
# EXECUTE inserted-documents-default/feed.sh

# => CREATE KIBANA AN INDEX PATTERN FOR : cem-*

# ===============================================================
# USE the basic index naming "cem-default"

# --------------------- EX1-1
GET _cat/indices

# --------------------- EX1-2
GET cem-default/_count

# --------------------- EX1-3
GET cem-default/_search

# --------------------- EX1-4
GET cem-default/_search?q='fractal mandelbrot'
#Lucene query string syntax : a AND b, a OR b, ...

# --------------------- EX1-5
GET cem-default/_search
{
  "query": {
    "query_string": {
      "query": "fractal OR mandelbrot"
    }
  }
}
# same results & score as previous
# uses query string “mini-language" : a AND b, a OR b, field:value, (), _exists_:field, "that", th*, th?t, ...


# --------------------- EX1-6
GET cem-default/_search
{
  "query": {
    "simple_query_string": {
      "query": "fractal|mandelbrot"
    }
  }
}
# same results & score as previous
# uses simple query string syntax : a+b, a|b, -b, "that", th*, ()


# --------------------- EX1-7
GET cem-default/_search
{
  "query": {
    "match_all": {
    }
  }
}
# returns everything with full content


# --------------------- EX1-8
GET cem-default/_search
{
  "_source": ["filename", "summary"],
  "query": {
    "match_all": {
    }
  }
}
# returns everything with content limited to given fields


# --------------------- EX1-9
GET cem-default/_search
{
  "_source": ["summary"],
  "query": {
    "match": {
      "summary": "caching operations"
    }
  }
}
# by default a OR is done within the given text after it has been analyzed
# no dedicated syntax,  no order
# but many parameters to change the match behavior : operator, analyzer, ...


# --------------------- EX1-10
GET cem-default/_search
{
  "_source": ["summary"],
  "query": {
    "match_phrase": {
      "summary": "get current user"
    }
  }
}
# match in sequence => must be the exact phrase
# we'll how to improve that after


# --------------------- EX1-11
GET cem-default/_search
{
  "query": {
    "query_string": {
      "query": "logging-tips"
    }
  }
}

# --------------------- EX1-12
GET cem-default/_search
{
  "query": {
    "query_string": {
      "fields": ["summary"],
      "query": "logging-tips"
    }
  }
}

# --------------------- EX1-13
GET cem-default/_search
{
  "_source": ["filename", "summary"],
  "query": {
    "query_string": {
      "query": "logg*"
    }
  }
}

# --------------------- EX1-14
GET cem-default/_search
{
  "_source": ["filename", "summary"],
  "query": {
    "simple_query_string": {
      "query": "crosson.david@gmail.com"
    }
  }
}

# --------------------- EX1-15
GET cem-default/_search
{
  "_source": ["filename", "summary"],
  "query": {
    "simple_query_string": {
      "query": "\"crosson.david@gmail.com\""
    }
  }
}

# --------------------- EX1-16
GET cem-default/_search
{
  "query": {
    "simple_query_string": {
      "query": "unsafeRun"
    }
  }
}
# returns NO results

# --------------------- EX1-17
GET cem-default/_search
{
  "query": {
    "simple_query_string": {
      "query": "Runtime.default.unsafeRun"
    }
  }
}
# returns many results with a high score
# because for the standard analyzer "Runtime.default.unsafeRun" is a unique word

# --------------------- EX1-18
GET cem-default/_search
{
  "query": {
    "simple_query_string": {
      "query": "2.6.13"
    }
  }
}


# ===============================================================
# Evaluating analyzers

# --------------------- EX2-1
POST _analyze
{
  "analyzer": "standard",
  "text": "I sold some stocks yesterday, people are sicks"
}

# --------------------- EX2-2
POST _analyze
{
  "analyzer": "english",
  "text": "I sold some stocks yesterday, people are sicks"
}

# --------------------- EX2-3
POST _analyze
{
  "analyzer": "english",
  "text": "I'm getting sick"
}

# --------------------- EX2-4
POST _analyze
{
  "analyzer": "standard",
  "text": "j'ai attrapé la grippe hier à cause de personnes contagieuses"
}

# --------------------- EX2-5
POST _analyze
{
  "analyzer": "french",
  "text": "j'ai attrapé la grippe hier à cause de personnes contagieuses"
}

# --------------------- EX2-6
POST _analyze
{
  "analyzer": "standard",
  "text": "truc.much there and then."
}

# --------------------- EX2-7
POST _analyze
{
  "analyzer": "simple",
  "text": "truc.much there and then."
}

# --------------------- EX2-8
POST _analyze
{
  "analyzer": "standard",
  "text": "crosson.david@gmail.com"
}

# --------------------- EX2-9
POST _analyze
{
  "analyzer": "simple",
  "text": "crosson.david@gmail.com"
}

# --------------------- EX2-10
POST _analyze
{
  "analyzer": "whitespace",
  "text": "crosson.david@gmail.com"
}


# ===============================================================
# Enhancing search and start more complex queries

# DO NOT EXECUTE IF USING THE COMMON ELASTICSEARCH
DELETE _template/cem_mapping_template
DELETE cem-*

# --------------------- EX3-1
# Using the simple analyzer instead of the standard one
PUT cem-default
{
  "settings":{
     "index":{"number_of_replicas":0},
     "analysis":{
       "analyzer":{"default":{"type" : "simple"}}
     }
  }
}


# INJECT DATA
# EXECUTE inserted-documents-default/feed.sh


# --------------------- EX3-2
GET _cat/indices
# now our index is green !

# --------------------- EX3-3
GET cem-default/_search
{
  "_source":["category","filename","summary"],
  "query": {
    "query_string": {
      "query": "unsafeRun"
    }
  }
}
# NOW RETURNS MANY RESULTS

# --------------------- EX3-4
GET cem-default/_search
{
  "query": {
    "query_string": {
      "query": "2.6.13"
    }
  }
}
# NOW RETURNS NO RESULTS

# --------------------- EX3-5
GET cem-default/_search
{
  "query": {
    "query_string": {
      "query": "backoff"
    }
  }
}
# RETURNS no results because camelcase not take into account of course

# --------------------- EX3-6
GET cem-default/_search
{
  "query": {
    "simple_query_string": {
      "query": "ExponentialBackoffRetry"
    }
  }
}

# --------------------- EX3-7
GET cem-default/_search
{
  "_source":["filename","summary"],
  "query": {
    "query_string": {
      "query": "scala"
    }
  }
}

# --------------------- EX3-8
GET cem-default/_search
{
  "_source":["filename","summary"],
  "query": {
    "query_string": {
      "query": "scala zio"
    }
  }
}
# implicits OR

# --------------------- EX3-9
GET cem-default/_search
{
  "_source":["filename","summary"],
  "query": {
    "query_string": {
      "query": "scala OR zio"
    }
  }
}

# --------------------- EX3-10
GET cem-default/_search
{
  "_source":["filename","summary"],
  "query": {
    "query_string": {
      "query": "scala AND zio"
    }
  }
}

# --------------------- EX3-11
GET cem-default/_search
{
  "_source":["filename","summary"],
  "query": {
    "bool": {
      "should": [
        {"match": {"keywords":"scala zio"}}
      ]
    }
  }
}

# --------------------- EX3-12
GET cem-default/_search
{
  "_source":["filename","summary"],
  "query": {
    "bool": {
      "must": [
        {"match": {"keywords":"scala"}},
        {"match": {"keywords":"zio"}}
      ]
    }
  }
}

# --------------------- EX3-13
# The good way to give more importance to specific fields
GET cem-default/_search
{
  "query": {
    "multi_match": {
      "query": "get user",
      "operator": "and",
      "type": "most_fields",
      "fields": [
        "keywords^1",
        "summary^1",
        "content^5"
      ]
    }
  }
}

# ===============================================================
# Fixing again the search capabilities

# Check the default generated mapping
GET cem-default/_mapping

# DO NOT EXECUTE IF USING THE COMMON ELASTICSEARCH
DELETE _template/cem_mapping_template
DELETE cem-*

# --------------------- EX4-1
# Using the simple analyzer instead of the standard one
# and add support for camecase
PUT cem-default
{
  "settings": {
    "index": {
      "number_of_replicas": 0,
      "number_of_shards": 2
    },
    "analysis": {
      "analyzer": {
        "default": {
          "type": "simple"
        },
        "camelcase": {
           "type" : "pattern",
           "pattern" :
                  "([^\\p{L}\\d]+)|(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)|(?<=[\\p{L}&&[^\\p{Lu}]])(?=\\p{Lu})|(?<=\\p{Lu})(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])"
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "summary": {
        "type": "text",
        "fields": {
          "summary-english": {
            "type": "text",
            "analyzer": "english"
          }
        }
      },
      "content": {
        "type": "text",
        "fields": {
          "content-standard": {
            "type": "text",
            "analyzer": "standard"
          },
          "content-english": {
            "type": "text",
            "analyzer": "english"
          },
          "content-camelcase": {
            "type": "text",
            "analyzer": "camelcase",
            "search_analyzer": "standard"
          }
        }
      }
    }
  }
}


# INJECT DATA
# EXECUTE inserted-documents-default/feed.sh


# --------------------- EX4-2
POST cem-default/_analyze
{
  "field": "content",
  "text": "coolRaoul42, trucBidule, joe_doe"
}

# --------------------- EX4-3
POST cem-default/_analyze
{
  "field": "content.content-camelcase",
  "text": "coolRaoul42, trucBidule, joe_doe"
}

# --------------------- EX4-4
POST cem-default/_analyze
{
  "field": "content.content-camelcase",
  "text" : "import org.apache.curator.retry.{ExponentialBackoffRetry, RetryNTimes}"
}

# --------------------- EX4-5
GET cem-default/_search
{
  "query": {
    "query_string": {
      "query": "unsafeRun"
    }
  }
}
# NOW RETURNS THE RIGHT NUMBER OF RESULTS
# ALSO BECAUSE search_analyzer has been set to "standard" on content-camelcase

# --------------------- EX4-6
GET cem-default/_search
{
  "query": {
    "query_string": {
      "query": "unsafe"
    }
  }
}
# And of course we can now use just a part of camelcase word

# --------------------- EX4-7
GET cem-default/_search
{
  "query": {
    "simple_query_string": {
      "query": "2.6.13"
    }
  }
}
# NOW RETURNS THE RIGHT NUMBER OF RESULTS
# ALSO BECAUSE search_analyzer has been set to "standard" on content-camelcase

# --------------------- EX4-8
GET cem-default/_search
{
  "query": {
    "query_string": {
      "query": "backoff"
    }
  }
}

# --------------------- EX4-9
GET cem-default/_search
{
  "query": {
    "query_string": {
      "query": "exponential AND backoff"
    }
  }
}

# --------------------- EX4-10
GET cem-default/_search
{
  "_source": ["content"],
  "query": {
    "query_string": {
      "query": "\"async.AsyncCuratorFramework\""
    }
  }
}

# TODO -  to DEBUG
GET cem-default/_search
{
  "_source": ["content"],
  "query": {
    "match_phrase_prefix": {
      "query": "async curator framework"
    }
  }
}


# --------------------- EX4-11
# IT IS POSSIBLE TO HIGHLIGHT THE MATCH !
GET cem-default/_search
{
  "_source":["summary"],
  "query": {
    "match": {
      "summary": "snippets user"
    }
  },
  "highlight" : {
    "fields" : {
      "summary": {"force_source" : true}
    }
  }
}


# ===============================================================
# Optimizing mapping and playing with tokens

# DO NOT EXECUTE IF USING THE COMMON ELASTICSEARCH
DELETE _template/cem_mapping_template
DELETE cem-*

# --------------------- EX5-1
# EXECUTE PRIOR TO DATA INJECTION
PUT _template/cem_mapping_template
{
  "index_patterns": ["cem-*"],
  "settings": {
    "index": {
      "number_of_replicas": 0
    },
    "analysis": {
      "filter":{
        "english_stop": {
          "type":       "stop",
          "stopwords":  "_english_"
        },
        "english_keywords": {
          "type":       "keyword_marker",
          "keywords":   ["example"]
        },
        "english_stemmer": {
          "type":       "stemmer",
          "language":   "english"
        },
        "english_possessive_stemmer": {
          "type":       "stemmer",
          "language":   "possessive_english"
        },
        "english_synonym" : {
          "type" : "synonym",
          "synonyms_path" : "synonyms.txt"
        }
      },
      "analyzer": {
        "default": {
          "type": "simple"
        },
        "camelcase": {
           "type" : "pattern",
           "pattern" :
              "([^\\p{L}\\d]+)|(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)|(?<=[\\p{L}&&[^\\p{Lu}]])(?=\\p{Lu})|(?<=\\p{Lu})(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])"
        },
        "rebuilt_english": {
          "tokenizer":  "standard",
          "filter": [
            "english_synonym",
            "english_possessive_stemmer",
            "lowercase",
            "asciifolding",
            "english_stop",
            "english_keywords",
            "english_stemmer"
          ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "category": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 2000
          }
        }
      },
      "summary": {
        "type": "text",
        "analyzer": "rebuilt_english",
        "fields": {
          "summary-english": {
            "type": "text",
            "analyzer": "rebuilt_english",
            "search_analyzer" : "rebuilt_english"
          },
          "summary-standard": {
            "type": "text",
            "analyzer": "standard",
            "search_analyzer" : "standard"
          }
        }
      },
      "content": {
        "type": "text",
        "fields": {
          "content-standard": {
            "type": "text",
            "analyzer": "standard"
          },
          "content-english": {
            "type": "text",
            "analyzer": "rebuilt_english",
            "search_analyzer" : "rebuilt_english"
          },
          "content-camelcase": {
            "type": "text",
            "analyzer": "camelcase",
            "search_analyzer": "standard"
          }
        }
      },
      "file": {
        "type": "text"
      },
      "filename": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "id": {
        "type": "text"
      },
      "uuid": {
        "type": "text"
      },
      "keywords": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 1000
          }
        }
      },
      "lastUpdated": {
        "type": "date"
      },
      "managedBy": {
        "type": "text"
      },
      "license": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "publish": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "authors": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "execution": {
        "type": "text"
      },
      "runWith": {
        "type": "text"
      }
    }
  }
}

# INJECT DATA (USE TIMED BASE INDEX NAME)
# EXECUTE inserted-documents-timed/feed.sh

# To check the applied mapping (automatically taken from the defined mapping template)


# --------------------- EX5-2
GET _cat/indices

# --------------------- EX5-3
GET cem-*/_count

# --------------------- EX5-4
GET cem-*/_search
{
  "query": {
    "query_string": {
      "query": "zio AND (resource OR environment)"
    }
  }
}

# --------------------- EX5-5
GET cem-*/_search
{
  "query": {
    "simple_query_string": {
      "query": "zio+(resource|environment)"
    }
  }
}

# --------------------- EX5-6
GET cem-*/_search
{
  "_source": ["summary"],
  "query": {
    "match": {
      "summary": {
        "query":"get the current user",
        "operator": "and"
      }
    }
  }
}
# the is ignored now

# --------------------- EX5-7

GET cem-*/_search
{
  "query": {
    "match_phrase": {
      "content": "cheat sheet"
    }
  }
}


# --------------------- EX5-8

GET cem-*/_search?size=5
{
  "_source": ["summary", "keywords"],
  "query": {
    "bool": {
      "must": {
        "match":{ "publish":"gist"}
      },
      "filter": {
        "term": { "keywords": "testable"}
      },
      "must_not": [
        {"match": {"summary":{"query":"hello world"}}},
        {"match": {"keywords":"async"}}
      ]
    }
  }
}
# filter focus on sub data, allow caching and are ignored in the scorings


# --------------------- EX5-9
# search for missing fields
POST /cem-*/_search
{
  "_source": ["filename","summary"],
  "size": 20,
  "query": {
    "bool": {
      "must_not": [
        {"exists": {"field": "execution"}}
      ]
    }
  }
}

# --------------------- EX5-10
# Search within date ranges

POST cem-*/_search
{
  "query": {
    "bool": {
      "must":{"query_string": { "query": "vertx"} },
      "filter": {"range": {
        "created_on": {
          "gte": "2020-06-01",
          "lte": "2020-12-31"
        }
      }}
    }
  }
}

# --------------------- EX5-11
# Count created this last 6 months

POST /cem-*/_count
{
  "query": {
    "bool": {
      "must":{"query_string": { "query": "elasticsearch"} },
      "filter": {"range": {
        "created_on": {
          "gte": "now-1y"
        }
      }}
    }
  }
}
# TODO  check with last_updated no results returned !!! (with zio)


# --------------------- EX5-12
# Natural language query attempt

GET cem-*/_search
{
  "_source":["summary"],
  "query": {
    "query_string": {
      "fields": ["summary"],
      "query": "getting the users"
    }
  }
}

# Remember :
POST cem-2021-5/_analyze
{
  "field":"summary",
  "text": "getting the users"
}


# --------------------- EX5-13
GET cem-*/_search
{
  "_source":["summary"],
  "query": {
    "query_string": {
      "fields": ["summary"],
      "query": "getting the users",
      "default_operator": "AND"
    }
  }
}
# users => user, getting => get, "the" is ignored


# --------------------- EX5-14
# fetch & get has been made synonyms (config/synonyms.txt)
GET cem-*/_search
{
  "_source":["summary"],
  "query": {
    "query_string": {
      "fields": ["summary", "content"],
      "query": "fetch the users",
      "default_operator": "AND"
    }
  }
}


GET cem-*/_search
{
  "query": {
    "query_string": {
      "query": "synonyms"
    }
  }
}

# all synonyms have been added in that case check (config/synonyms.txt)
POST cem-2021-5/_analyze
{
  "field":"summary",
  "text": "fetch the users"
}


# --------------------- EX5-15
GET cem-*/_search
{
  "_source":["summary"],
  "query": {
    "query_string": {
      "fields": ["summary"],
      "query": "sheetcheat",
      "default_operator": "AND"
    }
  }
}

# all synonyms have been mapped into a single token check (config/synonyms.txt)
POST cem-2021-5/_analyze
{
  "field":"summary",
  "text": "sheetcheat"
}

# --------------------- EX5-16A
GET cem-*/_search
{
  "_source":["summary"],
  "query": {
    "query_string": {
      "fields": ["summary"],
      "query": "arango database",
      "default_operator": "AND"
    }
  }
}


# --------------------- EX5-16B ?? => FIXED
# something going wrong here : issues, issue, unassigned
# After check the expected document wasn't containing the "issue" word !!
# TAKE care with multifields when specifying search fields

GET cem-*/_search
{
  "_source":["summary"],
  "query": {
    "query_string": {
      "fields":["content"],
      "query": "elastic issues unassigned",
      "default_operator": "AND"
    }
  }
}
# NO RESULTS !!!

GET cem-*/_search
{
  "_source":["summary", "content"],
  "query": {
    "query_string": {
      "fields":["content.content-english"],
      "query": "elastic issues unassigned",
      "default_operator": "AND"
    }
  }
}


GET cem-*/_search
{
  "_source":["summary", "content"],
  "query": {
    "query_string": {
      "fields":["content.content-english"],
      "query": "elastic issue unassign",
      "default_operator": "AND"
    }
  }
}


POST cem-2021-5/_analyze
{
  "field":"content.content-english",
  "text": "elastic issues unassigned"
}

# --------------------- EX5-17
# all synonyms have been mapped into two tokens, check (config/synonyms.txt)
POST cem-2021-1/_analyze
{
  "field": "summary.summary-english",
  "text": "arangodb"
}
# PLACE THE SYNONYMS FILTER IN FIRST POSITION !


# --------------------- EX5-18
# gérer les fautes d'orthographe... ou les erreurs de saisies

GET cem-*/_search
{
  "_source": ["summary"],
  "query": {
    "query_string": {
      "query":"interation"
    }
  }
}

# fuzzy search to deal with 1 mispelled word


GET cem-*/_search
{
  "_source": ["summary"],
  "query": {
    "fuzzy": {
      "summary.summary-standard": {
        "value":"interation"
      }
    }
  }
}
# interation => interactions !


# --------------------- EX5-19
# fuzzy search to deal with mispelled words
GET cem-*/_search
{
  "_source": ["summary"],
  "size": 3,
  "query": {
    "match": {
      "summary": {
        "query": "got the uuseer",
        "fuzziness": "AUTO"
      }
    }
  }
}


# ===============================================================
# Advanced queries

# --------------------- EX6-1
# How many distinct examples keywords ? results stored in aggregations.count.value
POST /cem-*/_search?size=0
{
    "aggs" : {
        "the-count-for-me" : {
            "cardinality" : {
                "field" : "keywords.keyword"
            }
        }
    }
}

# --------------------- EX6-2
# Examples keywords occurences count / How many examples for each keyword ?
# Top10 of the most used keywords
POST /cem-*/_search
{
    "size":0,
    "aggs" : {
        "my-results" : {
            "terms" : {
              "field" : "keywords.keyword",
              "size": 10
            }
        }
    }
}

# --------------------- EX6-3
# How much examples in each category
POST /cem-*/_search
{
    "size":0,
    "aggs" : {
        "results" : {
            "terms" : {
              "field" : "category.keyword",
              "size": 50
            }
        }
    }
}


# --------------------- EX6-4
# Significant terms - categories with more than 5 document with async & http keywords
POST /cem-*/_search?size=0
{
  "query": {
    "terms": {
      "keywords": [
        "async", "http"
      ]
    }
  },
  "aggregations": {
    "significant_categories": {
      "significant_terms": {
        "field": "category.keyword",
        "min_doc_count": 5
      }
    }
  }
}


# --------------------- EX6-5
# COUNT AGGREGATE MONTHLY FOR A GIVEN TIME RANGE
GET /cem-*/_search
{
  "aggs": {
    "my_results": {
      "date_histogram": {
        "field": "created_on",
        "calendar_interval": "1M",
        "time_zone": "Europe/Paris",
        "min_doc_count": 1
      }
    }
  },
  "size": 0,
  "query": {
    "bool": {
      "must": [
        {
          "range": {
            "created_on": {
              "format": "strict_date_optional_time",
              "gte": "2018-01-01T00:00:00.000Z",
              "lte": "2021-12-31T23:59:59.999Z"
            }
          }
        }
      ]
    }
  }
}
	## summary : example based elasticsearch training lab content for use within kibana developer console
	## keywords : cem, code-examples-manager, elasticsearch, configuration, lab, index-mapping, query
	## publish : gist
	## authors : David Crosson
	## license : Apache NON-AI License Version 2.0 (https://raw.githubusercontent.com/non-ai-licenses/non-ai-licenses/main/NON-AI-APACHE2)
	## id : bf08e514-f207-46df-9db9-c77dc60a350c
	## created-on : 2021-04-29T16:52:54Z
	## managed-by : https://github.com/dacr/code-examples-manager

	# ===============================================================
	# DO NOT EXECUTE IF USING THE COMMON ELASTICSEARCH
	DELETE _template/cem_mapping_template
	DELETE cem-*
	DELETE tmp-cem-*
	DELETE tmp-exec-*


	# INJECT DATA
	# EXECUTE inserted-documents-default/feed.sh

	# => CREATE KIBANA AN INDEX PATTERN FOR : cem-*

	# ===============================================================
	# USE the basic index naming "cem-default"

	# --------------------- EX1-1
	GET _cat/indices

	# --------------------- EX1-2
	GET cem-default/_count

	# --------------------- EX1-3
	GET cem-default/_search

	# --------------------- EX1-4
	GET cem-default/_search?q='fractal mandelbrot'
	#Lucene query string syntax : a AND b, a OR b, ...

	# --------------------- EX1-5
	GET cem-default/_search
	{
	"query": {
	"query_string": {
	"query": "fractal OR mandelbrot"
	}
	}
	}
	# same results & score as previous
	# uses query string “mini-language" : a AND b, a OR b, field:value, (), _exists_:field, "that", th*, th?t, ...


	# --------------------- EX1-6
	GET cem-default/_search
	{
	"query": {
	"simple_query_string": {
	"query": "fractal\|mandelbrot"
	}
	}
	}
	# same results & score as previous
	# uses simple query string syntax : a+b, a\|b, -b, "that", th*, ()


	# --------------------- EX1-7
	GET cem-default/_search
	{
	"query": {
	"match_all": {
	}
	}
	}
	# returns everything with full content


	# --------------------- EX1-8
	GET cem-default/_search
	{
	"_source": ["filename", "summary"],
	"query": {
	"match_all": {
	}
	}
	}
	# returns everything with content limited to given fields


	# --------------------- EX1-9
	GET cem-default/_search
	{
	"_source": ["summary"],
	"query": {
	"match": {
	"summary": "caching operations"
	}
	}
	}
	# by default a OR is done within the given text after it has been analyzed
	# no dedicated syntax, no order
	# but many parameters to change the match behavior : operator, analyzer, ...


	# --------------------- EX1-10
	GET cem-default/_search
	{
	"_source": ["summary"],
	"query": {
	"match_phrase": {
	"summary": "get current user"
	}
	}
	}
	# match in sequence => must be the exact phrase
	# we'll how to improve that after


	# --------------------- EX1-11
	GET cem-default/_search
	{
	"query": {
	"query_string": {
	"query": "logging-tips"
	}
	}
	}

	# --------------------- EX1-12
	GET cem-default/_search
	{
	"query": {
	"query_string": {
	"fields": ["summary"],
	"query": "logging-tips"
	}
	}
	}

	# --------------------- EX1-13
	GET cem-default/_search
	{
	"_source": ["filename", "summary"],
	"query": {
	"query_string": {
	"query": "logg*"
	}
	}
	}

	# --------------------- EX1-14
	GET cem-default/_search
	{
	"_source": ["filename", "summary"],
	"query": {
	"simple_query_string": {
	"query": "crosson.david@gmail.com"
	}
	}
	}

	# --------------------- EX1-15
	GET cem-default/_search
	{
	"_source": ["filename", "summary"],
	"query": {
	"simple_query_string": {
	"query": "\"crosson.david@gmail.com\""
	}
	}
	}

	# --------------------- EX1-16
	GET cem-default/_search
	{
	"query": {
	"simple_query_string": {
	"query": "unsafeRun"
	}
	}
	}
	# returns NO results

	# --------------------- EX1-17
	GET cem-default/_search
	{
	"query": {
	"simple_query_string": {
	"query": "Runtime.default.unsafeRun"
	}
	}
	}
	# returns many results with a high score
	# because for the standard analyzer "Runtime.default.unsafeRun" is a unique word

	# --------------------- EX1-18
	GET cem-default/_search
	{
	"query": {
	"simple_query_string": {
	"query": "2.6.13"
	}
	}
	}







	# ===============================================================
	# Evaluating analyzers

	# --------------------- EX2-1
	POST _analyze
	{
	"analyzer": "standard",
	"text": "I sold some stocks yesterday, people are sicks"
	}

	# --------------------- EX2-2
	POST _analyze
	{
	"analyzer": "english",
	"text": "I sold some stocks yesterday, people are sicks"
	}

	# --------------------- EX2-3
	POST _analyze
	{
	"analyzer": "english",
	"text": "I'm getting sick"
	}

	# --------------------- EX2-4
	POST _analyze
	{
	"analyzer": "standard",
	"text": "j'ai attrapé la grippe hier à cause de personnes contagieuses"
	}

	# --------------------- EX2-5
	POST _analyze
	{
	"analyzer": "french",
	"text": "j'ai attrapé la grippe hier à cause de personnes contagieuses"
	}

	# --------------------- EX2-6
	POST _analyze
	{
	"analyzer": "standard",
	"text": "truc.much there and then."
	}

	# --------------------- EX2-7
	POST _analyze
	{
	"analyzer": "simple",
	"text": "truc.much there and then."
	}

	# --------------------- EX2-8
	POST _analyze
	{
	"analyzer": "standard",
	"text": "crosson.david@gmail.com"
	}

	# --------------------- EX2-9
	POST _analyze
	{
	"analyzer": "simple",
	"text": "crosson.david@gmail.com"
	}

	# --------------------- EX2-10
	POST _analyze
	{
	"analyzer": "whitespace",
	"text": "crosson.david@gmail.com"
	}



	# ===============================================================
	# Enhancing search and start more complex queries

	# DO NOT EXECUTE IF USING THE COMMON ELASTICSEARCH
	DELETE _template/cem_mapping_template
	DELETE cem-*

	# --------------------- EX3-1
	# Using the simple analyzer instead of the standard one
	PUT cem-default
	{
	"settings":{
	"index":{"number_of_replicas":0},
	"analysis":{
	"analyzer":{"default":{"type" : "simple"}}
	}
	}
	}


	# INJECT DATA
	# EXECUTE inserted-documents-default/feed.sh


	# --------------------- EX3-2
	GET _cat/indices
	# now our index is green !

	# --------------------- EX3-3
	GET cem-default/_search
	{
	"_source":["category","filename","summary"],
	"query": {
	"query_string": {
	"query": "unsafeRun"
	}
	}
	}
	# NOW RETURNS MANY RESULTS

	# --------------------- EX3-4
	GET cem-default/_search
	{
	"query": {
	"query_string": {
	"query": "2.6.13"
	}
	}
	}
	# NOW RETURNS NO RESULTS

	# --------------------- EX3-5
	GET cem-default/_search
	{
	"query": {
	"query_string": {
	"query": "backoff"
	}
	}
	}
	# RETURNS no results because camelcase not take into account of course

	# --------------------- EX3-6
	GET cem-default/_search
	{
	"query": {
	"simple_query_string": {
	"query": "ExponentialBackoffRetry"
	}
	}
	}

	# --------------------- EX3-7
	GET cem-default/_search
	{
	"_source":["filename","summary"],
	"query": {
	"query_string": {
	"query": "scala"
	}
	}
	}

	# --------------------- EX3-8
	GET cem-default/_search
	{
	"_source":["filename","summary"],
	"query": {
	"query_string": {
	"query": "scala zio"
	}
	}
	}
	# implicits OR

	# --------------------- EX3-9
	GET cem-default/_search
	{
	"_source":["filename","summary"],
	"query": {
	"query_string": {
	"query": "scala OR zio"
	}
	}
	}

	# --------------------- EX3-10
	GET cem-default/_search
	{
	"_source":["filename","summary"],
	"query": {
	"query_string": {
	"query": "scala AND zio"
	}
	}
	}

	# --------------------- EX3-11
	GET cem-default/_search
	{
	"_source":["filename","summary"],
	"query": {
	"bool": {
	"should": [
	{"match": {"keywords":"scala zio"}}
	]
	}
	}
	}

	# --------------------- EX3-12
	GET cem-default/_search
	{
	"_source":["filename","summary"],
	"query": {
	"bool": {
	"must": [
	{"match": {"keywords":"scala"}},
	{"match": {"keywords":"zio"}}
	]
	}
	}
	}

	# --------------------- EX3-13
	# The good way to give more importance to specific fields
	GET cem-default/_search
	{
	"query": {
	"multi_match": {
	"query": "get user",
	"operator": "and",
	"type": "most_fields",
	"fields": [
	"keywords^1",
	"summary^1",
	"content^5"
	]
	}
	}
	}

	# ===============================================================
	# Fixing again the search capabilities

	# Check the default generated mapping
	GET cem-default/_mapping

	# DO NOT EXECUTE IF USING THE COMMON ELASTICSEARCH
	DELETE _template/cem_mapping_template
	DELETE cem-*

	# --------------------- EX4-1
	# Using the simple analyzer instead of the standard one
	# and add support for camecase
	PUT cem-default
	{
	"settings": {
	"index": {
	"number_of_replicas": 0,
	"number_of_shards": 2
	},
	"analysis": {
	"analyzer": {
	"default": {
	"type": "simple"
	},
	"camelcase": {
	"type" : "pattern",
	"pattern" :
	"([^\\p{L}\\d]+)\|(?<=\\D)(?=\\d)\|(?<=\\d)(?=\\D)\|(?<=[\\p{L}&&[^\\p{Lu}]])(?=\\p{Lu})\|(?<=\\p{Lu})(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])"
	}
	}
	}
	},
	"mappings": {
	"properties": {
	"summary": {
	"type": "text",
	"fields": {
	"summary-english": {
	"type": "text",
	"analyzer": "english"
	}
	}
	},
	"content": {
	"type": "text",
	"fields": {
	"content-standard": {
	"type": "text",
	"analyzer": "standard"
	},
	"content-english": {
	"type": "text",
	"analyzer": "english"
	},
	"content-camelcase": {
	"type": "text",
	"analyzer": "camelcase",
	"search_analyzer": "standard"
	}
	}
	}
	}
	}
	}


	# INJECT DATA
	# EXECUTE inserted-documents-default/feed.sh



	# --------------------- EX4-2
	POST cem-default/_analyze
	{
	"field": "content",
	"text": "coolRaoul42, trucBidule, joe_doe"
	}

	# --------------------- EX4-3
	POST cem-default/_analyze
	{
	"field": "content.content-camelcase",
	"text": "coolRaoul42, trucBidule, joe_doe"
	}

	# --------------------- EX4-4
	POST cem-default/_analyze
	{
	"field": "content.content-camelcase",
	"text" : "import org.apache.curator.retry.{ExponentialBackoffRetry, RetryNTimes}"
	}

	# --------------------- EX4-5
	GET cem-default/_search
	{
	"query": {
	"query_string": {
	"query": "unsafeRun"
	}
	}
	}
	# NOW RETURNS THE RIGHT NUMBER OF RESULTS
	# ALSO BECAUSE search_analyzer has been set to "standard" on content-camelcase

	# --------------------- EX4-6
	GET cem-default/_search
	{
	"query": {
	"query_string": {
	"query": "unsafe"
	}
	}
	}
	# And of course we can now use just a part of camelcase word

	# --------------------- EX4-7
	GET cem-default/_search
	{
	"query": {
	"simple_query_string": {
	"query": "2.6.13"
	}
	}
	}
	# NOW RETURNS THE RIGHT NUMBER OF RESULTS
	# ALSO BECAUSE search_analyzer has been set to "standard" on content-camelcase

	# --------------------- EX4-8
	GET cem-default/_search
	{
	"query": {
	"query_string": {
	"query": "backoff"
	}
	}
	}

	# --------------------- EX4-9
	GET cem-default/_search
	{
	"query": {
	"query_string": {
	"query": "exponential AND backoff"
	}
	}
	}

	# --------------------- EX4-10
	GET cem-default/_search
	{
	"_source": ["content"],
	"query": {
	"query_string": {
	"query": "\"async.AsyncCuratorFramework\""
	}
	}
	}

	# TODO - to DEBUG
	GET cem-default/_search
	{
	"_source": ["content"],
	"query": {
	"match_phrase_prefix": {
	"query": "async curator framework"
	}
	}
	}


	# --------------------- EX4-11
	# IT IS POSSIBLE TO HIGHLIGHT THE MATCH !
	GET cem-default/_search
	{
	"_source":["summary"],
	"query": {
	"match": {
	"summary": "snippets user"
	}
	},
	"highlight" : {
	"fields" : {
	"summary": {"force_source" : true}
	}
	}
	}


	# ===============================================================
	# Optimizing mapping and playing with tokens

	# DO NOT EXECUTE IF USING THE COMMON ELASTICSEARCH
	DELETE _template/cem_mapping_template
	DELETE cem-*

	# --------------------- EX5-1
	# EXECUTE PRIOR TO DATA INJECTION
	PUT _template/cem_mapping_template
	{
	"index_patterns": ["cem-*"],
	"settings": {
	"index": {
	"number_of_replicas": 0
	},
	"analysis": {
	"filter":{
	"english_stop": {
	"type": "stop",
	"stopwords": "_english_"
	},
	"english_keywords": {
	"type": "keyword_marker",
	"keywords": ["example"]
	},
	"english_stemmer": {
	"type": "stemmer",
	"language": "english"
	},
	"english_possessive_stemmer": {
	"type": "stemmer",
	"language": "possessive_english"
	},
	"english_synonym" : {
	"type" : "synonym",
	"synonyms_path" : "synonyms.txt"
	}
	},
	"analyzer": {
	"default": {
	"type": "simple"
	},
	"camelcase": {
	"type" : "pattern",
	"pattern" :
	"([^\\p{L}\\d]+)\|(?<=\\D)(?=\\d)\|(?<=\\d)(?=\\D)\|(?<=[\\p{L}&&[^\\p{Lu}]])(?=\\p{Lu})\|(?<=\\p{Lu})(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])"
	},
	"rebuilt_english": {
	"tokenizer": "standard",
	"filter": [
	"english_synonym",
	"english_possessive_stemmer",
	"lowercase",
	"asciifolding",
	"english_stop",
	"english_keywords",
	"english_stemmer"
	]
	}
	}
	}
	},
	"mappings": {
	"properties": {
	"category": {
	"type": "text",
	"fields": {
	"keyword": {
	"type": "keyword",
	"ignore_above": 2000
	}
	}
	},
	"summary": {
	"type": "text",
	"analyzer": "rebuilt_english",
	"fields": {
	"summary-english": {
	"type": "text",
	"analyzer": "rebuilt_english",
	"search_analyzer" : "rebuilt_english"
	},
	"summary-standard": {
	"type": "text",
	"analyzer": "standard",
	"search_analyzer" : "standard"
	}
	}
	},
	"content": {
	"type": "text",
	"fields": {
	"content-standard": {
	"type": "text",
	"analyzer": "standard"
	},
	"content-english": {
	"type": "text",
	"analyzer": "rebuilt_english",
	"search_analyzer" : "rebuilt_english"
	},
	"content-camelcase": {
	"type": "text",
	"analyzer": "camelcase",
	"search_analyzer": "standard"
	}
	}
	},
	"file": {
	"type": "text"
	},
	"filename": {
	"type": "text",
	"fields": {
	"keyword": {
	"type": "keyword",
	"ignore_above": 256
	}
	}
	},
	"id": {
	"type": "text"
	},
	"uuid": {
	"type": "text"
	},
	"keywords": {
	"type": "text",
	"fields": {
	"keyword": {
	"type": "keyword",
	"ignore_above": 1000
	}
	}
	},
	"lastUpdated": {
	"type": "date"
	},
	"managedBy": {
	"type": "text"
	},
	"license": {
	"type": "text",
	"fields": {
	"keyword": {
	"type": "keyword",
	"ignore_above": 256
	}
	}
	},
	"publish": {
	"type": "text",
	"fields": {
	"keyword": {
	"type": "keyword",
	"ignore_above": 256
	}
	}
	},
	"authors": {
	"type": "text",
	"fields": {
	"keyword": {
	"type": "keyword",
	"ignore_above": 256
	}
	}
	},
	"execution": {
	"type": "text"
	},
	"runWith": {
	"type": "text"
	}
	}
	}
	}

	# INJECT DATA (USE TIMED BASE INDEX NAME)
	# EXECUTE inserted-documents-timed/feed.sh

	# To check the applied mapping (automatically taken from the defined mapping template)



	# --------------------- EX5-2
	GET _cat/indices

	# --------------------- EX5-3
	GET cem-*/_count

	# --------------------- EX5-4
	GET cem-*/_search
	{
	"query": {
	"query_string": {
	"query": "zio AND (resource OR environment)"
	}
	}
	}

	# --------------------- EX5-5
	GET cem-*/_search
	{
	"query": {
	"simple_query_string": {
	"query": "zio+(resource\|environment)"
	}
	}
	}

	# --------------------- EX5-6
	GET cem-*/_search
	{
	"_source": ["summary"],
	"query": {
	"match": {
	"summary": {
	"query":"get the current user",
	"operator": "and"
	}
	}
	}
	}
	# the is ignored now

	# --------------------- EX5-7

	GET cem-*/_search
	{
	"query": {
	"match_phrase": {
	"content": "cheat sheet"
	}
	}
	}


	# --------------------- EX5-8

	GET cem-*/_search?size=5
	{
	"_source": ["summary", "keywords"],
	"query": {
	"bool": {
	"must": {
	"match":{ "publish":"gist"}
	},
	"filter": {
	"term": { "keywords": "testable"}
	},
	"must_not": [
	{"match": {"summary":{"query":"hello world"}}},
	{"match": {"keywords":"async"}}
	]
	}
	}
	}
	# filter focus on sub data, allow caching and are ignored in the scorings


	# --------------------- EX5-9
	# search for missing fields
	POST /cem-*/_search
	{
	"_source": ["filename","summary"],
	"size": 20,
	"query": {
	"bool": {
	"must_not": [
	{"exists": {"field": "execution"}}
	]
	}
	}
	}

	# --------------------- EX5-10
	# Search within date ranges

	POST cem-*/_search
	{
	"query": {
	"bool": {
	"must":{"query_string": { "query": "vertx"} },
	"filter": {"range": {
	"created_on": {
	"gte": "2020-06-01",
	"lte": "2020-12-31"
	}
	}}
	}
	}
	}

	# --------------------- EX5-11
	# Count created this last 6 months

	POST /cem-*/_count
	{
	"query": {
	"bool": {
	"must":{"query_string": { "query": "elasticsearch"} },
	"filter": {"range": {
	"created_on": {
	"gte": "now-1y"
	}
	}}
	}
	}
	}
	# TODO check with last_updated no results returned !!! (with zio)


	# --------------------- EX5-12
	# Natural language query attempt

	GET cem-*/_search
	{
	"_source":["summary"],
	"query": {
	"query_string": {
	"fields": ["summary"],
	"query": "getting the users"
	}
	}
	}

	# Remember :
	POST cem-2021-5/_analyze
	{
	"field":"summary",
	"text": "getting the users"
	}


	# --------------------- EX5-13
	GET cem-*/_search
	{
	"_source":["summary"],
	"query": {
	"query_string": {
	"fields": ["summary"],
	"query": "getting the users",
	"default_operator": "AND"
	}
	}
	}
	# users => user, getting => get, "the" is ignored


	# --------------------- EX5-14
	# fetch & get has been made synonyms (config/synonyms.txt)
	GET cem-*/_search
	{
	"_source":["summary"],
	"query": {
	"query_string": {
	"fields": ["summary", "content"],
	"query": "fetch the users",
	"default_operator": "AND"
	}
	}
	}


	GET cem-*/_search
	{
	"query": {
	"query_string": {
	"query": "synonyms"
	}
	}
	}

	# all synonyms have been added in that case check (config/synonyms.txt)
	POST cem-2021-5/_analyze
	{
	"field":"summary",
	"text": "fetch the users"
	}


	# --------------------- EX5-15
	GET cem-*/_search
	{
	"_source":["summary"],
	"query": {
	"query_string": {
	"fields": ["summary"],
	"query": "sheetcheat",
	"default_operator": "AND"
	}
	}
	}

	# all synonyms have been mapped into a single token check (config/synonyms.txt)
	POST cem-2021-5/_analyze
	{
	"field":"summary",
	"text": "sheetcheat"
	}

	# --------------------- EX5-16A
	GET cem-*/_search
	{
	"_source":["summary"],
	"query": {
	"query_string": {
	"fields": ["summary"],
	"query": "arango database",
	"default_operator": "AND"
	}
	}
	}


	# --------------------- EX5-16B ?? => FIXED
	# something going wrong here : issues, issue, unassigned
	# After check the expected document wasn't containing the "issue" word !!
	# TAKE care with multifields when specifying search fields

	GET cem-*/_search
	{
	"_source":["summary"],
	"query": {
	"query_string": {
	"fields":["content"],
	"query": "elastic issues unassigned",
	"default_operator": "AND"
	}
	}
	}
	# NO RESULTS !!!

	GET cem-*/_search
	{
	"_source":["summary", "content"],
	"query": {
	"query_string": {
	"fields":["content.content-english"],
	"query": "elastic issues unassigned",
	"default_operator": "AND"
	}
	}
	}


	GET cem-*/_search
	{
	"_source":["summary", "content"],
	"query": {
	"query_string": {
	"fields":["content.content-english"],
	"query": "elastic issue unassign",
	"default_operator": "AND"
	}
	}
	}


	POST cem-2021-5/_analyze
	{
	"field":"content.content-english",
	"text": "elastic issues unassigned"
	}

	# --------------------- EX5-17
	# all synonyms have been mapped into two tokens, check (config/synonyms.txt)
	POST cem-2021-1/_analyze
	{
	"field": "summary.summary-english",
	"text": "arangodb"
	}
	# PLACE THE SYNONYMS FILTER IN FIRST POSITION !







	# --------------------- EX5-18
	# gérer les fautes d'orthographe... ou les erreurs de saisies

	GET cem-*/_search
	{
	"_source": ["summary"],
	"query": {
	"query_string": {
	"query":"interation"
	}
	}
	}

	# fuzzy search to deal with 1 mispelled word


	GET cem-*/_search
	{
	"_source": ["summary"],
	"query": {
	"fuzzy": {
	"summary.summary-standard": {
	"value":"interation"
	}
	}
	}
	}
	# interation => interactions !


	# --------------------- EX5-19
	# fuzzy search to deal with mispelled words
	GET cem-*/_search
	{
	"_source": ["summary"],
	"size": 3,
	"query": {
	"match": {
	"summary": {
	"query": "got the uuseer",
	"fuzziness": "AUTO"
	}
	}
	}
	}




	# ===============================================================
	# Advanced queries

	# --------------------- EX6-1
	# How many distinct examples keywords ? results stored in aggregations.count.value
	POST /cem-*/_search?size=0
	{
	"aggs" : {
	"the-count-for-me" : {
	"cardinality" : {
	"field" : "keywords.keyword"
	}
	}
	}
	}

	# --------------------- EX6-2
	# Examples keywords occurences count / How many examples for each keyword ?
	# Top10 of the most used keywords
	POST /cem-*/_search
	{
	"size":0,
	"aggs" : {
	"my-results" : {
	"terms" : {
	"field" : "keywords.keyword",
	"size": 10
	}
	}
	}
	}

	# --------------------- EX6-3
	# How much examples in each category
	POST /cem-*/_search
	{
	"size":0,
	"aggs" : {
	"results" : {
	"terms" : {
	"field" : "category.keyword",
	"size": 50
	}
	}
	}
	}


	# --------------------- EX6-4
	# Significant terms - categories with more than 5 document with async & http keywords
	POST /cem-*/_search?size=0
	{
	"query": {
	"terms": {
	"keywords": [
	"async", "http"
	]
	}
	},
	"aggregations": {
	"significant_categories": {
	"significant_terms": {
	"field": "category.keyword",
	"min_doc_count": 5
	}
	}
	}
	}


	# --------------------- EX6-5
	# COUNT AGGREGATE MONTHLY FOR A GIVEN TIME RANGE
	GET /cem-*/_search
	{
	"aggs": {
	"my_results": {
	"date_histogram": {
	"field": "created_on",
	"calendar_interval": "1M",
	"time_zone": "Europe/Paris",
	"min_doc_count": 1
	}
	}
	},
	"size": 0,
	"query": {
	"bool": {
	"must": [
	{
	"range": {
	"created_on": {
	"format": "strict_date_optional_time",
	"gte": "2018-01-01T00:00:00.000Z",
	"lte": "2021-12-31T23:59:59.999Z"
	}
	}
	}
	]
	}
	}
	}