Skip to content

Instantly share code, notes, and snippets.

@spinscale
Created June 4, 2020 08:19
Show Gist options
  • Save spinscale/22aba88231aeea3e22cdbf204646d3ac to your computer and use it in GitHub Desktop.
Save spinscale/22aba88231aeea3e22cdbf204646d3ac to your computer and use it in GitHub Desktop.
Elasticsearch - Hands-on Introduction (2020-06, DevClash)
###################################
## Introduction to Elasticsearch ##
###################################
#########################
## If you do not want to run Elasticsearch yourself
## Test a free 14 day trial at
## https://www.elastic.co/cloud/
#########################
##########################
## Indexing & Searching ##
##########################
# Single document indexing
PUT books/_doc/1
{ "title" : "The Play Framework Cookbook", "category" : "Computer & IT", "price" : 34.99 }
# Bulk indexing
PUT books/_bulk
{ "index" : { "_id" : "1" } }
{ "title" : "The Play Framework Cookbook", "category" : "Computer & IT", "price" : 34.99 }
{ "index" : { "_id" : "2" } }
{ "title" : "Database Internals: A Deep Dive Into How Distributed Data Systems Work", "category":"Computer & IT", "price" : 44.99 }
{ "index" : { "_id" : "3" } }
{ "title" : "Ready Player One", "category": "Science Fiction", "price" : 10.99 }
GET books/_doc/1
GET books/_source/1
DELETE books/_doc/1
PUT books/_doc/1
{
"title": "The Play Framework Cookbook",
"category": "Computer & IT",
"price": 34.99
}
POST books/_update/1
{
"doc" : {
"release_date" : "2011-08-11",
"author" : "Alexander Reelsen"
}
}
GET books/_doc/1
POST books/_update/1
{
"script": {
"source": "ctx._source.price -= 1.55",
"lang": "painless"
}
}
GET books/_doc/1
# simple search
GET books/_search?q=play
# search with dsl
GET books/_search
{
"query": {
"match": {
"title": "play"
}
}
}
POST books/_forcemerge?max_num_segments=1
# search with explain
GET books/_search
{
"explain": true,
"query": {
"match": {
"title": "play"
}
}
}
PUT books/_doc/4
{
"title": "The Play Framework Cookbook (2nd edition)",
"category": "Programming",
"price": 38.99
}
GET books/_search
{
"query": {
"bool": {
"should": [
{
"match": {
"category.keyword": "Computer & IT"
}
}
],
"must": [
{
"match": {
"title": "play"
}
}
],
"filter": [
{
"range": {
"price": {
"gte": 30
}
}
}
]
}
}
}
##########################
## Aggregations ##########
##########################
# group by category
GET books/_search
{
"size": 0,
"aggs": {
"by_category": {
"terms": {
"field": "category.keyword",
"size": 10
}
}
}
}
# group by category, get the avg price
# danger floating point in accuracies!
GET books/_search
{
"size": 0,
"aggs": {
"by_category": {
"terms": {
"field": "category.keyword",
"size": 10
},
"aggs": {
"avg_price": {
"avg": {
"field": "price"
}
}
}
}
}
}
# add some monitoring data, having autogenerated ids
PUT monitoring_data/_bulk
{ "index" : {} }
{ "duration_in_ms" : 123, "host" : "db01", "statement": "SELECT * from cars" }
{ "index" : {} }
{ "duration_in_ms" : 145, "host" : "db01", "statement": "SELECT * from cars" }
{ "index" : {} }
{ "duration_in_ms" : 185, "host" : "db01", "statement": "SELECT * from cars" }
{ "index" : {} }
{ "duration_in_ms" : 220, "host" : "db01", "statement": "SELECT * from cars" }
{ "index" : {} }
{ "duration_in_ms" : 450, "host" : "db02", "statement": "SELECT * from cars" }
{ "index" : {} }
{ "duration_in_ms" : 360, "host" : "db02", "statement": "SELECT * from cars" }
# Let's count distinct elements
GET monitoring_data/_search
{
"size": 0,
"aggs": {
"number_of_hosts": {
"cardinality": {
"field": "host.keyword"
}
}
}
}
# Let's do some percentile
GET monitoring_data/_search
{
"size": 0,
"aggs": {
"number_of_hosts": {
"percentiles": {
"field": "duration_in_ms"
}
}
}
}
# let's group per host first, seeing clearly
# a performance issue with one of the
# database hosts
GET monitoring_data/_search
{
"size": 0,
"aggs": {
"by_host": {
"terms": {
"field": "host.keyword",
"size": 10
},
"aggs": {
"number_of_hosts": {
"percentiles": {
"field": "duration_in_ms"
}
}
}
}
}
}
#########################
## Analysis ############
#########################
### Tokenizer
GET _analyze
{
"text": "quick brown fox",
"tokenizer": "whitespace"
}
GET _analyze
{
"text": "the lazy, white dog.",
"tokenizer": "whitespace"
}
# see the commas magically being removed
# due to UCS#29 in the standard tokenizer
# that is also used by default, so you can omit it
GET _analyze
{
"text": "the lazy, white dog.",
"tokenizer": "standard"
}
# but the URLs is broken
GET _analyze
{
"text": "this is an url https://www.jade-hs.de"
}
# let's fix the above issue by using a email analyzer
GET _analyze
{
"text": "this is an url https://www.jade-hs.de",
"tokenizer": "uax_url_email"
}
### Token Filter
# standard tokenization, no filtering
GET _analyze
{
"text": "The Quick brown fox",
"tokenizer": "standard"
}
# lowercase all the terms
GET _analyze
{
"text": "The Quick brown fox",
"tokenizer": "standard",
"filter": [ "lowercase" ]
}
# remove stop words
GET _analyze
{
"text": "The Quick brown fox",
"tokenizer": "standard",
"filter": [ "lowercase", "stop"]
}
# let's add a synonym
GET _analyze
{
"text": "The Quick brown fox",
"tokenizer": "standard",
"filter": [
"lowercase",
"stop",
{
"type": "synonym",
"synonyms": [
"quick => fast"
]
}
]
}
# let's make sure, that quick remains also
GET _analyze
{
"text": "The Quick brown fox",
"tokenizer": "standard",
"filter": [
"lowercase",
"stop",
{
"type": "synonym",
"synonyms": [
"quick => fast, quick"
]
}
]
}
# stemming
# plural being removed, but see the term experience
# which now is indistinguishable from experiment
# stemming can be overly aggressive
GET _analyze
{
"text": "Waxolutionists - The smart blip experience",
"tokenizer": "standard",
"filter": [
"lowercase",
"stop",
{
"type": "stemmer",
"name": "english"
}
]
}
# compounding
# splits sub terms out of terms
# important for lots of german words
GET _analyze
{
"text": "Blumentopf",
"tokenizer": "standard",
"filter": [
"lowercase",
"stop",
{
"type": "dictionary_decompounder",
"word_list": [
"topf"
]
}
]
}
# overcompounding happens
# workaround might be the use of hyphenation patterns
GET _analyze
{
"text": "Verstopfung",
"tokenizer": "standard",
"filter": [
"lowercase",
"stop",
{
"type": "dictionary_decompounder",
"word_list": [
"topf"
]
}
]
}
# asciifolding
GET _analyze
{
"text": [
"München, Köln, Parkstraße",
"Muenchen, Koeln, Parkstrasse"
],
"tokenizer": "standard",
"filter": [
"lowercase",
"stop",
"asciifolding"
]
}
# or normalization?
GET _analyze
{
"text": [
"München, Köln, Parkstraße",
"Muenchen, Koeln, Parkstrasse"
],
"tokenizer": "standard",
"filter": [
"lowercase",
"stop",
"german_normalization"
]
}
# phonetic analysis requires you to
# install the analysis-phonetic plugin
# left as an exercise to the reader
# Creating an index with a certain analyzer configured
# for a certain field
PUT my_data
{
"settings": {
"analysis": {
"analyzer": {
"my_custom_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"stop",
"my_synonym_tokenizer"
]
}
},
"filter": {
"my_synonym_tokenizer": {
"type": "synonym",
"synonyms": [
"quick => fast, quick"
]
}
}
}
},
"mappings": {
"properties": {
"my_field": {
"type": "text",
"analyzer": "my_custom_analyzer"
}
}
}
}
PUT my_data/_doc/1?refresh
{
"my_field" : "quick brown fox"
}
# search for the synonym
GET my_data/_search?q=fast
GET my_data/_analyze
{
"text": "quick brown fox",
"field": "my_field"
}
##########################
## System overview #######
##########################
# check out static node configuration
GET _nodes
# check out dynamic node status
GET _nodes/stats
# check out dynamic node status, but human readable
GET _nodes/stats?human
# tab separated index overview
GET _cat/indices
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment