Created
June 4, 2020 08:19
-
-
Save spinscale/22aba88231aeea3e22cdbf204646d3ac to your computer and use it in GitHub Desktop.
Elasticsearch - Hands-on Introduction (2020-06, DevClash)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################### | |
## Introduction to Elasticsearch ## | |
################################### | |
######################### | |
## If you do not want to run Elasticsearch yourself | |
## Test a free 14 day trial at | |
## https://www.elastic.co/cloud/ | |
######################### | |
########################## | |
## Indexing & Searching ## | |
########################## | |
# Single document indexing | |
PUT books/_doc/1 | |
{ "title" : "The Play Framework Cookbook", "category" : "Computer & IT", "price" : 34.99 } | |
# Bulk indexing | |
PUT books/_bulk | |
{ "index" : { "_id" : "1" } } | |
{ "title" : "The Play Framework Cookbook", "category" : "Computer & IT", "price" : 34.99 } | |
{ "index" : { "_id" : "2" } } | |
{ "title" : "Database Internals: A Deep Dive Into How Distributed Data Systems Work", "category":"Computer & IT", "price" : 44.99 } | |
{ "index" : { "_id" : "3" } } | |
{ "title" : "Ready Player One", "category": "Science Fiction", "price" : 10.99 } | |
GET books/_doc/1 | |
GET books/_source/1 | |
DELETE books/_doc/1 | |
PUT books/_doc/1 | |
{ | |
"title": "The Play Framework Cookbook", | |
"category": "Computer & IT", | |
"price": 34.99 | |
} | |
POST books/_update/1 | |
{ | |
"doc" : { | |
"release_date" : "2011-08-11", | |
"author" : "Alexander Reelsen" | |
} | |
} | |
GET books/_doc/1 | |
POST books/_update/1 | |
{ | |
"script": { | |
"source": "ctx._source.price -= 1.55", | |
"lang": "painless" | |
} | |
} | |
GET books/_doc/1 | |
# simple search | |
GET books/_search?q=play | |
# search with dsl | |
GET books/_search | |
{ | |
"query": { | |
"match": { | |
"title": "play" | |
} | |
} | |
} | |
POST books/_forcemerge?max_num_segments=1 | |
# search with explain | |
GET books/_search | |
{ | |
"explain": true, | |
"query": { | |
"match": { | |
"title": "play" | |
} | |
} | |
} | |
PUT books/_doc/4 | |
{ | |
"title": "The Play Framework Cookbook (2nd edition)", | |
"category": "Programming", | |
"price": 38.99 | |
} | |
GET books/_search | |
{ | |
"query": { | |
"bool": { | |
"should": [ | |
{ | |
"match": { | |
"category.keyword": "Computer & IT" | |
} | |
} | |
], | |
"must": [ | |
{ | |
"match": { | |
"title": "play" | |
} | |
} | |
], | |
"filter": [ | |
{ | |
"range": { | |
"price": { | |
"gte": 30 | |
} | |
} | |
} | |
] | |
} | |
} | |
} | |
########################## | |
## Aggregations ########## | |
########################## | |
# group by category | |
GET books/_search | |
{ | |
"size": 0, | |
"aggs": { | |
"by_category": { | |
"terms": { | |
"field": "category.keyword", | |
"size": 10 | |
} | |
} | |
} | |
} | |
# group by category, get the avg price | |
# danger floating point in accuracies! | |
GET books/_search | |
{ | |
"size": 0, | |
"aggs": { | |
"by_category": { | |
"terms": { | |
"field": "category.keyword", | |
"size": 10 | |
}, | |
"aggs": { | |
"avg_price": { | |
"avg": { | |
"field": "price" | |
} | |
} | |
} | |
} | |
} | |
} | |
# add some monitoring data, having autogenerated ids | |
PUT monitoring_data/_bulk | |
{ "index" : {} } | |
{ "duration_in_ms" : 123, "host" : "db01", "statement": "SELECT * from cars" } | |
{ "index" : {} } | |
{ "duration_in_ms" : 145, "host" : "db01", "statement": "SELECT * from cars" } | |
{ "index" : {} } | |
{ "duration_in_ms" : 185, "host" : "db01", "statement": "SELECT * from cars" } | |
{ "index" : {} } | |
{ "duration_in_ms" : 220, "host" : "db01", "statement": "SELECT * from cars" } | |
{ "index" : {} } | |
{ "duration_in_ms" : 450, "host" : "db02", "statement": "SELECT * from cars" } | |
{ "index" : {} } | |
{ "duration_in_ms" : 360, "host" : "db02", "statement": "SELECT * from cars" } | |
# Let's count distinct elements | |
GET monitoring_data/_search | |
{ | |
"size": 0, | |
"aggs": { | |
"number_of_hosts": { | |
"cardinality": { | |
"field": "host.keyword" | |
} | |
} | |
} | |
} | |
# Let's do some percentile | |
GET monitoring_data/_search | |
{ | |
"size": 0, | |
"aggs": { | |
"number_of_hosts": { | |
"percentiles": { | |
"field": "duration_in_ms" | |
} | |
} | |
} | |
} | |
# let's group per host first, seeing clearly | |
# a performance issue with one of the | |
# database hosts | |
GET monitoring_data/_search | |
{ | |
"size": 0, | |
"aggs": { | |
"by_host": { | |
"terms": { | |
"field": "host.keyword", | |
"size": 10 | |
}, | |
"aggs": { | |
"number_of_hosts": { | |
"percentiles": { | |
"field": "duration_in_ms" | |
} | |
} | |
} | |
} | |
} | |
} | |
######################### | |
## Analysis ############ | |
######################### | |
### Tokenizer | |
GET _analyze | |
{ | |
"text": "quick brown fox", | |
"tokenizer": "whitespace" | |
} | |
GET _analyze | |
{ | |
"text": "the lazy, white dog.", | |
"tokenizer": "whitespace" | |
} | |
# see the commas magically being removed | |
# due to UCS#29 in the standard tokenizer | |
# that is also used by default, so you can omit it | |
GET _analyze | |
{ | |
"text": "the lazy, white dog.", | |
"tokenizer": "standard" | |
} | |
# but the URLs is broken | |
GET _analyze | |
{ | |
"text": "this is an url https://www.jade-hs.de" | |
} | |
# let's fix the above issue by using a email analyzer | |
GET _analyze | |
{ | |
"text": "this is an url https://www.jade-hs.de", | |
"tokenizer": "uax_url_email" | |
} | |
### Token Filter | |
# standard tokenization, no filtering | |
GET _analyze | |
{ | |
"text": "The Quick brown fox", | |
"tokenizer": "standard" | |
} | |
# lowercase all the terms | |
GET _analyze | |
{ | |
"text": "The Quick brown fox", | |
"tokenizer": "standard", | |
"filter": [ "lowercase" ] | |
} | |
# remove stop words | |
GET _analyze | |
{ | |
"text": "The Quick brown fox", | |
"tokenizer": "standard", | |
"filter": [ "lowercase", "stop"] | |
} | |
# let's add a synonym | |
GET _analyze | |
{ | |
"text": "The Quick brown fox", | |
"tokenizer": "standard", | |
"filter": [ | |
"lowercase", | |
"stop", | |
{ | |
"type": "synonym", | |
"synonyms": [ | |
"quick => fast" | |
] | |
} | |
] | |
} | |
# let's make sure, that quick remains also | |
GET _analyze | |
{ | |
"text": "The Quick brown fox", | |
"tokenizer": "standard", | |
"filter": [ | |
"lowercase", | |
"stop", | |
{ | |
"type": "synonym", | |
"synonyms": [ | |
"quick => fast, quick" | |
] | |
} | |
] | |
} | |
# stemming | |
# plural being removed, but see the term experience | |
# which now is indistinguishable from experiment | |
# stemming can be overly aggressive | |
GET _analyze | |
{ | |
"text": "Waxolutionists - The smart blip experience", | |
"tokenizer": "standard", | |
"filter": [ | |
"lowercase", | |
"stop", | |
{ | |
"type": "stemmer", | |
"name": "english" | |
} | |
] | |
} | |
# compounding | |
# splits sub terms out of terms | |
# important for lots of german words | |
GET _analyze | |
{ | |
"text": "Blumentopf", | |
"tokenizer": "standard", | |
"filter": [ | |
"lowercase", | |
"stop", | |
{ | |
"type": "dictionary_decompounder", | |
"word_list": [ | |
"topf" | |
] | |
} | |
] | |
} | |
# overcompounding happens | |
# workaround might be the use of hyphenation patterns | |
GET _analyze | |
{ | |
"text": "Verstopfung", | |
"tokenizer": "standard", | |
"filter": [ | |
"lowercase", | |
"stop", | |
{ | |
"type": "dictionary_decompounder", | |
"word_list": [ | |
"topf" | |
] | |
} | |
] | |
} | |
# asciifolding | |
GET _analyze | |
{ | |
"text": [ | |
"München, Köln, Parkstraße", | |
"Muenchen, Koeln, Parkstrasse" | |
], | |
"tokenizer": "standard", | |
"filter": [ | |
"lowercase", | |
"stop", | |
"asciifolding" | |
] | |
} | |
# or normalization? | |
GET _analyze | |
{ | |
"text": [ | |
"München, Köln, Parkstraße", | |
"Muenchen, Koeln, Parkstrasse" | |
], | |
"tokenizer": "standard", | |
"filter": [ | |
"lowercase", | |
"stop", | |
"german_normalization" | |
] | |
} | |
# phonetic analysis requires you to | |
# install the analysis-phonetic plugin | |
# left as an exercise to the reader | |
# Creating an index with a certain analyzer configured | |
# for a certain field | |
PUT my_data | |
{ | |
"settings": { | |
"analysis": { | |
"analyzer": { | |
"my_custom_analyzer": { | |
"type": "custom", | |
"tokenizer": "standard", | |
"filter": [ | |
"lowercase", | |
"stop", | |
"my_synonym_tokenizer" | |
] | |
} | |
}, | |
"filter": { | |
"my_synonym_tokenizer": { | |
"type": "synonym", | |
"synonyms": [ | |
"quick => fast, quick" | |
] | |
} | |
} | |
} | |
}, | |
"mappings": { | |
"properties": { | |
"my_field": { | |
"type": "text", | |
"analyzer": "my_custom_analyzer" | |
} | |
} | |
} | |
} | |
PUT my_data/_doc/1?refresh | |
{ | |
"my_field" : "quick brown fox" | |
} | |
# search for the synonym | |
GET my_data/_search?q=fast | |
GET my_data/_analyze | |
{ | |
"text": "quick brown fox", | |
"field": "my_field" | |
} | |
########################## | |
## System overview ####### | |
########################## | |
# check out static node configuration | |
GET _nodes | |
# check out dynamic node status | |
GET _nodes/stats | |
# check out dynamic node status, but human readable | |
GET _nodes/stats?human | |
# tab separated index overview | |
GET _cat/indices | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment