Skip to content

Instantly share code, notes, and snippets.

@abronner
Created April 23, 2015 01:03
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save abronner/c603ee6e12137cefec37 to your computer and use it in GitHub Desktop.
Save abronner/c603ee6e12137cefec37 to your computer and use it in GitHub Desktop.
Elasticsearch-Toronto Meetup: Zero Downtime (April 21, 2015)
# ----------------------------------------------------------------------------------------------------------------
# Elasticsearch-Toronto Meetup: Zero Downtime (part 1 of 3)
# ----------------------------------------------------------------------------------------------------------------
# My presentation at the second elasticsearch meetup in Toronto
# April 21, 2015
# http://www.meetup.com/ElasticSearch-toronto
# http://www.meetup.com/Elasticsearch-Toronto/events/220384588/
# ----------------------------------------------------------------------------------------------------------------
# Demo with elasticsearch 1.5.1 and Marvel/Sense
# Installation: http://www.elasticsearch.org/guide/en/elasticsearch/guide/current/_installing_elasticsearch.html
# ----------------------------------------------------------------------------------------------------------------
# ---------------------------------------------
# Chapter 1: WHEN SCHEMA-LESS MET AGGREGATIONS
# ... IN PRODUCTION
# ---------------------------------------------
# STORY: as a meetup organizer I want to FIND, SORT and AGGREGATE data about my group members in order to gain deeper insight about my group.
# 'Naive' schema-less INSERT:
POST elasticsearch-toronto_v1/members/182513481
{
"username": "Amit",
"location": "Toronto, ON",
"member_since": "November 25, 2014",
"introduction": "You know for search",
"meetup_groups": [
"(UXD / UX) User Experience Design Toronto",
"AngularJS Toronto",
"Big Data Developers in Toronto",
"DevOps Toronto",
"Full Stack Toronto Meetup",
"Meteor Toronto",
"PhoneGap Toronto (#PhoneGapTO)"
],
"number_of_groups": 7,
"organizer": true,
"link": "http://www.meetup.com/Elasticsearch-Toronto/members/182513481"
}
# Check: is it INDEXED?
GET elasticsearch-toronto_v1/members/182513481
# Can we SEARCH it?
# match all
GET elasticsearch-toronto_v1/_search
{
"query": {
"match_all": {}
}
}
# match by username
GET elasticsearch-toronto_v1/_search
{
"query": {
"match": {
"username": "Amit"
}
}
}
# match by location
# note: CASE INSENSITIVE
GET elasticsearch-toronto_v1/_search
{
"query": {
"match": {
"location": "toronto"
}
}
}
# match by meetup groups
# notes: BOOL query, PHRASE query
GET elasticsearch-toronto_v1/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"meetup_groups": "angularjs"
}
},
{
"match_phrase": {
"meetup_groups": "FULL STACK"
}
},
{
"match_phrase_prefix": {
"meetup_groups": "Meteor TO"
}
}
]
}
}
}
# So far so good !
# but, What about FACETS and AGGREGATIONS?
# aggregate by meetup groups
GET elasticsearch-toronto_v1/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"groups": {
"terms": {
"field": "meetup_groups",
"size": 20
}
}
}
}
# 'SCHEMA-LESS' EQUALS 'DEFAULT MAPPING' !
# Here is our DEFAULT MAPPING
GET elasticsearch-toronto_v1/_mapping
# And here is how DEFAULT STRING is ANALYZED
GET elasticsearch-toronto_v1/_analyze
{"(UXD / UX) User Experience Design Toronto"}
# No problem, MAP AGGREGATION FIELDS AS NOT_ANALYZED
# update mapping:
PUT elasticsearch-toronto_v1/members/_mapping
{
"members": {
"properties": {
"meetup_groups": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
# And... ERROR
# YOU CANNOT UPDATE THE MAPPING OF A MAPPED FIELD !
# Ok, alternative: USE MULTI FIELDS
PUT elasticsearch-toronto_v1/members/_mapping
{
"members": {
"properties": {
"meetup_groups": {
"type": "string",
"fields": {
"not_analyzed": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
# Great, NO Error
# aggregate by meetup groups
GET elasticsearch-toronto_v1/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"groups": {
"terms": {
"field": "meetup_groups.not_analyzed",
"size": 20
}
}
}
}
# And... NO RESULTS
# CHANGING MAPPING DOES NOT REINDEX DOCUMENTS !
# In other words, we need to reindex
POST elasticsearch-toronto_v1/members/182513481
{
"username": "Amit",
"location": "Toronto, ON",
"member_since": "November 25, 2014",
"introduction": "You know for search",
"meetup_groups": [
"(UXD / UX) User Experience Design Toronto",
"AngularJS Toronto",
"Big Data Developers in Toronto",
"DevOps Toronto",
"Full Stack Toronto Meetup",
"Meteor Toronto",
"PhoneGap Toronto (#PhoneGapTO)"
],
"number_of_groups": 7,
"organizer": true,
"link": "http://www.meetup.com/Elasticsearch-Toronto/members/182513481"
}
# try again
GET elasticsearch-toronto_v1/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"groups": {
"terms": {
"field": "meetup_groups.not_analyzed",
"size": 20
}
}
}
}
# IT WORKS
# but NOT ANALYZED is CASE SENSITIVE..
# insert another memeber:
POST elasticsearch-toronto_v1/members/13589068
{
"username": "Roman B.",
"location": "East York, ON",
"member_since": "November 29, 2014",
"introduction": "Full stack Dev",
"meetup_groups": [
"Angularjs Toronto",
"Business Connection Exchange Toronto",
"Devops Toronto",
"Full stack Toronto Meetup",
"Docker Online Meetup"
],
"number_of_groups": 5,
"organizer": false,
"link": "http://www.meetup.com/Elasticsearch-Toronto/members/13589068"
}
# check aggregation:
GET elasticsearch-toronto_v1/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"groups": {
"terms": {
"field": "meetup_groups.not_analyzed",
"size": 20
}
}
}
}
# We want to build a CUSTOM ANALYZER:
# keyword tokenizer + lowercase filter
PUT elasticsearch-toronto_v1/_settings
{
"analysis" : {
"analyzer":{
"keyword_lowercase":{
"type": "custom",
"tokenizer": "keyword",
"filter": ["lowercase"]
}
}
}
}
# And... ERROR
# INDEX MUST BE CLOSED TO UPDATE SETTINGS
# There could be many other changes.
# For example: want a date histogram to see how members join over time? oops, 'member_since' is string and not a date field...
GET elasticsearch-toronto_v1/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs" : {
"members_over_time" : {
"date_histogram" : {
"field" : "member_since",
"interval" : "day"
}
}
}
}
# SOONER OR LATER YOU WILL NEED TO MAKE CHANGES WHEN YOUR INDEX IS ALREADY IN PRODUCTION
# -----------------
# END of Chapter 1
# -----------------
# Lessons:
# - YOU CANNOT UPDATE THE MAPPING OF A MAPPED FIELD
# - YOU CANNOT UPDATE THE SETTINGS OF AN OPEN INDEX
# - CHANGING MAPPING/SETTINGS DOES NOT REINDEX DOCUMENTS
# - PREPARE TO MAKES CHANGES TO YOUR INDEX IN PRODUCTION
# ----------------------------------------------------------------------------------------------------------------
# Elasticsearch-Toronto Meetup: Zero Downtime (part 2 of 3)
# ----------------------------------------------------------------------------------------------------------------
# My presentation at the second elasticsearch meetup in Toronto
# April 21, 2015
# http://www.meetup.com/ElasticSearch-toronto
# http://www.meetup.com/Elasticsearch-Toronto/events/220384588/
# ----------------------------------------------------------------------------------------------------------------
# Demo with elasticsearch 1.5.1 and Marvel/Sense
# Installation: http://www.elasticsearch.org/guide/en/elasticsearch/guide/current/_installing_elasticsearch.html
# ----------------------------------------------------------------------------------------------------------------
# ---------------------------------------------
# Chapter 2: MAKING INDEX CHANGES IN PRODUCTION
# ---------------------------------------------
# Step 1: PREPARE NEW VERSION OF INDEX
POST elasticsearch-toronto_v2
{
"settings": {
"analysis" : {
"analyzer":{
"keyword_lowercase":{
"type": "custom",
"tokenizer": "keyword",
"filter": ["lowercase"]
}
}
}
},
"mappings": {
"members": {
"properties": {
"meetup_groups": {
"type": "string",
"fields": {
"not_analyzed": {
"type": "string",
"index": "not_analyzed"
},
"keyword_lowercase": {
"type": "string",
"index": "analyzed",
"analyzer": "keyword_lowercase"
}
}
},
"member_since": {
"type": "date",
"format": "MMM d, y"
}
}
}
}
}
# check the mapping
GET elasticsearch-toronto_v2/_mapping
# Step 2: REINDEX DOCUMENTS
POST elasticsearch-toronto_v2/members/182513481
{
"username": "Amit",
"location": "Toronto, ON",
"member_since": "November 25, 2014",
"introduction": "You know for search",
"meetup_groups": [
"(UXD / UX) User Experience Design Toronto",
"AngularJS Toronto",
"Big Data Developers in Toronto",
"DevOps Toronto",
"Full Stack Toronto Meetup",
"Meteor Toronto",
"PhoneGap Toronto (#PhoneGapTO)"
],
"number_of_groups": 7,
"organizer": true,
"link": "http://www.meetup.com/Elasticsearch-Toronto/members/182513481"
}
POST elasticsearch-toronto_v2/members/13589068
{
"username": "Roman B.",
"location": "East York, ON",
"member_since": "November 29, 2014",
"introduction": "Full stack Dev",
"meetup_groups": [
"Angularjs Toronto",
"Business Connection Exchange Toronto",
"Devops Toronto",
"Full stack Toronto Meetup",
"Docker Online Meetup"
],
"number_of_groups": 5,
"organizer": false,
"link": "http://www.meetup.com/Elasticsearch-Toronto/members/13589068"
}
# check changes: aggregation
GET elasticsearch-toronto_v2/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"groups": {
"terms": {
"field": "meetup_groups.keyword_lowercase",
"size": 20
}
}
}
}
# check changes: date histogram
GET elasticsearch-toronto_v2/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs" : {
"members_over_time" : {
"date_histogram" : {
"field" : "member_since",
"interval" : "day"
}
}
}
}
# Step 3: SWITCH INDICES
# USE ALIASES INSTEAD OF INDEX NAMES !
POST _aliases
{
"actions": [
{ "add": { "index": "elasticsearch-toronto_v1", "alias": "elasticsearch-toronto" }}
]
}
# check aggregation (using alias)
GET elasticsearch-toronto/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"groups": {
"terms": {
"field": "meetup_groups.keyword_lowercase",
"size": 20
}
}
}
}
# check date histogram (using alias)
GET elasticsearch-toronto/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs" : {
"members_over_time" : {
"date_histogram" : {
"field" : "member_since",
"interval" : "day"
}
}
}
}
# SWITCH WITH ZERO DOWNTIME !
POST _aliases
{
"actions": [
{ "remove": { "index": "elasticsearch-toronto_v1", "alias": "elasticsearch-toronto" }},
{ "add": { "index": "elasticsearch-toronto_v2", "alias": "elasticsearch-toronto" }}
]
}
# check aggregation (using alias)
GET elasticsearch-toronto/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"groups": {
"terms": {
"field": "meetup_groups.keyword_lowercase",
"size": 20
}
}
}
}
# check date histogram (using alias)
GET elasticsearch-toronto/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs" : {
"members_over_time" : {
"date_histogram" : {
"field" : "member_since",
"interval" : "day"
}
}
}
}
# Be careful not to mess it up:
# ALIAS can point to multiple indices
POST _aliases
{
"actions": [
{ "add": { "index": "elasticsearch-toronto_v1", "alias": "elasticsearch-toronto" }},
{ "add": { "index": "elasticsearch-toronto_v2", "alias": "elasticsearch-toronto" }}
]
}
# check alias
GET _alias/elasticsearch-toronto
# ALIAS can point to zero indices
POST _aliases
{
"actions": [
{ "remove": { "index": "elasticsearch-toronto_v1", "alias": "elasticsearch-toronto" }},
{ "remove": { "index": "elasticsearch-toronto_v2", "alias": "elasticsearch-toronto" }}
]
}
# check alias
GET _alias/elasticsearch-toronto
# If necessary, you can always revert
POST _aliases
{
"actions": [
{ "add": { "index": "elasticsearch-toronto_v1", "alias": "elasticsearch-toronto" }},
{ "remove": { "index": "elasticsearch-toronto_v2", "alias": "elasticsearch-toronto" }}
]
}
# check alias
GET _alias/elasticsearch-toronto
# before we continue
DELETE elasticsearch-toronto_v2/_query
{
"query": {
"match_all": {}
}
}
# ** REINDEXING METHODS: **
# (1) From external source (e.g. database)
# (2) From current index version ('_source' field)
# REINDEX: SCAN & SCROLL + BULK API
# SCAN
# call ONCE
GET elasticsearch-toronto_v1/_search?search_type=scan&scroll=1m
{
"query": {
"match_all": {}
},
"size": 1
}
# call until you get zero hits
GET _search/scroll?scroll=1m&scroll_id=SCROLL_ID
# Notes:
# scroll expiry time (1m)
# scroll size (x number of shards)
# SCAN DOES NO SORTING !
# BULK
# use the '_source' from SCAN results
POST _bulk
{ "index" : { "_index" : "elasticsearch-toronto_v2", "_type" : "members", "_id" : "182513481" }}
{ "username": "Amit", "location": "Toronto, ON", "member_since": "November 25, 2014", "introduction": "You know for search", "meetup_groups": [ "(UXD / UX) User Experience Design Toronto", "AngularJS Toronto", "Big Data Developers in Toronto", "DevOps Toronto", "Full Stack Toronto Meetup", "Meteor Toronto", "PhoneGap Toronto (#PhoneGapTO)" ], "number_of_groups": 7, "organizer": true, "link": "http://www.meetup.com/Elasticsearch-Toronto/members/182513481" }
{ "index" : { "_index" : "elasticsearch-toronto_v2", "_type" : "members", "_id" : "13589068" }}
{ "username": "Roman B.", "location": "East York, ON", "member_since": "November 29, 2014", "introduction": "Full stack Dev", "meetup_groups": [ "Angularjs Toronto", "Business Connection Exchange Toronto", "Devops Toronto", "Full stack Toronto Meetup", "Docker Online Meetup" ], "number_of_groups": 5, "organizer": false, "link": "http://www.meetup.com/Elasticsearch-Toronto/members/13589068" }
# check results
GET elasticsearch-toronto_v2/_search
{
"query": {
"match_all": {}
}
}
# before we continue
DELETE elasticsearch-toronto_v2/_query
{
"query": {
"match_all": {}
}
}
# EASY REINDEXING WITH PYTHON CLIENT
# $ pip install elasticsearch
# $ python
# >>> from elasticsearch import Elasticsearch
# >>> from elasticsearch import helpers
# >>> es = Elasticsearch()
# >>> helpers.reindex(es, "elasticsearch-toronto_v1", "elasticsearch-toronto_v2", {"query": {"match_all": {}}})
# check results
GET elasticsearch-toronto_v2/_search
{
"query": {
"match_all": {}
}
}
# source code on github: https://github.com/elastic/elasticsearch-py/blob/master/elasticsearch/helpers/__init__.py
# Elegant implementation (using GENERATORS)
# EASY TO MODIFY DOCUMENTS DURING REINDEXING
# example: https://gist.githubusercontent.com/abronner/2c0e0dba0e998eb3a4b1/raw/ce17e56eb22069cacb305e0a7e642daeaa80c5ed/gistfile1.txt
# >>> reindex(es, "elasticsearch-toronto_v1", "elasticsearch-toronto_v2", {"query": {"match_all": {}}})
# check results
GET elasticsearch-toronto_v2/_search
{
"query": {
"match_all": {}
}
}
# -----------------
# END of Chapter 2
# -----------------
# Lessons:
# - USE ALIASES INSTEAD OF INDEX NAMES
# - REINDEX WITH SCAN/SCROLL & BULK API
# - SCAN DOES NOT SORT THE RESULTS
# ----------------------------------------------------------------------------------------------------------------
# Elasticsearch-Toronto Meetup: Zero Downtime (part 3 of 3)
# ----------------------------------------------------------------------------------------------------------------
# My presentation at the second elasticsearch meetup in Toronto
# April 21, 2015
# http://www.meetup.com/ElasticSearch-toronto
# http://www.meetup.com/Elasticsearch-Toronto/events/220384588/
# ----------------------------------------------------------------------------------------------------------------
# Demo with elasticsearch 1.5.1 and Marvel/Sense
# Installation: http://www.elasticsearch.org/guide/en/elasticsearch/guide/current/_installing_elasticsearch.html
# ----------------------------------------------------------------------------------------------------------------
# ---------------------------------------------
# Chapter 3: REINDEXING CHALLENGES
# ---------------------------------------------
# before we continue
DELETE elasticsearch-toronto_v2/_query
{
"query": {
"match_all": {}
}
}
# INCOMING DOCUMENTS
# you need to change your production index
# you use aliases
# you create a new version of the index
# and you reindex using scan/scroll & bulk api
# but...
# your system continues to index new documents
# CHALLENGE: SCAN TAKES A SNAPSHOT IN TIME
# call ONCE
GET elasticsearch-toronto_v1/_search?search_type=scan&scroll=1m
{
"query": {
"match_all": {}
},
"size": 1
}
# 1st document
GET _search/scroll?scroll=1m&scroll_id=SCROLL_ID
# index a new document
POST elasticsearch-toronto_v1/members/8968154
{
"username": "Nick Van Weerdenburg",
"location": "Toronto, ON",
"member_since": "December 29, 2014",
"introduction": "Founder of http://rangle.io­, a next-generation web and UX development firm specializing in AngularJS, Node, modern JS, and Lean UX.",
"meetup_groups": [
"Agile Experience Design Toronto",
"AngularJS Toronto",
"PhoneGap Toronto (#PhoneGapTO)",
"(UXD / UX) User Experience Design Toronto",
"#DevTO"
],
"number_of_groups": 5,
"link": "http://www.meetup.com/Elasticsearch-Toronto/members/8968154"
}
# 2nd document
GET _search/scroll?scroll=1m&scroll_id=SCROLL_ID
# 3rd document ?
GET _search/scroll?scroll=1m&scroll_id=SCROLL_ID
# NO... END OF SCROLL
# SOLUTION: SCROLL BY TIMESTAMPS
PUT elasticsearch-toronto_v1/members/_mapping
{
"members" : {
"_timestamp": { "enabled" : true }
}
}
# reindex (to set timestamp)
POST elasticsearch-toronto_v1/members/182513481
{
"username": "Amit",
"location": "Toronto, ON",
"member_since": "November 25, 2014",
"introduction": "You know for search",
"meetup_groups": [
"(UXD / UX) User Experience Design Toronto",
"AngularJS Toronto",
"Big Data Developers in Toronto",
"DevOps Toronto",
"Full Stack Toronto Meetup",
"Meteor Toronto",
"PhoneGap Toronto (#PhoneGapTO)"
],
"number_of_groups": 7,
"organizer": true,
"link": "http://www.meetup.com/Elasticsearch-Toronto/members/182513481"
}
# reindex (to set timestamp)
POST elasticsearch-toronto_v1/members/13589068
{
"username": "Roman B.",
"location": "East York, ON",
"member_since": "November 29, 2014",
"introduction": "Full stack Dev",
"meetup_groups": [
"Angularjs Toronto",
"Business Connection Exchange Toronto",
"Devops Toronto",
"Full stack Toronto Meetup",
"Docker Online Meetup"
],
"number_of_groups": 5,
"organizer": false,
"link": "http://www.meetup.com/Elasticsearch-Toronto/members/13589068"
}
# Get last timestamp before scanning
GET elasticsearch-toronto_v1/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"last_timestamp": {
"max": {
"field": "_timestamp"
}
}
}
}
# LAST_TIMESTAMP = ___
# SCAN until timestamp
GET elasticsearch-toronto_v1/_search?search_type=scan&scroll=1m
{
"query": {
"range": {
"_timestamp": {
"gt": 0,
"lte": LAST_TIMESTAMP
}
}
},
"size": 1
}
# 1st document
GET _search/scroll?scroll=1m&scroll_id=SCROLL_ID
# index a new document
POST elasticsearch-toronto_v1/members/8968154
{
"username": "Nick Van Weerdenburg",
"location": "Toronto, ON",
"member_since": "December 29, 2014",
"introduction": "Founder of http://rangle.io­, a next-generation web and UX development firm specializing in AngularJS, Node, modern JS, and Lean UX.",
"meetup_groups": [
"Agile Experience Design Toronto",
"AngularJS Toronto",
"PhoneGap Toronto (#PhoneGapTO)",
"(UXD / UX) User Experience Design Toronto",
"#DevTO"
],
"number_of_groups": 5,
"link": "http://www.meetup.com/Elasticsearch-Toronto/members/8968154"
}
# 2nd document
GET _search/scroll?scroll=1m&scroll_id=SCROLL_ID
# End of scroll
# SCAN from previous timestamp
GET elasticsearch-toronto_v1/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"last_timestamp": {
"max": {
"field": "_timestamp"
}
}
}
}
GET elasticsearch-toronto_v1/_search?search_type=scan&scroll=1m
{
"query": {
"range": {
"_timestamp": {
"gt": PREVIOUS_TIMESTAMP,
"lte": LAST_TIMESTAMP
}
}
},
"size": 1
}
# 3rd document
GET _search/scroll?scroll=1m&scroll_id=SCROLL_ID
# Final Notes:
# You will still need to stop incoming documents before the alias switch, otherwise there is always a chance of missing some documents
# If stopping incoming document is not possible, you might need to do a final update (reindex) after the alias switch. The problem is the chance that a document from the previous index might have been already updated on the new index and should not be overwritten. Checking document versions is an optional solution.
# Scan does NO sorting so you can't make any assumptions about reindexing failures. Either log all failures and fix them individually, or repeat the scan interval.
# Things can get more complicated when a new version of your software is released with a new version of your index. It's better to split the tasks if possible.
# -----------------
# END of Chapter 3
# -----------------
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment