Skip to content

Instantly share code, notes, and snippets.

@markharwood
Last active August 29, 2015 14:01
Show Gist options
  • Save markharwood/e6c137c7673186217b74 to your computer and use it in GitHub Desktop.
Save markharwood/e6c137c7673186217b74 to your computer and use it in GitHub Desktop.
Scripts for analyzing Elasticsearch log files
{
"title": "eslogs",
"services": {
"query": {
"list": {
"0": {
"query": "",
"alias": "",
"color": "#7EB26D",
"id": 0,
"pin": false,
"type": "lucene",
"enable": true
}
},
"ids": [
0
]
},
"filter": {
"list": {
"0": {
"type": "terms",
"field": "tags",
"value": "Timeout",
"mandate": "mustNot",
"active": false,
"alias": "",
"id": 0
},
"1": {
"type": "time",
"from": "2014-05-30T00:10:38.000Z",
"to": "2014-05-30T18:58:40.000Z",
"field": "date",
"mandate": "must",
"active": true,
"alias": "",
"id": 1
},
"2": {
"type": "time",
"from": "2014-05-30T18:22:00.335Z",
"to": "2014-05-30T18:58:40.000Z",
"field": "date",
"mandate": "must",
"active": false,
"alias": "",
"id": 2
},
"3": {
"type": "time",
"from": "2014-05-30T18:42:15.649Z",
"to": "2014-05-30T18:50:19.576Z",
"field": "date",
"mandate": "must",
"active": false,
"alias": "",
"id": 3
},
"4": {
"type": "terms",
"field": "node",
"value": "1a",
"mandate": "must",
"active": true,
"alias": "",
"id": 4
},
"5": {
"type": "terms",
"field": "tags",
"value": "ClusterMembership",
"mandate": "must",
"active": false,
"alias": "",
"id": 5
},
"6": {
"type": "terms",
"field": "tags",
"value": "ClusterMembership",
"mandate": "must",
"active": true,
"alias": "",
"id": 6
},
"7": {
"type": "terms",
"field": "source",
"value": "action.admin.indices.stats",
"mandate": "mustNot",
"active": false,
"alias": "",
"id": 7
},
"9": {
"type": "terms",
"field": "disconnectedNode",
"value": "listening-elastic-prod3-1e-data2-1",
"mandate": "must",
"active": false,
"alias": "",
"id": 9
},
"13": {
"type": "terms",
"field": "tags",
"value": "NodeNotConnected",
"mandate": "must",
"active": false,
"alias": "",
"id": 13
},
"15": {
"type": "terms",
"field": "node",
"value": "1e",
"mandate": "must",
"active": false,
"alias": "",
"id": 15
},
"17": {
"type": "terms",
"field": "tags",
"value": "MasterChange",
"mandate": "must",
"active": false,
"alias": "",
"id": 17
},
"18": {
"type": "time",
"from": "2014-05-30T18:28:10.756Z",
"to": "2014-05-30T18:58:40.000Z",
"field": "date",
"mandate": "must",
"active": false,
"alias": "",
"id": 18
},
"19": {
"type": "time",
"from": "2014-05-30T18:44:09.993Z",
"to": "2014-05-30T18:51:09.381Z",
"field": "date",
"mandate": "must",
"active": false,
"alias": "",
"id": 19
}
},
"ids": [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
14
]
}
},
"rows": [
{
"title": "Level",
"height": "150px",
"editable": true,
"collapse": false,
"collapsable": true,
"panels": [
{
"span": 4,
"editable": true,
"type": "histogram",
"loadingEditor": false,
"mode": "count",
"time_field": "date",
"value_field": null,
"x-axis": true,
"y-axis": true,
"scale": 1,
"y_format": "none",
"grid": {
"max": null,
"min": 0
},
"queries": {
"mode": "all",
"ids": [
0
]
},
"annotate": {
"enable": true,
"query": "tags:OOM OR tags:EngineFailure OR tags:MasterConnectivity OR tags:MasterChange OR tags:ClusterMembership",
"size": 50000,
"field": "tags",
"sort": [
"date",
"desc"
]
},
"auto_int": false,
"resolution": 100,
"interval": "1m",
"intervals": [
"auto",
"1s",
"1m",
"5m",
"10m",
"30m",
"1h",
"3h",
"12h",
"1d",
"1w",
"1y"
],
"lines": false,
"fill": 0,
"linewidth": 3,
"points": false,
"pointradius": 5,
"bars": true,
"stack": true,
"spyable": true,
"zoomlinks": true,
"options": true,
"legend": true,
"show_query": true,
"interactive": true,
"legend_counts": true,
"timezone": "utc",
"percentage": false,
"zerofill": true,
"derivative": false,
"tooltip": {
"value_type": "cumulative",
"query_as_alias": true
}
},
{
"error": false,
"span": 2,
"editable": true,
"type": "terms",
"loadingEditor": false,
"field": "tags",
"exclude": [],
"missing": false,
"other": false,
"size": 12,
"order": "count",
"style": {
"font-size": "10pt"
},
"donut": false,
"tilt": false,
"labels": true,
"arrangement": "horizontal",
"chart": "bar",
"counter_pos": "above",
"spyable": true,
"queries": {
"mode": "all",
"ids": [
0
]
},
"tmode": "terms",
"tstat": "total",
"valuefield": "",
"title": "Key events"
},
{
"error": false,
"span": 2,
"editable": true,
"type": "terms",
"loadingEditor": false,
"field": "level",
"exclude": [],
"missing": false,
"other": false,
"size": 10,
"order": "count",
"style": {
"font-size": "10pt"
},
"donut": false,
"tilt": false,
"labels": true,
"arrangement": "horizontal",
"chart": "bar",
"counter_pos": "above",
"spyable": true,
"queries": {
"mode": "all",
"ids": [
0
]
},
"tmode": "terms",
"tstat": "total",
"valuefield": "",
"title": "Level"
},
{
"error": false,
"span": 4,
"editable": true,
"type": "terms",
"loadingEditor": false,
"field": "source",
"exclude": [],
"missing": false,
"other": false,
"size": 22,
"order": "count",
"style": {
"font-size": "10pt"
},
"donut": false,
"tilt": false,
"labels": true,
"arrangement": "horizontal",
"chart": "bar",
"counter_pos": "above",
"spyable": true,
"queries": {
"mode": "all",
"ids": [
0
]
},
"tmode": "terms",
"tstat": "total",
"valuefield": "",
"title": "Source"
}
],
"notice": false
},
{
"title": "",
"height": "250px",
"editable": true,
"collapse": false,
"collapsable": true,
"panels": [
{
"error": false,
"span": 12,
"editable": true,
"type": "table",
"loadingEditor": false,
"size": 1000,
"pages": 5,
"offset": 0,
"sort": [
"date",
"asc"
],
"overflow": "height",
"fields": [
"date",
"msg",
"tags",
"source",
"level",
"lateResponseTimeMillis"
],
"highlight": [],
"sortable": true,
"header": true,
"paging": true,
"field_list": false,
"all_fields": false,
"trimFactor": 300,
"localTime": false,
"timeField": "@timestamp",
"spyable": true,
"queries": {
"mode": "all",
"ids": [
0
]
},
"style": {
"font-size": "9pt"
},
"normTimes": true
}
],
"notice": false
},
{
"title": "",
"height": "150px",
"editable": true,
"collapse": false,
"collapsable": true,
"panels": [
{
"error": false,
"span": 4,
"editable": true,
"type": "terms",
"loadingEditor": false,
"field": "node",
"exclude": [],
"missing": false,
"other": false,
"size": 15,
"order": "term",
"style": {
"font-size": "10pt"
},
"donut": false,
"tilt": false,
"labels": true,
"arrangement": "horizontal",
"chart": "bar",
"counter_pos": "none",
"spyable": true,
"queries": {
"mode": "all",
"ids": [
0
]
},
"tmode": "terms",
"tstat": "total",
"valuefield": "",
"title": "Reporting Node"
},
{
"error": false,
"span": 4,
"editable": true,
"type": "terms",
"loadingEditor": false,
"field": "timedoutNode",
"exclude": [],
"missing": false,
"other": false,
"size": 10,
"order": "count",
"style": {
"font-size": "10pt"
},
"donut": false,
"tilt": false,
"labels": true,
"arrangement": "horizontal",
"chart": "bar",
"counter_pos": "above",
"spyable": true,
"queries": {
"mode": "all",
"ids": [
0
]
},
"tmode": "terms",
"tstat": "total",
"valuefield": "",
"title": "Nodes timing out on responses"
},
{
"error": false,
"span": 4,
"editable": true,
"type": "terms",
"loadingEditor": false,
"field": "disconnectedNode",
"exclude": [],
"missing": false,
"other": false,
"size": 10,
"order": "count",
"style": {
"font-size": "10pt"
},
"donut": false,
"tilt": false,
"labels": true,
"arrangement": "horizontal",
"chart": "bar",
"counter_pos": "above",
"spyable": true,
"queries": {
"mode": "all",
"ids": [
0
]
},
"tmode": "terms",
"tstat": "total",
"valuefield": "",
"title": "Nodes reported disconnected"
}
],
"notice": false
},
{
"title": "",
"height": "150px",
"editable": true,
"collapse": false,
"collapsable": true,
"panels": [
{
"error": false,
"span": 4,
"editable": true,
"type": "terms",
"loadingEditor": false,
"field": "membershipChangedNode",
"exclude": [],
"missing": false,
"other": false,
"size": 10,
"order": "count",
"style": {
"font-size": "10pt"
},
"donut": false,
"tilt": false,
"labels": true,
"arrangement": "horizontal",
"chart": "bar",
"counter_pos": "above",
"spyable": true,
"queries": {
"mode": "all",
"ids": [
0
]
},
"tmode": "terms",
"tstat": "total",
"valuefield": "",
"title": "Nodes changing cluster membership"
},
{
"error": false,
"span": 4,
"editable": true,
"type": "terms",
"loadingEditor": false,
"field": "lateRespondingNode",
"exclude": [],
"missing": false,
"other": false,
"size": 10,
"order": "count",
"style": {
"font-size": "10pt"
},
"donut": false,
"tilt": false,
"labels": true,
"arrangement": "horizontal",
"chart": "bar",
"counter_pos": "above",
"spyable": true,
"queries": {
"mode": "all",
"ids": [
0
]
},
"tmode": "terms",
"tstat": "total",
"valuefield": "",
"title": "Late responding nodes"
},
{
"error": false,
"span": 4,
"editable": true,
"type": "stats",
"loadingEditor": false,
"queries": {
"mode": "all",
"ids": [
0
]
},
"style": {
"font-size": "24pt"
},
"format": "number",
"mode": "max",
"display_breakdown": "no",
"sort_field": "label",
"sort_reverse": true,
"label_name": "lateRespondingNode",
"value_name": "lateResponseTimeMillis",
"spyable": true,
"field": "lateResponseTimeMillis",
"title": "Max overdue response time ms"
}
],
"notice": false
}
],
"editable": true,
"failover": false,
"index": {
"interval": "none",
"pattern": "[logstash-]YYYY.MM.DD",
"default": "eslogs",
"warm_fields": true
},
"style": "dark",
"panel_hints": true,
"pulldowns": [
{
"type": "query",
"collapse": false,
"notice": false,
"enable": true,
"query": "*",
"pinned": true,
"history": [
"",
"2878416",
"\"Received response for a request\"",
"2702083",
"ping*",
"time*",
"time",
"timed",
"timeout",
"\"listening-elastic-prod3-1e-data2-1\""
],
"remember": 10
},
{
"type": "filtering",
"collapse": false,
"notice": true,
"enable": true
}
],
"nav": [
{
"type": "timepicker",
"collapse": false,
"notice": false,
"enable": true,
"status": "Stable",
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
],
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"timefield": "@timestamp",
"now": false
}
],
"loader": {
"save_gist": false,
"save_elasticsearch": true,
"save_local": true,
"save_default": true,
"save_temp": true,
"save_temp_ttl_enable": true,
"save_temp_ttl": "30d",
"load_gist": false,
"load_elasticsearch": true,
"load_elasticsearch_size": 20,
"load_local": false,
"hide": false
},
"refresh": false
}
# ================ Enrich log files and convert to Json
echo "Converting log files to JSON with enriched tags and node names"
python parseLog.py "node1.log.2014-05-12" "node1.json" "node1"
python parseLog.py "node2.log.2014-05-12" "node2.json" "node2"
#...
# ================ Create a fresh index
echo "Indexing..."
es_host="http://localhost:9200"
curl -X DELETE "$es_host/eslogs"
curl -XPOST "$es_host/eslogs" -d '
{
"settings" : {
"number_of_shards" : 1,
"number_of_replicas" : 0
},
"mappings":{
"eslog": {
"properties": {
"logfile": {
"type": "string", "index":"not_analyzed"
},
"level": {
"type": "string", "index":"not_analyzed"
},
"tags": {
"type": "string", "index":"not_analyzed"
},
"source": {
"type": "string", "index":"not_analyzed"
},
"msg": {
"type": "string", "index":"analyzed"
},
"disconnectedNode": {
"type": "string", "index":"not_analyzed"
},
"membershipChangedNode": {
"type": "string", "index":"not_analyzed"
},
"lateResponseTimeMillis": {
"type": "long"
},
"lateRespondingNode": {
"type": "string", "index":"not_analyzed"
},
"timedoutNode": {
"type": "string", "index":"not_analyzed"
},
"date" : {
"type" : "date",
"format" : "yyyy-MM-dd HH:mm:ss,SSS"
}
}
}
}
}
'
curl -XPOST "$es_host/eslogs/_refresh"
stream2es stdin --target $es_host/eslogs/eslog < node1.json
stream2es stdin --target $es_host/eslogs/eslog < node2.json
#...
curl -XPOST "$es_host/eslogs/_refresh"
import os
import re
import StringIO
import json
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("inputLogFileName", help="inputLogFileName")
parser.add_argument("outputJsonFileName", help="Name of the output json file for use with Stream2ES")
parser.add_argument("node", help="The name or number of the node that produced the log file")
args = parser.parse_args()
print "Processing ",args.inputLogFileName
rows=open(args.inputLogFileName, 'rb')
outfile=open(args.outputJsonFileName, 'w')
lc=0
doc={}
msg=""
date=""
level=""
tags=[]
source=""
#node=""
timedoutTargetNodePattern=re.compile(r'.*ReceiveTimeoutTransportException: \[([^\]]*)\].*', re.MULTILINE | re.DOTALL )
disconnectedTargetNodePattern=re.compile(r'.*NodeDisconnectedException: \[([^\]]*)\].*', re.MULTILINE | re.DOTALL )
notConnectedTargetNodePattern=re.compile(r'.*NodeNotConnectedException: \[([^\]]*)\].*', re.MULTILINE | re.DOTALL )
clusterAddedNodePattern=re.compile(r'.*added {\[([^\]]*)\].*', re.MULTILINE | re.DOTALL )
clusterRemovedNodePattern=re.compile(r'.*removed {\[([^\]]*)\].*', re.MULTILINE | re.DOTALL )
lateRespondingNodePattern=re.compile(r'.*Received response for a request that has timed out, sent \[([0-9]*)ms\] ago.*node \[\[([^\]]*)\].*', re.MULTILINE | re.DOTALL )
for row in rows:
if row[0]=='[':
if lc>1:
doc={}
doc["date"]=date
doc["level"]=level
doc["source"]=source
doc["node"]=args.node
doc["msg"]=msg
if "timed out" in msg:
tags.append("Timeout")
m=timedoutTargetNodePattern.match(msg)
if m:
doc["timedoutNode"]=m.group(1)
else:
m=lateRespondingNodePattern.match(msg)
if m:
doc["lateResponseTimeMillis"]=int(m.group(1))
doc["lateRespondingNode"]=m.group(2)
if "should not happen" in msg:
tags.append("ShouldNotHappen")
if "Failed to send release search context" in msg:
tags.append("ReleaseFailure")
if "OutOfMemoryError" in msg:
tags.append("OOM")
if "Maximum number of bytes allocated exceeded" in msg:
tags.append("CircuitBreaker")
if "failed engine" in msg:
tags.append("EngineFailure")
if "failed to merge" in msg:
tags.append("MergeFailure")
if "failed to flush" in msg:
tags.append("FlushFailure")
if "SearchParseException" in msg:
tags.append("InvalidQuery")
if "Recovery failed" in msg:
tags.append("RecoveryFailure")
if "NodeNotConnectedException" in msg:
tags.append("NodeNotConnected")
m=notConnectedTargetNodePattern.match(msg)
if m:
doc["disconnectedNode"]=m.group(1)
if "NodeDisconnectedException" in msg:
tags.append("NodeNotConnected")
m=disconnectedTargetNodePattern.match(msg)
if m:
doc["disconnectedNode"]=m.group(1)
if source.startswith("node"):
tags.append("NodeLifeCycle")
if "cluster.service" in source:
if "master {new" in msg:
tags.append("MasterChange")
if "discovery.zen" in source:
# pretty much any zen discovery message looks interesting
# if "master_left" in msg:
if not "received cluster state" in msg:
tags.append("MasterChange")
if "cluster.service" in source:
if "added" in msg:
tags.append("ClusterMembership")
m=clusterAddedNodePattern.match(msg)
if m:
doc["membershipChangedNode"]=m.group(1)
if "cluster.service" in source:
if "removed" in msg:
tags.append("ClusterMembership")
m=clusterRemovedNodePattern.match(msg)
if m:
doc["membershipChangedNode"]=m.group(1)
if "index.shard.service" in source:
if "suspect illegal state" in msg:
tags.append("SuspectChange")
if "MasterFaultDetection" in msg:
tags.append("MasterConnectivity")
doc["tags"]=tags
json.dump(doc, outfile)
outfile.write(os.linesep)
tags=[]
lc=lc+1
pos=0
pos2=row.index("]",pos)
date=row[pos+1:pos2]
pos=row.index("[",pos2)
pos2=row.index("]",pos)
level=row[pos+1:pos2]
#Not all log entries have source and node e.g. [2014-05-12 16:44:54,918][WARN ][netty.channel.socket.nio.AbstractNioSelector] Unexpected exception in the selector loop.
pos=row.index("[",pos2)
pos2=row.index("]",pos)
source=row[pos+1:pos2].strip()
msg=row[pos2+1:]
else:
msg=msg+"\n"+row
rows.close
print lc, "rows processed"
outfile.close()
{
"aggs" : {
"nodes":{
"terms":{"field":"node", "size":20},
aggs:{
"UncommonlyCommonErrors":{"significant_terms":{"field":"tags"},
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment