Skip to content

Instantly share code, notes, and snippets.

@ruanbekker
Created December 4, 2020 08:50
Show Gist options
  • Save ruanbekker/cb989521fbd55da3fc9165d9e94e24e5 to your computer and use it in GitHub Desktop.
Save ruanbekker/cb989521fbd55da3fc9165d9e94e24e5 to your computer and use it in GitHub Desktop.
Loki absent_over_time workaround using AWS Lambda

Loki absent over time workaround

This is a workaround that works for my use-case when log content are not present withing a minute, using AWS Lambda and AWS CloudWatch Events

Setup:

  • AWS CloudWatch event triggers the Lambda every minute
  • Lambda uses the Loki API to search for logs, if nothing has been found it retries if still fails, it alerts
  • It alerts to slack.

Absent over time issue has been opened on Grafana:

import requests
import os
SLACK_WEBHOOK_URL = os.environ['SLACK_WEBHOOK_URL']
SLACK_CHANNEL = "#system-events"
alert_map = {
"emoji": {
"up": ":white_check_mark:",
"down": ":fire:"
},
"text": {
"up": "RESOLVED",
"down": "FIRING"
},
"message": {
"up": "xx events visible again",
"down": "No xx or xx events seen in the last minute"
},
"color": {
"up": "#32a852",
"down": "#ad1721"
}
}
def alert_to_slack(status, log_url, metric_url):
data = {
"text": "AlertManager",
"username": "Notifications",
"channel": SLACK_CHANNEL,
"attachments": [
{
"text": "{emoji} [*{state}*] xx Log Events\n {message}".format(
emoji=alert_map["emoji"][status],
state=alert_map["text"][status],
message=alert_map["message"][status]
),
"channel": "#tech-alerts-support",
"color": alert_map["color"][status],
"attachment_type": "default",
"actions": [
{
"name": "Logs",
"text": "Logs",
"type": "button",
"style": "primary",
"url": log_url
},
{
"name": "Metrics",
"text": "Metrics",
"type": "button",
"style": "primary",
"url": metric_url
}
]
}]
}
r = requests.post(SLACK_WEBHOOK_URL, json=data)
return r.status_code
import json
import os
import requests
import time
from log_checks import processing_xx_or_xx_populated_event
from alert_handler import alert_to_slack
log_url = "https://grafana.domain/goto/4Lql-J2Gk"
metric_url = "https://grafana.domain/goto/SCYwaJhGz"
def lambda_handler(event, context):
log_response = processing_xx_or_xx_populated_event()
try:
if len(log_response['data']['result']) > 0:
number_of_log_events = log_response['data']['result'][0]['value'][1]
response = "Number of log events in the last minute: {}".format(number_of_log_events)
print(response)
else:
print("attempt 1: no logs")
# retry check, if still fails, alert
time.sleep(5)
retried_log_response = processing_remit_or_queue_populated_event()
if len(retried_log_response['data']['result']) == 0:
response = "attempt 2: after 3 seconds, theres still no logs, alerting"
print(response)
alert_to_slack("down", log_url, metric_url)
else:
response = "attempt 2: retry worked"
print(response)
except:
response = "something went wrong"
print(response)
return {
'statusCode': 200,
'body': json.dumps(response)
}
import requests
import os
import json
LOKI_API_URL = os.environ['LOKI_API_URL']
LOKI_USERNAME = os.environ['LOKI_USERNAME']
LOKI_PASSWORD = os.environ['LOKI_PASSWORD']
def processing_xx_or_xxpopulated_event():
query_params = {'query': 'sum by (service) (count_over_time({environment="prod", service="xx"}[5m] |~ "(Processing xx Event for xx (.+)|Queue populated with (.+) xx)"))'}
r = requests.get(LOKI_API_URL, params=query_params, auth=(LOKI_USERNAME, LOKI_PASSWORD))
response = r.json()
return response
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment