Last active
October 15, 2024 02:17
-
-
Save dimo414/10d6f162fb6d72f517a041d28d92314f to your computer and use it in GitHub Desktop.
Basic Server Heartbeat Script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Source: https://gist.github.com/dimo414/10d6f162fb6d72f517a041d28d92314f | |
# | |
# This is a basic heartbeat monitoring script suitible for adding to | |
# a machine's crontab to receive alerts (via https://healthchecks.io) | |
# when the machine is unhealthy. | |
# | |
# I use it to keep an eye on my headless Raspberry Pi, but it should | |
# work for most Linux machines. | |
# | |
# Usage: | |
# First, create a check on healthchecks.io for your machine; | |
# you'll provide the ID (the bit after 'https://hc-ping.com/') | |
# to the script either as an argument or by setting HEALTHCHECK_ID. | |
# Set the check's schedule to match how often you want this | |
# script to run. | |
# | |
# Copy this script onto your machine: | |
# wget https://gist.githubusercontent.com/dimo414/10d6f162fb6d72f517a041d28d92314f/raw/heartbeat.sh | |
# you can drop it into /etc/cron.hourly to let cron run it each | |
# hour, or anywhere else if you'll configure the cronjob yourself. | |
# | |
# Be sure to make the script executable (chmod +w heartbeat.sh) | |
# and update HEALTHCHECK_ID if using /etc/cron.hourly | |
# | |
# If you want to configure the cronjob yourself, use a line like: | |
# 15,45 * * * * bash ~/heartbeat.sh {YOUR_HEALTHCHECK_ID} | |
# | |
# Checks: | |
# * Machine is up and online - the heartbeat won't be sent otherwise | |
# * Machine has not been up for too long - if this fails you'll want | |
# to pull in updates and reboot, e.g.: | |
# sudo apt-get update && sudo apt-get upgrade && sudo reboot | |
# * CPU load-average is acceptable | |
# * System has enough "available" memory | |
# * Disks have enough free space | |
# | |
# The first check is required, but the remaining checks can be tuned | |
# via the variables below, or turned off by commenting out the entry | |
# in the COMMANDS array. | |
# | |
# The output of these checks is included in the payload sent to | |
# healthchecks.io for easier monitoring. | |
# | |
# Note: this script is not a substitute for a proper monitoring setup | |
# In particular, if you need metrics or ongoing log monitoring you'll | |
# need a more powerful solution. | |
# Configuration | |
HEALTHCHECK_ID=${1:?healthcheck id} # overwrite this if dropping into /etc/cron.hourly | |
UP_SINCE_CAP='3 months' | |
LOAD_AVG_CAP=.7 | |
MEMORY_UTILIZATION_CAP=.9 | |
DISK_UTILIZATION_CAP=.9 | |
# Note that healthchecks.io only preseves the first 10kb of output | |
# All these commands should have minimal output | |
# TODO incorporate some additional check used by watchdog: | |
# https://manpages.debian.org/testing/watchdog/watchdog.8.en.html | |
COMMANDS=( | |
up_since | |
load_avg | |
avail_memory | |
free_disk | |
voltage | |
) | |
up_since() { | |
uptime | |
# Fail if up for too long | |
(( $(date -d "$(uptime -s)" +%s) > $(date -d "${UP_SINCE_CAP} ago" +%s) )) | |
} | |
load_avg() { | |
# Fail if 15m average load is too high | |
awk -v "nproc=$(nproc)" -v "cap=$LOAD_AVG_CAP" '( ( $3 / nproc ) > cap ) { exit 1 }' /proc/loadavg | |
} | |
avail_memory() { | |
grep -e '^Mem' -e '^Swap' /proc/meminfo | |
# Adapted from https://stackoverflow.com/q/40395894/113632#comment68104694_40398433 | |
awk -v "cap=$MEMORY_UTILIZATION_CAP" '/MemAvailable/{avail=$2} /MemTotal/{total=$2} END{if ((total-avail)/total > cap) { exit 1 } }' /proc/meminfo | |
} | |
free_disk() { | |
df | |
# df prints everything, then we check the usage percent of the given paths to alert | |
df --output=pcent / "$@" | awk -v "cap=$DISK_UTILIZATION_CAP" 'NR>1 { sub( "%", "", $1); if ($1 > cap * 100) { exit 1 }; }' | |
} | |
# https://www.raspberrypi.org/documentation/raspbian/applications/vcgencmd.md | |
voltage() { | |
command -v vcgencmd >/dev/null || return # no-op on non-Raspberry-Pi | |
local -i throttled | |
throttled=$(vcgencmd get_throttled | sed 's/.*=//') | |
printf 'Throttled Bitmask: %X\n' "$throttled" | |
(( throttled & 0x1 )) && echo 'Under-voltage detected!' | |
(( throttled & 0x2 )) && echo 'Arm frequency capped!' | |
(( throttled & 0x4 )) && echo 'Currently throttled!' | |
(( throttled & 0x8 )) && echo 'Soft temperature limit active!' | |
(( throttled & 0x10000 )) && echo 'Under-voltage has occurred' | |
(( throttled & 0x20000 )) && echo 'Arm frequency capping has occurred' | |
(( throttled & 0x40000 )) && echo 'Throttling has occurred' | |
(( throttled & 0x80000 )) && echo 'Soft temperature limit has occurred' | |
for block in core sdram_c sdram_i sdram_p; do | |
printf '%s:\t%s\n' "${block}" "$(vcgencmd measure_volts "$block")" | |
done | |
(( ( throttled & 0xF ) == 0 )) | |
} | |
run() { | |
printf 'Checking: %s\n' "$*" | |
"$@" | |
local ret=$? | |
printf -- '----- Exit Code: %s -----\n' "$ret" | |
return "$ret" | |
} | |
run_all() { | |
local ret all_ret=0 | |
for command in "${COMMANDS[@]}"; do | |
run "$command" | |
ret=$? | |
(( all_ret += ret )) | |
done | |
(( all_ret == 0 )) | |
} | |
### MAIN ### | |
if ! output=$(run_all 2>&1); then | |
suffix=/fail | |
fi | |
curl -fsS --retry 3 -X POST --data-raw "$output" "https://hc-ping.com/${HEALTHCHECK_ID}${suffix}" > /dev/null |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment