Skip to content

Instantly share code, notes, and snippets.

@dimo414
Last active October 15, 2024 02:17
Show Gist options
  • Save dimo414/10d6f162fb6d72f517a041d28d92314f to your computer and use it in GitHub Desktop.
Save dimo414/10d6f162fb6d72f517a041d28d92314f to your computer and use it in GitHub Desktop.
Basic Server Heartbeat Script
#!/bin/bash
# Source: https://gist.github.com/dimo414/10d6f162fb6d72f517a041d28d92314f
#
# This is a basic heartbeat monitoring script suitible for adding to
# a machine's crontab to receive alerts (via https://healthchecks.io)
# when the machine is unhealthy.
#
# I use it to keep an eye on my headless Raspberry Pi, but it should
# work for most Linux machines.
#
# Usage:
# First, create a check on healthchecks.io for your machine;
# you'll provide the ID (the bit after 'https://hc-ping.com/')
# to the script either as an argument or by setting HEALTHCHECK_ID.
# Set the check's schedule to match how often you want this
# script to run.
#
# Copy this script onto your machine:
# wget https://gist.githubusercontent.com/dimo414/10d6f162fb6d72f517a041d28d92314f/raw/heartbeat.sh
# you can drop it into /etc/cron.hourly to let cron run it each
# hour, or anywhere else if you'll configure the cronjob yourself.
#
# Be sure to make the script executable (chmod +w heartbeat.sh)
# and update HEALTHCHECK_ID if using /etc/cron.hourly
#
# If you want to configure the cronjob yourself, use a line like:
# 15,45 * * * * bash ~/heartbeat.sh {YOUR_HEALTHCHECK_ID}
#
# Checks:
# * Machine is up and online - the heartbeat won't be sent otherwise
# * Machine has not been up for too long - if this fails you'll want
# to pull in updates and reboot, e.g.:
# sudo apt-get update && sudo apt-get upgrade && sudo reboot
# * CPU load-average is acceptable
# * System has enough "available" memory
# * Disks have enough free space
#
# The first check is required, but the remaining checks can be tuned
# via the variables below, or turned off by commenting out the entry
# in the COMMANDS array.
#
# The output of these checks is included in the payload sent to
# healthchecks.io for easier monitoring.
#
# Note: this script is not a substitute for a proper monitoring setup
# In particular, if you need metrics or ongoing log monitoring you'll
# need a more powerful solution.
# Configuration
HEALTHCHECK_ID=${1:?healthcheck id} # overwrite this if dropping into /etc/cron.hourly
UP_SINCE_CAP='3 months'
LOAD_AVG_CAP=.7
MEMORY_UTILIZATION_CAP=.9
DISK_UTILIZATION_CAP=.9
# Note that healthchecks.io only preseves the first 10kb of output
# All these commands should have minimal output
# TODO incorporate some additional check used by watchdog:
# https://manpages.debian.org/testing/watchdog/watchdog.8.en.html
COMMANDS=(
up_since
load_avg
avail_memory
free_disk
voltage
)
up_since() {
uptime
# Fail if up for too long
(( $(date -d "$(uptime -s)" +%s) > $(date -d "${UP_SINCE_CAP} ago" +%s) ))
}
load_avg() {
# Fail if 15m average load is too high
awk -v "nproc=$(nproc)" -v "cap=$LOAD_AVG_CAP" '( ( $3 / nproc ) > cap ) { exit 1 }' /proc/loadavg
}
avail_memory() {
grep -e '^Mem' -e '^Swap' /proc/meminfo
# Adapted from https://stackoverflow.com/q/40395894/113632#comment68104694_40398433
awk -v "cap=$MEMORY_UTILIZATION_CAP" '/MemAvailable/{avail=$2} /MemTotal/{total=$2} END{if ((total-avail)/total > cap) { exit 1 } }' /proc/meminfo
}
free_disk() {
df
# df prints everything, then we check the usage percent of the given paths to alert
df --output=pcent / "$@" | awk -v "cap=$DISK_UTILIZATION_CAP" 'NR>1 { sub( "%", "", $1); if ($1 > cap * 100) { exit 1 }; }'
}
# https://www.raspberrypi.org/documentation/raspbian/applications/vcgencmd.md
voltage() {
command -v vcgencmd >/dev/null || return # no-op on non-Raspberry-Pi
local -i throttled
throttled=$(vcgencmd get_throttled | sed 's/.*=//')
printf 'Throttled Bitmask: %X\n' "$throttled"
(( throttled & 0x1 )) && echo 'Under-voltage detected!'
(( throttled & 0x2 )) && echo 'Arm frequency capped!'
(( throttled & 0x4 )) && echo 'Currently throttled!'
(( throttled & 0x8 )) && echo 'Soft temperature limit active!'
(( throttled & 0x10000 )) && echo 'Under-voltage has occurred'
(( throttled & 0x20000 )) && echo 'Arm frequency capping has occurred'
(( throttled & 0x40000 )) && echo 'Throttling has occurred'
(( throttled & 0x80000 )) && echo 'Soft temperature limit has occurred'
for block in core sdram_c sdram_i sdram_p; do
printf '%s:\t%s\n' "${block}" "$(vcgencmd measure_volts "$block")"
done
(( ( throttled & 0xF ) == 0 ))
}
run() {
printf 'Checking: %s\n' "$*"
"$@"
local ret=$?
printf -- '----- Exit Code: %s -----\n' "$ret"
return "$ret"
}
run_all() {
local ret all_ret=0
for command in "${COMMANDS[@]}"; do
run "$command"
ret=$?
(( all_ret += ret ))
done
(( all_ret == 0 ))
}
### MAIN ###
if ! output=$(run_all 2>&1); then
suffix=/fail
fi
curl -fsS --retry 3 -X POST --data-raw "$output" "https://hc-ping.com/${HEALTHCHECK_ID}${suffix}" > /dev/null
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment