Skip to content

Instantly share code, notes, and snippets.

@ShMaunder
Last active July 26, 2022 10:53
Show Gist options
  • Save ShMaunder/25c6483a8cf29312d5ca409f99466090 to your computer and use it in GitHub Desktop.
Save ShMaunder/25c6483a8cf29312d5ca409f99466090 to your computer and use it in GitHub Desktop.
DRBD 9 Sync Corruption
#!/bin/bash
set -eEuo pipefail
# DRBD 9.x corruption reproducer
# Requirements:
# - zfs
# - drbd
# - jq
# - sha256sum
# Primary node hostname
PRIMARY_HOST='node-1'
# ZFS Options
ZFS_POOL='ztank'
# List of volumes - the last volume in the array is the verification volume
# DRBD resync order needs to be the same order
VOL_NAMES=('volume-1' 'volume-2')
function on_error() {
local _line="${1}"
echo "!! Unhandled non-zero return occurred on line: ${_line} !!"
}
function is_primary_node() {
# This should match one of the nodes
[[ "$(</etc/hostname)" == "${PRIMARY_HOST}" ]] || return $?
}
function wait_drbd_conn() {
local _drbd_res="${1}"
while [[ "$(drbdadm cstate -- "${_drbd_res}")" != 'Connected' ]]; do
echo "Waiting for DRBD connection ${_drbd_res}..." 1>&2
sleep 1
done
}
trap 'on_error ${LINENO}' ERR
##
## Step 1
## Down DRBD resources
##
echo '>> Step 1' 1>&2
for VOL_NAME in "${VOL_NAMES[@]}"; do
if ! is_primary_node; then
# wait for first node to disconnect first
while [[ "$(drbdadm cstate -- "${VOL_NAME}")" == 'Connected' ]]; do
echo "Waiting for DRBD disconnection ${VOL_NAME}..." 1>&2
sleep 1
done
fi
echo "DRBD down: ${VOL_NAME}" 1>&2
drbdadm down -- "${VOL_NAME}"
done
##
## Step 2
## Create/Clone ZFS volumes and set primary
##
echo '>> Step 2' 1>&2
for VOL_NAME in "${VOL_NAMES[@]}"; do
ZFS_VOLUME="${ZFS_POOL}/${VOL_NAME}"
# (Re)create the backing volume
zfs destroy -R -- "${ZFS_VOLUME}" ||:
echo "Creating ZFS: ${ZFS_VOLUME}" 1>&2
zfs create -V 1G -s -o snapdev=visible -p "${ZFS_VOLUME}"
zfs set refreservation=auto "${ZFS_VOLUME}"
# (Re)create the meta disk
zfs destroy -R -- "${ZFS_VOLUME}-meta" ||:
zfs create -V 1G -p "${ZFS_VOLUME}-meta"
udevadm settle
sleep 3
drbdadm create-md --force -- "${VOL_NAME}"
echo "DRBD up: ${VOL_NAME}" 1>&2
drbdadm up -- "${VOL_NAME}"
wait_drbd_conn "${VOL_NAME}"
if is_primary_node; then
echo "DRBD primary: ${VOL_NAME}" 1>&2
# https://linbit.com/drbd-user-guide/drbd-guide-9_0-en/#s-skip-initial-resync
# we used an identical image to seed the zvol on both sides so skip the
# initial sync
drbdadm new-current-uuid --clear-bitmap -- "${VOL_NAME}"
drbdadm primary --force -- "${VOL_NAME}"
fi
done
sleep 5 # Let things settle
##
## Step 3
## Disconnect DRBD resource & start random data injection
##
echo '>> Step 3' 1>&2
for VOL_NAME in "${VOL_NAMES[@]}"; do
ZFS_VOLUME="${ZFS_POOL}/${VOL_NAME}"
DRBD_MINOR="$(drbdsetup status --json -- "${VOL_NAME}" | jq '.[0].devices[0].minor')"
if is_primary_node; then
echo "DRBD disconnect: ${VOL_NAME}" 1>&2
drbdadm disconnect -- "${VOL_NAME}"
# dd random stuff in to /dev/drbdxxxx, not immediately synced because
# the resource is disconnected.
# inject 250MB of data if not the last DRBD device
# we do this to increase the sync time of the first resource
DD_COUNT=500000
[[ "${VOL_NAME}" == "${VOL_NAMES[-1]}" ]] && DD_COUNT=1
echo "Starting urandom dd (${DD_COUNT} sectors): drbd${DRBD_MINOR}..." 1>&2
dd if=/dev/urandom count="${DD_COUNT}" of="/dev/drbd${DRBD_MINOR}" &
fi
done
# wait until the dd is finished
wait
sleep 3 # Let things settle
##
## Step 4
## Connect DRBD resource and expect random data to be synced.
## Explicitly pause-sync of the last volume.
##
echo '>> Step 4' 1>&2
for VOL_NAME in "${VOL_NAMES[@]}"; do
if is_primary_node; then
echo "DRBD connect: ${VOL_NAME}" 1>&2
drbdadm connect -- "${VOL_NAME}"
# Give drbd time to fully reconnect the resource.
# if drbd does not fully reconnect the resource in the same order
# then the resync-after directive might be ignored for a short time.
sleep 1
if [[ "${VOL_NAME}" == "${VOL_NAMES[-1]}" ]]; then
# Try to prevent the last resource to from sync'in
echo "DRBD pause: ${VOL_NAME}" 1>&2
drbdadm pause-sync -- "${VOL_NAME}"
if [[ "$(drbdadm dstate -- "${VOL_NAME}")" != 'UpToDate/Outdated' ]]; then
echo "ERROR - ${VOL_NAME} is not in the expected UpToDate/Outdated state!!" 1>&2
fi
fi
fi
done
##
## Step 5
## Random data injection into the last volume.
## The data won't be sync'd to the peer node after resuming
## the sync, showing the DRBD bug
##
echo '>> Step 5' 1>&2
for VOL_NAME in "${VOL_NAMES[-1]}"; do
DRBD_MINOR="$(drbdsetup status --json -- "${VOL_NAME}" | jq '.[0].devices[0].minor')"
if is_primary_node; then
echo "Running urandom dd (1 sector): drbd${DRBD_MINOR}..." 1>&2
dd if=/dev/urandom count=1 of="/dev/drbd${DRBD_MINOR}" seek=5050
fi
done
sleep 3 # Let things settle
##
## Step 6
## Secondary node needs to wait for reconnection
##
echo '>> Step 6' 1>&2
for VOL_NAME in "${VOL_NAMES[@]}"; do
if ! is_primary_node; then
wait_drbd_conn "${VOL_NAME}"
fi
done
##
## Step 7
## Unpause sync on last resource and detect when sync is finished
##
echo '>> Step 7' 1>&2
for VOL_NAME in "${VOL_NAMES[@]}"; do
DRBD_MINOR="$(drbdsetup status --json -- "${VOL_NAME}" | jq '.[0].devices[0].minor')"
if is_primary_node && [[ "${VOL_NAME}" == "${VOL_NAMES[-1]}" ]]; then
echo "DRBD resume-sync: ${VOL_NAME}" 1>&2
drbdadm resume-sync -- "${VOL_NAME}"
fi
while [[ "$(drbdsetup dstate -- "${DRBD_MINOR}")" != 'UpToDate/UpToDate' ]]; do
echo "Waiting for DRBD sync to finish ${VOL_NAME}..." 1>&2
sleep 1
done
done
if is_primary_node; then
# check if the resync total expectation warning message is printed
journalctl --since -15 | grep -P 'expected n_oos:[0-9]+ to be equal to rs_failed:[0-9]+' || :
fi
##
## Step 8
## Verification of backing volume
##
echo '>> Step 8' 1>&2
for VOL_NAME in "${VOL_NAMES[-1]}"; do
ZFS_VOLUME="${ZFS_POOL}/${VOL_NAME}"
DRBD_MINOR="$(drbdsetup status --json -- "${VOL_NAME}" | jq '.[0].devices[0].minor')"
echo "ZFS snapshot: ${ZFS_VOLUME}@post-replay" 1>&2
zfs snapshot -- "${ZFS_VOLUME}@post-replay"
udevadm settle
sleep 1
# if the resources are in sync we expect the snapshot to have identical content
echo -n "Snapshot checksum: " 1>&2
dd if=/dev/zvol/${ZFS_VOLUME}@post-replay bs=1M status=none | sha256sum
# even if some delay meant backing volume not in sync drbdadm verify should be consistent
if is_primary_node; then
echo "DRBD verify: ${VOL_NAME}" 1>&2
drbdadm verify -- "${VOL_NAME}"
else
echo "Waiting and checking for out-of-sync: ${VOL_NAME}" 1>&2
sleep 10 # allow the verify to process things for several seconds
OOS="$(drbdsetup status --statistics --verbose -- "${VOL_NAME}" | grep -Po 'out-of-sync:[0-9]+' | cut -d':' -f2)"
if [[ "${OOS}" -ne 0 ]]; then
echo "Corruption detected: ${OOS}" 1>&2
fi
fi
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment