-
-
Save ShMaunder/25c6483a8cf29312d5ca409f99466090 to your computer and use it in GitHub Desktop.
DRBD 9 Sync Corruption
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -eEuo pipefail | |
# DRBD 9.x corruption reproducer | |
# Requirements: | |
# - zfs | |
# - drbd | |
# - jq | |
# - sha256sum | |
# Primary node hostname | |
PRIMARY_HOST='node-1' | |
# ZFS Options | |
ZFS_POOL='ztank' | |
# List of volumes - the last volume in the array is the verification volume | |
# DRBD resync order needs to be the same order | |
VOL_NAMES=('volume-1' 'volume-2') | |
function on_error() { | |
local _line="${1}" | |
echo "!! Unhandled non-zero return occurred on line: ${_line} !!" | |
} | |
function is_primary_node() { | |
# This should match one of the nodes | |
[[ "$(</etc/hostname)" == "${PRIMARY_HOST}" ]] || return $? | |
} | |
function wait_drbd_conn() { | |
local _drbd_res="${1}" | |
while [[ "$(drbdadm cstate -- "${_drbd_res}")" != 'Connected' ]]; do | |
echo "Waiting for DRBD connection ${_drbd_res}..." 1>&2 | |
sleep 1 | |
done | |
} | |
trap 'on_error ${LINENO}' ERR | |
## | |
## Step 1 | |
## Down DRBD resources | |
## | |
echo '>> Step 1' 1>&2 | |
for VOL_NAME in "${VOL_NAMES[@]}"; do | |
if ! is_primary_node; then | |
# wait for first node to disconnect first | |
while [[ "$(drbdadm cstate -- "${VOL_NAME}")" == 'Connected' ]]; do | |
echo "Waiting for DRBD disconnection ${VOL_NAME}..." 1>&2 | |
sleep 1 | |
done | |
fi | |
echo "DRBD down: ${VOL_NAME}" 1>&2 | |
drbdadm down -- "${VOL_NAME}" | |
done | |
## | |
## Step 2 | |
## Create/Clone ZFS volumes and set primary | |
## | |
echo '>> Step 2' 1>&2 | |
for VOL_NAME in "${VOL_NAMES[@]}"; do | |
ZFS_VOLUME="${ZFS_POOL}/${VOL_NAME}" | |
# (Re)create the backing volume | |
zfs destroy -R -- "${ZFS_VOLUME}" ||: | |
echo "Creating ZFS: ${ZFS_VOLUME}" 1>&2 | |
zfs create -V 1G -s -o snapdev=visible -p "${ZFS_VOLUME}" | |
zfs set refreservation=auto "${ZFS_VOLUME}" | |
# (Re)create the meta disk | |
zfs destroy -R -- "${ZFS_VOLUME}-meta" ||: | |
zfs create -V 1G -p "${ZFS_VOLUME}-meta" | |
udevadm settle | |
sleep 3 | |
drbdadm create-md --force -- "${VOL_NAME}" | |
echo "DRBD up: ${VOL_NAME}" 1>&2 | |
drbdadm up -- "${VOL_NAME}" | |
wait_drbd_conn "${VOL_NAME}" | |
if is_primary_node; then | |
echo "DRBD primary: ${VOL_NAME}" 1>&2 | |
# https://linbit.com/drbd-user-guide/drbd-guide-9_0-en/#s-skip-initial-resync | |
# we used an identical image to seed the zvol on both sides so skip the | |
# initial sync | |
drbdadm new-current-uuid --clear-bitmap -- "${VOL_NAME}" | |
drbdadm primary --force -- "${VOL_NAME}" | |
fi | |
done | |
sleep 5 # Let things settle | |
## | |
## Step 3 | |
## Disconnect DRBD resource & start random data injection | |
## | |
echo '>> Step 3' 1>&2 | |
for VOL_NAME in "${VOL_NAMES[@]}"; do | |
ZFS_VOLUME="${ZFS_POOL}/${VOL_NAME}" | |
DRBD_MINOR="$(drbdsetup status --json -- "${VOL_NAME}" | jq '.[0].devices[0].minor')" | |
if is_primary_node; then | |
echo "DRBD disconnect: ${VOL_NAME}" 1>&2 | |
drbdadm disconnect -- "${VOL_NAME}" | |
# dd random stuff in to /dev/drbdxxxx, not immediately synced because | |
# the resource is disconnected. | |
# inject 250MB of data if not the last DRBD device | |
# we do this to increase the sync time of the first resource | |
DD_COUNT=500000 | |
[[ "${VOL_NAME}" == "${VOL_NAMES[-1]}" ]] && DD_COUNT=1 | |
echo "Starting urandom dd (${DD_COUNT} sectors): drbd${DRBD_MINOR}..." 1>&2 | |
dd if=/dev/urandom count="${DD_COUNT}" of="/dev/drbd${DRBD_MINOR}" & | |
fi | |
done | |
# wait until the dd is finished | |
wait | |
sleep 3 # Let things settle | |
## | |
## Step 4 | |
## Connect DRBD resource and expect random data to be synced. | |
## Explicitly pause-sync of the last volume. | |
## | |
echo '>> Step 4' 1>&2 | |
for VOL_NAME in "${VOL_NAMES[@]}"; do | |
if is_primary_node; then | |
echo "DRBD connect: ${VOL_NAME}" 1>&2 | |
drbdadm connect -- "${VOL_NAME}" | |
# Give drbd time to fully reconnect the resource. | |
# if drbd does not fully reconnect the resource in the same order | |
# then the resync-after directive might be ignored for a short time. | |
sleep 1 | |
if [[ "${VOL_NAME}" == "${VOL_NAMES[-1]}" ]]; then | |
# Try to prevent the last resource to from sync'in | |
echo "DRBD pause: ${VOL_NAME}" 1>&2 | |
drbdadm pause-sync -- "${VOL_NAME}" | |
if [[ "$(drbdadm dstate -- "${VOL_NAME}")" != 'UpToDate/Outdated' ]]; then | |
echo "ERROR - ${VOL_NAME} is not in the expected UpToDate/Outdated state!!" 1>&2 | |
fi | |
fi | |
fi | |
done | |
## | |
## Step 5 | |
## Random data injection into the last volume. | |
## The data won't be sync'd to the peer node after resuming | |
## the sync, showing the DRBD bug | |
## | |
echo '>> Step 5' 1>&2 | |
for VOL_NAME in "${VOL_NAMES[-1]}"; do | |
DRBD_MINOR="$(drbdsetup status --json -- "${VOL_NAME}" | jq '.[0].devices[0].minor')" | |
if is_primary_node; then | |
echo "Running urandom dd (1 sector): drbd${DRBD_MINOR}..." 1>&2 | |
dd if=/dev/urandom count=1 of="/dev/drbd${DRBD_MINOR}" seek=5050 | |
fi | |
done | |
sleep 3 # Let things settle | |
## | |
## Step 6 | |
## Secondary node needs to wait for reconnection | |
## | |
echo '>> Step 6' 1>&2 | |
for VOL_NAME in "${VOL_NAMES[@]}"; do | |
if ! is_primary_node; then | |
wait_drbd_conn "${VOL_NAME}" | |
fi | |
done | |
## | |
## Step 7 | |
## Unpause sync on last resource and detect when sync is finished | |
## | |
echo '>> Step 7' 1>&2 | |
for VOL_NAME in "${VOL_NAMES[@]}"; do | |
DRBD_MINOR="$(drbdsetup status --json -- "${VOL_NAME}" | jq '.[0].devices[0].minor')" | |
if is_primary_node && [[ "${VOL_NAME}" == "${VOL_NAMES[-1]}" ]]; then | |
echo "DRBD resume-sync: ${VOL_NAME}" 1>&2 | |
drbdadm resume-sync -- "${VOL_NAME}" | |
fi | |
while [[ "$(drbdsetup dstate -- "${DRBD_MINOR}")" != 'UpToDate/UpToDate' ]]; do | |
echo "Waiting for DRBD sync to finish ${VOL_NAME}..." 1>&2 | |
sleep 1 | |
done | |
done | |
if is_primary_node; then | |
# check if the resync total expectation warning message is printed | |
journalctl --since -15 | grep -P 'expected n_oos:[0-9]+ to be equal to rs_failed:[0-9]+' || : | |
fi | |
## | |
## Step 8 | |
## Verification of backing volume | |
## | |
echo '>> Step 8' 1>&2 | |
for VOL_NAME in "${VOL_NAMES[-1]}"; do | |
ZFS_VOLUME="${ZFS_POOL}/${VOL_NAME}" | |
DRBD_MINOR="$(drbdsetup status --json -- "${VOL_NAME}" | jq '.[0].devices[0].minor')" | |
echo "ZFS snapshot: ${ZFS_VOLUME}@post-replay" 1>&2 | |
zfs snapshot -- "${ZFS_VOLUME}@post-replay" | |
udevadm settle | |
sleep 1 | |
# if the resources are in sync we expect the snapshot to have identical content | |
echo -n "Snapshot checksum: " 1>&2 | |
dd if=/dev/zvol/${ZFS_VOLUME}@post-replay bs=1M status=none | sha256sum | |
# even if some delay meant backing volume not in sync drbdadm verify should be consistent | |
if is_primary_node; then | |
echo "DRBD verify: ${VOL_NAME}" 1>&2 | |
drbdadm verify -- "${VOL_NAME}" | |
else | |
echo "Waiting and checking for out-of-sync: ${VOL_NAME}" 1>&2 | |
sleep 10 # allow the verify to process things for several seconds | |
OOS="$(drbdsetup status --statistics --verbose -- "${VOL_NAME}" | grep -Po 'out-of-sync:[0-9]+' | cut -d':' -f2)" | |
if [[ "${OOS}" -ne 0 ]]; then | |
echo "Corruption detected: ${OOS}" 1>&2 | |
fi | |
fi | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment