Skip to content

Instantly share code, notes, and snippets.

@Grissess
Last active June 23, 2022 16:52
Show Gist options
  • Save Grissess/1d8d2cd915b79c018ac7ac5d07a64d3b to your computer and use it in GitHub Desktop.
Save Grissess/1d8d2cd915b79c018ac7ac5d07a64d3b to your computer and use it in GitHub Desktop.
A little dialog wrapper for smartctl
#!/bin/bash
smartctl="${SMARTCTL:-smartctl}"
dialog="${DIALOG:-dialog}"
jq="${JQ:-jq}"
# Debian workaround
blockdev="${BLOCKDEV:-$(PATH="$PATH:/sbin:/usr/sbin" which blockdev)}"
sce_parse_bit=1
sce_open_bit=2
sce_cmd_bit=4
sce_fail_bit=8
sce_badattr_bit=16
sce_histfail_bit=32
sce_errlog_bit=64
sce_stfail_bit=128
poll_interval=10 #sec
temp_file=/tmp/smartout
dle_extra=3
ui() {
$dialog --backtitle check_disk --colors "$@" 3>&1 1>&2 2>&3 3>&-
}
errui() {
DIALOGRC=<(echo "screen_color=(RED,RED,ON)") ui "$@"
}
warnui() {
DIALOGRC=<(echo "screen_color=(YELLOW,YELLOW,ON)") ui "$@"
}
dangui() {
DIALOGRC=<(echo "screen_color=(MAGENTA,MAGENTA,ON)") ui "$@"
}
testui() {
DIALOGRC=<(echo "screen_color=(GREEN,GREEN,ON)") ui "$@"
}
progrui() {
DIALOGRC=<(echo "screen_color=(CYAN,CYAN,ON)") ui "$@"
}
flash() {
local times="$1"
local wait="$2"
shift 2
for i in $(seq 1 $times); do
local color="RED"
if (( i % 2 == 0 )); then color="MAGENTA"; fi
DIALOGRC=<(echo "screen_color=($color,$color,ON)") ui "$@"
sleep "$wait"
done
}
exec_test() {
local dev="$1"
local test="$2"
local esttime="$3"
while true; do
local out
out="$($smartctl $dev -t "$test" 2>&1)"
local ret="$?"
if (( ret & sce_cmd_bit )); then
local choice
if ! choice=$(warnui --title "$dev, $test test: Possibly Ongoing Test" --no-cancel --menu \
"Failed to submit the command to begin the test (code $ret).\n\n"\
"Ouput: $out\n\n"\
"This could be due to an ongoing test. Would you like to:" \
0 0 10 \
abort "Abort the current test and try again?" \
status "Monitor the status of the ongoing test?" \
cancel "Cancel this test and return to the menu?" \
); then
choice="cancel"
fi
case "$choice" in
abort)
if ! $smartctl $dev -X; then
warnui --title "$dev, $test test: Abort Failed" --msgbox "Abort failed! (The test may not be cancellable.)" 0 0
fi
continue
;;
status)
break # from the while
;;
cancel)
return 3
;;
esac
elif (( ret )); then
errui --title "$dev, $test test: Failed to Start" --msgbox "Failed to start test of $dev (exit code $ret)" 0 0
return 2
fi
break
done
local start="$(date +"%s")"
while true; do
local stat="$($jq -c '.ata_smart_data.self_test.status' <<< "$($smartctl $dev -j -c)")"
case "$($jq '.passed' <<< "$stat")" in
true)
testui --title "$dev, $test test: Passed" --timeout 5 --msgbox "$test test of $dev passed!" 0 0
return 0
;;
false)
errui --title "$dev, $test test: Failed" --timeout 5 --msgbox "$test test of $dev failed!" 0 0
return 1;
;;
null) ;;
*)
echo "unexpected stat: $stat"
exit 1
;;
esac
local rempct="$($jq '.remaining_percent' <<< "$stat")"
local string="$($jq '.string' <<< "$stat")"
local now="$(date +"%s")"
string="$string\n$(( (now - start) / 60 )) min $(( (now - start) % 60 )) sec elapsed / $esttime min estimated"
if [ "$rempct" = "null" ]; then
warnui --title "$dev, $test test: Progress Unknown" --infobox "$string" 10 -1
else
progrui --title "$dev, $test test: Ongoing" --gauge "$string" 10 -1 $(( 100 - rempct )) < /dev/null
fi
sleep "$poll_interval"
done
}
show_info() {
local info="$1"
shift 1
cat > "$temp_file" <<< "$info"
ui --hline "HOME | END | PGUP | PGDN | Arrow keys | / to search" "$@" --exit-label Continue --textbox "$temp_file" -1 -1
}
classify_smartctl_error() {
local stat="$1"
local out="$2"
local continue="$3"
local before="$4"
if (( stat & (sce_parse_bit | sce_open_bit) )); then
local oncemsg=""
if [ -n "$before" ]; then
oncemsg=$'\n\n'"This device worked before! Check your hardware (e.g. dock),"$'\n'"and perhaps reset or reconnect the device."
fi
local msg="$(printf "\\Z1smartctl could not open device (exit code %d)%s\\Zn\n\n%s" "$stat" "$oncemsg" "$out")"
errui --title "$dev: Open Failed" --msgbox "$msg" 25 75
return 1
fi
local failmsg=() faillvl=()
if (( stat & sce_fail_bit )); then faillvl+=( 2 ); failmsg+=( "- Bit 3: Device reports it is failing NOW" ); fi
if (( stat & sce_badattr_bit )); then faillvl+=( 2 ); failmsg+=( "- Bit 4: Device has an attribute below threshold (indicating imminent failure)" ); fi
if (( stat & sce_histfail_bit )); then faillvl+=( 1 ); failmsg+=( "- Bit 5: Device has detected an attribute previously below threshold (indicating failure is likely)" ); fi
if (( stat & sce_errlog_bit )); then faillvl+=( 2 ); failmsg+=( "- Bit 6: The error log is not empty" ); fi
if (( stat & sce_stfail_bit )); then faillvl+=( 2 ); failmsg+=( "- Bit 7: Device failed to self test" ); fi
local failmax=0
for i in "${faillvl[@]}"; do if (( i > failmax )); then failmax=$i; fi; done
local failcmd="" failcolor=0 failtitle="" failbutton=""
case "$failmax" in
0) ;;
1) failcmd="warnui"; failcolor=3; failtitle="Caution"; failbutton="ok" ;;
*) failcmd="errui"; failcolor=1; failtitle="Warning"; failbutton="cancel" ;;
esac
if [ -n "$failcmd" ]; then
local msgs=""
for msg in "${failmsg[@]}"; do msgs="${msgs}$msg\n\n"; done
while true; do
# Bug with labels when using extra with yes/no?
if [ -n "$continue" ]; then
$failcmd --title "$dev: $failtitle" --extra-button --extra-label "Details" \
--cancel-label "Yes" --ok-label "No" --default-button "$failbutton" \
--yesno "\Z${failcolor}This device reports failures:\n\n${msgs}\ZnDo you want to abort tests?" 25 75
stat="$?"
else
$failcmd --title "$dev: $failtitle" --extra-button --extra-label "Details" --default-button extra \
--msgbox "This device reports failures:\n\n${msgs}" 25 75
stat="$?"
fi
if (( stat == dle_extra )); then
show_info "$out"
continue
elif (( stat )); then
return 1
fi
break
done
fi
}
run_tests() {
local dev="$1"
local once=""
local out stat # Predeclared, otherwise local eats the $? from the substitution
while true; do
out="$($smartctl $dev -a 2>&1)"
stat="$?"
if ! classify_smartctl_error "$stat" "$out" "1" "$once"; then
return 1
fi
once="1"
local tests
declare -A testtimes=()
local timedata="$($jq -c '.ata_smart_data.self_test.polling_minutes' <<< "$($smartctl $dev -j -c)")"
for testkind in short extended conveyance; do
local tname="$testkind"
if [ "$tname" = "extended" ]; then tname="long"; fi
testtimes["$tname"]="$($jq -r ".$testkind" <<< "$timedata")"
done
tests=( $(testui --title "$dev" --extra-button --extra-label "Details" --cancel-label "Back" \
--checklist "Select tests to perform. (Times given are estimates by the target device only.)" 0 0 0 \
short "Short (${testtimes[short]} min)" on \
long "Long / Extended (${testtimes[long]} min)" off \
conveyance "Conveyance (${testtimes[conveyance]} min)" off \
) )
stat="$?"
if (( stat == dle_extra )); then
cat > "$temp_file" <<< "$out"
ui --title "$dev" --exit-label Continue --textbox "$temp_file" -1 -1
continue
elif (( stat )); then
return
fi
local i=0
local aborted=""
while (( i < ${#tests[@]} )); do
local res
exec_test "$dev" "${tests[$i]}" "${testtimes[${tests[$i]}]}"
res="$?"
if (( res )); then
case "$res" in
1)
errui --title "$dev Self-Test Failed" --msgbox "$dev failed a self-test; no further tests will be attempted." 0 0
;;
*)
warnui --title "$dev Self-Test Cancelled" --msgbox "A test for $dev was cancelled or failed for an internal error (code $res)." 0 0
;;
esac
aborted="1"
break
fi
i=$(( i + 1 ))
done
if [ -z "$aborted" ]; then
testui --title "$dev: All Tests Run" --msgbox "All scheduled tests for $dev were successfully run." 0 0
fi
done
}
zero_warning="\Zb\ZuEVEN IF CANCELED,\Zn zeroing a disk is an essentially \Zb\ZuIRRECOVERABLE OPERATION.\Zn\n\n\
The partition table and filesystem administrative structure, usually at the beginning of the disk, will be the first to be destroyed. Recovering even an aborted zero usually requires forensic methods.\n\n\
\Z1Do \Zb\ZuNOT\Zn\Z1 proceed unless you are sure you want to erase \Zb\ZuALL\Zn\Z1 data on this device. This cannot be undone."
try_zero_disk() {
flash 5 0.2 --title "ZERO $dev" --infobox "$zero_warning" 25 75
if dangui --title "ZERO $dev" --yes-label "Erase Disk" --no-label "Cancel" --default-button no \
--yesno "$zero_warning" 25 75; then
if dangui --title "ZERO $dev: Confirm" --yes-label "Yes, Erase Disk" --no-label "Cancel" --default-button no \
--yesno "$zero_warning\n\n\Z5\Zb\ZuAre you really sure?" 25 75; then
really_zero_disk "$dev"
fi
fi
}
mem_info() {
grep "$1" /proc/meminfo | while read nm val rest; do echo "$val"; done
}
really_zero_disk() {
local dev="$1"
local total="$($blockdev --getsize64 "$dev")"
local totalhr=""
if [ -n "$total" ]; then totalhr="$(numfmt --to=iec --format='%.1f' <<< "$total")"; fi
coproc dd if=/dev/zero of="$dev" bs=4M 2>&1
echo
ps
local progmon
while true; do
kill -USR1 "$COPROC_PID"
sleep 0.25
done &
progmon="$!"
while read bytes rest; do
if [[ $bytes =~ .*\+.* ]]; then continue; fi
if [ -z "$bytes" ]; then break; fi
local hr="$(numfmt --to=iec --format='%.1f' <<< "$bytes")"
local pc="$(bc <<< "100 * $bytes / $total")"
if [ -z "$total" ]; then
dangui --title "$dev: Zeroing" --infobox "$bytes bytes ($hr) written, progress unknown" 10 -1
else
dangui --title "$dev: Zeroing" --gauge "$bytes/$total bytes ($hr/$totalhr) written" 10 -1 "$pc" < /dev/null
fi
done <&"${COPROC[0]}"
kill "$progmon"
local memtot="$(mem_info MemTotal)"
local dots=1
local suffix
while true; do
if kill -0 "$COPROC_PID"; then
case dots in
1) suffix="."; dots=2 ;;
2) suffix=".."; dots=3 ;;
3) suffix="..."; dots=1 ;;
*) dots=1 ;;
esac
local membuf="$(mem_info Buffers)"
dangui --title "$dev: Waiting for Buffers" --gauge "Waiting for buffers to clear (this may take a while)$suffix\n\n$membuf/$memtot buffers/total memory" 10 -1 $(( 100 - 100 * membuf / memtot ))
sleep 1
else
break
fi
done
dots=1
while true; do
suffix=""
case dots in
1) suffix="."; dots=2 ;;
2) suffix=".."; dots=3 ;;
3) suffix="..."; dots=1 ;;
*) dots=1 ;;
esac
dangui --title "$dev: Syncing" --infobox "Syncing (this may take a moment)$suffix"
sleep 1
done &
progmon="$!"
sync
kill "$progmon"
dangui --title "$dev: Zeroed" --msgbox "$dev has been zeroed." 0 0
}
last_dmesg_check=""
check_dmesg() {
local buffer
if [ -z "$last_dmesg_check"]; then
while read line; do
buffer+=( "$line" )
done <<< "$(dmesg)"
else
local time
while read line; do
time="${line#[}"
time="${line%%]*}"
time="${line/.*/}"
if (( time >= last_dmesg_check )); then
buffer+=( "$line" )
fi
done <<< "$(dmesg)"
fi
last_dmesg_check="$(cat /proc/uptime)"
last_dmesg_check="${last_dmesg_check##.*}"
local problems=()
for line in "${buffer[@]}"; do
if [[ $line =~ .*Buffer\ I/O\ error\ on\ dev\ ([^,]*),.* ]]; then
problems+=( "${BASH_REMATCH[1]}" )
fi
done
for dev in "${problems[@]}"; do echo "$dev"; done | sort | uniq
}
partitions() {
local dev="$1"
echo p | fdisk "$dev" 2>&1
}
device_menu() {
local dev="$1"
local choice
if ! partitions "$dev"; then
while true; do
warnui --extra-button --extra-label "Dmesg" \
--ok-label "Yes" --cancel-label "No" --default-button no \
--title "$dev: Input/Output Errors" \
--yesno "The device $dev appears to be experiencing I/O errors, which may make any further tests unreliable. More information may be available in the kernel debug message ('dmesg') logs.\n\nDo you want to continue?" \
25 75
local stat="$?"
if (( stat == dle_extra )); then
show_info "$(dmesg --time-format iso)" --title "Dmesg"
continue
elif (( stat )); then
return
fi
break
done
fi
while true; do
if ! choice="$(testui --title "$dev" --cancel-label "Back" \
--menu "Choose an action for $dev:" 0 0 10 \
"test" 'Test the disk' \
"stat" 'Read the SMART status and details' \
"part" 'Show the partition table' \
"info" 'Show detailed drive information' \
"zero" '\Z1\ZbZero the disk\Zn' \
"erase" '\Z1\ZbSecure erase or sanitize the disk\Zn' \
)"; then
return
fi
case "$choice" in
test)
run_tests "$dev"
;;
stat)
local stat out
out="$($smartctl $dev -a)"
stat="$?"
if (( stat )); then
classify_smartctl_error "$stat" "$out"
else
show_info "$out"
fi
;;
part)
show_info "$(partitions "$dev")" --title "$dev: Partitions"
;;
info)
show_info "$(hdparm -I "$dev")" --title "$dev: Info"
;;
zero)
try_zero_disk "$dev"
;;
erase)
errui --title "TODO" --timeout 5 --msgbox "Not yet implemented" 0 0
;;
esac
done
}
enum_devices() {
declare -A devices
while read line; do
name="$($jq -r .name <<< "$line")"
local devnode="/dev/$name"
local sz
if ! sz="$($blockdev --getsize64 /dev/$name)"; then
extra='\Z1NOMEDIA\Zn'
else
extra='\Z5'"$(numfmt --to=iec --format='%.1f' <<< "$sz")"'\Zn'
fi
# Superfluous echo to trim ws
extra="$extra "'\Z4'"$(echo $(cat /sys/class/block/$name/device/model))"'\Zn'
local info
if info="$($smartctl -i /dev/$name)"; then
extra="$extra "'\Z6'"SN:$(grep Serial <<< "$info" | tr -d ' ' | cut -d: -f2)"'\Zn'
else
extra="$extra "'\Z6'"SN:"'\Z1\Zb'"???"'\Zn'
fi
if [ "$($jq '[.children?[]?.mountpoint? != null] | any' <<< "$line")" = "true" ]; then
extra="$extra "'\Z3\Zb(mounted)\Zn'
fi
if ! [ "$($jq -r '.type' <<< "$line")" = "disk" ]; then
extra="$extra "'\Z1\Zb(not a disk)\Zn'
fi
devices["/dev/$name"]="$extra"
done <<< "$(lsblk -J | $jq -c '.blockdevices[]')"
declare -p devices
}
main() {
if [ -z "$CD_NO_ROOT" ] && (( EUID )); then
errui --title "check_disk: Not Root" --msgbox \
"This script is not running as root!\n\n"\
"Try using 'sudo' or equivalent.\n\n"\
"If you know what you're doing and can assure this script has sufficient privileges, you may set CD_NO_ROOT to a non-empty value." \
0 0
exit 1
fi
while true; do
ui --title "check_disk" --infobox "Scanning devices, please wait..." 10 75
eval "$(enum_devices)"
local args=()
local sorted=( $(for key in "${!devices[@]}"; do echo "$key"; done | sort) )
for dev in "${sorted[@]}"; do
args+=( "$dev" "${devices[$dev]}" )
done
local device
if ! device=$(ui --title "check_disk" --no-cancel --menu \
"Choose the device to analyze." \
0 0 10 \
"${args[@]}" \
"refresh" '\Z2Refresh this list' \
"exit" '\Z2Exit'
); then break; fi
if [ "$device" = "refresh" ]; then continue; fi
if [ "$device" = "exit" ]; then break; fi
device_menu "$device"
done
}
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment