Skip to content

Instantly share code, notes, and snippets.

@jperkin
Created October 29, 2023 13:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jperkin/93c5512de45769d41bd22523e97b5d1a to your computer and use it in GitHub Desktop.
Save jperkin/93c5512de45769d41bd22523e97b5d1a to your computer and use it in GitHub Desktop.
#!/bin/sh
#
# Look for any known-to-hang processes that have been running for longer than
# 2 hours, these are not caught by the ulimit -t set by pbulk as they are not
# using any CPU time.
#
# Run from cron and redirect output to a log
#
if [ "$1" = "-n" ]; then
dry_run=true
else
dry_run=false
fi
case "$(uname -s)" in
NetBSD)
ps_args_pbulk="-o etime= -o pid= -o args= -U pbulk -x"
ps_args_log="-ww -o user,pid,lstart,etime,args"
process_restart=true
;;
*)
ps_args_pbulk="-o etime= -o pid= -o args= -U pbulk"
ps_args_log="-fo user,pid,etime,args"
process_restart=false
;;
esac
kill_or_restart()
{
pid=$1; shift
if ${dry_run}; then
if ${process_restart}; then
echo "Would stop/start PID ${pid}"
else
echo "Would kill PID ${pid}"
fi
return
fi
#
# On some OS it's enough to stop and restart processes to get them
# running again (notably NetBSD with its broken libpthread).
#
if ${process_restart}; then
kill -STOP ${pid}
sleep 1
kill -CONT ${pid}
else
kill -9 ${pid}
fi
}
log_and_kill()
{
pid=$1; shift
# Output date and running command for the log.
date '+%Y-%m-%d-%H:%M:%S'
ps ${ps_args_log} -p ${pid}
kill_or_restart ${pid}
}
ps ${ps_args_pbulk} | while read time pid cmd; do
#
# Match when the ETIME field is 2 hours or longer. Account for OS
# differences, most have leading 0, NetBSD does not.
#
# If anything has been running for over a day just kill it, it's
# highly unlikely to be making forward progress.
#
case "${time}" in
*-*:*:*)
log_and_kill ${pid}
;;
0[2-9]:*:*|\
[2-9]:*:*|\
[1-9][0-9]:*:*)
#
# Only match either known fail processes or anything running
# from within the work directory, skipping known false
# positives such as Rust.
#
case "${cmd}" in
*lang/rust*|*ghc94*)
# Do nothing, likely legitimate long-running process.
;;
/Users/pbulk/*|\
/home/pbulk/*|\
*Configure*|\
*cmake_autogen*|\
mplayer|\
*py-scipy*|\
*" ./configure "*|\
./*|../*)
log_and_kill ${pid}
;;
esac
;;
esac
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment