feat: add CUPS watchdog timers
This commit is contained in:
162
watchdog/print-watchdog.sh
Normal file
162
watchdog/print-watchdog.sh
Normal file
@@ -0,0 +1,162 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Print service watchdog for the CUPS print server.
|
||||
# It checks systemd service state plus quick CUPS command/HTTP probes.
|
||||
# When checks fail repeatedly, it restarts CUPS and related services.
|
||||
#
|
||||
|
||||
set -u
|
||||
|
||||
CONFIG_FILE="${CUPS_PRINT_WATCHDOG_CONFIG:-/etc/cups-watchdog/print-watchdog.conf}"
|
||||
STATE_DIR="/run/cups-watchdog"
|
||||
|
||||
SERVICES="cups avahi-daemon cups-driver-manager"
|
||||
CUPS_URL="http://127.0.0.1:631/"
|
||||
CHECK_CUPS_HTTP=1
|
||||
CHECK_LPSTAT=1
|
||||
COMMAND_TIMEOUT=8
|
||||
FAIL_THRESHOLD=2
|
||||
RESTART_COOLDOWN=60
|
||||
LOG_FILE="/var/log/cups-watchdog/print.log"
|
||||
|
||||
if [ -f "$CONFIG_FILE" ]; then
|
||||
# shellcheck disable=SC1090
|
||||
. "$CONFIG_FILE"
|
||||
fi
|
||||
|
||||
mkdir -p "$STATE_DIR"
|
||||
mkdir -p "$(dirname "$LOG_FILE")" 2>/dev/null || true
|
||||
|
||||
log_msg() {
|
||||
local msg="$1"
|
||||
local line
|
||||
line="$(date '+%Y-%m-%d %H:%M:%S') [print-watchdog] $msg"
|
||||
echo "$line"
|
||||
echo "$line" >> "$LOG_FILE" 2>/dev/null || true
|
||||
}
|
||||
|
||||
unit_exists() {
|
||||
local service="$1"
|
||||
|
||||
systemctl list-unit-files --no-legend "$service.service" 2>/dev/null | awk '{print $1}' | grep -qx "$service.service" && return 0
|
||||
systemctl list-units --all --no-legend "$service.service" 2>/dev/null | awk '{print $1}' | grep -qx "$service.service"
|
||||
}
|
||||
|
||||
service_should_check() {
|
||||
local service="$1"
|
||||
|
||||
unit_exists "$service" || return 1
|
||||
if systemctl is-enabled --quiet "$service.service" 2>/dev/null; then
|
||||
return 0
|
||||
fi
|
||||
if systemctl is-active --quiet "$service.service" 2>/dev/null; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
check_systemd_services() {
|
||||
local service
|
||||
local failed=0
|
||||
|
||||
for service in $SERVICES; do
|
||||
service_should_check "$service" || continue
|
||||
if ! systemctl is-active --quiet "$service.service" 2>/dev/null; then
|
||||
log_msg "service not active: $service"
|
||||
failed=1
|
||||
fi
|
||||
done
|
||||
|
||||
return "$failed"
|
||||
}
|
||||
|
||||
check_lpstat() {
|
||||
[ "$CHECK_LPSTAT" = "1" ] || return 0
|
||||
command -v lpstat >/dev/null 2>&1 || return 0
|
||||
|
||||
timeout "$COMMAND_TIMEOUT" lpstat -r >/dev/null 2>&1
|
||||
}
|
||||
|
||||
check_http() {
|
||||
[ "$CHECK_CUPS_HTTP" = "1" ] || return 0
|
||||
command -v curl >/dev/null 2>&1 || return 0
|
||||
|
||||
curl -fsS --max-time "$COMMAND_TIMEOUT" "$CUPS_URL" >/dev/null 2>&1
|
||||
}
|
||||
|
||||
restart_print_stack() {
|
||||
local now
|
||||
local last_file="$STATE_DIR/print.last_restart"
|
||||
local last=0
|
||||
local service
|
||||
|
||||
now="$(date +%s)"
|
||||
if [ -f "$last_file" ]; then
|
||||
last="$(cat "$last_file" 2>/dev/null || echo 0)"
|
||||
fi
|
||||
|
||||
if [ $((now - last)) -lt "$RESTART_COOLDOWN" ]; then
|
||||
log_msg "restart skipped: cooldown active"
|
||||
return 0
|
||||
fi
|
||||
|
||||
log_msg "restarting print stack"
|
||||
systemctl reset-failed cups.service >/dev/null 2>&1 || true
|
||||
systemctl restart cups.service
|
||||
|
||||
for service in avahi-daemon cups-driver-manager; do
|
||||
service_should_check "$service" || continue
|
||||
systemctl reset-failed "$service.service" >/dev/null 2>&1 || true
|
||||
systemctl restart "$service.service" >/dev/null 2>&1 || true
|
||||
done
|
||||
|
||||
echo "$now" > "$last_file"
|
||||
}
|
||||
|
||||
check_print_stack() {
|
||||
local count_file="$STATE_DIR/print.fail_count"
|
||||
local fail_count=0
|
||||
local failed=0
|
||||
|
||||
check_systemd_services || failed=1
|
||||
if ! check_lpstat; then
|
||||
log_msg "lpstat check failed or timed out"
|
||||
failed=1
|
||||
fi
|
||||
if ! check_http; then
|
||||
log_msg "CUPS HTTP check failed or timed out"
|
||||
failed=1
|
||||
fi
|
||||
|
||||
if [ "$failed" = "0" ]; then
|
||||
echo 0 > "$count_file"
|
||||
log_msg "print stack ok"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [ -f "$count_file" ]; then
|
||||
fail_count="$(cat "$count_file" 2>/dev/null || echo 0)"
|
||||
fi
|
||||
fail_count=$((fail_count + 1))
|
||||
echo "$fail_count" > "$count_file"
|
||||
|
||||
log_msg "print stack failed ($fail_count/$FAIL_THRESHOLD)"
|
||||
if [ "$fail_count" -ge "$FAIL_THRESHOLD" ]; then
|
||||
restart_print_stack
|
||||
echo 0 > "$count_file"
|
||||
fi
|
||||
}
|
||||
|
||||
case "${1:-check}" in
|
||||
check)
|
||||
check_print_stack
|
||||
;;
|
||||
restart)
|
||||
restart_print_stack
|
||||
;;
|
||||
*)
|
||||
echo "Usage: $0 [check|restart]"
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
Reference in New Issue
Block a user