Files
turingpi/manifests/disk-usage-alert/cronjob.yaml
T
gilgamezh 98c1e7b63d feat(disk-usage-alert): Telegram alert when turing3 SSD >=90%
The SSD on turing3 (/mnt/ssd) is shared by Postgres, Plex's SQLite DB
and the media library. When it fills, Postgres crashes (cannot write
postmaster.pid) and Plex's library DB corrupts. Add a CronJob that
checks df via the plex-data mount every 15m and warns Telegram at >=90%
used (3h cooldown). Bot token/chatId live in the out-of-band
`telegram-disk-alert` Secret, not in git.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-04 22:50:01 +02:00

82 lines
3.5 KiB
YAML

# SSD disk-usage alert for turing3 /mnt/ssd (shared by Postgres + Plex + media).
# A full SSD crashes Postgres and corrupts Plex's SQLite DB, so we warn early.
#
# NOTE: the Telegram bot token + chat id live in the `telegram-disk-alert` Secret,
# created out-of-band (NOT in git) so the token stays out of history:
# kubectl -n default create secret generic telegram-disk-alert \
# --from-literal=botToken='<token>' --from-literal=chatId='<chatId>'
#
# Mounts the existing RWX plex-data PVC purely to read `df` of the underlying SSD.
apiVersion: batch/v1
kind: CronJob
metadata:
name: disk-usage-alert
namespace: default
spec:
schedule: "*/15 * * * *"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 3
jobTemplate:
spec:
backoffLimit: 0
activeDeadlineSeconds: 120
template:
spec:
restartPolicy: Never
containers:
- name: check
image: curlimages/curl:8.11.1
imagePullPolicy: IfNotPresent
env:
- name: THRESHOLD # alert at/above this % used
value: "90"
- name: COOLDOWN_SEC # min seconds between alerts (3h)
value: "10800"
- name: MOUNT
value: "/data"
- name: BOT_TOKEN
valueFrom:
secretKeyRef: { name: telegram-disk-alert, key: botToken }
- name: CHAT_ID
valueFrom:
secretKeyRef: { name: telegram-disk-alert, key: chatId }
command: ["/bin/sh", "-c"]
args:
- |
set -u
pct=$(df -P "$MOUNT" | awk 'END{gsub("%","",$5); print $5}')
avail=$(df -Ph "$MOUNT" | awk 'END{print $4}')
now=$(date +%s)
marker="$MOUNT/.disk-alert-last"
send() {
curl -s -m 15 "https://api.telegram.org/bot${BOT_TOKEN}/sendMessage" \
--data-urlencode "chat_id=${CHAT_ID}" \
--data-urlencode "text=$1" -d "parse_mode=HTML" >/dev/null
}
if [ "${TEST:-0}" = "1" ]; then
send "✅ <b>turingpi disk-alert test</b> — SSD at ${pct}% (free ${avail}). Alerting works."
echo "TEST sent (${pct}% used)"; exit 0
fi
if [ "$pct" -ge "$THRESHOLD" ]; then
last=0; [ -f "$marker" ] && last=$(cat "$marker" 2>/dev/null || echo 0)
if [ $((now - last)) -ge "$COOLDOWN_SEC" ]; then
send "⚠️ <b>turingpi SSD ${pct}% full</b> — only ${avail} free on /mnt/ssd. Postgres + Plex crash at 100%. Prune content / check maintainerr."
echo "$now" > "$marker"
echo "ALERT sent (${pct}% >= ${THRESHOLD}%)"
else
echo "over threshold (${pct}%) but within cooldown; skipping"
fi
else
echo "ok: ${pct}% used, ${avail} free (threshold ${THRESHOLD}%)"
fi
resources:
requests: { cpu: 10m, memory: 16Mi }
limits: { memory: 64Mi }
volumeMounts:
- { name: data, mountPath: /data }
volumes:
- name: data
persistentVolumeClaim:
claimName: plex-data