feat(disk-usage-alert): Telegram alert when turing3 SSD >=90%
The SSD on turing3 (/mnt/ssd) is shared by Postgres, Plex's SQLite DB and the media library. When it fills, Postgres crashes (cannot write postmaster.pid) and Plex's library DB corrupts. Add a CronJob that checks df via the plex-data mount every 15m and warns Telegram at >=90% used (3h cooldown). Bot token/chatId live in the out-of-band `telegram-disk-alert` Secret, not in git. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,23 @@
|
|||||||
|
apiVersion: argoproj.io/v1alpha1
|
||||||
|
kind: Application
|
||||||
|
metadata:
|
||||||
|
name: disk-usage-alert
|
||||||
|
namespace: argocd
|
||||||
|
spec:
|
||||||
|
project: default
|
||||||
|
source:
|
||||||
|
repoURL: http://gitea-http.gitea.svc.cluster.local:3000/admin/turingpi.git
|
||||||
|
targetRevision: HEAD
|
||||||
|
path: manifests/disk-usage-alert
|
||||||
|
directory:
|
||||||
|
recurse: false
|
||||||
|
destination:
|
||||||
|
server: https://kubernetes.default.svc
|
||||||
|
namespace: default
|
||||||
|
syncPolicy:
|
||||||
|
automated:
|
||||||
|
prune: true
|
||||||
|
selfHeal: true
|
||||||
|
syncOptions:
|
||||||
|
- CreateNamespace=true
|
||||||
|
- ServerSideApply=true
|
||||||
@@ -0,0 +1,81 @@
|
|||||||
|
# SSD disk-usage alert for turing3 /mnt/ssd (shared by Postgres + Plex + media).
|
||||||
|
# A full SSD crashes Postgres and corrupts Plex's SQLite DB, so we warn early.
|
||||||
|
#
|
||||||
|
# NOTE: the Telegram bot token + chat id live in the `telegram-disk-alert` Secret,
|
||||||
|
# created out-of-band (NOT in git) so the token stays out of history:
|
||||||
|
# kubectl -n default create secret generic telegram-disk-alert \
|
||||||
|
# --from-literal=botToken='<token>' --from-literal=chatId='<chatId>'
|
||||||
|
#
|
||||||
|
# Mounts the existing RWX plex-data PVC purely to read `df` of the underlying SSD.
|
||||||
|
apiVersion: batch/v1
|
||||||
|
kind: CronJob
|
||||||
|
metadata:
|
||||||
|
name: disk-usage-alert
|
||||||
|
namespace: default
|
||||||
|
spec:
|
||||||
|
schedule: "*/15 * * * *"
|
||||||
|
concurrencyPolicy: Forbid
|
||||||
|
successfulJobsHistoryLimit: 1
|
||||||
|
failedJobsHistoryLimit: 3
|
||||||
|
jobTemplate:
|
||||||
|
spec:
|
||||||
|
backoffLimit: 0
|
||||||
|
activeDeadlineSeconds: 120
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
restartPolicy: Never
|
||||||
|
containers:
|
||||||
|
- name: check
|
||||||
|
image: curlimages/curl:8.11.1
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
env:
|
||||||
|
- name: THRESHOLD # alert at/above this % used
|
||||||
|
value: "90"
|
||||||
|
- name: COOLDOWN_SEC # min seconds between alerts (3h)
|
||||||
|
value: "10800"
|
||||||
|
- name: MOUNT
|
||||||
|
value: "/data"
|
||||||
|
- name: BOT_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef: { name: telegram-disk-alert, key: botToken }
|
||||||
|
- name: CHAT_ID
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef: { name: telegram-disk-alert, key: chatId }
|
||||||
|
command: ["/bin/sh", "-c"]
|
||||||
|
args:
|
||||||
|
- |
|
||||||
|
set -u
|
||||||
|
pct=$(df -P "$MOUNT" | awk 'END{gsub("%","",$5); print $5}')
|
||||||
|
avail=$(df -Ph "$MOUNT" | awk 'END{print $4}')
|
||||||
|
now=$(date +%s)
|
||||||
|
marker="$MOUNT/.disk-alert-last"
|
||||||
|
send() {
|
||||||
|
curl -s -m 15 "https://api.telegram.org/bot${BOT_TOKEN}/sendMessage" \
|
||||||
|
--data-urlencode "chat_id=${CHAT_ID}" \
|
||||||
|
--data-urlencode "text=$1" -d "parse_mode=HTML" >/dev/null
|
||||||
|
}
|
||||||
|
if [ "${TEST:-0}" = "1" ]; then
|
||||||
|
send "✅ <b>turingpi disk-alert test</b> — SSD at ${pct}% (free ${avail}). Alerting works."
|
||||||
|
echo "TEST sent (${pct}% used)"; exit 0
|
||||||
|
fi
|
||||||
|
if [ "$pct" -ge "$THRESHOLD" ]; then
|
||||||
|
last=0; [ -f "$marker" ] && last=$(cat "$marker" 2>/dev/null || echo 0)
|
||||||
|
if [ $((now - last)) -ge "$COOLDOWN_SEC" ]; then
|
||||||
|
send "⚠️ <b>turingpi SSD ${pct}% full</b> — only ${avail} free on /mnt/ssd. Postgres + Plex crash at 100%. Prune content / check maintainerr."
|
||||||
|
echo "$now" > "$marker"
|
||||||
|
echo "ALERT sent (${pct}% >= ${THRESHOLD}%)"
|
||||||
|
else
|
||||||
|
echo "over threshold (${pct}%) but within cooldown; skipping"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "ok: ${pct}% used, ${avail} free (threshold ${THRESHOLD}%)"
|
||||||
|
fi
|
||||||
|
resources:
|
||||||
|
requests: { cpu: 10m, memory: 16Mi }
|
||||||
|
limits: { memory: 64Mi }
|
||||||
|
volumeMounts:
|
||||||
|
- { name: data, mountPath: /data }
|
||||||
|
volumes:
|
||||||
|
- name: data
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: plex-data
|
||||||
Reference in New Issue
Block a user