From 98c1e7b63d585b56fb9e0f31b80bc2c9cb38fe1a Mon Sep 17 00:00:00 2001 From: gilgamezh Date: Thu, 4 Jun 2026 22:50:01 +0200 Subject: [PATCH] feat(disk-usage-alert): Telegram alert when turing3 SSD >=90% The SSD on turing3 (/mnt/ssd) is shared by Postgres, Plex's SQLite DB and the media library. When it fills, Postgres crashes (cannot write postmaster.pid) and Plex's library DB corrupts. Add a CronJob that checks df via the plex-data mount every 15m and warns Telegram at >=90% used (3h cooldown). Bot token/chatId live in the out-of-band `telegram-disk-alert` Secret, not in git. Co-Authored-By: Claude Opus 4.8 --- applications/disk-usage-alert.yaml | 23 +++++++ manifests/disk-usage-alert/cronjob.yaml | 81 +++++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 applications/disk-usage-alert.yaml create mode 100644 manifests/disk-usage-alert/cronjob.yaml diff --git a/applications/disk-usage-alert.yaml b/applications/disk-usage-alert.yaml new file mode 100644 index 0000000..e54720c --- /dev/null +++ b/applications/disk-usage-alert.yaml @@ -0,0 +1,23 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: disk-usage-alert + namespace: argocd +spec: + project: default + source: + repoURL: http://gitea-http.gitea.svc.cluster.local:3000/admin/turingpi.git + targetRevision: HEAD + path: manifests/disk-usage-alert + directory: + recurse: false + destination: + server: https://kubernetes.default.svc + namespace: default + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + - ServerSideApply=true diff --git a/manifests/disk-usage-alert/cronjob.yaml b/manifests/disk-usage-alert/cronjob.yaml new file mode 100644 index 0000000..f2e7def --- /dev/null +++ b/manifests/disk-usage-alert/cronjob.yaml @@ -0,0 +1,81 @@ +# SSD disk-usage alert for turing3 /mnt/ssd (shared by Postgres + Plex + media). +# A full SSD crashes Postgres and corrupts Plex's SQLite DB, so we warn early. +# +# NOTE: the Telegram bot token + chat id live in the `telegram-disk-alert` Secret, +# created out-of-band (NOT in git) so the token stays out of history: +# kubectl -n default create secret generic telegram-disk-alert \ +# --from-literal=botToken='' --from-literal=chatId='' +# +# Mounts the existing RWX plex-data PVC purely to read `df` of the underlying SSD. +apiVersion: batch/v1 +kind: CronJob +metadata: + name: disk-usage-alert + namespace: default +spec: + schedule: "*/15 * * * *" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + backoffLimit: 0 + activeDeadlineSeconds: 120 + template: + spec: + restartPolicy: Never + containers: + - name: check + image: curlimages/curl:8.11.1 + imagePullPolicy: IfNotPresent + env: + - name: THRESHOLD # alert at/above this % used + value: "90" + - name: COOLDOWN_SEC # min seconds between alerts (3h) + value: "10800" + - name: MOUNT + value: "/data" + - name: BOT_TOKEN + valueFrom: + secretKeyRef: { name: telegram-disk-alert, key: botToken } + - name: CHAT_ID + valueFrom: + secretKeyRef: { name: telegram-disk-alert, key: chatId } + command: ["/bin/sh", "-c"] + args: + - | + set -u + pct=$(df -P "$MOUNT" | awk 'END{gsub("%","",$5); print $5}') + avail=$(df -Ph "$MOUNT" | awk 'END{print $4}') + now=$(date +%s) + marker="$MOUNT/.disk-alert-last" + send() { + curl -s -m 15 "https://api.telegram.org/bot${BOT_TOKEN}/sendMessage" \ + --data-urlencode "chat_id=${CHAT_ID}" \ + --data-urlencode "text=$1" -d "parse_mode=HTML" >/dev/null + } + if [ "${TEST:-0}" = "1" ]; then + send "✅ turingpi disk-alert test — SSD at ${pct}% (free ${avail}). Alerting works." + echo "TEST sent (${pct}% used)"; exit 0 + fi + if [ "$pct" -ge "$THRESHOLD" ]; then + last=0; [ -f "$marker" ] && last=$(cat "$marker" 2>/dev/null || echo 0) + if [ $((now - last)) -ge "$COOLDOWN_SEC" ]; then + send "⚠️ turingpi SSD ${pct}% full — only ${avail} free on /mnt/ssd. Postgres + Plex crash at 100%. Prune content / check maintainerr." + echo "$now" > "$marker" + echo "ALERT sent (${pct}% >= ${THRESHOLD}%)" + else + echo "over threshold (${pct}%) but within cooldown; skipping" + fi + else + echo "ok: ${pct}% used, ${avail} free (threshold ${THRESHOLD}%)" + fi + resources: + requests: { cpu: 10m, memory: 16Mi } + limits: { memory: 64Mi } + volumeMounts: + - { name: data, mountPath: /data } + volumes: + - name: data + persistentVolumeClaim: + claimName: plex-data