diff --git a/applications/disk-usage-alert.yaml b/applications/disk-usage-alert.yaml new file mode 100644 index 0000000..e54720c --- /dev/null +++ b/applications/disk-usage-alert.yaml @@ -0,0 +1,23 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: disk-usage-alert + namespace: argocd +spec: + project: default + source: + repoURL: http://gitea-http.gitea.svc.cluster.local:3000/admin/turingpi.git + targetRevision: HEAD + path: manifests/disk-usage-alert + directory: + recurse: false + destination: + server: https://kubernetes.default.svc + namespace: default + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + - ServerSideApply=true diff --git a/manifests/disk-usage-alert/cronjob.yaml b/manifests/disk-usage-alert/cronjob.yaml new file mode 100644 index 0000000..f2e7def --- /dev/null +++ b/manifests/disk-usage-alert/cronjob.yaml @@ -0,0 +1,81 @@ +# SSD disk-usage alert for turing3 /mnt/ssd (shared by Postgres + Plex + media). +# A full SSD crashes Postgres and corrupts Plex's SQLite DB, so we warn early. +# +# NOTE: the Telegram bot token + chat id live in the `telegram-disk-alert` Secret, +# created out-of-band (NOT in git) so the token stays out of history: +# kubectl -n default create secret generic telegram-disk-alert \ +# --from-literal=botToken='' --from-literal=chatId='' +# +# Mounts the existing RWX plex-data PVC purely to read `df` of the underlying SSD. +apiVersion: batch/v1 +kind: CronJob +metadata: + name: disk-usage-alert + namespace: default +spec: + schedule: "*/15 * * * *" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + backoffLimit: 0 + activeDeadlineSeconds: 120 + template: + spec: + restartPolicy: Never + containers: + - name: check + image: curlimages/curl:8.11.1 + imagePullPolicy: IfNotPresent + env: + - name: THRESHOLD # alert at/above this % used + value: "90" + - name: COOLDOWN_SEC # min seconds between alerts (3h) + value: "10800" + - name: MOUNT + value: "/data" + - name: BOT_TOKEN + valueFrom: + secretKeyRef: { name: telegram-disk-alert, key: botToken } + - name: CHAT_ID + valueFrom: + secretKeyRef: { name: telegram-disk-alert, key: chatId } + command: ["/bin/sh", "-c"] + args: + - | + set -u + pct=$(df -P "$MOUNT" | awk 'END{gsub("%","",$5); print $5}') + avail=$(df -Ph "$MOUNT" | awk 'END{print $4}') + now=$(date +%s) + marker="$MOUNT/.disk-alert-last" + send() { + curl -s -m 15 "https://api.telegram.org/bot${BOT_TOKEN}/sendMessage" \ + --data-urlencode "chat_id=${CHAT_ID}" \ + --data-urlencode "text=$1" -d "parse_mode=HTML" >/dev/null + } + if [ "${TEST:-0}" = "1" ]; then + send "✅ turingpi disk-alert test — SSD at ${pct}% (free ${avail}). Alerting works." + echo "TEST sent (${pct}% used)"; exit 0 + fi + if [ "$pct" -ge "$THRESHOLD" ]; then + last=0; [ -f "$marker" ] && last=$(cat "$marker" 2>/dev/null || echo 0) + if [ $((now - last)) -ge "$COOLDOWN_SEC" ]; then + send "⚠️ turingpi SSD ${pct}% full — only ${avail} free on /mnt/ssd. Postgres + Plex crash at 100%. Prune content / check maintainerr." + echo "$now" > "$marker" + echo "ALERT sent (${pct}% >= ${THRESHOLD}%)" + else + echo "over threshold (${pct}%) but within cooldown; skipping" + fi + else + echo "ok: ${pct}% used, ${avail} free (threshold ${THRESHOLD}%)" + fi + resources: + requests: { cpu: 10m, memory: 16Mi } + limits: { memory: 64Mi } + volumeMounts: + - { name: data, mountPath: /data } + volumes: + - name: data + persistentVolumeClaim: + claimName: plex-data