feat(disk-usage-alert): Telegram alert when turing3 SSD >=90%
The SSD on turing3 (/mnt/ssd) is shared by Postgres, Plex's SQLite DB and the media library. When it fills, Postgres crashes (cannot write postmaster.pid) and Plex's library DB corrupts. Add a CronJob that checks df via the plex-data mount every 15m and warns Telegram at >=90% used (3h cooldown). Bot token/chatId live in the out-of-band `telegram-disk-alert` Secret, not in git. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,23 @@
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: disk-usage-alert
|
||||
namespace: argocd
|
||||
spec:
|
||||
project: default
|
||||
source:
|
||||
repoURL: http://gitea-http.gitea.svc.cluster.local:3000/admin/turingpi.git
|
||||
targetRevision: HEAD
|
||||
path: manifests/disk-usage-alert
|
||||
directory:
|
||||
recurse: false
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: default
|
||||
syncPolicy:
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
- ServerSideApply=true
|
||||
@@ -0,0 +1,81 @@
|
||||
# SSD disk-usage alert for turing3 /mnt/ssd (shared by Postgres + Plex + media).
|
||||
# A full SSD crashes Postgres and corrupts Plex's SQLite DB, so we warn early.
|
||||
#
|
||||
# NOTE: the Telegram bot token + chat id live in the `telegram-disk-alert` Secret,
|
||||
# created out-of-band (NOT in git) so the token stays out of history:
|
||||
# kubectl -n default create secret generic telegram-disk-alert \
|
||||
# --from-literal=botToken='<token>' --from-literal=chatId='<chatId>'
|
||||
#
|
||||
# Mounts the existing RWX plex-data PVC purely to read `df` of the underlying SSD.
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: disk-usage-alert
|
||||
namespace: default
|
||||
spec:
|
||||
schedule: "*/15 * * * *"
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 1
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 0
|
||||
activeDeadlineSeconds: 120
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
containers:
|
||||
- name: check
|
||||
image: curlimages/curl:8.11.1
|
||||
imagePullPolicy: IfNotPresent
|
||||
env:
|
||||
- name: THRESHOLD # alert at/above this % used
|
||||
value: "90"
|
||||
- name: COOLDOWN_SEC # min seconds between alerts (3h)
|
||||
value: "10800"
|
||||
- name: MOUNT
|
||||
value: "/data"
|
||||
- name: BOT_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef: { name: telegram-disk-alert, key: botToken }
|
||||
- name: CHAT_ID
|
||||
valueFrom:
|
||||
secretKeyRef: { name: telegram-disk-alert, key: chatId }
|
||||
command: ["/bin/sh", "-c"]
|
||||
args:
|
||||
- |
|
||||
set -u
|
||||
pct=$(df -P "$MOUNT" | awk 'END{gsub("%","",$5); print $5}')
|
||||
avail=$(df -Ph "$MOUNT" | awk 'END{print $4}')
|
||||
now=$(date +%s)
|
||||
marker="$MOUNT/.disk-alert-last"
|
||||
send() {
|
||||
curl -s -m 15 "https://api.telegram.org/bot${BOT_TOKEN}/sendMessage" \
|
||||
--data-urlencode "chat_id=${CHAT_ID}" \
|
||||
--data-urlencode "text=$1" -d "parse_mode=HTML" >/dev/null
|
||||
}
|
||||
if [ "${TEST:-0}" = "1" ]; then
|
||||
send "✅ <b>turingpi disk-alert test</b> — SSD at ${pct}% (free ${avail}). Alerting works."
|
||||
echo "TEST sent (${pct}% used)"; exit 0
|
||||
fi
|
||||
if [ "$pct" -ge "$THRESHOLD" ]; then
|
||||
last=0; [ -f "$marker" ] && last=$(cat "$marker" 2>/dev/null || echo 0)
|
||||
if [ $((now - last)) -ge "$COOLDOWN_SEC" ]; then
|
||||
send "⚠️ <b>turingpi SSD ${pct}% full</b> — only ${avail} free on /mnt/ssd. Postgres + Plex crash at 100%. Prune content / check maintainerr."
|
||||
echo "$now" > "$marker"
|
||||
echo "ALERT sent (${pct}% >= ${THRESHOLD}%)"
|
||||
else
|
||||
echo "over threshold (${pct}%) but within cooldown; skipping"
|
||||
fi
|
||||
else
|
||||
echo "ok: ${pct}% used, ${avail} free (threshold ${THRESHOLD}%)"
|
||||
fi
|
||||
resources:
|
||||
requests: { cpu: 10m, memory: 16Mi }
|
||||
limits: { memory: 64Mi }
|
||||
volumeMounts:
|
||||
- { name: data, mountPath: /data }
|
||||
volumes:
|
||||
- name: data
|
||||
persistentVolumeClaim:
|
||||
claimName: plex-data
|
||||
Reference in New Issue
Block a user