diff --git a/manifests/base/influxdb/monitoring_alerting.yaml b/manifests/base/influxdb/monitoring_alerting.yaml index dbc77d20..f52e9322 100644 --- a/manifests/base/influxdb/monitoring_alerting.yaml +++ b/manifests/base/influxdb/monitoring_alerting.yaml @@ -20,7 +20,7 @@ spec: groups: - name: InfluxDB2PersistentVolumes rules: - - alert: influxdb2 + - alert: InfluxDB2PersistentVolumeFull annotations: description: InfluxDB2 data volume is full. summary: InfluxDB2 data volume is full. diff --git a/manifests/env/nostromo-stage/postgresql/alerting.yaml b/manifests/env/nostromo-stage/postgresql/alerting.yaml index a36509f6..242a158a 100644 --- a/manifests/env/nostromo-stage/postgresql/alerting.yaml +++ b/manifests/env/nostromo-stage/postgresql/alerting.yaml @@ -6,14 +6,14 @@ spec: groups: - name: BackupAlertRules rules: - - alert: DiffBackupTimeout + - alert: PostresqlDiffBackupTimeout expr: time() - max( kube_job_status_start_time * ON(job_name) GROUP_RIGHT() kube_job_labels{job_name=~"db-repo1-diff.*"} * ON(job_name) GROUP_RIGHT() label_replace(kube_pod_labels, "job_name", "$1", "label_job_name", "(.*)") * ON(pod) GROUP_RIGHT() kube_pod_status_phase{job="kube-state-metrics",phase=~"Succeeded"}) BY (job_name,label_postgres_operator_crunchydata_com_pgbackrest_cronjob) > 86400 labels: severity: critical for: 10m annotations: message: "Backup job {{ $labels.job_name }} has NOT been running for more than 24 hours." - - alert: FailedBackupPods + - alert: PostgresqlFailedBackupPods expr: kube_pod_status_phase{namespace="b4mad-racing-stage",phase="Failed"} * ON(pod) GROUP_RIGHT() kube_pod_labels{label_postgres_operator_crunchydata_com_pgbackrest_cronjob="diff",namespace="b4mad-racing-stage"} > 0 labels: severity: critical