diff --git a/apis/installer/v1alpha1/ace_ace_types.go b/apis/installer/v1alpha1/ace_ace_types.go
index b9f6a7e7e..5ec8b7c5b 100644
--- a/apis/installer/v1alpha1/ace_ace_types.go
+++ b/apis/installer/v1alpha1/ace_ace_types.go
@@ -52,21 +52,23 @@ type Ace struct {
// AceSpec is the schema for Ace Operator values file
type AceSpec struct {
- PlatformUi AcePlatformUi `json:"platform-ui"`
- ClusterUi AceClusterUi `json:"cluster-ui"`
- Grafana AceGrafana `json:"grafana"`
- KubedbUi AceKubedbUi `json:"kubedb-ui"`
- PlatformApi AcePlatformApi `json:"platform-api"`
- IngressNginx AceIngressNginx `json:"ingress-nginx"`
- Gateway AceGateway `json:"gateway"`
- IngressDns AceIngressDns `json:"ingress-dns"`
- Nats AceNats `json:"nats"`
- NatsDns AceNatsDns `json:"nats-dns"`
- Trickster AceTrickster `json:"trickster"`
- Openfga AceOpenfga `json:"openfga"`
- S3proxy AceS3proxy `json:"s3proxy"`
- PgOutbox AcePgOutbox `json:"pgoutbox"`
- OutboxSyncer AceOutboxSyncer `json:"outbox-syncer"`
+ PlatformUi AcePlatformUi `json:"platform-ui"`
+ ClusterUi AceClusterUi `json:"cluster-ui"`
+ Grafana AceGrafana `json:"grafana"`
+ KubedbUi AceKubedbUi `json:"kubedb-ui"`
+ PlatformApi AcePlatformApi `json:"platform-api"`
+ IngressNginx AceIngressNginx `json:"ingress-nginx"`
+ Gateway AceGateway `json:"gateway"`
+ IngressDns AceIngressDns `json:"ingress-dns"`
+ Nats AceNats `json:"nats"`
+ NatsDns AceNatsDns `json:"nats-dns"`
+ Trickster AceTrickster `json:"trickster"`
+ Openfga AceOpenfga `json:"openfga"`
+ S3proxy AceS3proxy `json:"s3proxy"`
+ PgOutbox AcePgOutbox `json:"pgoutbox"`
+ OutboxSyncer AceOutboxSyncer `json:"outbox-syncer"`
+ PostgresAlerts AcePostgresAlerts `json:"postgres-alerts"`
+ RedisAlerts AceRedisAlerts `json:"redis-alerts"`
// KubeBindServer AceKubeBindServer `json:"kube-bind-server"`
Global AceGlobalValues `json:"global"`
Settings Settings `json:"settings"`
@@ -179,6 +181,16 @@ type AceS3proxy struct {
*S3proxySpec `json:",inline,omitempty"`
}
+type AcePostgresAlerts struct {
+ Enabled bool `json:"enabled"`
+ Form AceAlertForm `json:"form,omitempty"`
+}
+
+type AceRedisAlerts struct {
+ Enabled bool `json:"enabled"`
+ Form AceAlertForm `json:"form,omitempty"`
+}
+
type AceGlobalValues struct {
NameOverride string `json:"nameOverride"`
FullnameOverride string `json:"fullnameOverride"`
diff --git a/apis/installer/v1alpha1/ace_shared_types.go b/apis/installer/v1alpha1/ace_shared_types.go
index 3781efb68..cfca44544 100644
--- a/apis/installer/v1alpha1/ace_shared_types.go
+++ b/apis/installer/v1alpha1/ace_shared_types.go
@@ -142,3 +142,20 @@ type AceHook struct {
HookWeight string `json:"hookWeight"`
HookDeletePolicy string `json:"hookDeletePolicy"`
}
+
+type AceAlertForm struct {
+ Alert AceAlertConfig `json:"alert,omitempty"`
+}
+
+type AceAlertConfig struct {
+ AppSuffix string `json:"appSuffix,omitempty"`
+ Groups AceAlertGroups `json:"groups,omitempty"`
+}
+
+type AceAlertGroups struct {
+ Stash AceStashGroup `json:"stash,omitempty"`
+}
+
+type AceStashGroup struct {
+ Enabled string `json:"enabled"`
+}
diff --git a/apis/installer/v1alpha1/zz_generated.deepcopy.go b/apis/installer/v1alpha1/zz_generated.deepcopy.go
index 5fbbe2d40..1ea605026 100644
--- a/apis/installer/v1alpha1/zz_generated.deepcopy.go
+++ b/apis/installer/v1alpha1/zz_generated.deepcopy.go
@@ -423,6 +423,54 @@ func (in *AceAccountsUi) DeepCopy() *AceAccountsUi {
return out
}
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *AceAlertConfig) DeepCopyInto(out *AceAlertConfig) {
+ *out = *in
+ out.Groups = in.Groups
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AceAlertConfig.
+func (in *AceAlertConfig) DeepCopy() *AceAlertConfig {
+ if in == nil {
+ return nil
+ }
+ out := new(AceAlertConfig)
+ in.DeepCopyInto(out)
+ return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *AceAlertForm) DeepCopyInto(out *AceAlertForm) {
+ *out = *in
+ out.Alert = in.Alert
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AceAlertForm.
+func (in *AceAlertForm) DeepCopy() *AceAlertForm {
+ if in == nil {
+ return nil
+ }
+ out := new(AceAlertForm)
+ in.DeepCopyInto(out)
+ return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *AceAlertGroups) DeepCopyInto(out *AceAlertGroups) {
+ *out = *in
+ out.Stash = in.Stash
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AceAlertGroups.
+func (in *AceAlertGroups) DeepCopy() *AceAlertGroups {
+ if in == nil {
+ return nil
+ }
+ out := new(AceAlertGroups)
+ in.DeepCopyInto(out)
+ return out
+}
+
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *AceBilling) DeepCopyInto(out *AceBilling) {
*out = *in
@@ -1675,6 +1723,38 @@ func (in *AcePlatformUi) DeepCopy() *AcePlatformUi {
return out
}
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *AcePostgresAlerts) DeepCopyInto(out *AcePostgresAlerts) {
+ *out = *in
+ out.Form = in.Form
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AcePostgresAlerts.
+func (in *AcePostgresAlerts) DeepCopy() *AcePostgresAlerts {
+ if in == nil {
+ return nil
+ }
+ out := new(AcePostgresAlerts)
+ in.DeepCopyInto(out)
+ return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *AceRedisAlerts) DeepCopyInto(out *AceRedisAlerts) {
+ *out = *in
+ out.Form = in.Form
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AceRedisAlerts.
+func (in *AceRedisAlerts) DeepCopy() *AceRedisAlerts {
+ if in == nil {
+ return nil
+ }
+ out := new(AceRedisAlerts)
+ in.DeepCopyInto(out)
+ return out
+}
+
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *AceReloader) DeepCopyInto(out *AceReloader) {
*out = *in
@@ -1823,6 +1903,8 @@ func (in *AceSpec) DeepCopyInto(out *AceSpec) {
in.S3proxy.DeepCopyInto(&out.S3proxy)
in.PgOutbox.DeepCopyInto(&out.PgOutbox)
in.OutboxSyncer.DeepCopyInto(&out.OutboxSyncer)
+ out.PostgresAlerts = in.PostgresAlerts
+ out.RedisAlerts = in.RedisAlerts
in.Global.DeepCopyInto(&out.Global)
in.Settings.DeepCopyInto(&out.Settings)
out.Image = in.Image
@@ -1909,6 +1991,21 @@ func (in *AceSpec) DeepCopy() *AceSpec {
return out
}
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *AceStashGroup) DeepCopyInto(out *AceStashGroup) {
+ *out = *in
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AceStashGroup.
+func (in *AceStashGroup) DeepCopy() *AceStashGroup {
+ if in == nil {
+ return nil
+ }
+ out := new(AceStashGroup)
+ in.DeepCopyInto(out)
+ return out
+}
+
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *AceTrickster) DeepCopyInto(out *AceTrickster) {
*out = *in
diff --git a/charts/ace/Chart.lock b/charts/ace/Chart.lock
index 384223d10..975299c5a 100644
--- a/charts/ace/Chart.lock
+++ b/charts/ace/Chart.lock
@@ -38,5 +38,11 @@ dependencies:
- name: platform-opscenter
repository: file://../platform-opscenter
version: v2026.6.12
-digest: sha256:c4a10722c3ba6c17b32f51eefffbfe55092332e6ad8aee172bfe71084be15b7f
-generated: "2026-06-03T06:42:27.865267503Z"
+- name: postgres-alerts
+ repository: oci://ghcr.io/appscode-charts
+ version: v2026.2.24
+- name: redis-alerts
+ repository: oci://ghcr.io/appscode-charts
+ version: v2026.2.24
+digest: sha256:8960a14236fa95195d54db9f35eb381fb5f620775ace0499e2d137f909cfaa2c
+generated: "2026-06-17T14:16:36.261832093+06:00"
diff --git a/charts/ace/Chart.yaml b/charts/ace/Chart.yaml
index 5f71490de..3be149cee 100644
--- a/charts/ace/Chart.yaml
+++ b/charts/ace/Chart.yaml
@@ -64,3 +64,11 @@ dependencies:
repository: file://../platform-opscenter
condition: platform-opscenter.enabled
version: v2026.6.12
+- name: postgres-alerts
+ repository: oci://ghcr.io/appscode-charts
+ condition: postgres-alerts.enabled
+ version: v2026.2.24
+- name: redis-alerts
+ repository: oci://ghcr.io/appscode-charts
+ condition: redis-alerts.enabled
+ version: v2026.2.24
diff --git a/charts/ace/README.md b/charts/ace/README.md
index 8c105a335..871d0bdf9 100644
--- a/charts/ace/README.md
+++ b/charts/ace/README.md
@@ -66,6 +66,12 @@ The following table lists the configurable parameters of the `ace` chart and the
| openfga.datastoreURI | | "" |
| pgoutbox.enabled | | false |
| outbox-syncer.enabled | | false |
+| postgres-alerts.enabled | | true |
+| postgres-alerts.form.alert.appSuffix | | "-db" |
+| postgres-alerts.form.alert.groups.stash.enabled | | "" |
+| redis-alerts.enabled | | true |
+| redis-alerts.form.alert.appSuffix | | "-cache" |
+| redis-alerts.form.alert.groups.stash.enabled | | "" |
| global.nameOverride | | "ace" |
| global.fullnameOverride | | "" |
| global.platform.host | | appscode.ninja |
diff --git a/charts/ace/templates/cache/alert.yaml b/charts/ace/templates/cache/alert.yaml
deleted file mode 100644
index b0da58826..000000000
--- a/charts/ace/templates/cache/alert.yaml
+++ /dev/null
@@ -1,216 +0,0 @@
-{{ $app := printf "%s-cache" (include "ace.fullname" .) }}
-
-apiVersion: monitoring.coreos.com/v1
-kind: PrometheusRule
-metadata:
- name: {{ $app }}
- namespace: {{ $.Release.Namespace }}
- labels:
- release: kube-prometheus-stack
- app.kubernetes.io/managed-by: {{ .Release.Service }}
- {{- include "ace.selectorLabels" . | nindent 4 }}
-spec:
- groups:
- - name: redis.database.{{ $.Release.Namespace }}.{{ $app }}.rules
- rules:
- - alert: RedisDown
- expr: redis_up{job="{{- $app -}}-stats",namespace="{{ $.Release.Namespace }}"} == 0
- for: 0m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Redis instance down (instance {{`{{`}} $labels.pod {{`}}`}})
- description: "Redis instance is down on {{`{{`}} $labels.pod {{`}}`}}\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: RedisMissingMaster
- expr: (count(redis_instance_info{job="{{- $app -}}-stats",namespace="{{ $.Release.Namespace }}"}) or vector(0)) < 1
- for: 0m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Redis missing master (instance {{`{{`}} $labels.pod {{`}}`}})
- description: "Redis cluster has less than expected amount of node marked as master\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: RedisTooManyConnections
- expr: redis_connected_clients{job="{{- $app -}}-stats",namespace="{{ $.Release.Namespace }}"} > 100
- for: 2m
- labels:
- severity: warning
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Redis too many connections (instance {{`{{`}} $labels.pod {{`}}`}})
- description: "Redis instance has too many connections. More than 100 of Redis connections are in use on {{`{{`}} $labels.pod {{`}}`}}\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: RedisTooManyMasters
- expr: (count(redis_instance_info{job="{{- $app -}}-stats",namespace="{{ $.Release.Namespace }}"})) > 1
- for: 0m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Redis too many master nodes (instance {{`{{`}} $labels.pod {{`}}`}})
- description: "Redis cluster has too many nodes marked as master. \n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: RedisDisconnectedSlaves
- expr: count without (instance) (redis_connected_slaves{job="{{- $app -}}-stats",namespace="{{ $.Release.Namespace }}"}) - sum without (instance) (redis_connected_slaves{job="{{- $app -}}-stats",namespace="{{ $.Release.Namespace }}"}) - 1 > 0
- for: 0m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Redis disconnected slaves (instance {{`{{`}} $labels.pod {{`}}`}})
- description: "Redis not replicating for all slaves. Consider reviewing the redis replication status.\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: DiskUsageHigh
- expr: (kubelet_volume_stats_used_bytes{service="kubernetes"} / on(persistentvolumeclaim) group_left(pod) ((kubelet_volume_stats_used_bytes{service="kubernetes"} + on(persistentvolumeclaim) group_left(pod) kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"{{ $.Release.Name }}-.+$",namespace=~"{{ $.Release.Namespace }}"}) ) )
- * 100 > 80
- for: 1m
- labels:
- severity: warning
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Persistent Volume Usages in instance {{`{{`}} $labels.instance {{`}}`}}
- description: "Persistent Volume Usages\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: DiskAlmostFull
- expr: (kubelet_volume_stats_used_bytes{service="kubernetes"} / on(persistentvolumeclaim) group_left(pod) ((kubelet_volume_stats_used_bytes{service="kubernetes"} + on(persistentvolumeclaim) group_left(pod) kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"{{ $.Release.Name }}-.+$",namespace=~"{{ $.Release.Namespace }}"}) ) )
- * 100 > 95
- for: 1m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Persistent Volume Usages in instance {{`{{`}} $labels.instance {{`}}`}}
- description: "Persistent Volume Usages\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
-
- - name: redis.provisioner.{{ $.Release.Namespace }}.{{ $app }}.rules
- rules:
- - alert: KubeDBRedisPhaseNotReady
- expr: kubedb_com_redis_status_phase{phase="NotReady",app="{{ $app }}",namespace="{{ $.Release.Namespace }}"} == 1
- for: 1m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: KubeDB Redis Phase NotReady (redis {{`{{`}} $labels.redis {{`}}`}})
- description: "KubeDB Redis Phase not ready on {{`{{`}} $labels.redis {{`}}`}}\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: KubeDBRedisPhaseCritical
- expr: kubedb_com_redis_status_phase{phase="Critical",app="{{ $app }}",namespace="{{ $.Release.Namespace }}"} == 1
- for: 15m
- labels:
- severity: warning
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: KubeDB Redis Phase Critical (redis {{`{{`}} $labels.redis {{`}}`}})
- description: "KubeDB Redis Phase Critical {{`{{`}} $labels.redis {{`}}`}}\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
-
- - name: redis.opsManager.{{ $.Release.Namespace }}.{{ $app }}.rules
- rules:
- - alert: KubeDBRedisOpsRequestOnProgress
- expr: ops_kubedb_com_redisopsrequest_status_phase{phase="Progressing",app="{{ $app }}",namespace="{{ $.Release.Namespace }}"} == 1
- for: 0m
- labels:
- severity: info
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: RedisOpsRequest on progress (redisopsrequest {{`{{`}} $labels.redisopsrequest {{`}}`}})
- description: "RedisOpsRequest {{`{{`}} $labels.redisopsrequest {{`}}`}} is in progressressing status\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: KubeDBRedisOpsRequestStatusProgressingToLong
- expr: ops_kubedb_com_redisopsrequest_status_phase{phase="Progressing",app="{{ $app }}",namespace="{{ $.Release.Namespace }}"} == 1
- for: 30m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: RedisOpsRequest is in progressing status for too long (redisopsrequest {{`{{`}} $labels.redisopsrequest {{`}}`}})
- description: "RedisOpsRequest {{`{{`}} $labels.redisopsrequest {{`}}`}} is in progressing status for too long\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: KubeDBRedisOpsRequestFailed
- expr: ops_kubedb_com_redisopsrequest_status_phase{phase="Failed",app="{{ $app }}",namespace="{{ $.Release.Namespace }}"} == 1
- for: 0m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: RedisOpsRequest failed (redisopsrequest {{`{{`}} $labels.redisopsrequest {{`}}`}})
- description: "RedisOpsRequest {{`{{`}} $labels.redisopsrequest {{`}}`}} failed \n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
-
- - name: redis.kubeStash.{{ $.Release.Namespace }}.{{ $app }}.rules
- rules:
- - alert: RedisKubeStashBackupSessionFailed
- expr: core_kubestash_com_backupsession_phase{phase="Failed"} * on(backup_invoker_kind, backup_invoker_name, namespace) group_left(target_kind, target_name, target_namespace) core_kubestash_com_backupconfiguration_info{target_kind="Redis", target_name="{{ $app }}", target_namespace="{{ $.Release.Namespace }}"} == 1
- for: 0m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Redis KubeStash backup session failed (invoker_name {{`{{`}} $labels.invoker_name {{`}}`}})
- description: "Redis KubeStash backupsession failed {{`{{`}} $labels.invoker_name {{`}}`}} \n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: RedisKubeStashRestoreSessionFailed
- expr: core_kubestash_com_restoresession_phase{target_kind="Redis", target_name="{{ $app }}", target_namespace="{{ $.Release.Namespace }}", phase="Failed"} == 1
- for: 0m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Redis KubeStash restore session failed (invoker_name {{`{{`}} $labels.invoker_name {{`}}`}})
- description: "Redis KubeStash restore session failed {{`{{`}} $labels.invoker_name {{`}}`}} \n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: RedisKubeStashNoBackupSessionForTooLong
- expr: time() - max(core_kubestash_com_backupsession_created * on(backup_invoker_kind, backup_invoker_name, namespace) group_left(target_kind, target_name, target_namespace) core_kubestash_com_backupconfiguration_info{target_kind="Redis", target_name="{{ $app }}", target_namespace="{{ $.Release.Namespace }}"}) > 18000
- for: 0m
- labels:
- severity: warning
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Redis KubeStash no backup for last 18000 second (invoker_name {{`{{`}} $labels.invoker_name {{`}}`}})
- description: "Redis KubeStash no backup for too long {{`{{`}} $labels.invoker_name {{`}}`}} \n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: RedisKubeStashRepositoryCorrupted
- expr: storage_kubestash_com_repository_info{target_kind="Redis", target_name="{{ $app }}", target_namespace="{{ $.Release.Namespace }}", integrity="false"} == 1
- for: 5m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Redis KubeStash repository corrupted (invoker_name {{`{{`}} $labels.invoker_name {{`}}`}})
- description: "Redis KubeStash repository corrupted {{`{{`}} $labels.invoker_name {{`}}`}} \n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: RedisKubeStashRepositoryStorageRunningLow
- expr: storage_kubestash_com_repository_size_bytes{target_kind="Redis", target_name="{{ $app }}", target_namespace="{{ $.Release.Namespace }}"} > 10737418240
- for: 5m
- labels:
- severity: warning
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Redis KubeStash Repository storage more than 10737418240 byte. (invoker_name {{`{{`}} $labels.invoker_name {{`}}`}})
- description: "Redis KubeStash Repository storage running low {{`{{`}} $labels.invoker_name {{`}}`}} \n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: RedisKubeStashBackupSessionPeriodTooLong
- expr: core_kubestash_com_backupsession_duration_seconds * on(backup_invoker_kind, backup_invoker_name, namespace) group_left(target_kind, target_name, target_namespace) core_kubestash_com_backupconfiguration_info{target_kind="Redis", target_name="{{ $app }}", target_namespace="{{ $.Release.Namespace }}"} > 1800
- for: 0m
- labels:
- severity: warning
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Redis KubeStash backup session took more than 1800 second to complete. (invoker_name {{`{{`}} $labels.invoker_name {{`}}`}})
- description: "Redis KubeStash backup session taking to long to complete {{`{{`}} $labels.invoker_name {{`}}`}} \n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: RedisKubeStashRestoreSessionPeriodTooLong
- expr: core_kubestash_com_restoresession_duration_seconds{target_kind="Redis", target_name="{{ $app }}", target_namespace="{{ $.Release.Namespace }}"} > 1800
- for: 0m
- labels:
- severity: warning
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Redis KubeStash restore session took more than 1800 second to complete. (invoker_name {{`{{`}} $labels.invoker_name {{`}}`}})
- description: "Redis KubeStash restore session taking to long to complete {{`{{`}} $labels.invoker_name {{`}}`}} \n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
diff --git a/charts/ace/templates/db/alert.yaml b/charts/ace/templates/db/alert.yaml
deleted file mode 100644
index 2f554fb27..000000000
--- a/charts/ace/templates/db/alert.yaml
+++ /dev/null
@@ -1,256 +0,0 @@
-{{ $app := printf "%s-db" (include "ace.fullname" .) }}
-
-apiVersion: monitoring.coreos.com/v1
-kind: PrometheusRule
-metadata:
- name: {{ $app }}
- namespace: {{ $.Release.Namespace }}
- labels:
- release: kube-prometheus-stack
- app.kubernetes.io/managed-by: {{ .Release.Service }}
- {{- include "ace.selectorLabels" . | nindent 4 }}
-spec:
- groups:
- - name: postgres.database.{{ $.Release.Namespace }}.{{ $app }}.rules
- rules:
- - alert: PostgresqlDown
- expr: pg_up{job="{{- $app -}}-stats",namespace="{{ $.Release.Namespace }}"} == 0
- for: 0m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Postgres instance down (instance {{`{{`}} $labels.pod {{`}}`}})
- description: "Postgres instance is down on {{`{{`}} $labels.pod {{`}}`}}\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: PostgresqlSplitBrain
- expr: count(pg_replication_is_replica{job="{{- $app -}}-stats",namespace="{{ $.Release.Namespace }}"} == 0) != 1
- for: 0m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Postgresql split brain (instance {{`{{`}} $labels.pod {{`}}`}})
- description: "Split Brain, too many primary Postgresql databases in read-write mode\n {{`{{`}} $labels.pod {{`}}`}}\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: PostgresqlTooManyLocksAcquired
- expr: ((sum by (pod) (pg_locks_count{job="{{- $app -}}-stats",namespace="{{ $.Release.Namespace }}"}) ) / ( sum by (pod) (pg_settings_max_locks_per_transaction{job="{{- $app -}}-stats",namespace="{{ $.Release.Namespace }}"}) * sum by (pod) (pg_settings_max_connections{job="{{- $app -}}-stats",namespace="{{ $.Release.Namespace }}"}))) > 0.20
- for: 2m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Postgresql too many locks acquired (instance {{`{{`}} $labels.pod {{`}}`}})
- description: "Too many locks acquired on the database. \n If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n {{`{{`}} $labels.pod {{`}}`}}\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: PostgresReplicationSlotLagHigh
- expr: max_over_time(pg_replication_slots_pg_wal_lsn_diff[1m]) > 838860800
- for: 1m
- labels:
- severity: warning
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: "High replication slot WAL lag detected on slot {{`{{`}} $labels.slot_name {{`}}`}}"
- description: "The replication lag for slot {{`{{`}} $labels.slot_name {{`}}`}} is {{`{{`}} $value {{`}}`}} bytes, which may cause disk space issues or replication errors. This indicates the replica or consumer might be falling behind."
- - alert: PostgresReplicationSlotLagCritical
- expr: max_over_time(pg_replication_slots_pg_wal_lsn_diff[1m]) > 1288490188
- for: 1m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: "Critical PostgreSQL replication lag detected on slot {{`{{`}} $labels.slot_name {{`}}`}}"
- description: "Replication slot {{`{{`}} $labels.slot_name {{`}}`}} is lagging by more than {{`{{`}} $value {{`}}`}} bytes. WAL file deletion is imminent. Immediate intervention required to prevent replication slot invalidation and potential data loss."
- - alert: PostgresqlRestarted
- expr: time() - pg_postmaster_start_time_seconds{job="{{- $app -}}-stats",namespace="{{ $.Release.Namespace }}"} < 60
- for: 0m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Postgresql restarted (instance {{`{{`}} $labels.pod {{`}}`}})
- description: "Postgresql restarted\n {{`{{`}} $labels.pod {{`}}`}}\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: PostgresqlExporterError
- expr: pg_exporter_last_scrape_error{job="{{- $app -}}-stats",namespace="{{ $.Release.Namespace }}"} > 0
- for: 5m
- labels:
- severity: warning
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Postgresql exporter error (instance {{`{{`}} $labels.pod {{`}}`}})
- description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n {{`{{`}} $labels.pod {{`}}`}}\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: PostgresqlHighRollbackRate
- expr: rate(pg_stat_database_xact_rollback{job="{{- $app -}}-stats",namespace="{{ $.Release.Namespace }}"}[3m]) / rate(pg_stat_database_xact_commit{job="{{- $app -}}-stats",namespace="{{ $.Release.Namespace }}"}[3m]) > 0.02
- for: 0m
- labels:
- severity: warning
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Postgresql high rollback rate (instance {{`{{`}} $labels.pod {{`}}`}})
- description: "Ratio of transactions being aborted compared to committed is hign. {{`{{`}} $labels.pod {{`}}`}}\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: PostgresTooManyConnections
- expr: sum by (pod) (pg_stat_activity_count{job="{{- $app -}}-stats",namespace="{{ $.Release.Namespace }}"}) >= sum by (pod) (pg_settings_max_connections{job="{{- $app -}}-stats",namespace="{{ $.Release.Namespace }}"} * 80) / 100
- for: 2m
- labels:
- severity: warning
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Postgresql too many connections (instance {{`{{`}} $labels.pod {{`}}`}})
- description: "PostgreSQL instance has too many connections. 80% of Postgres connections are in use on {{`{{`}} $labels.pod {{`}}`}}\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: DiskUsageHigh
- expr: (kubelet_volume_stats_used_bytes{service="kubernetes"} / on(persistentvolumeclaim) group_left(pod) ((kubelet_volume_stats_used_bytes{service="kubernetes"} + on(persistentvolumeclaim) group_left(pod) kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"{{ $.Release.Name }}-.+$",namespace=~"{{ $.Release.Namespace }}"}) ) )
- * 100 > 80
- for: 1m
- labels:
- severity: warning
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Persistent Volume Usages in instance {{`{{`}} $labels.instance {{`}}`}}
- description: "Persistent Volume Usages\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: DiskAlmostFull
- expr: (kubelet_volume_stats_used_bytes{service="kubernetes"} / on(persistentvolumeclaim) group_left(pod) ((kubelet_volume_stats_used_bytes{service="kubernetes"} + on(persistentvolumeclaim) group_left(pod) kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"{{ $.Release.Name }}-.+$",namespace=~"{{ $.Release.Namespace }}"}) ) )
- * 100 > 95
- for: 1m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: Persistent Volume Usages in instance {{`{{`}} $labels.instance {{`}}`}}
- description: "Persistent Volume Usages\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
-
- - name: postgres.provisioner.{{ $.Release.Namespace }}.{{ $app }}.rules
- rules:
- - alert: KubeDBPostgreSQLPhaseNotReady
- expr: kubedb_com_postgres_status_phase{phase="NotReady",app="{{ $app }}",namespace="{{ $.Release.Namespace }}"} == 1
- for: 1m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: KubeDB PostgreSQL Phase NotReady (postgres {{`{{`}} $labels.postgres {{`}}`}})
- description: "KubeDB PostgreSQL Phase not ready on {{`{{`}} $labels.postgres {{`}}`}}\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: KubeDBPostgreSQLPhaseCritical
- expr: kubedb_com_postgres_status_phase{phase="Critical",app="{{ $app }}",namespace="{{ $.Release.Namespace }}"} == 1
- for: 15m
- labels:
- severity: warning
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: KubeDB PostgreSQL Phase Critical (postgres {{`{{`}} $labels.postgres {{`}}`}})
- description: "KubeDB PostgreSQL Phase Critical {{`{{`}} $labels.postgres {{`}}`}}\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
-
- - name: postgres.opsManager.{{ $.Release.Namespace }}.{{ $app }}.rules
- rules:
- - alert: KubeDBPostgreSQLOpsRequestOnProgress
- expr: ops_kubedb_com_postgresopsrequest_status_phase{phase="Progressing",app="{{ $app }}",namespace="{{ $.Release.Namespace }}"} == 1
- for: 0m
- labels:
- severity: info
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: PostgreSQLOpsRequest on progress (postgresopsrequest {{`{{`}} $labels.postgresopsrequest {{`}}`}})
- description: "PostgresOpsRequest {{`{{`}} $labels.postgresopsrequest {{`}}`}} is in progressressing status\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: KubeDBPostgreSQLOpsRequestStatusProgressingToLong
- expr: ops_kubedb_com_postgresopsrequest_status_phase{phase="Progressing",app="{{ $app }}",namespace="{{ $.Release.Namespace }}"} == 1
- for: 30m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: PostgreSQLOpsRequest is in progressing status for too long (postgresopsrequest {{`{{`}} $labels.postgresopsrequest {{`}}`}})
- description: "PostgresOpsRequest {{`{{`}} $labels.postgresopsrequest {{`}}`}} is in progressing status for too long\n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: KubeDBPostgreSQLOpsRequestFailed
- expr: ops_kubedb_com_postgresopsrequest_status_phase{phase="Failed",app="{{ $app }}",namespace="{{ $.Release.Namespace }}"} == 1
- for: 0m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: PostgreSQLOpsRequest failed (postgresopsrequest {{`{{`}} $labels.postgresopsrequest {{`}}`}})
- description: "PostgresOpsRequest {{`{{`}} $labels.postgresopsrequest {{`}}`}} failed \n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
-
- - name: postgres.kubeStash.{{ $.Release.Namespace }}.{{ $app }}.rules
- rules:
- - alert: PostgreSQLKubeStashBackupSessionFailed
- expr: core_kubestash_com_backupsession_phase{phase="Failed"} * on(backup_invoker_kind, backup_invoker_name, namespace) group_left(target_kind, target_name, target_namespace) core_kubestash_com_backupconfiguration_info{target_kind="PostgreSQL", target_name="{{ $app }}", target_namespace="{{ $.Release.Namespace }}"} == 1
- for: 0m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: PostgreSQL KubeStash backup session failed (invoker_name {{`{{`}} $labels.invoker_name {{`}}`}})
- description: "PostgreSQL KubeStash backupsession failed {{`{{`}} $labels.invoker_name {{`}}`}} \n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: PostgreSQLKubeStashRestoreSessionFailed
- expr: core_kubestash_com_restoresession_phase{target_kind="PostgreSQL", target_name="{{ $app }}", target_namespace="{{ $.Release.Namespace }}", phase="Failed"} == 1
- for: 0m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: PostgreSQL KubeStash restore session failed (invoker_name {{`{{`}} $labels.invoker_name {{`}}`}})
- description: "PostgreSQL KubeStash restore session failed {{`{{`}} $labels.invoker_name {{`}}`}} \n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: PostgreSQLKubeStashNoBackupSessionForTooLong
- expr: time() - max(core_kubestash_com_backupsession_created * on(backup_invoker_kind, backup_invoker_name, namespace) group_left(target_kind, target_name, target_namespace) core_kubestash_com_backupconfiguration_info{target_kind="PostgreSQL", target_name="{{ $app }}", target_namespace="{{ $.Release.Namespace }}"}) > 18000
- for: 0m
- labels:
- severity: warning
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: PostgreSQL KubeStash no backup for last 18000 second (invoker_name {{`{{`}} $labels.invoker_name {{`}}`}})
- description: "PostgreSQL KubeStash no backup for too long {{`{{`}} $labels.invoker_name {{`}}`}} \n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: PostgreSQLKubeStashRepositoryCorrupted
- expr: storage_kubestash_com_repository_info{target_kind="PostgreSQL", target_name="{{ $app }}", target_namespace="{{ $.Release.Namespace }}", integrity="false"} == 1
- for: 5m
- labels:
- severity: critical
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: PostgreSQL KubeStash repository corrupted (invoker_name {{`{{`}} $labels.invoker_name {{`}}`}})
- description: "PostgreSQL KubeStash repository corrupted {{`{{`}} $labels.invoker_name {{`}}`}} \n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: PostgreSQLKubeStashRepositoryStorageRunningLow
- expr: storage_kubestash_com_repository_size_bytes{target_kind="PostgreSQL", target_name="{{ $app }}", target_namespace="{{ $.Release.Namespace }}"} > 10737418240
- for: 5m
- labels:
- severity: warning
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: PostgreSQL KubeStash Repository storage more than 10737418240 byte. (invoker_name {{`{{`}} $labels.invoker_name {{`}}`}})
- description: "PostgreSQL KubeStash Repository storage running low {{`{{`}} $labels.invoker_name {{`}}`}} \n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: PostgreSQLKubeStashBackupSessionPeriodTooLong
- expr: core_kubestash_com_backupsession_duration_seconds * on(backup_invoker_kind, backup_invoker_name, namespace) group_left(target_kind, target_name, target_namespace) core_kubestash_com_backupconfiguration_info{target_kind="PostgreSQL", target_name="{{ $app }}", target_namespace="{{ $.Release.Namespace }}"} > 1800
- for: 0m
- labels:
- severity: warning
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: PostgreSQL KubeStash backup session took more than 1800 second to complete. (invoker_name {{`{{`}} $labels.invoker_name {{`}}`}})
- description: "PostgreSQL KubeStash backup session taking to long to complete {{`{{`}} $labels.invoker_name {{`}}`}} \n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
- - alert: PostgreSQLKubeStashRestoreSessionPeriodTooLong
- expr: core_kubestash_com_restoresession_duration_seconds{target_kind="PostgreSQL", target_name="{{ $app }}", target_namespace="{{ $.Release.Namespace }}"} > 1800
- for: 0m
- labels:
- severity: warning
- app: {{ $app }}
- app_namespace: {{ $.Release.Namespace }}
- annotations:
- summary: PostgreSQL KubeStash restore session took more than 1800 second to complete. (invoker_name {{`{{`}} $labels.invoker_name {{`}}`}})
- description: "PostgreSQL KubeStash restore session taking to long to complete {{`{{`}} $labels.invoker_name {{`}}`}} \n VALUE = {{`{{`}} $value {{`}}`}}\n LABELS = {{`{{`}} $labels {{`}}`}}"
diff --git a/charts/ace/values.openapiv3_schema.yaml b/charts/ace/values.openapiv3_schema.yaml
index 0886c0f3b..1dcbe5bbe 100644
--- a/charts/ace/values.openapiv3_schema.yaml
+++ b/charts/ace/values.openapiv3_schema.yaml
@@ -23227,6 +23227,56 @@ properties:
type: string
type: object
type: object
+ postgres-alerts:
+ properties:
+ enabled:
+ type: boolean
+ form:
+ properties:
+ alert:
+ properties:
+ appSuffix:
+ type: string
+ groups:
+ properties:
+ stash:
+ properties:
+ enabled:
+ type: string
+ required:
+ - enabled
+ type: object
+ type: object
+ type: object
+ type: object
+ required:
+ - enabled
+ type: object
+ redis-alerts:
+ properties:
+ enabled:
+ type: boolean
+ form:
+ properties:
+ alert:
+ properties:
+ appSuffix:
+ type: string
+ groups:
+ properties:
+ stash:
+ properties:
+ enabled:
+ type: string
+ required:
+ - enabled
+ type: object
+ type: object
+ type: object
+ type: object
+ required:
+ - enabled
+ type: object
resources:
properties:
claims:
@@ -27411,6 +27461,8 @@ required:
- platform-ui
- podAnnotations
- podSecurityContext
+- postgres-alerts
+- redis-alerts
- resources
- s3proxy
- securityContext
diff --git a/charts/ace/values.yaml b/charts/ace/values.yaml
index c7d6b9ca4..c612052b0 100644
--- a/charts/ace/values.yaml
+++ b/charts/ace/values.yaml
@@ -96,6 +96,24 @@ pgoutbox:
outbox-syncer:
enabled: false
+postgres-alerts:
+ enabled: true
+ form:
+ alert:
+ appSuffix: "-db"
+ groups:
+ stash:
+ enabled: ""
+
+redis-alerts:
+ enabled: true
+ form:
+ alert:
+ appSuffix: "-cache"
+ groups:
+ stash:
+ enabled: ""
+
# -------------
global: