From fc7765b5d4d37209c194a9752fe610129d43a755 Mon Sep 17 00:00:00 2001
From: Adrien Reslinger <adrien@reslinger.net>
Date: Wed, 26 Jun 2024 23:46:26 +0200
Subject: [PATCH] Add prometheus rules for longhorn

---
 files/longhorn/longhorn_prometheusRules.yml | 90 +++++++++++++++++++++
 tasks/longhorn.yml                          | 10 +++
 2 files changed, 100 insertions(+)
 create mode 100644 files/longhorn/longhorn_prometheusRules.yml

diff --git a/files/longhorn/longhorn_prometheusRules.yml b/files/longhorn/longhorn_prometheusRules.yml
new file mode 100644
index 0000000..7f4d14c
--- /dev/null
+++ b/files/longhorn/longhorn_prometheusRules.yml
@@ -0,0 +1,90 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    prometheus: longhorn
+    role: alert-rules
+  name: prometheus-longhorn-rules
+spec:
+  groups:
+  - name: longhorn.rules
+    rules:
+    - alert: LonghornVolumeActualSpaceUsedWarning
+      annotations:
+        description: The actual space used by Longhorn volume {{$labels.volume}} on {{$labels.node}} is at {{$value}}% capacity for
+          more than 5 minutes.
+        summary: The actual used space of Longhorn volume is over 90% of the capacity.
+      expr: (longhorn_volume_actual_size_bytes / longhorn_volume_capacity_bytes) * 100 > 90
+      for: 5m
+      labels:
+        issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high.
+        severity: warning
+    - alert: LonghornVolumeStatusCritical
+      annotations:
+        description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for
+          more than 2 minutes.
+        summary: Longhorn volume {{$labels.volume}} is Fault
+      expr: longhorn_volume_robustness == 3
+      for: 5m
+      labels:
+        issue: Longhorn volume {{$labels.volume}} is Fault.
+        severity: critical
+    - alert: LonghornVolumeStatusWarning
+      annotations:
+        description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for
+          more than 5 minutes.
+        summary: Longhorn volume {{$labels.volume}} is Degraded
+      expr: longhorn_volume_robustness == 2
+      for: 5m
+      labels:
+        issue: Longhorn volume {{$labels.volume}} is Degraded.
+        severity: warning
+    - alert: LonghornNodeStorageWarning
+      annotations:
+        description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for
+          more than 5 minutes.
+        summary:  The used storage of node is over 70% of the capacity.
+      expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70
+      for: 5m
+      labels:
+        issue: The used storage of node {{$labels.node}} is high.
+        severity: warning
+    - alert: LonghornDiskStorageWarning
+      annotations:
+        description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for
+          more than 5 minutes.
+        summary:  The used storage of disk is over 70% of the capacity.
+      expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70
+      for: 5m
+      labels:
+        issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high.
+        severity: warning
+    - alert: LonghornNodeDown
+      annotations:
+        description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes.
+        summary: Longhorn nodes is offline
+      expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0
+      for: 5m
+      labels:
+        issue: There are {{$value}} Longhorn nodes are offline
+        severity: critical
+    - alert: LonghornInstanceManagerCPUUsageWarning
+      annotations:
+        description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for
+          more than 5 minutes.
+        summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%.
+      expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300
+      for: 5m
+      labels:
+        issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request.
+        severity: warning
+    - alert: LonghornNodeCPUUsageWarning
+      annotations:
+        description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for
+          more than 5 minutes.
+        summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m.
+      expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90
+      for: 5m
+      labels:
+        issue: Longhorn node {{$labels.node}} experiences high CPU pressure.
+        severity: warning
diff --git a/tasks/longhorn.yml b/tasks/longhorn.yml
index c00c6cb..17a73a0 100644
--- a/tasks/longhorn.yml
+++ b/tasks/longhorn.yml
@@ -72,6 +72,16 @@
             csi.storage.k8s.io/fsType: ext4
             recurringJobSelector: '[{"name":"snapshot","isGroup":true}, {"name":"backup-daily","isGroup":true}]'
 
+    - name: Install Prometheus rules
+      kubernetes.core.k8s:
+        state: "present"
+        context: "{{ my_context }}"
+        namespace: "{{ storage_longhorn_namespace }}"
+        apply: yes
+        resource_definition: "{{ lookup('file', 'longhorn/' + item) | from_yaml_all }}"
+      with_items:
+        - longhorn_prometheusRules.yml
+
   when:
     - storage_longhorn_enabled
   tags: