From fc7765b5d4d37209c194a9752fe610129d43a755 Mon Sep 17 00:00:00 2001 From: Adrien Reslinger Date: Wed, 26 Jun 2024 23:46:26 +0200 Subject: [PATCH] Add prometheus rules for longhorn --- files/longhorn/longhorn_prometheusRules.yml | 90 +++++++++++++++++++++ tasks/longhorn.yml | 10 +++ 2 files changed, 100 insertions(+) create mode 100644 files/longhorn/longhorn_prometheusRules.yml diff --git a/files/longhorn/longhorn_prometheusRules.yml b/files/longhorn/longhorn_prometheusRules.yml new file mode 100644 index 0000000..7f4d14c --- /dev/null +++ b/files/longhorn/longhorn_prometheusRules.yml @@ -0,0 +1,90 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + prometheus: longhorn + role: alert-rules + name: prometheus-longhorn-rules +spec: + groups: + - name: longhorn.rules + rules: + - alert: LonghornVolumeActualSpaceUsedWarning + annotations: + description: The actual space used by Longhorn volume {{$labels.volume}} on {{$labels.node}} is at {{$value}}% capacity for + more than 5 minutes. + summary: The actual used space of Longhorn volume is over 90% of the capacity. + expr: (longhorn_volume_actual_size_bytes / longhorn_volume_capacity_bytes) * 100 > 90 + for: 5m + labels: + issue: The actual used space of Longhorn volume {{$labels.volume}} on {{$labels.node}} is high. + severity: warning + - alert: LonghornVolumeStatusCritical + annotations: + description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Fault for + more than 2 minutes. + summary: Longhorn volume {{$labels.volume}} is Fault + expr: longhorn_volume_robustness == 3 + for: 5m + labels: + issue: Longhorn volume {{$labels.volume}} is Fault. + severity: critical + - alert: LonghornVolumeStatusWarning + annotations: + description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is Degraded for + more than 5 minutes. + summary: Longhorn volume {{$labels.volume}} is Degraded + expr: longhorn_volume_robustness == 2 + for: 5m + labels: + issue: Longhorn volume {{$labels.volume}} is Degraded. + severity: warning + - alert: LonghornNodeStorageWarning + annotations: + description: The used storage of node {{$labels.node}} is at {{$value}}% capacity for + more than 5 minutes. + summary: The used storage of node is over 70% of the capacity. + expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes) * 100 > 70 + for: 5m + labels: + issue: The used storage of node {{$labels.node}} is high. + severity: warning + - alert: LonghornDiskStorageWarning + annotations: + description: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is at {{$value}}% capacity for + more than 5 minutes. + summary: The used storage of disk is over 70% of the capacity. + expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 > 70 + for: 5m + labels: + issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}} is high. + severity: warning + - alert: LonghornNodeDown + annotations: + description: There are {{$value}} Longhorn nodes which have been offline for more than 5 minutes. + summary: Longhorn nodes is offline + expr: (avg(longhorn_node_count_total) or on() vector(0)) - (count(longhorn_node_status{condition="ready"} == 1) or on() vector(0)) > 0 + for: 5m + labels: + issue: There are {{$value}} Longhorn nodes are offline + severity: critical + - alert: LonghornInstanceManagerCPUUsageWarning + annotations: + description: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is {{$value}}% for + more than 5 minutes. + summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} has CPU Usage / CPU request is over 300%. + expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu) * 100 > 300 + for: 5m + labels: + issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}} consumes 3 times the CPU request. + severity: warning + - alert: LonghornNodeCPUUsageWarning + annotations: + description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity is {{$value}}% for + more than 5 minutes. + summary: Longhorn node {{$labels.node}} experiences high CPU pressure for more than 5m. + expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu) * 100 > 90 + for: 5m + labels: + issue: Longhorn node {{$labels.node}} experiences high CPU pressure. + severity: warning diff --git a/tasks/longhorn.yml b/tasks/longhorn.yml index c00c6cb..17a73a0 100644 --- a/tasks/longhorn.yml +++ b/tasks/longhorn.yml @@ -72,6 +72,16 @@ csi.storage.k8s.io/fsType: ext4 recurringJobSelector: '[{"name":"snapshot","isGroup":true}, {"name":"backup-daily","isGroup":true}]' + - name: Install Prometheus rules + kubernetes.core.k8s: + state: "present" + context: "{{ my_context }}" + namespace: "{{ storage_longhorn_namespace }}" + apply: yes + resource_definition: "{{ lookup('file', 'longhorn/' + item) | from_yaml_all }}" + with_items: + - longhorn_prometheusRules.yml + when: - storage_longhorn_enabled tags: