From af03d7e1c4769e2a9481027c274eb7f66e87cd43 Mon Sep 17 00:00:00 2001 From: Adrien Date: Tue, 28 Jan 2020 09:11:29 +0100 Subject: [PATCH] First commit --- defaults/main.yml | 3 + meta/main.yml | 6 + scripts/update-templates.sh | 8 + tasks/main.yml | 57 + templates/0-namespace.yaml | 6 + templates/config.yaml | 253 ++++ templates/deployment.yaml | 63 + .../discovery/kube-controller-manager.yaml | 18 + templates/discovery/kube-proxy.yaml | 19 + templates/discovery/kube-scheduler.yaml | 18 + .../cluster-role-binding.yaml | 12 + .../kube-state-metrics/cluster-role.yaml | 103 ++ .../kube-state-metrics/deployment.yaml | 42 + .../kube-state-metrics/service-account.yaml | 5 + .../exporters/kube-state-metrics/service.yaml | 19 + .../exporters/node-exporter/daemonset.yaml | 73 + .../node-exporter/service-account.yaml | 5 + .../exporters/node-exporter/service.yaml | 19 + templates/network-policy.yaml | 28 + templates/rbac/cluster-role-binding.yaml | 12 + templates/rbac/cluster-role.yaml | 15 + templates/rules.yaml | 1228 +++++++++++++++++ templates/service-account.yaml | 5 + templates/service.yaml | 18 + vars/main.yml | 0 25 files changed, 2035 insertions(+) create mode 100644 defaults/main.yml create mode 100644 meta/main.yml create mode 100644 scripts/update-templates.sh create mode 100644 tasks/main.yml create mode 100644 templates/0-namespace.yaml create mode 100644 templates/config.yaml create mode 100644 templates/deployment.yaml create mode 100644 templates/discovery/kube-controller-manager.yaml create mode 100644 templates/discovery/kube-proxy.yaml create mode 100644 templates/discovery/kube-scheduler.yaml create mode 100644 templates/exporters/kube-state-metrics/cluster-role-binding.yaml create mode 100644 templates/exporters/kube-state-metrics/cluster-role.yaml create mode 100644 templates/exporters/kube-state-metrics/deployment.yaml create mode 100644 templates/exporters/kube-state-metrics/service-account.yaml create mode 100644 templates/exporters/kube-state-metrics/service.yaml create mode 100644 templates/exporters/node-exporter/daemonset.yaml create mode 100644 templates/exporters/node-exporter/service-account.yaml create mode 100644 templates/exporters/node-exporter/service.yaml create mode 100644 templates/network-policy.yaml create mode 100644 templates/rbac/cluster-role-binding.yaml create mode 100644 templates/rbac/cluster-role.yaml create mode 100644 templates/rules.yaml create mode 100644 templates/service-account.yaml create mode 100644 templates/service.yaml create mode 100644 vars/main.yml diff --git a/defaults/main.yml b/defaults/main.yml new file mode 100644 index 0000000..8d9b868 --- /dev/null +++ b/defaults/main.yml @@ -0,0 +1,3 @@ +my_context: kubernetes +my_namespace: monitoring +prometheus_state: present \ No newline at end of file diff --git a/meta/main.yml b/meta/main.yml new file mode 100644 index 0000000..712cd5d --- /dev/null +++ b/meta/main.yml @@ -0,0 +1,6 @@ +galaxy_info: + author: Adrien Reslinger + description: Install Prometheus stack to a kubernetes cluster + company: Personnal + min_ansible_version: 2.6 + galaxy_tags: [] diff --git a/scripts/update-templates.sh b/scripts/update-templates.sh new file mode 100644 index 0000000..dea0234 --- /dev/null +++ b/scripts/update-templates.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +git clone https://github.com/poseidon/typhoon.git +typhoon/addons/prometheus + +for i in $(ls rbac/*.yaml); do echo " - $i"; done +for i in $(ls *.yaml); do echo " - $i"; done +for i in $(ls discovery/*.yaml); do echo " - $i"; done +for i in $(ls exporters/*/*.yaml); do echo " - $i"; done diff --git a/tasks/main.yml b/tasks/main.yml new file mode 100644 index 0000000..281d6b6 --- /dev/null +++ b/tasks/main.yml @@ -0,0 +1,57 @@ +#- debug: var=my_context +# tags: traefik + +- name: traefik setup + block: + - name: namespace + k8s: + state: "{{ prometheus_state }}" + context: "{{ my_context }}" + name: "{{ my_namespace }}" + api_version: v1 + kind: Namespace + +# - name: Create a Secret object for basic authentification +# k8s: +# state: "{{ prometheus_state }}" +# context: "{{ my_context }}" +# definition: +# apiVersion: v1 +# kind: Secret +# metadata: +# name: basic-auth +# namespace: "{{ my_namespace }}" +# type: Opaque +# data: +# basic_auth: "{{ basic_auth_data | b64encode }}" +# when: +# - basic_auth == true +# tags: prom + + - name: Prometheus files need to be {{ prometheus_state }} + k8s: + state: "{{ prometheus_state }}" + context: "{{ my_context }}" + resource_definition: "{{ lookup('template', item) | from_yaml }}" + with_items: +# - 0-namespace.yaml + - rbac/cluster-role-binding.yaml + - rbac/cluster-role.yaml + - config.yaml + - deployment.yaml + - network-policy.yaml + - rules.yaml + - service-account.yaml + - service.yaml + - discovery/kube-controller-manager.yaml + - discovery/kube-proxy.yaml + - discovery/kube-scheduler.yaml + - exporters/kube-state-metrics/cluster-role-binding.yaml + - exporters/kube-state-metrics/cluster-role.yaml + - exporters/kube-state-metrics/deployment.yaml + - exporters/kube-state-metrics/service-account.yaml + - exporters/kube-state-metrics/service.yaml + - exporters/node-exporter/daemonset.yaml + - exporters/node-exporter/service-account.yaml + - exporters/node-exporter/service.yaml + tags: traefik diff --git a/templates/0-namespace.yaml b/templates/0-namespace.yaml new file mode 100644 index 0000000..90d12ef --- /dev/null +++ b/templates/0-namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: monitoring + labels: + name: monitoring diff --git a/templates/config.yaml b/templates/config.yaml new file mode 100644 index 0000000..38b7472 --- /dev/null +++ b/templates/config.yaml @@ -0,0 +1,253 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config + namespace: monitoring +data: + prometheus.yaml: |- + # Global config + global: + scrape_interval: 15s + + # AlertManager + alerting: + alertmanagers: + - static_configs: + - targets: + - alertmanager:9093 + + # Scrape configs for running Prometheus on a Kubernetes cluster. + # This uses separate scrape configs for cluster components (i.e. API server, node) + # and services to allow each to use different authentication configs. + # + # Kubernetes labels will be added as Prometheus labels on metrics via the + # `labelmap` relabeling action. + scrape_configs: + + # Scrape config for API servers. + # + # Kubernetes exposes API servers as endpoints to the default/kubernetes + # service so this uses `endpoints` role and uses relabelling to only keep + # the endpoints associated with the default/kubernetes service using the + # default named port `https`. This works for single API server deployments as + # well as HA API server deployments. + - job_name: 'kubernetes-apiservers' + kubernetes_sd_configs: + - role: endpoints + + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # Using endpoints to discover kube-apiserver targets finds the pod IP + # (host IP since apiserver uses host network) which is not used in + # the server certificate. + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + # Keep only the default/kubernetes service endpoints for the https port. This + # will add targets for each API server which Kubernetes adds an endpoint to + # the default/kubernetes service. + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + - replacement: apiserver + action: replace + target_label: job + + metric_relabel_configs: + - source_labels: [__name__] + action: drop + regex: etcd_(debugging|disk|request|server).* + - source_labels: [__name__] + action: drop + regex: apiserver_admission_controller_admission_latencies_seconds_.* + - source_labels: [__name__] + action: drop + regex: apiserver_admission_step_admission_latencies_seconds_.* + - source_labels: [__name__, group] + regex: apiserver_request_duration_seconds_bucket;.+ + action: drop + + # Scrape config for node (i.e. kubelet) /metrics (e.g. 'kubelet_'). Explore + # metrics from a node by scraping kubelet (127.0.0.1:10250/metrics). + - job_name: 'kubelet' + kubernetes_sd_configs: + - role: node + + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # Kubelet certs don't have any fixed IP SANs + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_name + + # Scrape config for Kubelet cAdvisor. Explore metrics from a node by + # scraping kubelet (127.0.0.1:10250/metrics/cadvisor). + - job_name: 'kubernetes-cadvisor' + kubernetes_sd_configs: + - role: node + + scheme: https + metrics_path: /metrics/cadvisor + tls_config: + # Kubelet certs don't have any fixed IP SANs + insecure_skip_verify: true + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_name + metric_relabel_configs: + - source_labels: [__name__, image] + action: drop + regex: container_([a-z_]+); + - source_labels: [__name__] + action: drop + regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) + + + # Scrap etcd metrics from controllers via listen-metrics-urls + - job_name: 'etcd' + kubernetes_sd_configs: + - role: node + scheme: http + relabel_configs: + - source_labels: [__meta_kubernetes_node_label_node_kubernetes_io_controller] + action: keep + regex: 'true' + - action: labelmap + regex: __meta_kubernetes_node_name + - source_labels: [__meta_kubernetes_node_address_InternalIP] + action: replace + target_label: __address__ + replacement: '${1}:2381' + + # Scrape config for service endpoints. + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/scrape`: Only scrape services that have a value of `true` + # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need + # to set this to `https` & most likely set the `tls_config` of the scrape config. + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: If the metrics are exposed on a different port to the + # service then set this appropriately. + - job_name: 'kubernetes-service-endpoints' + kubernetes_sd_configs: + - role: endpoints + + honor_labels: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: job + + metric_relabel_configs: + - source_labels: [__name__] + action: drop + regex: etcd_(debugging|disk|request|server).* + + # Example scrape config for probing services via the Blackbox Exporter. + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/probe`: Only probe services that have a value of `true` + - job_name: 'kubernetes-services' + + metrics_path: /probe + params: + module: [http_2xx] + + kubernetes_sd_configs: + - role: service + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] + action: keep + regex: true + - source_labels: [__address__] + target_label: __param_target + - target_label: __address__ + replacement: blackbox + - source_labels: [__param_target] + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace + - source_labels: [__meta_kubernetes_service_name] + target_label: job + + # Example scrape config for pods + # + # The relabeling allows the actual pod scrape endpoint to be configured via the + # following annotations: + # + # * `prometheus.io/scrape`: Only scrape pods that have a value of `true` + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the + # pod's declared ports (default is a port-free target if none are declared). + - job_name: 'kubernetes-pods' + + kubernetes_sd_configs: + - role: pod + + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + + # Rule files + rule_files: + - "/etc/prometheus/rules/*.rules" + - "/etc/prometheus/rules/*.yaml" + - "/etc/prometheus/rules/*.yml" diff --git a/templates/deployment.yaml b/templates/deployment.yaml new file mode 100644 index 0000000..873aeaa --- /dev/null +++ b/templates/deployment.yaml @@ -0,0 +1,63 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + name: prometheus + phase: prod + template: + metadata: + labels: + name: prometheus + phase: prod + annotations: + seccomp.security.alpha.kubernetes.io/pod: 'docker/default' + spec: + serviceAccountName: prometheus + containers: + - name: prometheus + image: quay.io/prometheus/prometheus:v2.15.2 + args: + - --web.listen-address=0.0.0.0:9090 + - --config.file=/etc/prometheus/prometheus.yaml + - --storage.tsdb.path=/var/lib/prometheus + ports: + - name: web + containerPort: 9090 + resources: + requests: + cpu: 100m + memory: 200Mi + volumeMounts: + - name: config + mountPath: /etc/prometheus + - name: rules + mountPath: /etc/prometheus/rules + - name: data + mountPath: /var/lib/prometheus + livenessProbe: + httpGet: + path: /-/healthy + port: 9090 + initialDelaySeconds: 10 + timeoutSeconds: 10 + readinessProbe: + httpGet: + path: /-/ready + port: 9090 + initialDelaySeconds: 10 + timeoutSeconds: 10 + terminationGracePeriodSeconds: 30 + volumes: + - name: config + configMap: + name: prometheus-config + - name: rules + configMap: + name: prometheus-rules + - name: data + emptyDir: {} diff --git a/templates/discovery/kube-controller-manager.yaml b/templates/discovery/kube-controller-manager.yaml new file mode 100644 index 0000000..1dabf72 --- /dev/null +++ b/templates/discovery/kube-controller-manager.yaml @@ -0,0 +1,18 @@ +# Allow Prometheus to scrape service endpoints +apiVersion: v1 +kind: Service +metadata: + name: kube-controller-manager + namespace: kube-system + annotations: + prometheus.io/scrape: 'true' +spec: + type: ClusterIP + clusterIP: None + selector: + k8s-app: kube-controller-manager + ports: + - name: metrics + protocol: TCP + port: 10252 + targetPort: 10252 diff --git a/templates/discovery/kube-proxy.yaml b/templates/discovery/kube-proxy.yaml new file mode 100644 index 0000000..9c49bef --- /dev/null +++ b/templates/discovery/kube-proxy.yaml @@ -0,0 +1,19 @@ +# Allow Prometheus to scrape service endpoints +apiVersion: v1 +kind: Service +metadata: + name: kube-proxy + namespace: kube-system + annotations: + prometheus.io/scrape: 'true' + prometheus.io/port: '10249' +spec: + type: ClusterIP + clusterIP: None + selector: + k8s-app: kube-proxy + ports: + - name: metrics + protocol: TCP + port: 10249 + targetPort: 10249 diff --git a/templates/discovery/kube-scheduler.yaml b/templates/discovery/kube-scheduler.yaml new file mode 100644 index 0000000..0032cf1 --- /dev/null +++ b/templates/discovery/kube-scheduler.yaml @@ -0,0 +1,18 @@ +# Allow Prometheus to scrape service endpoints +apiVersion: v1 +kind: Service +metadata: + name: kube-scheduler + namespace: kube-system + annotations: + prometheus.io/scrape: 'true' +spec: + type: ClusterIP + clusterIP: None + selector: + k8s-app: kube-scheduler + ports: + - name: metrics + protocol: TCP + port: 10251 + targetPort: 10251 diff --git a/templates/exporters/kube-state-metrics/cluster-role-binding.yaml b/templates/exporters/kube-state-metrics/cluster-role-binding.yaml new file mode 100644 index 0000000..9a8f311 --- /dev/null +++ b/templates/exporters/kube-state-metrics/cluster-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: +- kind: ServiceAccount + name: kube-state-metrics + namespace: monitoring diff --git a/templates/exporters/kube-state-metrics/cluster-role.yaml b/templates/exporters/kube-state-metrics/cluster-role.yaml new file mode 100644 index 0000000..9346b9a --- /dev/null +++ b/templates/exporters/kube-state-metrics/cluster-role.yaml @@ -0,0 +1,103 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kube-state-metrics +rules: +- apiGroups: + - "" + resources: + - configmaps + - secrets + - nodes + - pods + - services + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + verbs: + - list + - watch +- apiGroups: + - extensions + resources: + - daemonsets + - deployments + - replicasets + - ingresses + verbs: + - list + - watch +- apiGroups: + - apps + resources: + - statefulsets + - daemonsets + - deployments + - replicasets + verbs: + - list + - watch +- apiGroups: + - batch + resources: + - cronjobs + - jobs + verbs: + - list + - watch +- apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - list + - watch +- apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - list + - watch +- apiGroups: + - certificates.k8s.io + resources: + - certificatesigningrequests + verbs: + - list + - watch +- apiGroups: + - storage.k8s.io + resources: + - storageclasses + - volumeattachments + verbs: + - list + - watch +- apiGroups: + - autoscaling.k8s.io + resources: + - verticalpodautoscalers + verbs: + - list + - watch +- apiGroups: + - admissionregistration.k8s.io + resources: + - mutatingwebhookconfigurations + - validatingwebhookconfigurations + verbs: + - list + - watch +- apiGroups: + - networking.k8s.io + resources: + - networkpolicies + verbs: + - list + - watch + diff --git a/templates/exporters/kube-state-metrics/deployment.yaml b/templates/exporters/kube-state-metrics/deployment.yaml new file mode 100644 index 0000000..ab7c623 --- /dev/null +++ b/templates/exporters/kube-state-metrics/deployment.yaml @@ -0,0 +1,42 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: monitoring +spec: + replicas: 1 + strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + selector: + matchLabels: + name: kube-state-metrics + phase: prod + template: + metadata: + labels: + name: kube-state-metrics + phase: prod + annotations: + seccomp.security.alpha.kubernetes.io/pod: 'docker/default' + spec: + serviceAccountName: kube-state-metrics + containers: + - name: kube-state-metrics + image: quay.io/coreos/kube-state-metrics:v1.9.3 + ports: + - name: metrics + containerPort: 8080 + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + timeoutSeconds: 5 + readinessProbe: + httpGet: + path: / + port: 8081 + initialDelaySeconds: 5 + timeoutSeconds: 5 diff --git a/templates/exporters/kube-state-metrics/service-account.yaml b/templates/exporters/kube-state-metrics/service-account.yaml new file mode 100644 index 0000000..fff1028 --- /dev/null +++ b/templates/exporters/kube-state-metrics/service-account.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-state-metrics + namespace: monitoring diff --git a/templates/exporters/kube-state-metrics/service.yaml b/templates/exporters/kube-state-metrics/service.yaml new file mode 100644 index 0000000..fbdad78 --- /dev/null +++ b/templates/exporters/kube-state-metrics/service.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + name: kube-state-metrics + namespace: monitoring + annotations: + prometheus.io/scrape: 'true' +spec: + type: ClusterIP + # service is created to allow prometheus to scape endpoints + clusterIP: None + selector: + name: kube-state-metrics + phase: prod + ports: + - name: metrics + protocol: TCP + port: 8080 + targetPort: 8080 diff --git a/templates/exporters/node-exporter/daemonset.yaml b/templates/exporters/node-exporter/daemonset.yaml new file mode 100644 index 0000000..7ef8873 --- /dev/null +++ b/templates/exporters/node-exporter/daemonset.yaml @@ -0,0 +1,73 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-exporter + namespace: monitoring +spec: + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + selector: + matchLabels: + name: node-exporter + phase: prod + template: + metadata: + labels: + name: node-exporter + phase: prod + annotations: + seccomp.security.alpha.kubernetes.io/pod: 'docker/default' + spec: + serviceAccountName: node-exporter + securityContext: + runAsNonRoot: true + runAsUser: 65534 + hostNetwork: true + hostPID: true + containers: + - name: node-exporter + image: quay.io/prometheus/node-exporter:v0.18.1 + args: + - --path.procfs=/host/proc + - --path.sysfs=/host/sys + - --path.rootfs=/host/root + - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/) + - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ + ports: + - name: metrics + containerPort: 9100 + hostPort: 9100 + resources: + requests: + cpu: 100m + memory: 50Mi + limits: + cpu: 200m + memory: 100Mi + volumeMounts: + - name: proc + mountPath: /host/proc + readOnly: true + - name: sys + mountPath: /host/sys + readOnly: true + - name: root + mountPath: /host/root + readOnly: true + tolerations: + - key: node-role.kubernetes.io/master + operator: Exists + - key: node.kubernetes.io/not-ready + operator: Exists + volumes: + - name: proc + hostPath: + path: /proc + - name: sys + hostPath: + path: /sys + - name: root + hostPath: + path: / diff --git a/templates/exporters/node-exporter/service-account.yaml b/templates/exporters/node-exporter/service-account.yaml new file mode 100644 index 0000000..8a03ac1 --- /dev/null +++ b/templates/exporters/node-exporter/service-account.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-exporter + namespace: monitoring diff --git a/templates/exporters/node-exporter/service.yaml b/templates/exporters/node-exporter/service.yaml new file mode 100644 index 0000000..62edcb3 --- /dev/null +++ b/templates/exporters/node-exporter/service.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + name: node-exporter + namespace: monitoring + annotations: + prometheus.io/scrape: 'true' +spec: + type: ClusterIP + # service is created to allow prometheus to scape endpoints + clusterIP: None + selector: + name: node-exporter + phase: prod + ports: + - name: metrics + protocol: TCP + port: 80 + targetPort: 9100 diff --git a/templates/network-policy.yaml b/templates/network-policy.yaml new file mode 100644 index 0000000..e64cdb5 --- /dev/null +++ b/templates/network-policy.yaml @@ -0,0 +1,28 @@ +# Allow Grafana access and in-cluster Prometheus scraping +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: prometheus + namespace: monitoring +spec: + podSelector: + matchLabels: + name: prometheus + ingress: + - ports: + - protocol: TCP + port: 9090 + from: + - namespaceSelector: + matchLabels: + name: monitoring + podSelector: + matchLabels: + name: grafana + - namespaceSelector: + matchLabels: + name: monitoring + podSelector: + matchLabels: + name: prometheus + diff --git a/templates/rbac/cluster-role-binding.yaml b/templates/rbac/cluster-role-binding.yaml new file mode 100644 index 0000000..127f83a --- /dev/null +++ b/templates/rbac/cluster-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: +- kind: ServiceAccount + name: prometheus + namespace: monitoring diff --git a/templates/rbac/cluster-role.yaml b/templates/rbac/cluster-role.yaml new file mode 100644 index 0000000..6f6cee0 --- /dev/null +++ b/templates/rbac/cluster-role.yaml @@ -0,0 +1,15 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: +- apiGroups: [""] + resources: + - nodes + - nodes/metrics + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] diff --git a/templates/rules.yaml b/templates/rules.yaml new file mode 100644 index 0000000..78746f2 --- /dev/null +++ b/templates/rules.yaml @@ -0,0 +1,1228 @@ +apiVersion: v1 +data: + etcd.yaml: |- + { + "groups": [ + { + "name": "etcd", + "rules": [ + { + "alert": "etcdMembersDown", + "annotations": { + "message": "etcd cluster \"{{ $labels.job }}\": members are down ({{ $value }})." + }, + "expr": "max by (job) (\n sum by (job) (up{job=~\".*etcd.*\"} == bool 0)\nor\n count by (job,endpoint) (\n sum by (job,endpoint,To) (rate(etcd_network_peer_sent_failures_total{job=~\".*etcd.*\"}[3m])) > 0.01\n )\n)\n> 0\n", + "for": "3m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "etcdInsufficientMembers", + "annotations": { + "message": "etcd cluster \"{{ $labels.job }}\": insufficient members ({{ $value }})." + }, + "expr": "sum(up{job=~\".*etcd.*\"} == bool 1) by (job) < ((count(up{job=~\".*etcd.*\"}) by (job) + 1) / 2)\n", + "for": "3m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "etcdNoLeader", + "annotations": { + "message": "etcd cluster \"{{ $labels.job }}\": member {{ $labels.instance }} has no leader." + }, + "expr": "etcd_server_has_leader{job=~\".*etcd.*\"} == 0\n", + "for": "1m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "etcdHighNumberOfLeaderChanges", + "annotations": { + "message": "etcd cluster \"{{ $labels.job }}\": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last 30 minutes." + }, + "expr": "rate(etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}[15m]) > 3\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "etcdGRPCRequestsSlow", + "annotations": { + "message": "etcd cluster \"{{ $labels.job }}\": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}." + }, + "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\", grpc_type=\"unary\"}[5m])) by (job, instance, grpc_service, grpc_method, le))\n> 0.15\n", + "for": "10m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "etcdMemberCommunicationSlow", + "annotations": { + "message": "etcd cluster \"{{ $labels.job }}\": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}." + }, + "expr": "histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.15\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "etcdHighNumberOfFailedProposals", + "annotations": { + "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}." + }, + "expr": "rate(etcd_server_proposals_failed_total{job=~\".*etcd.*\"}[15m]) > 5\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "etcdHighFsyncDurations", + "annotations": { + "message": "etcd cluster \"{{ $labels.job }}\": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}." + }, + "expr": "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.5\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "etcdHighCommitDurations", + "annotations": { + "message": "etcd cluster \"{{ $labels.job }}\": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}." + }, + "expr": "histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.25\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "etcdHighNumberOfFailedHTTPRequests", + "annotations": { + "message": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}" + }, + "expr": "sum(rate(etcd_http_failed_total{job=~\".*etcd.*\", code!=\"404\"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~\".*etcd.*\"}[5m]))\nBY (method) > 0.01\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "etcdHighNumberOfFailedHTTPRequests", + "annotations": { + "message": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}." + }, + "expr": "sum(rate(etcd_http_failed_total{job=~\".*etcd.*\", code!=\"404\"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~\".*etcd.*\"}[5m]))\nBY (method) > 0.05\n", + "for": "10m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "etcdHTTPRequestsSlow", + "annotations": { + "message": "etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow." + }, + "expr": "histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))\n> 0.15\n", + "for": "10m", + "labels": { + "severity": "warning" + } + } + ] + } + ] + } + kube.yaml: |- + { + "groups": [ + { + "name": "kube-apiserver.rules", + "rules": [ + { + "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.99" + }, + "record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.9" + }, + "record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.5" + }, + "record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile" + } + ] + }, + { + "name": "k8s.rules", + "rules": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubernetes-cadvisor\", image!=\"\", container!=\"POD\"}[5m])) by (namespace)\n", + "record": "namespace:container_cpu_usage_seconds_total:sum_rate" + }, + { + "expr": "sum by (namespace, pod, container) (\n rate(container_cpu_usage_seconds_total{job=\"kubernetes-cadvisor\", image!=\"\", container!=\"POD\"}[5m])\n) * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)\n", + "record": "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate" + }, + { + "expr": "container_memory_working_set_bytes{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)\n", + "record": "node_namespace_pod_container:container_memory_working_set_bytes" + }, + { + "expr": "container_memory_rss{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)\n", + "record": "node_namespace_pod_container:container_memory_rss" + }, + { + "expr": "container_memory_cache{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)\n", + "record": "node_namespace_pod_container:container_memory_cache" + }, + { + "expr": "container_memory_swap{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info)\n", + "record": "node_namespace_pod_container:container_memory_swap" + }, + { + "expr": "sum(container_memory_usage_bytes{job=\"kubernetes-cadvisor\", image!=\"\", container!=\"POD\"}) by (namespace)\n", + "record": "namespace:container_memory_usage_bytes:sum" + }, + { + "expr": "sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)) by (namespace, pod)\n * on (namespace, pod)\n group_left(label_name) kube_pod_labels{job=\"kube-state-metrics\"}\n)\n", + "record": "namespace:kube_pod_container_resource_requests_memory_bytes:sum" + }, + { + "expr": "sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_cpu_cores{job=\"kube-state-metrics\"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~\"Pending|Running\"} == 1)) by (namespace, pod)\n * on (namespace, pod)\n group_left(label_name) kube_pod_labels{job=\"kube-state-metrics\"}\n)\n", + "record": "namespace:kube_pod_container_resource_requests_cpu_cores:sum" + }, + { + "expr": "sum(\n label_replace(\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"ReplicaSet\"},\n \"replicaset\", \"$1\", \"owner_name\", \"(.*)\"\n ) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job=\"kube-state-metrics\"},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n) by (namespace, workload, pod)\n", + "labels": { + "workload_type": "deployment" + }, + "record": "mixin_pod_workload" + }, + { + "expr": "sum(\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"DaemonSet\"},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n) by (namespace, workload, pod)\n", + "labels": { + "workload_type": "daemonset" + }, + "record": "mixin_pod_workload" + }, + { + "expr": "sum(\n label_replace(\n kube_pod_owner{job=\"kube-state-metrics\", owner_kind=\"StatefulSet\"},\n \"workload\", \"$1\", \"owner_name\", \"(.*)\"\n )\n) by (namespace, workload, pod)\n", + "labels": { + "workload_type": "statefulset" + }, + "record": "mixin_pod_workload" + } + ] + }, + { + "name": "kube-scheduler.rules", + "rules": [ + { + "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.99" + }, + "record": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.99" + }, + "record": "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.99" + }, + "record": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.9" + }, + "record": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.9" + }, + "record": "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.9" + }, + "record": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.5" + }, + "record": "cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.5" + }, + "record": "cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile" + }, + { + "expr": "histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod))\n", + "labels": { + "quantile": "0.5" + }, + "record": "cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile" + } + ] + }, + { + "name": "node.rules", + "rules": [ + { + "expr": "sum(min(kube_pod_info) by (node))", + "record": ":kube_pod_info_node_count:" + }, + { + "expr": "max(label_replace(kube_pod_info{job=\"kube-state-metrics\"}, \"pod\", \"$1\", \"pod\", \"(.*)\")) by (node, namespace, pod)\n", + "record": "node_namespace_pod:kube_pod_info:" + }, + { + "expr": "count by (node) (sum by (node, cpu) (\n node_cpu_seconds_total{job=\"node-exporter\"}\n* on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n))\n", + "record": "node:node_num_cpu:sum" + }, + { + "expr": "sum(\n node_memory_MemAvailable_bytes{job=\"node-exporter\"} or\n (\n node_memory_Buffers_bytes{job=\"node-exporter\"} +\n node_memory_Cached_bytes{job=\"node-exporter\"} +\n node_memory_MemFree_bytes{job=\"node-exporter\"} +\n node_memory_Slab_bytes{job=\"node-exporter\"}\n )\n)\n", + "record": ":node_memory_MemAvailable_bytes:sum" + } + ] + }, + { + "name": "kubernetes-apps", + "rules": [ + { + "alert": "KubePodCrashLooping", + "annotations": { + "message": "Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf \"%.2f\" $value }} times / 5 minutes.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping" + }, + "expr": "rate(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\"}[15m]) * 60 * 5 > 0\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubePodNotReady", + "annotations": { + "message": "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready" + }, + "expr": "sum by (namespace, pod) (kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Failed|Pending|Unknown\"} * on(namespace, pod) group_left(owner_kind) kube_pod_owner{owner_kind!=\"Job\"}) > 0\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubeDeploymentGenerationMismatch", + "annotations": { + "message": "Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch" + }, + "expr": "kube_deployment_status_observed_generation{job=\"kube-state-metrics\"}\n !=\nkube_deployment_metadata_generation{job=\"kube-state-metrics\"}\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubeDeploymentReplicasMismatch", + "annotations": { + "message": "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch" + }, + "expr": "kube_deployment_spec_replicas{job=\"kube-state-metrics\"}\n !=\nkube_deployment_status_replicas_available{job=\"kube-state-metrics\"}\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubeStatefulSetReplicasMismatch", + "annotations": { + "message": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch" + }, + "expr": "kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\"}\n !=\nkube_statefulset_status_replicas{job=\"kube-state-metrics\"}\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubeStatefulSetGenerationMismatch", + "annotations": { + "message": "StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch" + }, + "expr": "kube_statefulset_status_observed_generation{job=\"kube-state-metrics\"}\n !=\nkube_statefulset_metadata_generation{job=\"kube-state-metrics\"}\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubeStatefulSetUpdateNotRolledOut", + "annotations": { + "message": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout" + }, + "expr": "max without (revision) (\n kube_statefulset_status_current_revision{job=\"kube-state-metrics\"}\n unless\n kube_statefulset_status_update_revision{job=\"kube-state-metrics\"}\n)\n *\n(\n kube_statefulset_replicas{job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}\n)\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubeDaemonSetRolloutStuck", + "annotations": { + "message": "Only {{ $value | humanizePercentage }} of the desired Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck" + }, + "expr": "kube_daemonset_status_number_ready{job=\"kube-state-metrics\"}\n /\nkube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"} < 1.00\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubeContainerWaiting", + "annotations": { + "message": "Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting" + }, + "expr": "sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job=\"kube-state-metrics\"}) > 0\n", + "for": "1h", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeDaemonSetNotScheduled", + "annotations": { + "message": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled" + }, + "expr": "kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n -\nkube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"} > 0\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeDaemonSetMisScheduled", + "annotations": { + "message": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled" + }, + "expr": "kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"} > 0\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeCronJobRunning", + "annotations": { + "message": "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning" + }, + "expr": "time() - kube_cronjob_next_schedule_time{job=\"kube-state-metrics\"} > 3600\n", + "for": "1h", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeJobCompletion", + "annotations": { + "message": "Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than one hour to complete.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion" + }, + "expr": "kube_job_spec_completions{job=\"kube-state-metrics\"} - kube_job_status_succeeded{job=\"kube-state-metrics\"} > 0\n", + "for": "1h", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeJobFailed", + "annotations": { + "message": "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed" + }, + "expr": "kube_job_failed{job=\"kube-state-metrics\"} > 0\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeHpaReplicasMismatch", + "annotations": { + "message": "HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch" + }, + "expr": "(kube_hpa_status_desired_replicas{job=\"kube-state-metrics\"}\n !=\nkube_hpa_status_current_replicas{job=\"kube-state-metrics\"})\n and\nchanges(kube_hpa_status_current_replicas[15m]) == 0\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeHpaMaxedOut", + "annotations": { + "message": "HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout" + }, + "expr": "kube_hpa_status_current_replicas{job=\"kube-state-metrics\"}\n ==\nkube_hpa_spec_max_replicas{job=\"kube-state-metrics\"}\n", + "for": "15m", + "labels": { + "severity": "warning" + } + } + ] + }, + { + "name": "kubernetes-resources", + "rules": [ + { + "alert": "KubeCPUOvercommit", + "annotations": { + "message": "Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit" + }, + "expr": "sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum)\n /\nsum(kube_node_status_allocatable_cpu_cores)\n >\n(count(kube_node_status_allocatable_cpu_cores)-1) / count(kube_node_status_allocatable_cpu_cores)\n", + "for": "5m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeMemOvercommit", + "annotations": { + "message": "Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit" + }, + "expr": "sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum)\n /\nsum(kube_node_status_allocatable_memory_bytes)\n >\n(count(kube_node_status_allocatable_memory_bytes)-1)\n /\ncount(kube_node_status_allocatable_memory_bytes)\n", + "for": "5m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeCPUOvercommit", + "annotations": { + "message": "Cluster has overcommitted CPU resource requests for Namespaces.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit" + }, + "expr": "sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"cpu\"})\n /\nsum(kube_node_status_allocatable_cpu_cores)\n > 1.5\n", + "for": "5m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeMemOvercommit", + "annotations": { + "message": "Cluster has overcommitted memory resource requests for Namespaces.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit" + }, + "expr": "sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"memory\"})\n /\nsum(kube_node_status_allocatable_memory_bytes{job=\"node-exporter\"})\n > 1.5\n", + "for": "5m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeQuotaExceeded", + "annotations": { + "message": "Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded" + }, + "expr": "kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n > 0.90\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "CPUThrottlingHigh", + "annotations": { + "message": "{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh" + }, + "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\", }[5m])) by (container, pod, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)\n > ( 100 / 100 )\n", + "for": "15m", + "labels": { + "severity": "warning" + } + } + ] + }, + { + "name": "kubernetes-storage", + "rules": [ + { + "alert": "KubePersistentVolumeUsageCritical", + "annotations": { + "message": "The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical" + }, + "expr": "kubelet_volume_stats_available_bytes{job=\"kubelet\"}\n /\nkubelet_volume_stats_capacity_bytes{job=\"kubelet\"}\n < 0.03\n", + "for": "1m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubePersistentVolumeFullInFourDays", + "annotations": { + "message": "Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays" + }, + "expr": "(\n kubelet_volume_stats_available_bytes{job=\"kubelet\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\"}\n) < 0.15\nand\npredict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\"}[6h], 4 * 24 * 3600) < 0\n", + "for": "5m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubePersistentVolumeErrors", + "annotations": { + "message": "The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors" + }, + "expr": "kube_persistentvolume_status_phase{phase=~\"Failed|Pending\",job=\"kube-state-metrics\"} > 0\n", + "for": "5m", + "labels": { + "severity": "critical" + } + } + ] + }, + { + "name": "kubernetes-system", + "rules": [ + { + "alert": "KubeVersionMismatch", + "annotations": { + "message": "There are {{ $value }} different semantic versions of Kubernetes components running.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch" + }, + "expr": "count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~\"kube-dns|coredns\"},\"gitVersion\",\"$1\",\"gitVersion\",\"(v[0-9]*.[0-9]*.[0-9]*).*\"))) > 1\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeClientErrors", + "annotations": { + "message": "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors" + }, + "expr": "(sum(rate(rest_client_requests_total{code=~\"5..\"}[5m])) by (instance, job)\n /\nsum(rate(rest_client_requests_total[5m])) by (instance, job))\n> 0.01\n", + "for": "15m", + "labels": { + "severity": "warning" + } + } + ] + }, + { + "name": "kubernetes-system-apiserver", + "rules": [ + { + "alert": "KubeAPILatencyHigh", + "annotations": { + "message": "The API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh" + }, + "expr": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job=\"apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|PROXY|CONNECT\"} > 1\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeAPILatencyHigh", + "annotations": { + "message": "The API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh" + }, + "expr": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job=\"apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|PROXY|CONNECT\"} > 4\n", + "for": "10m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubeAPIErrorsHigh", + "annotations": { + "message": "API server is returning errors for {{ $value | humanizePercentage }} of requests.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh" + }, + "expr": "sum(rate(apiserver_request_total{job=\"apiserver\",code=~\"5..\"}[5m]))\n /\nsum(rate(apiserver_request_total{job=\"apiserver\"}[5m])) > 0.03\n", + "for": "10m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubeAPIErrorsHigh", + "annotations": { + "message": "API server is returning errors for {{ $value | humanizePercentage }} of requests.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh" + }, + "expr": "sum(rate(apiserver_request_total{job=\"apiserver\",code=~\"5..\"}[5m]))\n /\nsum(rate(apiserver_request_total{job=\"apiserver\"}[5m])) > 0.01\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeAPIErrorsHigh", + "annotations": { + "message": "API server is returning errors for {{ $value | humanizePercentage }} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh" + }, + "expr": "sum(rate(apiserver_request_total{job=\"apiserver\",code=~\"5..\"}[5m])) by (resource,subresource,verb)\n /\nsum(rate(apiserver_request_total{job=\"apiserver\"}[5m])) by (resource,subresource,verb) > 0.10\n", + "for": "10m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubeAPIErrorsHigh", + "annotations": { + "message": "API server is returning errors for {{ $value | humanizePercentage }} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh" + }, + "expr": "sum(rate(apiserver_request_total{job=\"apiserver\",code=~\"5..\"}[5m])) by (resource,subresource,verb)\n /\nsum(rate(apiserver_request_total{job=\"apiserver\"}[5m])) by (resource,subresource,verb) > 0.05\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeClientCertificateExpiration", + "annotations": { + "message": "A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration" + }, + "expr": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 604800\n", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeClientCertificateExpiration", + "annotations": { + "message": "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration" + }, + "expr": "apiserver_client_certificate_expiration_seconds_count{job=\"apiserver\"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 86400\n", + "labels": { + "severity": "critical" + } + }, + { + "alert": "KubeAPIDown", + "annotations": { + "message": "KubeAPI has disappeared from Prometheus target discovery.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown" + }, + "expr": "absent(up{job=\"apiserver\"} == 1)\n", + "for": "15m", + "labels": { + "severity": "critical" + } + } + ] + }, + { + "name": "kubernetes-system-kubelet", + "rules": [ + { + "alert": "KubeNodeNotReady", + "annotations": { + "message": "{{ $labels.node }} has been unready for more than 15 minutes.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready" + }, + "expr": "kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} == 0\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeNodeUnreachable", + "annotations": { + "message": "{{ $labels.node }} is unreachable and some workloads may be rescheduled.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable" + }, + "expr": "kube_node_spec_taint{job=\"kube-state-metrics\",key=\"node.kubernetes.io/unreachable\",effect=\"NoSchedule\"} == 1\n", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeletTooManyPods", + "annotations": { + "message": "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods" + }, + "expr": "max(max(kubelet_running_pod_count{job=\"kubelet\"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\"}) by(node) / max(kube_node_status_capacity_pods{job=\"kube-state-metrics\"}) by(node) > 0.95\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "KubeletDown", + "annotations": { + "message": "Kubelet has disappeared from Prometheus target discovery.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown" + }, + "expr": "absent(up{job=\"kubelet\"} == 1)\n", + "for": "15m", + "labels": { + "severity": "critical" + } + } + ] + }, + { + "name": "kubernetes-system-scheduler", + "rules": [ + { + "alert": "KubeSchedulerDown", + "annotations": { + "message": "KubeScheduler has disappeared from Prometheus target discovery.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown" + }, + "expr": "absent(up{job=\"kube-scheduler\"} == 1)\n", + "for": "15m", + "labels": { + "severity": "critical" + } + } + ] + }, + { + "name": "kubernetes-system-controller-manager", + "rules": [ + { + "alert": "KubeControllerManagerDown", + "annotations": { + "message": "KubeControllerManager has disappeared from Prometheus target discovery.", + "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown" + }, + "expr": "absent(up{job=\"kube-controller-manager\"} == 1)\n", + "for": "15m", + "labels": { + "severity": "critical" + } + } + ] + } + ] + } + node-exporter.yaml: |- + { + "groups": [ + { + "name": "node-exporter", + "rules": [ + { + "alert": "NodeFilesystemSpaceFillingUp", + "annotations": { + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up.", + "summary": "Filesystem is predicted to run out of space within the next 24 hours." + }, + "expr": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} * 100 < 40\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} == 0\n)\n", + "for": "1h", + "labels": { + "severity": "warning" + } + }, + { + "alert": "NodeFilesystemSpaceFillingUp", + "annotations": { + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left and is filling up fast.", + "summary": "Filesystem is predicted to run out of space within the next 4 hours." + }, + "expr": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} * 100 < 20\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} == 0\n)\n", + "for": "1h", + "labels": { + "severity": "critical" + } + }, + { + "alert": "NodeFilesystemAlmostOutOfSpace", + "annotations": { + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left.", + "summary": "Filesystem has less than 5% space left." + }, + "expr": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} == 0\n)\n", + "for": "1h", + "labels": { + "severity": "warning" + } + }, + { + "alert": "NodeFilesystemAlmostOutOfSpace", + "annotations": { + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available space left.", + "summary": "Filesystem has less than 3% space left." + }, + "expr": "(\n node_filesystem_avail_bytes{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} / node_filesystem_size_bytes{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} == 0\n)\n", + "for": "1h", + "labels": { + "severity": "critical" + } + }, + { + "alert": "NodeFilesystemFilesFillingUp", + "annotations": { + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left and is filling up.", + "summary": "Filesystem is predicted to run out of inodes within the next 24 hours." + }, + "expr": "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} / node_filesystem_files{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} * 100 < 40\nand\n predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} == 0\n)\n", + "for": "1h", + "labels": { + "severity": "warning" + } + }, + { + "alert": "NodeFilesystemFilesFillingUp", + "annotations": { + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left and is filling up fast.", + "summary": "Filesystem is predicted to run out of inodes within the next 4 hours." + }, + "expr": "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} / node_filesystem_files{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} * 100 < 20\nand\n predict_linear(node_filesystem_files_free{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} == 0\n)\n", + "for": "1h", + "labels": { + "severity": "critical" + } + }, + { + "alert": "NodeFilesystemAlmostOutOfFiles", + "annotations": { + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left.", + "summary": "Filesystem has less than 5% inodes left." + }, + "expr": "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} / node_filesystem_files{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} == 0\n)\n", + "for": "1h", + "labels": { + "severity": "warning" + } + }, + { + "alert": "NodeFilesystemAlmostOutOfFiles", + "annotations": { + "description": "Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf \"%.2f\" $value }}% available inodes left.", + "summary": "Filesystem has less than 3% inodes left." + }, + "expr": "(\n node_filesystem_files_free{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} / node_filesystem_files{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node-exporter\",fstype!~\"tmpfs|nsfs|vfat\"} == 0\n)\n", + "for": "1h", + "labels": { + "severity": "critical" + } + }, + { + "alert": "NodeNetworkReceiveErrs", + "annotations": { + "description": "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.", + "summary": "Network interface is reporting many receive errors." + }, + "expr": "increase(node_network_receive_errs_total[2m]) > 10\n", + "for": "1h", + "labels": { + "severity": "warning" + } + }, + { + "alert": "NodeNetworkTransmitErrs", + "annotations": { + "description": "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.", + "summary": "Network interface is reporting many transmit errors." + }, + "expr": "increase(node_network_transmit_errs_total[2m]) > 10\n", + "for": "1h", + "labels": { + "severity": "warning" + } + } + ] + } + ] + } + prom.yaml: |- + { + "groups": [ + { + "name": "prometheus", + "rules": [ + { + "alert": "PrometheusBadConfig", + "annotations": { + "description": "Prometheus {{$labels.instance}} has failed to reload its configuration.", + "summary": "Failed Prometheus configuration reload." + }, + "expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\nmax_over_time(prometheus_config_last_reload_successful{job=\"prometheus\"}[5m]) == 0\n", + "for": "10m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "PrometheusNotificationQueueRunningFull", + "annotations": { + "description": "Alert notification queue of Prometheus {{$labels.instance}} is running full.", + "summary": "Prometheus alert notification queue predicted to run full in less than 30m." + }, + "expr": "# Without min_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n predict_linear(prometheus_notifications_queue_length{job=\"prometheus\"}[5m], 60 * 30)\n>\n min_over_time(prometheus_notifications_queue_capacity{job=\"prometheus\"}[5m])\n)\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusErrorSendingAlertsToSomeAlertmanagers", + "annotations": { + "description": "{{ printf \"%.1f\" $value }}% errors while sending alerts from Prometheus {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.", + "summary": "Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager." + }, + "expr": "(\n rate(prometheus_notifications_errors_total{job=\"prometheus\"}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus\"}[5m])\n)\n* 100\n> 1\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusErrorSendingAlertsToAnyAlertmanager", + "annotations": { + "description": "{{ printf \"%.1f\" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.", + "summary": "Prometheus encounters more than 3% errors sending alerts to any Alertmanager." + }, + "expr": "min without(alertmanager) (\n rate(prometheus_notifications_errors_total{job=\"prometheus\"}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus\"}[5m])\n)\n* 100\n> 3\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "PrometheusNotConnectedToAlertmanagers", + "annotations": { + "description": "Prometheus {{$labels.instance}} is not connected to any Alertmanagers.", + "summary": "Prometheus is not connected to any Alertmanagers." + }, + "expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\nmax_over_time(prometheus_notifications_alertmanagers_discovered{job=\"prometheus\"}[5m]) < 1\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusTSDBReloadsFailing", + "annotations": { + "description": "Prometheus {{$labels.instance}} has detected {{$value | humanize}} reload failures over the last 3h.", + "summary": "Prometheus has issues reloading blocks from disk." + }, + "expr": "increase(prometheus_tsdb_reloads_failures_total{job=\"prometheus\"}[3h]) > 0\n", + "for": "4h", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusTSDBCompactionsFailing", + "annotations": { + "description": "Prometheus {{$labels.instance}} has detected {{$value | humanize}} compaction failures over the last 3h.", + "summary": "Prometheus has issues compacting blocks." + }, + "expr": "increase(prometheus_tsdb_compactions_failed_total{job=\"prometheus\"}[3h]) > 0\n", + "for": "4h", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusNotIngestingSamples", + "annotations": { + "description": "Prometheus {{$labels.instance}} is not ingesting samples.", + "summary": "Prometheus is not ingesting samples." + }, + "expr": "rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus\"}[5m]) <= 0\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusDuplicateTimestamps", + "annotations": { + "description": "Prometheus {{$labels.instance}} is dropping {{ printf \"%.4g\" $value }} samples/s with different values but duplicated timestamp.", + "summary": "Prometheus is dropping samples with duplicate timestamps." + }, + "expr": "rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=\"prometheus\"}[5m]) > 0\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusOutOfOrderTimestamps", + "annotations": { + "description": "Prometheus {{$labels.instance}} is dropping {{ printf \"%.4g\" $value }} samples/s with timestamps arriving out of order.", + "summary": "Prometheus drops samples with out-of-order timestamps." + }, + "expr": "rate(prometheus_target_scrapes_sample_out_of_order_total{job=\"prometheus\"}[5m]) > 0\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusRemoteStorageFailures", + "annotations": { + "description": "Prometheus {{$labels.instance}} failed to send {{ printf \"%.1f\" $value }}% of the samples to queue {{$labels.queue}}.", + "summary": "Prometheus fails to send samples to remote storage." + }, + "expr": "(\n rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus\"}[5m])\n/\n (\n rate(prometheus_remote_storage_failed_samples_total{job=\"prometheus\"}[5m])\n +\n rate(prometheus_remote_storage_succeeded_samples_total{job=\"prometheus\"}[5m])\n )\n)\n* 100\n> 1\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "PrometheusRemoteWriteBehind", + "annotations": { + "description": "Prometheus {{$labels.instance}} remote write is {{ printf \"%.1f\" $value }}s behind for queue {{$labels.queue}}.", + "summary": "Prometheus remote write is behind." + }, + "expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"prometheus\"}[5m])\n- on(job, instance) group_right\n max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"prometheus\"}[5m])\n)\n> 120\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "PrometheusRemoteWriteDesiredShards", + "annotations": { + "description": "Prometheus {{$labels.instance}} remote write desired shards calculation wants to run {{ printf $value }} shards, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance=\"%s\",job=\"prometheus\"}` $labels.instance | query | first | value }}.", + "summary": "Prometheus remote write desired shards calculation wants to run more than configured max shards." + }, + "expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_shards_desired{job=\"prometheus\"}[5m])\n> on(job, instance) group_right\n max_over_time(prometheus_remote_storage_shards_max{job=\"prometheus\"}[5m])\n)\n", + "for": "15m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "PrometheusRuleFailures", + "annotations": { + "description": "Prometheus {{$labels.instance}} has failed to evaluate {{ printf \"%.0f\" $value }} rules in the last 5m.", + "summary": "Prometheus is failing rule evaluations." + }, + "expr": "increase(prometheus_rule_evaluation_failures_total{job=\"prometheus\"}[5m]) > 0\n", + "for": "15m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "PrometheusMissingRuleEvaluations", + "annotations": { + "description": "Prometheus {{$labels.instance}} has missed {{ printf \"%.0f\" $value }} rule group evaluations in the last 5m.", + "summary": "Prometheus is missing rule evaluations due to slow rule group evaluation." + }, + "expr": "increase(prometheus_rule_group_iterations_missed_total{job=\"prometheus\"}[5m]) > 0\n", + "for": "15m", + "labels": { + "severity": "warning" + } + } + ] + } + ] + } + typhoon.yaml: |- + { + "groups": [ + { + "name": "general.rules", + "rules": [ + { + "alert": "TargetDown", + "annotations": { + "message": "{{ printf \"%.4g\" $value }}% of the {{ $labels.job }} targets are down." + }, + "expr": "100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10", + "for": "10m", + "labels": { + "severity": "warning" + } + } + ] + }, + { + "name": "extra.rules", + "rules": [ + { + "alert": "InactiveRAIDDisk", + "annotations": { + "message": "{{ $value }} RAID disk(s) on node {{ $labels.instance }} are inactive." + }, + "expr": "node_md_disks - node_md_disks_active > 0", + "for": "10m", + "labels": { + "severity": "warning" + } + } + ] + } + ] + } +kind: ConfigMap +metadata: + name: prometheus-rules + namespace: monitoring diff --git a/templates/service-account.yaml b/templates/service-account.yaml new file mode 100644 index 0000000..f4c5f20 --- /dev/null +++ b/templates/service-account.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus + namespace: monitoring diff --git a/templates/service.yaml b/templates/service.yaml new file mode 100644 index 0000000..c1b9eec --- /dev/null +++ b/templates/service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: monitoring + annotations: + prometheus.io/scrape: 'true' + prometheus.io/port: '9090' +spec: + type: ClusterIP + selector: + name: prometheus + phase: prod + ports: + - name: web + protocol: TCP + port: 80 + targetPort: 9090 diff --git a/vars/main.yml b/vars/main.yml new file mode 100644 index 0000000..e69de29