Fusion de prometheus et granafa

2020-04-20 18:03:28 +02:00 · 2020-04-20 18:03:28 +02:00 · 2c57657fc9
commit 2c57657fc9
parent 82c43339cb
39 changed files with 26968 additions and 57 deletions
--- a/defaults/main.yml
+++ b/defaults/main.yml
@ -1,3 +1,4 @@
 my_context: kubernetes
 my_namespace: monitoring
 prometheus_state: present
+grafana_state: present
--- a/meta/main.yml
+++ b/meta/main.yml
@ -1,6 +1,6 @@
 galaxy_info:
  author: Adrien Reslinger
-  description: Install Prometheus stack to a kubernetes cluster
+  description: Install a monitoring stack to a kubernetes cluster based on prometheus / grafana
  company: Personnal
-  min_ansible_version: 2.6
+  min_ansible_version: 2.9
  galaxy_tags: []
--- a/scripts/update-templates.sh
+++ b/scripts/update-templates.sh
@ -1,8 +1,17 @@
 #!/usr/bin/env bash
+cd $(dirname "$0")/../templates
 git clone https://github.com/poseidon/typhoon.git
-typhoon/addons/prometheus

+cd typhoon/addons/prometheus
+cp -fr * ../../../prometheus/
 for i in $(ls rbac/*.yaml); do echo "      - $i"; done
 for i in $(ls *.yaml); do echo "      - $i"; done
 for i in $(ls discovery/*.yaml); do echo "      - $i"; done
 for i in $(ls exporters/*/*.yaml); do echo "      - $i"; done
+cd -
+
+cd typhoon/addons/grafana
+cp -fr * ../../../grafana/
+for i in $(ls *.yaml); do echo "      - $i"; done
+cd -
+rm -fr typhoon
--- a/tasks/main.yml
+++ b/tasks/main.yml
@ -3,13 +3,22 @@

 - name: traefik setup
  block:
+  - name: Find state of monitoring stack
+    set_facts:
+      monitoring_state: absent
+    when:
+      - prometheus_state == "absent"
+      - grafana_state == "absent"
+    
  - name: namespace
    k8s:
-      state: "{{ prometheus_state }}"
+      state: "{{ monitoring_state }}"
      context: "{{ my_context }}"
      name: "{{ my_namespace }}"
      api_version: v1
      kind: Namespace
+    when:
+      - monitoring_state == "present"

 #  - name: Create a Secret object for basic authentification
 #    k8s:
@ -32,26 +41,63 @@
    k8s:
      state: "{{ prometheus_state }}"
      context: "{{ my_context }}"
+      merge_type: merge
      resource_definition: "{{ lookup('template', item) | from_yaml }}"
    with_items:
-#      - 0-namespace.yaml
-      - rbac/cluster-role-binding.yaml
-      - rbac/cluster-role.yaml
-      - config.yaml
-      - deployment.yaml
-      - network-policy.yaml
-      - rules.yaml
-      - service-account.yaml
-      - service.yaml
-      - discovery/kube-controller-manager.yaml
-      - discovery/kube-proxy.yaml
-      - discovery/kube-scheduler.yaml
-      - exporters/kube-state-metrics/cluster-role-binding.yaml
-      - exporters/kube-state-metrics/cluster-role.yaml
-      - exporters/kube-state-metrics/deployment.yaml
-      - exporters/kube-state-metrics/service-account.yaml
-      - exporters/kube-state-metrics/service.yaml
-      - exporters/node-exporter/daemonset.yaml
-      - exporters/node-exporter/service-account.yaml
-      - exporters/node-exporter/service.yaml
-    tags: traefik
+#      - prometheus/0-namespace.yaml
+      - prometheus/rbac/cluster-role-binding.yaml
+      - prometheus/rbac/cluster-role.yaml
+      - prometheus/config.yaml
+      - prometheus/deployment.yaml
+      - prometheus/network-policy.yaml
+      - prometheus/rules.yaml
+      - prometheus/service-account.yaml
+      - prometheus/service.yaml
+      - prometheus/discovery/kube-controller-manager.yaml
+      - prometheus/discovery/kube-proxy.yaml
+      - prometheus/discovery/kube-scheduler.yaml
+      - prometheus/exporters/kube-state-metrics/cluster-role-binding.yaml
+      - prometheus/exporters/kube-state-metrics/cluster-role.yaml
+      - prometheus/exporters/kube-state-metrics/deployment.yaml
+      - prometheus/exporters/kube-state-metrics/service-account.yaml
+      - prometheus/exporters/kube-state-metrics/service.yaml
+      - prometheus/exporters/node-exporter/daemonset.yaml
+      - prometheus/exporters/node-exporter/service-account.yaml
+      - prometheus/exporters/node-exporter/service.yaml
+    tags: monitoring
+
+  - name: Grafana files need to be {{ grafana_state }}
+    k8s:
+      state: "{{ grafana_state }}"
+      context: "{{ my_context }}"
+      merge_type: merge
+      resource_definition: "{{ lookup('template', item) | from_yaml }}"
+    with_items:
+      - grafana/config.yaml
+      - grafana/dashboards-coredns.yaml
+      - grafana/dashboards-etcd.yaml
+      - grafana/dashboards-k8s.yaml
+      - grafana/dashboards-k8s-nodes.yaml
+      - grafana/dashboards-k8s-resources-1.yaml
+      - grafana/dashboards-k8s-resources-2.yaml
+      - grafana/dashboards-nginx-ingress.yaml
+      - grafana/dashboards-node-exporter.yaml
+      - grafana/dashboards-prom.yaml
+      - grafana/datasources.yaml
+      - grafana/deployment.yaml
+      - grafana/providers.yaml
+      - grafana/service.yaml
+    tags: grafana
+
+  - name: namespace
+    k8s:
+      state: "{{ monitoring_state }}"
+      context: "{{ my_context }}"
+      name: "{{ my_namespace }}"
+      api_version: v1
+      kind: Namespace
+    when:
+      - monitoring_state == "absent"
+
+  tags:
+    - monitoring
--- a/templates/grafana/config.yaml
+++ b/templates/grafana/config.yaml
@ -0,0 +1,36 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-config
+  namespace: monitoring
+data:
+  custom.ini: |+
+    [server]
+    http_port = 8080
+
+    [paths]
+    data    = /var/lib/grafana
+    plugins = /var/lib/grafana/plugins
+    provisioning = /etc/grafana/provisioning
+
+    [users]
+    allow_sign_up    = false
+    allow_org_create = false
+    # viewers can edit/inspect, but not save
+    viewers_can_edit = true
+
+    # Disable login form, since Grafana always creates an admin user
+    [auth]
+    disable_login_form = true
+
+    # Disable the user/pass login system
+    [auth.basic]
+    enabled = false
+
+    # Allow anonymous authentication with view-only authorization
+    [auth.anonymous]
+    enabled = true
+    org_role = Viewer
+
+    [analytics]
+    reporting_enabled = false
--- a/templates/grafana/dashboards-coredns.yaml
+++ b/templates/grafana/dashboards-coredns.yaml
--- a/templates/grafana/dashboards-etcd.yaml
+++ b/templates/grafana/dashboards-etcd.yaml
--- a/templates/grafana/dashboards-k8s-nodes.yaml
+++ b/templates/grafana/dashboards-k8s-nodes.yaml
--- a/templates/grafana/dashboards-k8s-resources-1.yaml
+++ b/templates/grafana/dashboards-k8s-resources-1.yaml
--- a/templates/grafana/dashboards-k8s-resources-2.yaml
+++ b/templates/grafana/dashboards-k8s-resources-2.yaml
--- a/templates/grafana/dashboards-k8s.yaml
+++ b/templates/grafana/dashboards-k8s.yaml
--- a/templates/grafana/dashboards-nginx-ingress.yaml
+++ b/templates/grafana/dashboards-nginx-ingress.yaml
--- a/templates/grafana/dashboards-node-exporter.yaml
+++ b/templates/grafana/dashboards-node-exporter.yaml
@ -0,0 +1,968 @@
+apiVersion: v1
+data:
+  nodes.json: |-
+    {
+      "__inputs": [
+
+      ],
+      "__requires": [
+
+      ],
+      "annotations": {
+        "list": [
+
+        ]
+      },
+      "editable": false,
+      "gnetId": null,
+      "graphTooltip": 0,
+      "hideControls": false,
+      "id": null,
+      "links": [
+
+      ],
+      "refresh": "",
+      "rows": [
+        {
+          "collapse": false,
+          "collapsed": false,
+          "panels": [
+            {
+              "aliasColors": {
+
+              },
+              "bars": false,
+              "dashLength": 10,
+              "dashes": false,
+              "datasource": "$datasource",
+              "fill": 1,
+              "gridPos": {
+
+              },
+              "id": 2,
+              "legend": {
+                "alignAsTable": false,
+                "avg": false,
+                "current": false,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "total": false,
+                "values": false
+              },
+              "lines": true,
+              "linewidth": 1,
+              "links": [
+
+              ],
+              "nullPointMode": "null",
+              "percentage": false,
+              "pointradius": 5,
+              "points": false,
+              "renderer": "flot",
+              "repeat": null,
+              "seriesOverrides": [
+
+              ],
+              "spaceLength": 10,
+              "span": 6,
+              "stack": true,
+              "steppedLine": false,
+              "targets": [
+                {
+                  "expr": "(\n  (1 - rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"}[$__interval]))\n/ ignoring(cpu) group_left\n  count without (cpu)( node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\", instance=\"$instance\"})\n)\n",
+                  "format": "time_series",
+                  "interval": "1m",
+                  "intervalFactor": 5,
+                  "legendFormat": "{{cpu}}",
+                  "refId": "A"
+                }
+              ],
+              "thresholds": [
+
+              ],
+              "timeFrom": null,
+              "timeShift": null,
+              "title": "CPU Usage",
+              "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+              },
+              "type": "graph",
+              "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": [
+
+                ]
+              },
+              "yaxes": [
+                {
+                  "format": "percentunit",
+                  "label": null,
+                  "logBase": 1,
+                  "max": 1,
+                  "min": 0,
+                  "show": true
+                },
+                {
+                  "format": "percentunit",
+                  "label": null,
+                  "logBase": 1,
+                  "max": 1,
+                  "min": 0,
+                  "show": true
+                }
+              ]
+            },
+            {
+              "aliasColors": {
+
+              },
+              "bars": false,
+              "dashLength": 10,
+              "dashes": false,
+              "datasource": "$datasource",
+              "fill": 0,
+              "gridPos": {
+
+              },
+              "id": 3,
+              "legend": {
+                "alignAsTable": false,
+                "avg": false,
+                "current": false,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "total": false,
+                "values": false
+              },
+              "lines": true,
+              "linewidth": 1,
+              "links": [
+
+              ],
+              "nullPointMode": "null",
+              "percentage": false,
+              "pointradius": 5,
+              "points": false,
+              "renderer": "flot",
+              "repeat": null,
+              "seriesOverrides": [
+
+              ],
+              "spaceLength": 10,
+              "span": 6,
+              "stack": false,
+              "steppedLine": false,
+              "targets": [
+                {
+                  "expr": "node_load1{job=\"node-exporter\", instance=\"$instance\"}",
+                  "format": "time_series",
+                  "intervalFactor": 2,
+                  "legendFormat": "1m load average",
+                  "refId": "A"
+                },
+                {
+                  "expr": "node_load5{job=\"node-exporter\", instance=\"$instance\"}",
+                  "format": "time_series",
+                  "intervalFactor": 2,
+                  "legendFormat": "5m load average",
+                  "refId": "B"
+                },
+                {
+                  "expr": "node_load15{job=\"node-exporter\", instance=\"$instance\"}",
+                  "format": "time_series",
+                  "intervalFactor": 2,
+                  "legendFormat": "15m load average",
+                  "refId": "C"
+                },
+                {
+                  "expr": "count(node_cpu_seconds_total{job=\"node-exporter\", instance=\"$instance\", mode=\"idle\"})",
+                  "format": "time_series",
+                  "intervalFactor": 2,
+                  "legendFormat": "logical cores",
+                  "refId": "D"
+                }
+              ],
+              "thresholds": [
+
+              ],
+              "timeFrom": null,
+              "timeShift": null,
+              "title": "Load Average",
+              "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+              },
+              "type": "graph",
+              "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": [
+
+                ]
+              },
+              "yaxes": [
+                {
+                  "format": "short",
+                  "label": null,
+                  "logBase": 1,
+                  "max": null,
+                  "min": 0,
+                  "show": true
+                },
+                {
+                  "format": "short",
+                  "label": null,
+                  "logBase": 1,
+                  "max": null,
+                  "min": 0,
+                  "show": true
+                }
+              ]
+            }
+          ],
+          "repeat": null,
+          "repeatIteration": null,
+          "repeatRowId": null,
+          "showTitle": false,
+          "title": "Dashboard Row",
+          "titleSize": "h6",
+          "type": "row"
+        },
+        {
+          "collapse": false,
+          "collapsed": false,
+          "panels": [
+            {
+              "aliasColors": {
+
+              },
+              "bars": false,
+              "dashLength": 10,
+              "dashes": false,
+              "datasource": "$datasource",
+              "fill": 1,
+              "gridPos": {
+
+              },
+              "id": 4,
+              "legend": {
+                "alignAsTable": false,
+                "avg": false,
+                "current": false,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "total": false,
+                "values": false
+              },
+              "lines": true,
+              "linewidth": 1,
+              "links": [
+
+              ],
+              "nullPointMode": "null",
+              "percentage": false,
+              "pointradius": 5,
+              "points": false,
+              "renderer": "flot",
+              "repeat": null,
+              "seriesOverrides": [
+
+              ],
+              "spaceLength": 10,
+              "span": 9,
+              "stack": true,
+              "steppedLine": false,
+              "targets": [
+                {
+                  "expr": "(\n  node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n-\n  node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}\n-\n  node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}\n-\n  node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}\n)\n",
+                  "format": "time_series",
+                  "intervalFactor": 2,
+                  "legendFormat": "memory used",
+                  "refId": "A"
+                },
+                {
+                  "expr": "node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}",
+                  "format": "time_series",
+                  "intervalFactor": 2,
+                  "legendFormat": "memory buffers",
+                  "refId": "B"
+                },
+                {
+                  "expr": "node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}",
+                  "format": "time_series",
+                  "intervalFactor": 2,
+                  "legendFormat": "memory cached",
+                  "refId": "C"
+                },
+                {
+                  "expr": "node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}",
+                  "format": "time_series",
+                  "intervalFactor": 2,
+                  "legendFormat": "memory free",
+                  "refId": "D"
+                }
+              ],
+              "thresholds": [
+
+              ],
+              "timeFrom": null,
+              "timeShift": null,
+              "title": "Memory Usage",
+              "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+              },
+              "type": "graph",
+              "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": [
+
+                ]
+              },
+              "yaxes": [
+                {
+                  "format": "bytes",
+                  "label": null,
+                  "logBase": 1,
+                  "max": null,
+                  "min": 0,
+                  "show": true
+                },
+                {
+                  "format": "bytes",
+                  "label": null,
+                  "logBase": 1,
+                  "max": null,
+                  "min": 0,
+                  "show": true
+                }
+              ]
+            },
+            {
+              "cacheTimeout": null,
+              "colorBackground": false,
+              "colorValue": false,
+              "colors": [
+                "rgba(50, 172, 45, 0.97)",
+                "rgba(237, 129, 40, 0.89)",
+                "rgba(245, 54, 54, 0.9)"
+              ],
+              "datasource": "$datasource",
+              "format": "percent",
+              "gauge": {
+                "maxValue": 100,
+                "minValue": 0,
+                "show": true,
+                "thresholdLabels": false,
+                "thresholdMarkers": true
+              },
+              "gridPos": {
+
+              },
+              "id": 5,
+              "interval": null,
+              "links": [
+
+              ],
+              "mappingType": 1,
+              "mappingTypes": [
+                {
+                  "name": "value to text",
+                  "value": 1
+                },
+                {
+                  "name": "range to text",
+                  "value": 2
+                }
+              ],
+              "maxDataPoints": 100,
+              "nullPointMode": "connected",
+              "nullText": null,
+              "postfix": "",
+              "postfixFontSize": "50%",
+              "prefix": "",
+              "prefixFontSize": "50%",
+              "rangeMaps": [
+                {
+                  "from": "null",
+                  "text": "N/A",
+                  "to": "null"
+                }
+              ],
+              "span": 3,
+              "sparkline": {
+                "fillColor": "rgba(31, 118, 189, 0.18)",
+                "full": false,
+                "lineColor": "rgb(31, 120, 193)",
+                "show": false
+              },
+              "tableColumn": "",
+              "targets": [
+                {
+                  "expr": "100 -\n(\n  node_memory_MemAvailable_bytes{job=\"node-exporter\", instance=\"$instance\"}\n/\n  node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n* 100\n)\n",
+                  "format": "time_series",
+                  "intervalFactor": 2,
+                  "legendFormat": "",
+                  "refId": "A"
+                }
+              ],
+              "thresholds": "80, 90",
+              "title": "Memory Usage",
+              "type": "singlestat",
+              "valueFontSize": "80%",
+              "valueMaps": [
+                {
+                  "op": "=",
+                  "text": "N/A",
+                  "value": "null"
+                }
+              ],
+              "valueName": "current"
+            }
+          ],
+          "repeat": null,
+          "repeatIteration": null,
+          "repeatRowId": null,
+          "showTitle": false,
+          "title": "Dashboard Row",
+          "titleSize": "h6",
+          "type": "row"
+        },
+        {
+          "collapse": false,
+          "collapsed": false,
+          "panels": [
+            {
+              "aliasColors": {
+
+              },
+              "bars": false,
+              "dashLength": 10,
+              "dashes": false,
+              "datasource": "$datasource",
+              "fill": 0,
+              "gridPos": {
+
+              },
+              "id": 6,
+              "legend": {
+                "alignAsTable": false,
+                "avg": false,
+                "current": false,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "total": false,
+                "values": false
+              },
+              "lines": true,
+              "linewidth": 1,
+              "links": [
+
+              ],
+              "nullPointMode": "null",
+              "percentage": false,
+              "pointradius": 5,
+              "points": false,
+              "renderer": "flot",
+              "repeat": null,
+              "seriesOverrides": [
+                {
+                  "alias": "/ read| written/",
+                  "yaxis": 1
+                },
+                {
+                  "alias": "/ io time/",
+                  "yaxis": 2
+                }
+              ],
+              "spaceLength": 10,
+              "span": 6,
+              "stack": false,
+              "steppedLine": false,
+              "targets": [
+                {
+                  "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!~\"dm.*\"}[$__interval])",
+                  "format": "time_series",
+                  "interval": "1m",
+                  "intervalFactor": 2,
+                  "legendFormat": "{{device}} read",
+                  "refId": "A"
+                },
+                {
+                  "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!~\"dm.*\"}[$__interval])",
+                  "format": "time_series",
+                  "interval": "1m",
+                  "intervalFactor": 2,
+                  "legendFormat": "{{device}} written",
+                  "refId": "B"
+                },
+                {
+                  "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device!~\"dm.*\"}[$__interval])",
+                  "format": "time_series",
+                  "interval": "1m",
+                  "intervalFactor": 2,
+                  "legendFormat": "{{device}} io time",
+                  "refId": "C"
+                }
+              ],
+              "thresholds": [
+
+              ],
+              "timeFrom": null,
+              "timeShift": null,
+              "title": "Disk I/O",
+              "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+              },
+              "type": "graph",
+              "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": [
+
+                ]
+              },
+              "yaxes": [
+                {
+                  "format": "bytes",
+                  "label": null,
+                  "logBase": 1,
+                  "max": null,
+                  "min": null,
+                  "show": true
+                },
+                {
+                  "format": "s",
+                  "label": null,
+                  "logBase": 1,
+                  "max": null,
+                  "min": null,
+                  "show": true
+                }
+              ]
+            },
+            {
+              "aliasColors": {
+
+              },
+              "bars": false,
+              "dashLength": 10,
+              "dashes": false,
+              "datasource": "$datasource",
+              "fill": 1,
+              "gridPos": {
+
+              },
+              "id": 7,
+              "legend": {
+                "alignAsTable": false,
+                "avg": false,
+                "current": false,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "total": false,
+                "values": false
+              },
+              "lines": true,
+              "linewidth": 1,
+              "links": [
+
+              ],
+              "nullPointMode": "null",
+              "percentage": false,
+              "pointradius": 5,
+              "points": false,
+              "renderer": "flot",
+              "repeat": null,
+              "seriesOverrides": [
+                {
+                  "alias": "used",
+                  "color": "#E0B400"
+                },
+                {
+                  "alias": "available",
+                  "color": "#73BF69"
+                }
+              ],
+              "spaceLength": 10,
+              "span": 6,
+              "stack": true,
+              "steppedLine": false,
+              "targets": [
+                {
+                  "expr": "sum(\n  max by (device) (\n    node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!~\"tmpfs|nsfs|vfat\"}\n  -\n    node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!~\"tmpfs|nsfs|vfat\"}\n  )\n)\n",
+                  "format": "time_series",
+                  "intervalFactor": 2,
+                  "legendFormat": "used",
+                  "refId": "A"
+                },
+                {
+                  "expr": "sum(\n  max by (device) (\n    node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!~\"tmpfs|nsfs|vfat\"}\n  )\n)\n",
+                  "format": "time_series",
+                  "intervalFactor": 2,
+                  "legendFormat": "available",
+                  "refId": "B"
+                }
+              ],
+              "thresholds": [
+
+              ],
+              "timeFrom": null,
+              "timeShift": null,
+              "title": "Disk Space Usage",
+              "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+              },
+              "type": "graph",
+              "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": [
+
+                ]
+              },
+              "yaxes": [
+                {
+                  "format": "bytes",
+                  "label": null,
+                  "logBase": 1,
+                  "max": null,
+                  "min": 0,
+                  "show": true
+                },
+                {
+                  "format": "bytes",
+                  "label": null,
+                  "logBase": 1,
+                  "max": null,
+                  "min": 0,
+                  "show": true
+                }
+              ]
+            }
+          ],
+          "repeat": null,
+          "repeatIteration": null,
+          "repeatRowId": null,
+          "showTitle": false,
+          "title": "Dashboard Row",
+          "titleSize": "h6",
+          "type": "row"
+        },
+        {
+          "collapse": false,
+          "collapsed": false,
+          "panels": [
+            {
+              "aliasColors": {
+
+              },
+              "bars": false,
+              "dashLength": 10,
+              "dashes": false,
+              "datasource": "$datasource",
+              "fill": 0,
+              "gridPos": {
+
+              },
+              "id": 8,
+              "legend": {
+                "alignAsTable": false,
+                "avg": false,
+                "current": false,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "total": false,
+                "values": false
+              },
+              "lines": true,
+              "linewidth": 1,
+              "links": [
+
+              ],
+              "nullPointMode": "null",
+              "percentage": false,
+              "pointradius": 5,
+              "points": false,
+              "renderer": "flot",
+              "repeat": null,
+              "seriesOverrides": [
+
+              ],
+              "spaceLength": 10,
+              "span": 6,
+              "stack": false,
+              "steppedLine": false,
+              "targets": [
+                {
+                  "expr": "rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__interval])",
+                  "format": "time_series",
+                  "interval": "1m",
+                  "intervalFactor": 2,
+                  "legendFormat": "{{device}}",
+                  "refId": "A"
+                }
+              ],
+              "thresholds": [
+
+              ],
+              "timeFrom": null,
+              "timeShift": null,
+              "title": "Network Received",
+              "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+              },
+              "type": "graph",
+              "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": [
+
+                ]
+              },
+              "yaxes": [
+                {
+                  "format": "bytes",
+                  "label": null,
+                  "logBase": 1,
+                  "max": null,
+                  "min": 0,
+                  "show": true
+                },
+                {
+                  "format": "bytes",
+                  "label": null,
+                  "logBase": 1,
+                  "max": null,
+                  "min": 0,
+                  "show": true
+                }
+              ]
+            },
+            {
+              "aliasColors": {
+
+              },
+              "bars": false,
+              "dashLength": 10,
+              "dashes": false,
+              "datasource": "$datasource",
+              "fill": 0,
+              "gridPos": {
+
+              },
+              "id": 9,
+              "legend": {
+                "alignAsTable": false,
+                "avg": false,
+                "current": false,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "total": false,
+                "values": false
+              },
+              "lines": true,
+              "linewidth": 1,
+              "links": [
+
+              ],
+              "nullPointMode": "null",
+              "percentage": false,
+              "pointradius": 5,
+              "points": false,
+              "renderer": "flot",
+              "repeat": null,
+              "seriesOverrides": [
+
+              ],
+              "spaceLength": 10,
+              "span": 6,
+              "stack": false,
+              "steppedLine": false,
+              "targets": [
+                {
+                  "expr": "rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!=\"lo\"}[$__interval])",
+                  "format": "time_series",
+                  "interval": "1m",
+                  "intervalFactor": 2,
+                  "legendFormat": "{{device}}",
+                  "refId": "A"
+                }
+              ],
+              "thresholds": [
+
+              ],
+              "timeFrom": null,
+              "timeShift": null,
+              "title": "Network Transmitted",
+              "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+              },
+              "type": "graph",
+              "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": [
+
+                ]
+              },
+              "yaxes": [
+                {
+                  "format": "bytes",
+                  "label": null,
+                  "logBase": 1,
+                  "max": null,
+                  "min": 0,
+                  "show": true
+                },
+                {
+                  "format": "bytes",
+                  "label": null,
+                  "logBase": 1,
+                  "max": null,
+                  "min": 0,
+                  "show": true
+                }
+              ]
+            }
+          ],
+          "repeat": null,
+          "repeatIteration": null,
+          "repeatRowId": null,
+          "showTitle": false,
+          "title": "Dashboard Row",
+          "titleSize": "h6",
+          "type": "row"
+        }
+      ],
+      "schemaVersion": 14,
+      "style": "dark",
+      "tags": [
+
+      ],
+      "templating": {
+        "list": [
+          {
+            "current": {
+              "text": "Prometheus",
+              "value": "Prometheus"
+            },
+            "hide": 0,
+            "label": null,
+            "name": "datasource",
+            "options": [
+
+            ],
+            "query": "prometheus",
+            "refresh": 1,
+            "regex": "",
+            "type": "datasource"
+          },
+          {
+            "allValue": null,
+            "current": {
+
+            },
+            "datasource": "$datasource",
+            "hide": 0,
+            "includeAll": false,
+            "label": null,
+            "multi": false,
+            "name": "instance",
+            "options": [
+
+            ],
+            "query": "label_values(node_exporter_build_info{job=\"node-exporter\"}, instance)",
+            "refresh": 2,
+            "regex": "",
+            "sort": 0,
+            "tagValuesQuery": "",
+            "tags": [
+
+            ],
+            "tagsQuery": "",
+            "type": "query",
+            "useTags": false
+          }
+        ]
+      },
+      "time": {
+        "from": "now-1h",
+        "to": "now"
+      },
+      "timepicker": {
+        "refresh_intervals": [
+          "5s",
+          "10s",
+          "30s",
+          "1m",
+          "5m",
+          "15m",
+          "30m",
+          "1h",
+          "2h",
+          "1d"
+        ],
+        "time_options": [
+          "5m",
+          "15m",
+          "1h",
+          "6h",
+          "12h",
+          "24h",
+          "2d",
+          "7d",
+          "30d"
+        ]
+      },
+      "timezone": "",
+      "title": "Nodes",
+      "uid": "fa49a4706d07a042595b664c87fb33ea",
+      "version": 0
+    }
+kind: ConfigMap
+metadata:
+  name: grafana-dashboards-node-exporter
+  namespace: monitoring
--- a/templates/grafana/dashboards-prom.yaml
+++ b/templates/grafana/dashboards-prom.yaml
--- a/templates/grafana/datasources.yaml
+++ b/templates/grafana/datasources.yaml
@ -0,0 +1,24 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-datasources
+  namespace: monitoring
+data:
+  prometheus.yaml: |+
+    apiVersion: 1
+    datasources:
+    - name: prometheus
+      type: prometheus
+      access: proxy
+      url: http://prometheus.monitoring.svc.cluster.local
+      version: 1
+      editable: false
+  loki.yaml: |+
+    apiVersion: 1
+    datasources:
+    - name: loki
+      type: loki
+      access: proxy
+      url: http://loki.monitoring.svc.cluster.local
+      version: 1
+      editable: false
--- a/templates/grafana/deployment.yaml
+++ b/templates/grafana/deployment.yaml
@ -0,0 +1,112 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: grafana
+  namespace: monitoring
+spec:
+  replicas: 1
+  strategy:
+    type: RollingUpdate
+    rollingUpdate:
+      maxUnavailable: 1
+  selector:
+    matchLabels:
+      name: grafana
+      phase: prod
+  template:
+    metadata:
+      labels:
+        name: grafana
+        phase: prod
+      annotations:
+        seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
+    spec:
+      containers:
+        - name: grafana
+          image: docker.io/grafana/grafana:6.7.2
+          env:
+            - name: GF_PATHS_CONFIG
+              value: "/etc/grafana/custom.ini"
+          ports:
+            - name: http
+              containerPort: 8080
+          livenessProbe:
+            httpGet:
+              path: /metrics
+              port: 8080
+            initialDelaySeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /api/health
+              port: 8080
+            initialDelaySeconds: 10
+          resources:
+            requests:
+              cpu: 100m
+              memory: 100Mi
+            limits:
+              cpu: 200m
+              memory: 200Mi
+          volumeMounts:
+            - name: config
+              mountPath: /etc/grafana
+            - name: datasources
+              mountPath: /etc/grafana/provisioning/datasources
+            - name: providers
+              mountPath: /etc/grafana/provisioning/dashboards
+            - name: dashboards-etcd
+              mountPath: /etc/grafana/dashboards/etcd
+            - name: dashboards-node-exporter
+              mountPath: /etc/grafana/dashboards/node-exporter
+            - name: dashboards-prom
+              mountPath: /etc/grafana/dashboards/prom
+            - name: dashboards-k8s
+              mountPath: /etc/grafana/dashboards/k8s
+            - name: dashboards-k8s-nodes
+              mountPath: /etc/grafana/dashboards/k8s-nodes
+            - name: dashboards-k8s-resources-1
+              mountPath: /etc/grafana/dashboards/k8s-resources-1
+            - name: dashboards-k8s-resources-2
+              mountPath: /etc/grafana/dashboards/k8s-resources-2
+            - name: dashboards-coredns
+              mountPath: /etc/grafana/dashboards/coredns
+            - name: dashboards-nginx-ingress
+              mountPath: /etc/grafana/dashboards/nginx-ingress
+      volumes:
+        - name: config
+          configMap:
+            name: grafana-config
+        - name: datasources
+          configMap:
+            name: grafana-datasources
+        - name: providers
+          configMap:
+            name: grafana-providers
+        - name: dashboards-etcd
+          configMap:
+            name: grafana-dashboards-etcd
+        - name: dashboards-node-exporter
+          configMap:
+            name: grafana-dashboards-node-exporter
+        - name: dashboards-prom
+          configMap:
+            name: grafana-dashboards-prom
+        - name: dashboards-k8s
+          configMap:
+            name: grafana-dashboards-k8s
+        - name: dashboards-k8s-nodes
+          configMap:
+            name: grafana-dashboards-k8s-nodes
+        - name: dashboards-k8s-resources-1
+          configMap:
+            name: grafana-dashboards-k8s-resources-1
+        - name: dashboards-k8s-resources-2
+          configMap:
+            name: grafana-dashboards-k8s-resources-2
+        - name: dashboards-coredns
+          configMap:
+            name: grafana-dashboards-coredns
+        - name: dashboards-nginx-ingress
+          configMap:
+            name: grafana-dashboards-nginx-ingress
+
--- a/templates/grafana/providers.yaml
+++ b/templates/grafana/providers.yaml
@ -0,0 +1,15 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-providers
+  namespace: monitoring
+data:
+  providers.yaml: |+
+    apiVersion: 1
+    providers:
+    - name: 'default'
+      ordId: 1
+      folder: ''
+      type: file
+      options:
+        path: /etc/grafana/dashboards
--- a/templates/grafana/service.yaml
+++ b/templates/grafana/service.yaml
@ -0,0 +1,18 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: grafana
+  namespace: monitoring
+  annotations:
+    prometheus.io/scrape: 'true'
+    prometheus.io/port: '8080'
+spec:
+  type: ClusterIP
+  selector:
+    name: grafana
+    phase: prod
+  ports:
+    - name: http
+      protocol: TCP
+      port: 80
+      targetPort: 8080
--- a/templates/prometheus/0-namespace.yaml
+++ b/templates/prometheus/0-namespace.yaml
--- a/templates/prometheus/config.yaml
+++ b/templates/prometheus/config.yaml
--- a/templates/prometheus/deployment.yaml
+++ b/templates/prometheus/deployment.yaml
@ -20,7 +20,7 @@ spec:
      serviceAccountName: prometheus
      containers:
        - name: prometheus
-          image: quay.io/prometheus/prometheus:v2.17.0
+          image: quay.io/prometheus/prometheus:v2.17.1
          args:
            - --web.listen-address=0.0.0.0:9090
            - --config.file=/etc/prometheus/prometheus.yaml
--- a/templates/prometheus/discovery/kube-controller-manager.yaml
+++ b/templates/prometheus/discovery/kube-controller-manager.yaml
--- a/templates/prometheus/discovery/kube-proxy.yaml
+++ b/templates/prometheus/discovery/kube-proxy.yaml
--- a/templates/prometheus/discovery/kube-scheduler.yaml
+++ b/templates/prometheus/discovery/kube-scheduler.yaml
--- a/templates/prometheus/exporters/kube-state-metrics/cluster-role-binding.yaml
+++ b/templates/prometheus/exporters/kube-state-metrics/cluster-role-binding.yaml
--- a/templates/prometheus/exporters/kube-state-metrics/cluster-role.yaml
+++ b/templates/prometheus/exporters/kube-state-metrics/cluster-role.yaml
--- a/templates/prometheus/exporters/kube-state-metrics/deployment.yaml
+++ b/templates/prometheus/exporters/kube-state-metrics/deployment.yaml
--- a/templates/prometheus/exporters/kube-state-metrics/service-account.yaml
+++ b/templates/prometheus/exporters/kube-state-metrics/service-account.yaml
--- a/templates/prometheus/exporters/kube-state-metrics/service.yaml
+++ b/templates/prometheus/exporters/kube-state-metrics/service.yaml
--- a/templates/prometheus/exporters/node-exporter/daemonset.yaml
+++ b/templates/prometheus/exporters/node-exporter/daemonset.yaml
--- a/templates/prometheus/exporters/node-exporter/service-account.yaml
+++ b/templates/prometheus/exporters/node-exporter/service-account.yaml
--- a/templates/prometheus/exporters/node-exporter/service.yaml
+++ b/templates/prometheus/exporters/node-exporter/service.yaml
--- a/templates/prometheus/network-policy.yaml
+++ b/templates/prometheus/network-policy.yaml
--- a/templates/prometheus/rbac/cluster-role-binding.yaml
+++ b/templates/prometheus/rbac/cluster-role-binding.yaml
--- a/templates/prometheus/rbac/cluster-role.yaml
+++ b/templates/prometheus/rbac/cluster-role.yaml
--- a/templates/prometheus/rules.yaml
+++ b/templates/prometheus/rules.yaml
@ -252,25 +252,25 @@ data:
          "name": "kube-apiserver.rules",
          "rules": [
            {
-              "expr": "sum(rate(apiserver_request_duration_seconds_sum{subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|PROXY|CONNECT\"}[5m])) without(instance, pod)\n/\nsum(rate(apiserver_request_duration_seconds_count{subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|PROXY|CONNECT\"}[5m])) without(instance, pod)\n",
+              "expr": "sum(rate(apiserver_request_duration_seconds_sum{subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod)\n/\nsum(rate(apiserver_request_duration_seconds_count{subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod)\n",
              "record": "cluster:apiserver_request_duration_seconds:mean5m"
            },
            {
-              "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|PROXY|CONNECT\"}[5m])) without(instance, pod))\n",
+              "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod))\n",
              "labels": {
                "quantile": "0.99"
              },
              "record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile"
            },
            {
-              "expr": "histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|PROXY|CONNECT\"}[5m])) without(instance, pod))\n",
+              "expr": "histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod))\n",
              "labels": {
                "quantile": "0.9"
              },
              "record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile"
            },
            {
-              "expr": "histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|PROXY|CONNECT\"}[5m])) without(instance, pod))\n",
+              "expr": "histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod))\n",
              "labels": {
                "quantile": "0.5"
              },
@ -805,6 +805,7 @@ data:
            {
              "alert": "ErrorBudgetBurn",
              "annotations": {
+                "message": "High requests error budget burn for job=apiserver (current value: {{ $value }})",
                "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn"
              },
              "expr": "(\n  status_class_5xx:apiserver_request_total:ratio_rate1h{job=\"apiserver\"} > (14.4*0.010000)\n  and\n  status_class_5xx:apiserver_request_total:ratio_rate5m{job=\"apiserver\"} > (14.4*0.010000)\n)\nor\n(\n  status_class_5xx:apiserver_request_total:ratio_rate6h{job=\"apiserver\"} > (6*0.010000)\n  and\n  status_class_5xx:apiserver_request_total:ratio_rate30m{job=\"apiserver\"} > (6*0.010000)\n)\n",
@ -816,6 +817,7 @@ data:
            {
              "alert": "ErrorBudgetBurn",
              "annotations": {
+                "message": "High requests error budget burn for job=apiserver (current value: {{ $value }})",
                "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn"
              },
              "expr": "(\n  status_class_5xx:apiserver_request_total:ratio_rate1d{job=\"apiserver\"} > (3*0.010000)\n  and\n  status_class_5xx:apiserver_request_total:ratio_rate2h{job=\"apiserver\"} > (3*0.010000)\n)\nor\n(\n  status_class_5xx:apiserver_request_total:ratio_rate3d{job=\"apiserver\"} > (0.010000)\n  and\n  status_class_5xx:apiserver_request_total:ratio_rate6h{job=\"apiserver\"} > (0.010000)\n)\n",
@ -853,30 +855,6 @@ data:
                "severity": "critical"
              }
            },
-            {
-              "alert": "KubeAPIErrorsHigh",
-              "annotations": {
-                "message": "API server is returning errors for {{ $value | humanizePercentage }} of requests.",
-                "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh"
-              },
-              "expr": "sum(rate(apiserver_request_total{job=\"apiserver\",code=~\"5..\"}[5m]))\n  /\nsum(rate(apiserver_request_total{job=\"apiserver\"}[5m])) > 0.03\n",
-              "for": "10m",
-              "labels": {
-                "severity": "critical"
-              }
-            },
-            {
-              "alert": "KubeAPIErrorsHigh",
-              "annotations": {
-                "message": "API server is returning errors for {{ $value | humanizePercentage }} of requests.",
-                "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh"
-              },
-              "expr": "sum(rate(apiserver_request_total{job=\"apiserver\",code=~\"5..\"}[5m]))\n  /\nsum(rate(apiserver_request_total{job=\"apiserver\"}[5m])) > 0.01\n",
-              "for": "10m",
-              "labels": {
-                "severity": "warning"
-              }
-            },
            {
              "alert": "KubeAPIErrorsHigh",
              "annotations": {
@ -993,7 +971,7 @@ data:
                "message": "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.",
                "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods"
              },
-              "expr": "max(max(kubelet_running_pod_count{job=\"kubelet\"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\"}) by(node) / max(kube_node_status_capacity_pods{job=\"kube-state-metrics\"}) by(node) > 0.95\n",
+              "expr": "max(max(kubelet_running_pod_count{job=\"kubelet\"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\"}) by(node) / max(kube_node_status_capacity_pods{job=\"kube-state-metrics\"} != 1) by(node) > 0.95\n",
              "for": "15m",
              "labels": {
                "severity": "warning"
@ -1029,7 +1007,7 @@ data:
                "message": "Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.",
                "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh"
              },
-              "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name  > 5\n",
+              "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name  > 60\n",
              "for": "15m",
              "labels": {
                "severity": "warning"
@ -1085,9 +1063,167 @@ data:
        }
      ]
    }
+  loki.yaml: |-
+    {
+      "groups": [
+        {
+          "name": "loki_rules",
+          "rules": [
+            {
+              "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job))",
+              "record": "job:loki_request_duration_seconds:99quantile"
+            },
+            {
+              "expr": "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job))",
+              "record": "job:loki_request_duration_seconds:50quantile"
+            },
+            {
+              "expr": "sum(rate(loki_request_duration_seconds_sum[1m])) by (job) / sum(rate(loki_request_duration_seconds_count[1m])) by (job)",
+              "record": "job:loki_request_duration_seconds:avg"
+            },
+            {
+              "expr": "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job)",
+              "record": "job:loki_request_duration_seconds_bucket:sum_rate"
+            },
+            {
+              "expr": "sum(rate(loki_request_duration_seconds_sum[1m])) by (job)",
+              "record": "job:loki_request_duration_seconds_sum:sum_rate"
+            },
+            {
+              "expr": "sum(rate(loki_request_duration_seconds_count[1m])) by (job)",
+              "record": "job:loki_request_duration_seconds_count:sum_rate"
+            },
+            {
+              "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route))",
+              "record": "job_route:loki_request_duration_seconds:99quantile"
+            },
+            {
+              "expr": "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route))",
+              "record": "job_route:loki_request_duration_seconds:50quantile"
+            },
+            {
+              "expr": "sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)",
+              "record": "job_route:loki_request_duration_seconds:avg"
+            },
+            {
+              "expr": "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route)",
+              "record": "job_route:loki_request_duration_seconds_bucket:sum_rate"
+            },
+            {
+              "expr": "sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)",
+              "record": "job_route:loki_request_duration_seconds_sum:sum_rate"
+            },
+            {
+              "expr": "sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)",
+              "record": "job_route:loki_request_duration_seconds_count:sum_rate"
+            },
+            {
+              "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job, route))",
+              "record": "namespace_job_route:loki_request_duration_seconds:99quantile"
+            },
+            {
+              "expr": "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job, route))",
+              "record": "namespace_job_route:loki_request_duration_seconds:50quantile"
+            },
+            {
+              "expr": "sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)",
+              "record": "namespace_job_route:loki_request_duration_seconds:avg"
+            },
+            {
+              "expr": "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job, route)",
+              "record": "namespace_job_route:loki_request_duration_seconds_bucket:sum_rate"
+            },
+            {
+              "expr": "sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)",
+              "record": "namespace_job_route:loki_request_duration_seconds_sum:sum_rate"
+            },
+            {
+              "expr": "sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)",
+              "record": "namespace_job_route:loki_request_duration_seconds_count:sum_rate"
+            }
+          ]
+        },
+        {
+          "name": "loki_alerts",
+          "rules": [
+            {
+              "alert": "LokiRequestErrors",
+              "annotations": {
+                "message": "{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n"
+              },
+              "expr": "100 * sum(rate(loki_request_duration_seconds_count{status_code=~\"5..\"}[1m])) by (namespace, job, route)\n  /\nsum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)\n  > 10\n",
+              "for": "15m",
+              "labels": {
+                "severity": "critical"
+              }
+            },
+            {
+              "alert": "LokiRequestLatency",
+              "annotations": {
+                "message": "{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency.\n"
+              },
+              "expr": "namespace_job_route:loki_request_duration_seconds:99quantile{route!~\"(?i).*tail.*\"} > 1\n",
+              "for": "15m",
+              "labels": {
+                "severity": "critical"
+              }
+            }
+          ]
+        }
+      ]
+    }
  node-exporter.yaml: |-
    {
      "groups": [
+        {
+          "name": "node-exporter.rules",
+          "rules": [
+            {
+              "expr": "count without (cpu) (\n  count without (mode) (\n    node_cpu_seconds_total{job=\"node-exporter\"}\n  )\n)\n",
+              "record": "instance:node_num_cpu:sum"
+            },
+            {
+              "expr": "1 - avg without (cpu, mode) (\n  rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\"}[1m])\n)\n",
+              "record": "instance:node_cpu_utilisation:rate1m"
+            },
+            {
+              "expr": "(\n  node_load1{job=\"node-exporter\"}\n/\n  instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n",
+              "record": "instance:node_load1_per_cpu:ratio"
+            },
+            {
+              "expr": "1 - (\n  node_memory_MemAvailable_bytes{job=\"node-exporter\"}\n/\n  node_memory_MemTotal_bytes{job=\"node-exporter\"}\n)\n",
+              "record": "instance:node_memory_utilisation:ratio"
+            },
+            {
+              "expr": "rate(node_vmstat_pgmajfault{job=\"node-exporter\"}[1m])\n",
+              "record": "instance:node_vmstat_pgmajfault:rate1m"
+            },
+            {
+              "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", device!~\"dm.*\"}[1m])\n",
+              "record": "instance_device:node_disk_io_time_seconds:rate1m"
+            },
+            {
+              "expr": "rate(node_disk_io_time_weighted_seconds_total{job=\"node-exporter\", device!~\"dm.*\"}[1m])\n",
+              "record": "instance_device:node_disk_io_time_weighted_seconds:rate1m"
+            },
+            {
+              "expr": "sum without (device) (\n  rate(node_network_receive_bytes_total{job=\"node-exporter\", device!=\"lo\"}[1m])\n)\n",
+              "record": "instance:node_network_receive_bytes_excluding_lo:rate1m"
+            },
+            {
+              "expr": "sum without (device) (\n  rate(node_network_transmit_bytes_total{job=\"node-exporter\", device!=\"lo\"}[1m])\n)\n",
+              "record": "instance:node_network_transmit_bytes_excluding_lo:rate1m"
+            },
+            {
+              "expr": "sum without (device) (\n  rate(node_network_receive_drop_total{job=\"node-exporter\", device!=\"lo\"}[1m])\n)\n",
+              "record": "instance:node_network_receive_drop_excluding_lo:rate1m"
+            },
+            {
+              "expr": "sum without (device) (\n  rate(node_network_transmit_drop_total{job=\"node-exporter\", device!=\"lo\"}[1m])\n)\n",
+              "record": "instance:node_network_transmit_drop_excluding_lo:rate1m"
+            }
+          ]
+        },
        {
          "name": "node-exporter",
          "rules": [
@ -1210,6 +1346,41 @@ data:
              "labels": {
                "severity": "warning"
              }
+            },
+            {
+              "alert": "NodeHighNumberConntrackEntriesUsed",
+              "annotations": {
+                "description": "{{ $value | humanizePercentage }} of conntrack entries are used",
+                "summary": "Number of conntrack are getting close to the limit"
+              },
+              "expr": "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n",
+              "labels": {
+                "severity": "warning"
+              }
+            },
+            {
+              "alert": "NodeClockSkewDetected",
+              "annotations": {
+                "message": "Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.",
+                "summary": "Clock skew detected."
+              },
+              "expr": "(\n  node_timex_offset_seconds > 0.05\nand\n  deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n  node_timex_offset_seconds < -0.05\nand\n  deriv(node_timex_offset_seconds[5m]) <= 0\n)\n",
+              "for": "10m",
+              "labels": {
+                "severity": "warning"
+              }
+            },
+            {
+              "alert": "NodeClockNotSynchronising",
+              "annotations": {
+                "message": "Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.",
+                "summary": "Clock not synchronising."
+              },
+              "expr": "min_over_time(node_timex_sync_status[5m]) == 0\n",
+              "for": "10m",
+              "labels": {
+                "severity": "warning"
+              }
            }
          ]
        }
--- a/templates/prometheus/service-account.yaml
+++ b/templates/prometheus/service-account.yaml
--- a/templates/prometheus/service.yaml
+++ b/templates/prometheus/service.yaml
--- a/vars/main.yml
+++ b/vars/main.yml
@ -0,0 +1,2 @@
+---
+monitoring_state: present