chore(helm): Adding observability chart

Signed-off-by: rjshrjndrn <rjshrjndrn@gmail.com>
2022-06-30 11:58:23 +02:00 · 2022-06-30 11:58:23 +02:00 · 67022f538b
commit 67022f538b
parent d7e100e383
13 changed files with 2648 additions and 3863 deletions
--- a/ee/scripts/helmcharts/manifests/dashboards/nginx-ingress.yaml
+++ b/ee/scripts/helmcharts/manifests/dashboards/nginx-ingress.yaml
--- a/ee/scripts/helmcharts/manifests/dashboards/openreplay-components.yaml
+++ b/ee/scripts/helmcharts/manifests/dashboards/openreplay-components.yaml
--- a/ee/scripts/helmcharts/manifests/monitoring.yaml
+++ b/ee/scripts/helmcharts/manifests/monitoring.yaml
@ -1,168 +0,0 @@
-##################################################
-## Update the following values
-##
-## For example,
-# domainName: &domainName openreplay.supercompany.com
-##################################################
-
-domainName: &domainName "changeme.mycorp.org"
-grafanaAdminPassword: &adminpass "changeNeGrafanaAdminPassword"
-slackWebhookUrl: &slackwebhook "https://hooks.slack.com/services/xxxx/xxxx/xxxxx"       # Slack webhook token for sending alerts
-slackChannel: &slackchannel "changeMeAlertsChannel"                                     # Name of the channel, to alerts to be delivered to.
-
-
-########################################################
-## Custom configuration for Monitoring and logging stack
-########################################################
-
-fullnameOverride: "openreplay"
-grafana:
-  adminPassword: *adminpass
-  env:
-    GF_SERVER_ROOT_URL: http://grafana.local.com/grafana
-  additionalDataSources:
-    - name: loki
-      editable: true
-      type: loki
-      url: http://loki.logging:3100
-  plugins:
-    - grafana-piechart-panel
-    - vertamedia-clickhouse-datasource
-    - digrich-bubblechart-panel
-    - grafana-clock-panel
-  ingress:
-    enabled: true
-    ingressClassName: openreplay
-    hosts:
-      - *domainName
-    annotations:
-      nginx.ingress.kubernetes.io/rewrite-target: /$1
-    path: /grafana/(.*)
-    tls:
-      - hosts:
-        - *domainName
-        secretName: openreplay-ssl
-
-prometheus:
-  prometheusSpec:
-    storageSpec:
-      volumeClaimTemplate:
-        spec:
-          accessModes: ["ReadWriteOnce"]
-          resources:
-            requests:
-              storage: 200Gi
-alertmanager:
-  config:
-    global:
-      resolve_timeout: 5m
-      slack_api_url: *slackwebhook
-    route:
-      # group_by: ['job']
-      group_by: ['alertname','container']
-      group_wait: 30s
-      group_interval: 5m
-      repeat_interval: 12h
-      receiver: 'slack'
-      routes:
-      - match:
-          alertname: Watchdog
-        receiver: 'slack'
-    receivers:
-    - name: slack
-      slack_configs:
-        - channel: *slackchannel
-          color: '{{ template "slack.color" . }}'
-          title: '{{ template "slack.title" . }}'
-          text: '{{ template "slack.text" . }}'
-          send_resolved: true
-          actions:
-            - type: button
-              text: 'Runbook :green_book:'
-              url: '{{ (index .Alerts 0).Annotations.runbook_url }}'
-         #  - type: button
-         #    text: 'Query :mag:'
-         #    url: '{{ (index .Alerts 0).GeneratorURL }}'
-         #  - type: button
-         #    text: 'Dashboard :chart_with_upwards_trend:'
-         #    url: '{{ (index .Alerts 0).Annotations.dashboard_url }}'
-         #  - type: button
-         #    text: 'Silence :no_bell:'
-         #    url: '{{ template "__alert_silence_link" . }}'
-    templates:
-    - /etc/alertmanager/config/*.tmpl
-
-  templateFiles:
-     template_1.tmpl: |-
-       {{/* Alertmanager Silence link */}}
-       {{ define "__alert_silence_link" -}}
-           {{ .ExternalURL }}/#/silences/new?filter=%7B
-           {{- range .CommonLabels.SortedPairs -}}
-               {{- if ne .Name "alertname" -}}
-                   {{- .Name }}%3D"{{- .Value -}}"%2C%20
-               {{- end -}}
-           {{- end -}}
-           alertname%3D"{{- .CommonLabels.alertname -}}"%7D
-       {{- end }}
-       
-       {{/* Severity of the alert */}}
-       {{ define "__alert_severity" -}}
-           {{- if eq .CommonLabels.severity "critical" -}}
-           *Severity:* `Critical`
-           {{- else if eq .CommonLabels.severity "warning" -}}
-           *Severity:* `Warning`
-           {{- else if eq .CommonLabels.severity "info" -}}
-           *Severity:* `Info`
-           {{- else -}}
-           *Severity:* :question: {{ .CommonLabels.severity }}
-           {{- end }}
-       {{- end }}
-       
-       {{/* Title of the Slack alert */}}
-       {{ define "slack.title" -}}
-         [{{ .Status | toUpper -}}
-         {{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{- end -}}
-         ] {{ .CommonLabels.alertname }}
-       {{- end }}
-       
-       
-       {{/* Color of Slack attachment (appears as line next to alert )*/}}
-       {{ define "slack.color" -}}
-           {{ if eq .Status "firing" -}}
-               {{ if eq .CommonLabels.severity "warning" -}}
-                   warning
-               {{- else if eq .CommonLabels.severity "critical" -}}
-                   danger
-               {{- else -}}
-                   #439FE0
-               {{- end -}}
-           {{ else -}}
-           good
-           {{- end }}
-       {{- end }}
-       
-       {{/* The text to display in the alert */}}
-       {{ define "slack.text" -}}
-       
-           {{ template "__alert_severity" . }}
-           {{- if (index .Alerts 0).Annotations.summary }}
-           {{- "\n" -}}
-           *Summary:* {{ (index .Alerts 0).Annotations.summary }}
-           {{- end }}
-       
-           {{ range .Alerts }}
-       
-               {{- if .Annotations.description }}
-               {{- "\n" -}}
-               {{ .Annotations.description }}
-               {{- "\n" -}}
-               {{- end }}
-               {{- if .Annotations.message }}
-               {{- "\n" -}}
-               {{ .Annotations.message }}
-               {{- "\n" -}}
-               {{- end }}
-       
-           {{- end }}
-       
-       {{- end }}
--- a/ee/scripts/helmcharts/manifests/observability-values.yaml
+++ b/ee/scripts/helmcharts/manifests/observability-values.yaml
@ -0,0 +1,300 @@
+# vim: set fdm=indent:
+##################################################
+## Update the following values
+##
+## For example,
+# domainName: &domainName openreplay.supercompany.com
+##################################################
+
+domainName: &domainName "changeme.mycorp.org"
+grafanaAdminPassword: &adminpass "changeNeGrafanaAdminPassword"
+slackWebhookUrl: &slackwebhook "https://hooks.slack.com/services/xxxx/xxxx/xxxxx"       # Slack webhook token for sending alerts
+slackChannel: &slackchannel "changeMeAlertsChannel"                                     # Name of the channel, to alerts to be delivered to.
+
+
+########################################################
+## Custom configuration for Monitoring and logging stack
+########################################################
+
+observability:
+  kube-prometheus-stack:
+    fullnameOverride: "openreplay"
+    grafana:
+      adminPassword: *adminpass
+      env:
+        GF_SERVER_ROOT_URL: http://grafana.local.com/grafana
+      additionalDataSources:
+        - name: loki
+          editable: true
+          type: loki
+          url: http://loki.observability:3100
+      plugins:
+        - grafana-piechart-panel
+        - vertamedia-clickhouse-datasource
+        - digrich-bubblechart-panel
+        - grafana-clock-panel
+      ingress:
+        enabled: true
+        ingressClassName: openreplay
+        hosts:
+          - *domainName
+        annotations:
+          nginx.ingress.kubernetes.io/rewrite-target: /$1
+        path: /grafana/(.*)
+        tls:
+          - hosts:
+            - *domainName
+            secretName: openreplay-ssl
+
+    prometheus:
+      prometheusSpec:
+        storageSpec:
+          volumeClaimTemplate:
+            spec:
+              accessModes: ["ReadWriteOnce"]
+              resources:
+                requests:
+                  storage: 200Gi
+    alertmanager:
+      config:
+        global:
+          resolve_timeout: 5m
+          slack_api_url: *slackwebhook
+        route:
+          # group_by: ['job']
+          group_by: ['alertname','container']
+          group_wait: 30s
+          group_interval: 5m
+          repeat_interval: 12h
+          receiver: 'slack'
+          routes:
+          - match:
+              alertname: Watchdog
+            receiver: 'slack'
+        receivers:
+        - name: slack
+          slack_configs:
+            - channel: *slackchannel
+              color: '{{ template "slack.color" . }}'
+              title: '{{ template "slack.title" . }}'
+              text: '{{ template "slack.text" . }}'
+              send_resolved: true
+              actions:
+                - type: button
+                  text: 'Runbook :green_book:'
+                  url: '{{ (index .Alerts 0).Annotations.runbook_url }}'
+             #  - type: button
+             #    text: 'Query :mag:'
+             #    url: '{{ (index .Alerts 0).GeneratorURL }}'
+             #  - type: button
+             #    text: 'Dashboard :chart_with_upwards_trend:'
+             #    url: '{{ (index .Alerts 0).Annotations.dashboard_url }}'
+             #  - type: button
+             #    text: 'Silence :no_bell:'
+             #    url: '{{ template "__alert_silence_link" . }}'
+        templates:
+        - /etc/alertmanager/config/*.tmpl
+
+      templateFiles:
+         template_1.tmpl: |-
+           {{/* Alertmanager Silence link */}}
+           {{ define "__alert_silence_link" -}}
+               {{ .ExternalURL }}/#/silences/new?filter=%7B
+               {{- range .CommonLabels.SortedPairs -}}
+                   {{- if ne .Name "alertname" -}}
+                       {{- .Name }}%3D"{{- .Value -}}"%2C%20
+                   {{- end -}}
+               {{- end -}}
+               alertname%3D"{{- .CommonLabels.alertname -}}"%7D
+           {{- end }}
+           
+           {{/* Severity of the alert */}}
+           {{ define "__alert_severity" -}}
+               {{- if eq .CommonLabels.severity "critical" -}}
+               *Severity:* `Critical`
+               {{- else if eq .CommonLabels.severity "warning" -}}
+               *Severity:* `Warning`
+               {{- else if eq .CommonLabels.severity "info" -}}
+               *Severity:* `Info`
+               {{- else -}}
+               *Severity:* :question: {{ .CommonLabels.severity }}
+               {{- end }}
+           {{- end }}
+           
+           {{/* Title of the Slack alert */}}
+           {{ define "slack.title" -}}
+             [{{ .Status | toUpper -}}
+             {{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{- end -}}
+             ] {{ .CommonLabels.alertname }}
+           {{- end }}
+           
+           
+           {{/* Color of Slack attachment (appears as line next to alert )*/}}
+           {{ define "slack.color" -}}
+               {{ if eq .Status "firing" -}}
+                   {{ if eq .CommonLabels.severity "warning" -}}
+                       warning
+                   {{- else if eq .CommonLabels.severity "critical" -}}
+                       danger
+                   {{- else -}}
+                       #439FE0
+                   {{- end -}}
+               {{ else -}}
+               good
+               {{- end }}
+           {{- end }}
+           
+           {{/* The text to display in the alert */}}
+           {{ define "slack.text" -}}
+           
+               {{ template "__alert_severity" . }}
+               {{- if (index .Alerts 0).Annotations.summary }}
+               {{- "\n" -}}
+               *Summary:* {{ (index .Alerts 0).Annotations.summary }}
+               {{- end }}
+           
+               {{ range .Alerts }}
+           
+                   {{- if .Annotations.description }}
+                   {{- "\n" -}}
+                   {{ .Annotations.description }}
+                   {{- "\n" -}}
+                   {{- end }}
+                   {{- if .Annotations.message }}
+                   {{- "\n" -}}
+                   {{ .Annotations.message }}
+                   {{- "\n" -}}
+                   {{- end }}
+           
+               {{- end }}
+           
+           {{- end }}
+  loki:
+    config:
+      # existingSecret:
+      auth_enabled: false
+      ingester:
+        chunk_idle_period: 3m
+        chunk_block_size: 262144
+        chunk_retain_period: 1m
+        max_transfer_retries: 0
+        wal:
+          dir: /data/loki/wal
+        lifecycler:
+          ring:
+            kvstore:
+              store: inmemory
+            replication_factor: 1
+
+      limits_config:
+        enforce_metric_name: false
+        reject_old_samples: true
+        reject_old_samples_max_age: 168h
+      schema_config:
+        configs:
+        - from: 2020-10-24
+          store: boltdb-shipper
+          object_store: filesystem
+          schema: v11
+          index:
+            prefix: index_
+            period: 24h
+      server:
+        http_listen_port: 3100
+      storage_config:
+        boltdb_shipper:
+          active_index_directory: /data/loki/boltdb-shipper-active
+          cache_location: /data/loki/boltdb-shipper-cache
+          cache_ttl: 24h         # Can be increased for faster performance over longer query periods, uses more disk space
+          shared_store: filesystem
+        filesystem:
+          directory: /data/loki/chunks
+      chunk_store_config:
+        max_look_back_period: 0s
+      table_manager:
+        retention_deletes_enabled: false
+        retention_period: 0s
+      compactor:
+        working_directory: /data/loki/boltdb-shipper-compactor
+        shared_store: filesystem
+        retention_enabled: true
+    # Needed for Alerting: https://grafana.com/docs/loki/latest/rules/
+    # This is just a simple example, for more details: https://grafana.com/docs/loki/latest/configuration/#ruler_config
+      ruler:
+        storage:
+          type: local
+          local:
+            directory: /rules
+        rule_path: /tmp/scratch
+        alertmanager_url: http://openreplay-alertmanager.observability.svc.cluster.local:9093
+        ring:
+          kvstore:
+            store: inmemory
+        enable_api: true
+
+    persistence:
+      enabled: true
+      accessModes:
+      - ReadWriteOnce
+      size: 100Gi
+
+    serviceMonitor:
+      enabled: true
+      interval: ""
+      additionalLabels:
+        release: monitoring
+      annotations: {}
+      # scrapeTimeout: 10s
+      # path: /metrics
+      prometheusRule:
+        enabled: true
+        additionalLabels:
+          release: monitoring
+        rules: 
+         - alert: LokiProcessTooManyRestarts
+           expr: changes(process_start_time_seconds{job=~"loki"}[15m]) > 2
+           for: 0m
+           labels:
+             severity: warning
+           annotations:
+             summary: Loki process too many restarts (instance {{ $labels.instance }})
+             description: "A loki process had too many restarts (target {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+         - alert: LokiRequestErrors
+           expr: 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10
+           for: 15m
+           labels:
+             severity: critical
+           annotations:
+             summary: Loki request errors (instance {{ $labels.instance }})
+             description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+         - alert: LokiRequestPanic
+           expr: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
+           for: 5m
+           labels:
+             severity: critical
+           annotations:
+             summary: Loki request panic (instance {{ $labels.instance }})
+             description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+         - alert: LokiRequestLatency
+           expr: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le)))  > 1
+           for: 5m
+           labels:
+             severity: critical
+           annotations:
+             summary: Loki request latency (instance {{ $labels.instance }})
+             description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # Specify Loki Alerting rules based on this documentation: https://grafana.com/docs/loki/latest/rules/
+    # When specified, you also need to add a ruler config section above. An example is shown in the alerting docs.
+    alerting_groups: 
+      - name: dbZombie
+        rules:
+        - alert: dbZombie
+          expr: absent_over_time({namespace="app", app="db"} |~ "Queue Statistics"[5m]) == 1
+          for: 10m
+          labels:
+            severity: warning
+  promtail:
+    config:
+      clients:
+        - url: http://loki.observability.svc.cluster.local:3100/loki/api/v1/push 
--- a/ee/scripts/helmcharts/manifests/observability/.helmignore
+++ b/ee/scripts/helmcharts/manifests/observability/.helmignore
@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
--- a/ee/scripts/helmcharts/manifests/observability/Chart.lock
+++ b/ee/scripts/helmcharts/manifests/observability/Chart.lock
@ -0,0 +1,12 @@
+dependencies:
+- name: kube-prometheus-stack
+  repository: https://prometheus-community.github.io/helm-charts
+  version: 36.2.0
+- name: loki
+  repository: https://grafana.github.io/helm-charts
+  version: 2.12.2
+- name: promtail
+  repository: https://grafana.github.io/helm-charts
+  version: 6.0.0
+digest: sha256:7f28353afa3626a7aaf59159a877be84f29bfac004e325d12f2ce43f31bd3e90
+generated: "2022-06-30T11:23:29.753889457+02:00"
--- a/ee/scripts/helmcharts/manifests/observability/Chart.yaml
+++ b/ee/scripts/helmcharts/manifests/observability/Chart.yaml
@ -0,0 +1,34 @@
+apiVersion: v2
+name: observability
+description: A Helm chart for Kubernetes
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.1.0
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: "1.0.0"
+dependencies:
+  - name: kube-prometheus-stack
+    repository: https://prometheus-community.github.io/helm-charts
+    version: 36.2.*
+  - name: loki
+    repository: https://grafana.github.io/helm-charts
+    version: 2.12.*
+  - name: promtail
+    repository: https://grafana.github.io/helm-charts
+    version: 6.0.*
--- a/ee/scripts/helmcharts/manifests/observability/dashboards/nginx-performance.json
+++ b/ee/scripts/helmcharts/manifests/observability/dashboards/nginx-performance.json
@ -0,0 +1,982 @@
+{
+  "__inputs": [
+    {
+      "name": "DS_PROMETHEUS",
+      "label": "Prometheus",
+      "description": "",
+      "type": "datasource",
+      "pluginId": "prometheus",
+      "pluginName": "Prometheus"
+    }
+  ],
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "6.6.0"
+    },
+    {
+      "type": "panel",
+      "id": "graph",
+      "name": "Graph",
+      "version": ""
+    },
+    {
+      "type": "datasource",
+      "id": "prometheus",
+      "name": "Prometheus",
+      "version": "1.0.0"
+    }
+  ],
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "-- Grafana --",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "description": "",
+  "editable": true,
+  "gnetId": 9614,
+  "graphTooltip": 1,
+  "id": null,
+  "iteration": 1582146566338,
+  "links": [],
+  "panels": [
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "${DS_PROMETHEUS}",
+      "description": "Total time taken for nginx and upstream servers to process a request and send a response",
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
+      "hiddenSeries": false,
+      "id": 91,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "dataLinks": []
+      },
+      "percentage": false,
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "histogram_quantile(\n  0.5,\n  sum by (le)(\n    rate(\n      nginx_ingress_controller_request_duration_seconds_bucket{\n        ingress =~ \"$ingress\"\n      }[1m]\n    )\n  )\n)",
+          "interval": "",
+          "legendFormat": ".5",
+          "refId": "D"
+        },
+        {
+          "expr": "histogram_quantile(\n  0.95,\n  sum by (le)(\n    rate(\n    nginx_ingress_controller_request_duration_seconds_bucket{\n        ingress =~ \"$ingress\"\n      }[1m]\n    )\n  )\n)",
+          "interval": "",
+          "legendFormat": ".95",
+          "refId": "B"
+        },
+        {
+          "expr": "histogram_quantile(\n  0.99,\n  sum by (le)(\n    rate(\n      nginx_ingress_controller_request_duration_seconds_bucket{\n        ingress =~ \"$ingress\"\n      }[1m]\n    )\n  )\n)",
+          "interval": "",
+          "legendFormat": ".99",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Total request handling time",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "s",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "${DS_PROMETHEUS}",
+      "description": "The time spent on receiving the response from the upstream server",
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
+      "hiddenSeries": false,
+      "id": 94,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "dataLinks": []
+      },
+      "percentage": false,
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "histogram_quantile(\n  0.5,\n  sum by (le)(\n    rate(\n      nginx_ingress_controller_response_duration_seconds_bucket{\n        ingress =~ \"$ingress\"\n      }[1m]\n    )\n  )\n)",
+          "instant": false,
+          "interval": "",
+          "intervalFactor": 1,
+          "legendFormat": ".5",
+          "refId": "D"
+        },
+        {
+          "expr": "histogram_quantile(\n  0.95,\n  sum by (le)(\n    rate(\n    nginx_ingress_controller_response_duration_seconds_bucket{\n        ingress =~ \"$ingress\"\n      }[1m]\n    )\n  )\n)",
+          "interval": "",
+          "legendFormat": ".95",
+          "refId": "B"
+        },
+        {
+          "expr": "histogram_quantile(\n  0.99,\n  sum by (le)(\n    rate(\n      nginx_ingress_controller_response_duration_seconds_bucket{\n        ingress =~ \"$ingress\"\n      }[1m]\n    )\n  )\n)",
+          "interval": "",
+          "legendFormat": ".99",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Upstream response time",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "s",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "${DS_PROMETHEUS}",
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
+      "hiddenSeries": false,
+      "id": 93,
+      "legend": {
+        "alignAsTable": true,
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "rightSide": true,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "dataLinks": []
+      },
+      "percentage": false,
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "  sum by (path)(\n    rate(\n      nginx_ingress_controller_request_duration_seconds_count{\n        ingress =~ \"$ingress\"\n      }[1m]\n    )\n  )\n",
+          "interval": "",
+          "intervalFactor": 1,
+          "legendFormat": "{{ path }}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Request volume by Path",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "reqps",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "${DS_PROMETHEUS}",
+      "description": "For each path observed, its median upstream response time",
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 8
+      },
+      "hiddenSeries": false,
+      "id": 98,
+      "legend": {
+        "alignAsTable": true,
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "rightSide": true,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "dataLinks": []
+      },
+      "percentage": false,
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "histogram_quantile(\n  .5,\n  sum by (le, path)(\n    rate(\n      nginx_ingress_controller_response_duration_seconds_bucket{\n        ingress =~ \"$ingress\"\n      }[1m]\n    )\n  )\n)",
+          "interval": "",
+          "intervalFactor": 1,
+          "legendFormat": "{{ path }}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Median upstream response time by Path",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "s",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "${DS_PROMETHEUS}",
+      "description": "Percentage of 4xx and 5xx responses among all responses.",
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 16
+      },
+      "hiddenSeries": false,
+      "id": 100,
+      "legend": {
+        "alignAsTable": true,
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "rightSide": true,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null as zero",
+      "options": {
+        "dataLinks": []
+      },
+      "percentage": false,
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum by (path) (rate(nginx_ingress_controller_request_duration_seconds_count{\n  ingress =~ \"$ingress\",\n  status =~ \"[4-5].*\"\n}[1m])) / sum by (path) (rate(nginx_ingress_controller_request_duration_seconds_count{\n  ingress =~ \"$ingress\",\n}[1m]))",
+          "interval": "",
+          "intervalFactor": 1,
+          "legendFormat": "{{ path }}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Response error rate by Path",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "percentunit",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "${DS_PROMETHEUS}",
+      "description": "For each path observed, the sum of upstream request time",
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 16
+      },
+      "hiddenSeries": false,
+      "id": 102,
+      "legend": {
+        "alignAsTable": true,
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "rightSide": true,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "dataLinks": []
+      },
+      "percentage": false,
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum by (path) (rate(nginx_ingress_controller_response_duration_seconds_sum{ingress =~ \"$ingress\"}[1m]))",
+          "interval": "",
+          "intervalFactor": 1,
+          "legendFormat": "{{ path }}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Upstream time consumed by Path",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "s",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "${DS_PROMETHEUS}",
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 24
+      },
+      "hiddenSeries": false,
+      "id": 101,
+      "legend": {
+        "alignAsTable": true,
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "rightSide": true,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "dataLinks": []
+      },
+      "percentage": false,
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "  sum (\n    rate(\n      nginx_ingress_controller_request_duration_seconds_count{\n        ingress =~ \"$ingress\",\n        status =~\"[4-5].*\",\n      }[1m]\n    )\n  ) by(path, status)\n",
+          "interval": "",
+          "intervalFactor": 1,
+          "legendFormat": "{{ path }} {{ status }}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Response error volume by Path",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "reqps",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "${DS_PROMETHEUS}",
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 24
+      },
+      "hiddenSeries": false,
+      "id": 99,
+      "legend": {
+        "alignAsTable": true,
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "rightSide": true,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "dataLinks": []
+      },
+      "percentage": false,
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum (\n  rate (\n      nginx_ingress_controller_response_size_sum {\n        ingress =~ \"$ingress\",\n      }[1m]\n  )\n)  by (path) / sum (\n  rate(\n      nginx_ingress_controller_response_size_count {\n        ingress =~ \"$ingress\",\n      }[1m]\n  )\n) by (path)\n",
+          "hide": false,
+          "instant": false,
+          "interval": "",
+          "intervalFactor": 1,
+          "legendFormat": "{{ path }}",
+          "refId": "D"
+        },
+        {
+          "expr": "    sum (rate(nginx_ingress_controller_response_size_bucket{\n        ingress =~ \"$ingress\",\n    }[1m])) by (le)\n",
+          "hide": true,
+          "legendFormat": "{{le}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Average response size by Path",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "decbytes",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "${DS_PROMETHEUS}",
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 32
+      },
+      "hiddenSeries": false,
+      "id": 96,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "dataLinks": []
+      },
+      "percentage": false,
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum (\n  rate(\n      nginx_ingress_controller_ingress_upstream_latency_seconds_sum {\n        ingress =~ \"$ingress\",\n      }[1m]\n)) / sum (\n  rate(\n      nginx_ingress_controller_ingress_upstream_latency_seconds_count {\n        ingress =~ \"$ingress\",\n      }[1m]\n  )\n)\n",
+          "hide": false,
+          "instant": false,
+          "interval": "",
+          "intervalFactor": 1,
+          "legendFormat": "average",
+          "refId": "B"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Upstream service latency",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "s",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 22,
+  "style": "dark",
+  "tags": [
+    "nginx"
+  ],
+  "templating": {
+    "list": [
+      {
+        "hide": 0,
+        "label": "datasource",
+        "name": "DS_PROMETHEUS",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "regex": "",
+        "type": "datasource"
+      },
+      {
+        "allValue": ".*",
+        "current": {},
+        "datasource": "${DS_PROMETHEUS}",
+        "definition": "label_values(nginx_ingress_controller_requests, ingress) ",
+        "hide": 0,
+        "includeAll": true,
+        "label": "Service Ingress",
+        "multi": false,
+        "name": "ingress",
+        "options": [],
+        "query": "label_values(nginx_ingress_controller_requests, ingress) ",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 2,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      }
+    ]
+  },
+  "time": {
+    "from": "now-15m",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "30s",
+      "2m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "browser",
+  "title": "Request Handling Performance",
+  "uid": "4GFbkOsZk",
+  "version": 1
+}
+
--- a/ee/scripts/helmcharts/manifests/observability/dashboards/openreplay-components.json
+++ b/ee/scripts/helmcharts/manifests/observability/dashboards/openreplay-components.json
--- a/ee/scripts/helmcharts/manifests/observability/templates/NOTES.txt
+++ b/ee/scripts/helmcharts/manifests/observability/templates/NOTES.txt
@ -0,0 +1 @@
+Obeservibity stack installation complete. Please refer domain.com/grafana
--- a/ee/scripts/helmcharts/manifests/observability/templates/_helpers.tpl
+++ b/ee/scripts/helmcharts/manifests/observability/templates/_helpers.tpl
@ -0,0 +1,62 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "observability.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "observability.fullname" -}}
+{{- if .Values.fullnameOverride }}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- $name := default .Chart.Name .Values.nameOverride }}
+{{- if contains $name .Release.Name }}
+{{- .Release.Name | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "observability.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "observability.labels" -}}
+helm.sh/chart: {{ include "observability.chart" . }}
+{{ include "observability.selectorLabels" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
+
+{{/*
+Selector labels
+*/}}
+{{- define "observability.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "observability.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+
+{{/*
+Create the name of the service account to use
+*/}}
+{{- define "observability.serviceAccountName" -}}
+{{- if .Values.serviceAccount.create }}
+{{- default (include "observability.fullname" .) .Values.serviceAccount.name }}
+{{- else }}
+{{- default "default" .Values.serviceAccount.name }}
+{{- end }}
+{{- end }}
--- a/ee/scripts/helmcharts/manifests/observability/templates/dashboard.yaml
+++ b/ee/scripts/helmcharts/manifests/observability/templates/dashboard.yaml
@ -0,0 +1,11 @@
+apiVersion: v1
+data:
+  {{- (.Files.Glob "dashboards/*").AsConfig | nindent 2 }}
+kind: ConfigMap
+metadata:
+  labels:
+    app: kube-prometheus-stack-grafana
+    grafana_dashboard: "1"
+    release: monitoring
+  name: nginx-igress
+  namespace: observability
--- a/ee/scripts/helmcharts/manifests/observability/values.yaml
+++ b/ee/scripts/helmcharts/manifests/observability/values.yaml
				`@ -0,0 +1 @@`
				`Obeservibity stack installation complete. Please refer domain.com/grafana`