chore(helm): Adding observability chart

Signed-off-by: rjshrjndrn <rjshrjndrn@gmail.com>
This commit is contained in:
rjshrjndrn 2022-06-30 11:58:23 +02:00
parent d7e100e383
commit 67022f538b
13 changed files with 2648 additions and 3863 deletions

File diff suppressed because it is too large Load diff

View file

@ -1,168 +0,0 @@
##################################################
## Update the following values
##
## For example,
# domainName: &domainName openreplay.supercompany.com
##################################################
domainName: &domainName "changeme.mycorp.org"
grafanaAdminPassword: &adminpass "changeNeGrafanaAdminPassword"
slackWebhookUrl: &slackwebhook "https://hooks.slack.com/services/xxxx/xxxx/xxxxx" # Slack webhook token for sending alerts
slackChannel: &slackchannel "changeMeAlertsChannel" # Name of the channel, to alerts to be delivered to.
########################################################
## Custom configuration for Monitoring and logging stack
########################################################
fullnameOverride: "openreplay"
grafana:
adminPassword: *adminpass
env:
GF_SERVER_ROOT_URL: http://grafana.local.com/grafana
additionalDataSources:
- name: loki
editable: true
type: loki
url: http://loki.logging:3100
plugins:
- grafana-piechart-panel
- vertamedia-clickhouse-datasource
- digrich-bubblechart-panel
- grafana-clock-panel
ingress:
enabled: true
ingressClassName: openreplay
hosts:
- *domainName
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /$1
path: /grafana/(.*)
tls:
- hosts:
- *domainName
secretName: openreplay-ssl
prometheus:
prometheusSpec:
storageSpec:
volumeClaimTemplate:
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 200Gi
alertmanager:
config:
global:
resolve_timeout: 5m
slack_api_url: *slackwebhook
route:
# group_by: ['job']
group_by: ['alertname','container']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'slack'
routes:
- match:
alertname: Watchdog
receiver: 'slack'
receivers:
- name: slack
slack_configs:
- channel: *slackchannel
color: '{{ template "slack.color" . }}'
title: '{{ template "slack.title" . }}'
text: '{{ template "slack.text" . }}'
send_resolved: true
actions:
- type: button
text: 'Runbook :green_book:'
url: '{{ (index .Alerts 0).Annotations.runbook_url }}'
# - type: button
# text: 'Query :mag:'
# url: '{{ (index .Alerts 0).GeneratorURL }}'
# - type: button
# text: 'Dashboard :chart_with_upwards_trend:'
# url: '{{ (index .Alerts 0).Annotations.dashboard_url }}'
# - type: button
# text: 'Silence :no_bell:'
# url: '{{ template "__alert_silence_link" . }}'
templates:
- /etc/alertmanager/config/*.tmpl
templateFiles:
template_1.tmpl: |-
{{/* Alertmanager Silence link */}}
{{ define "__alert_silence_link" -}}
{{ .ExternalURL }}/#/silences/new?filter=%7B
{{- range .CommonLabels.SortedPairs -}}
{{- if ne .Name "alertname" -}}
{{- .Name }}%3D"{{- .Value -}}"%2C%20
{{- end -}}
{{- end -}}
alertname%3D"{{- .CommonLabels.alertname -}}"%7D
{{- end }}
{{/* Severity of the alert */}}
{{ define "__alert_severity" -}}
{{- if eq .CommonLabels.severity "critical" -}}
*Severity:* `Critical`
{{- else if eq .CommonLabels.severity "warning" -}}
*Severity:* `Warning`
{{- else if eq .CommonLabels.severity "info" -}}
*Severity:* `Info`
{{- else -}}
*Severity:* :question: {{ .CommonLabels.severity }}
{{- end }}
{{- end }}
{{/* Title of the Slack alert */}}
{{ define "slack.title" -}}
[{{ .Status | toUpper -}}
{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{- end -}}
] {{ .CommonLabels.alertname }}
{{- end }}
{{/* Color of Slack attachment (appears as line next to alert )*/}}
{{ define "slack.color" -}}
{{ if eq .Status "firing" -}}
{{ if eq .CommonLabels.severity "warning" -}}
warning
{{- else if eq .CommonLabels.severity "critical" -}}
danger
{{- else -}}
#439FE0
{{- end -}}
{{ else -}}
good
{{- end }}
{{- end }}
{{/* The text to display in the alert */}}
{{ define "slack.text" -}}
{{ template "__alert_severity" . }}
{{- if (index .Alerts 0).Annotations.summary }}
{{- "\n" -}}
*Summary:* {{ (index .Alerts 0).Annotations.summary }}
{{- end }}
{{ range .Alerts }}
{{- if .Annotations.description }}
{{- "\n" -}}
{{ .Annotations.description }}
{{- "\n" -}}
{{- end }}
{{- if .Annotations.message }}
{{- "\n" -}}
{{ .Annotations.message }}
{{- "\n" -}}
{{- end }}
{{- end }}
{{- end }}

View file

@ -0,0 +1,300 @@
# vim: set fdm=indent:
##################################################
## Update the following values
##
## For example,
# domainName: &domainName openreplay.supercompany.com
##################################################
domainName: &domainName "changeme.mycorp.org"
grafanaAdminPassword: &adminpass "changeNeGrafanaAdminPassword"
slackWebhookUrl: &slackwebhook "https://hooks.slack.com/services/xxxx/xxxx/xxxxx" # Slack webhook token for sending alerts
slackChannel: &slackchannel "changeMeAlertsChannel" # Name of the channel, to alerts to be delivered to.
########################################################
## Custom configuration for Monitoring and logging stack
########################################################
observability:
kube-prometheus-stack:
fullnameOverride: "openreplay"
grafana:
adminPassword: *adminpass
env:
GF_SERVER_ROOT_URL: http://grafana.local.com/grafana
additionalDataSources:
- name: loki
editable: true
type: loki
url: http://loki.observability:3100
plugins:
- grafana-piechart-panel
- vertamedia-clickhouse-datasource
- digrich-bubblechart-panel
- grafana-clock-panel
ingress:
enabled: true
ingressClassName: openreplay
hosts:
- *domainName
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /$1
path: /grafana/(.*)
tls:
- hosts:
- *domainName
secretName: openreplay-ssl
prometheus:
prometheusSpec:
storageSpec:
volumeClaimTemplate:
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 200Gi
alertmanager:
config:
global:
resolve_timeout: 5m
slack_api_url: *slackwebhook
route:
# group_by: ['job']
group_by: ['alertname','container']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'slack'
routes:
- match:
alertname: Watchdog
receiver: 'slack'
receivers:
- name: slack
slack_configs:
- channel: *slackchannel
color: '{{ template "slack.color" . }}'
title: '{{ template "slack.title" . }}'
text: '{{ template "slack.text" . }}'
send_resolved: true
actions:
- type: button
text: 'Runbook :green_book:'
url: '{{ (index .Alerts 0).Annotations.runbook_url }}'
# - type: button
# text: 'Query :mag:'
# url: '{{ (index .Alerts 0).GeneratorURL }}'
# - type: button
# text: 'Dashboard :chart_with_upwards_trend:'
# url: '{{ (index .Alerts 0).Annotations.dashboard_url }}'
# - type: button
# text: 'Silence :no_bell:'
# url: '{{ template "__alert_silence_link" . }}'
templates:
- /etc/alertmanager/config/*.tmpl
templateFiles:
template_1.tmpl: |-
{{/* Alertmanager Silence link */}}
{{ define "__alert_silence_link" -}}
{{ .ExternalURL }}/#/silences/new?filter=%7B
{{- range .CommonLabels.SortedPairs -}}
{{- if ne .Name "alertname" -}}
{{- .Name }}%3D"{{- .Value -}}"%2C%20
{{- end -}}
{{- end -}}
alertname%3D"{{- .CommonLabels.alertname -}}"%7D
{{- end }}
{{/* Severity of the alert */}}
{{ define "__alert_severity" -}}
{{- if eq .CommonLabels.severity "critical" -}}
*Severity:* `Critical`
{{- else if eq .CommonLabels.severity "warning" -}}
*Severity:* `Warning`
{{- else if eq .CommonLabels.severity "info" -}}
*Severity:* `Info`
{{- else -}}
*Severity:* :question: {{ .CommonLabels.severity }}
{{- end }}
{{- end }}
{{/* Title of the Slack alert */}}
{{ define "slack.title" -}}
[{{ .Status | toUpper -}}
{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{- end -}}
] {{ .CommonLabels.alertname }}
{{- end }}
{{/* Color of Slack attachment (appears as line next to alert )*/}}
{{ define "slack.color" -}}
{{ if eq .Status "firing" -}}
{{ if eq .CommonLabels.severity "warning" -}}
warning
{{- else if eq .CommonLabels.severity "critical" -}}
danger
{{- else -}}
#439FE0
{{- end -}}
{{ else -}}
good
{{- end }}
{{- end }}
{{/* The text to display in the alert */}}
{{ define "slack.text" -}}
{{ template "__alert_severity" . }}
{{- if (index .Alerts 0).Annotations.summary }}
{{- "\n" -}}
*Summary:* {{ (index .Alerts 0).Annotations.summary }}
{{- end }}
{{ range .Alerts }}
{{- if .Annotations.description }}
{{- "\n" -}}
{{ .Annotations.description }}
{{- "\n" -}}
{{- end }}
{{- if .Annotations.message }}
{{- "\n" -}}
{{ .Annotations.message }}
{{- "\n" -}}
{{- end }}
{{- end }}
{{- end }}
loki:
config:
# existingSecret:
auth_enabled: false
ingester:
chunk_idle_period: 3m
chunk_block_size: 262144
chunk_retain_period: 1m
max_transfer_retries: 0
wal:
dir: /data/loki/wal
lifecycler:
ring:
kvstore:
store: inmemory
replication_factor: 1
limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
server:
http_listen_port: 3100
storage_config:
boltdb_shipper:
active_index_directory: /data/loki/boltdb-shipper-active
cache_location: /data/loki/boltdb-shipper-cache
cache_ttl: 24h # Can be increased for faster performance over longer query periods, uses more disk space
shared_store: filesystem
filesystem:
directory: /data/loki/chunks
chunk_store_config:
max_look_back_period: 0s
table_manager:
retention_deletes_enabled: false
retention_period: 0s
compactor:
working_directory: /data/loki/boltdb-shipper-compactor
shared_store: filesystem
retention_enabled: true
# Needed for Alerting: https://grafana.com/docs/loki/latest/rules/
# This is just a simple example, for more details: https://grafana.com/docs/loki/latest/configuration/#ruler_config
ruler:
storage:
type: local
local:
directory: /rules
rule_path: /tmp/scratch
alertmanager_url: http://openreplay-alertmanager.observability.svc.cluster.local:9093
ring:
kvstore:
store: inmemory
enable_api: true
persistence:
enabled: true
accessModes:
- ReadWriteOnce
size: 100Gi
serviceMonitor:
enabled: true
interval: ""
additionalLabels:
release: monitoring
annotations: {}
# scrapeTimeout: 10s
# path: /metrics
prometheusRule:
enabled: true
additionalLabels:
release: monitoring
rules:
- alert: LokiProcessTooManyRestarts
expr: changes(process_start_time_seconds{job=~"loki"}[15m]) > 2
for: 0m
labels:
severity: warning
annotations:
summary: Loki process too many restarts (instance {{ $labels.instance }})
description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestErrors
expr: 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10
for: 15m
labels:
severity: critical
annotations:
summary: Loki request errors (instance {{ $labels.instance }})
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestPanic
expr: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
for: 5m
labels:
severity: critical
annotations:
summary: Loki request panic (instance {{ $labels.instance }})
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestLatency
expr: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1
for: 5m
labels:
severity: critical
annotations:
summary: Loki request latency (instance {{ $labels.instance }})
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Specify Loki Alerting rules based on this documentation: https://grafana.com/docs/loki/latest/rules/
# When specified, you also need to add a ruler config section above. An example is shown in the alerting docs.
alerting_groups:
- name: dbZombie
rules:
- alert: dbZombie
expr: absent_over_time({namespace="app", app="db"} |~ "Queue Statistics"[5m]) == 1
for: 10m
labels:
severity: warning
promtail:
config:
clients:
- url: http://loki.observability.svc.cluster.local:3100/loki/api/v1/push

View file

@ -0,0 +1,23 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/

View file

@ -0,0 +1,12 @@
dependencies:
- name: kube-prometheus-stack
repository: https://prometheus-community.github.io/helm-charts
version: 36.2.0
- name: loki
repository: https://grafana.github.io/helm-charts
version: 2.12.2
- name: promtail
repository: https://grafana.github.io/helm-charts
version: 6.0.0
digest: sha256:7f28353afa3626a7aaf59159a877be84f29bfac004e325d12f2ce43f31bd3e90
generated: "2022-06-30T11:23:29.753889457+02:00"

View file

@ -0,0 +1,34 @@
apiVersion: v2
name: observability
description: A Helm chart for Kubernetes
# A chart can be either an 'application' or a 'library' chart.
#
# Application charts are a collection of templates that can be packaged into versioned archives
# to be deployed.
#
# Library charts provide useful utilities or functions for the chart developer. They're included as
# a dependency of application charts to inject those utilities and functions into the rendering
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.0
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "1.0.0"
dependencies:
- name: kube-prometheus-stack
repository: https://prometheus-community.github.io/helm-charts
version: 36.2.*
- name: loki
repository: https://grafana.github.io/helm-charts
version: 2.12.*
- name: promtail
repository: https://grafana.github.io/helm-charts
version: 6.0.*

View file

@ -0,0 +1,982 @@
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "Prometheus",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "6.6.0"
},
{
"type": "panel",
"id": "graph",
"name": "Graph",
"version": ""
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "",
"editable": true,
"gnetId": 9614,
"graphTooltip": 1,
"id": null,
"iteration": 1582146566338,
"links": [],
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"description": "Total time taken for nginx and upstream servers to process a request and send a response",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"hiddenSeries": false,
"id": 91,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(\n 0.5,\n sum by (le)(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[1m]\n )\n )\n)",
"interval": "",
"legendFormat": ".5",
"refId": "D"
},
{
"expr": "histogram_quantile(\n 0.95,\n sum by (le)(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[1m]\n )\n )\n)",
"interval": "",
"legendFormat": ".95",
"refId": "B"
},
{
"expr": "histogram_quantile(\n 0.99,\n sum by (le)(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[1m]\n )\n )\n)",
"interval": "",
"legendFormat": ".99",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Total request handling time",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "s",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"description": "The time spent on receiving the response from the upstream server",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"hiddenSeries": false,
"id": 94,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(\n 0.5,\n sum by (le)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[1m]\n )\n )\n)",
"instant": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": ".5",
"refId": "D"
},
{
"expr": "histogram_quantile(\n 0.95,\n sum by (le)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[1m]\n )\n )\n)",
"interval": "",
"legendFormat": ".95",
"refId": "B"
},
{
"expr": "histogram_quantile(\n 0.99,\n sum by (le)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[1m]\n )\n )\n)",
"interval": "",
"legendFormat": ".99",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Upstream response time",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "s",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"hiddenSeries": false,
"id": 93,
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": " sum by (path)(\n rate(\n nginx_ingress_controller_request_duration_seconds_count{\n ingress =~ \"$ingress\"\n }[1m]\n )\n )\n",
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{ path }}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Request volume by Path",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "reqps",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"description": "For each path observed, its median upstream response time",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"hiddenSeries": false,
"id": 98,
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(\n .5,\n sum by (le, path)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[1m]\n )\n )\n)",
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{ path }}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Median upstream response time by Path",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "s",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"description": "Percentage of 4xx and 5xx responses among all responses.",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"hiddenSeries": false,
"id": 100,
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null as zero",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum by (path) (rate(nginx_ingress_controller_request_duration_seconds_count{\n ingress =~ \"$ingress\",\n status =~ \"[4-5].*\"\n}[1m])) / sum by (path) (rate(nginx_ingress_controller_request_duration_seconds_count{\n ingress =~ \"$ingress\",\n}[1m]))",
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{ path }}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Response error rate by Path",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percentunit",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"description": "For each path observed, the sum of upstream request time",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 16
},
"hiddenSeries": false,
"id": 102,
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum by (path) (rate(nginx_ingress_controller_response_duration_seconds_sum{ingress =~ \"$ingress\"}[1m]))",
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{ path }}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Upstream time consumed by Path",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "s",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 24
},
"hiddenSeries": false,
"id": 101,
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": " sum (\n rate(\n nginx_ingress_controller_request_duration_seconds_count{\n ingress =~ \"$ingress\",\n status =~\"[4-5].*\",\n }[1m]\n )\n ) by(path, status)\n",
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{ path }} {{ status }}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Response error volume by Path",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "reqps",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 24
},
"hiddenSeries": false,
"id": 99,
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum (\n rate (\n nginx_ingress_controller_response_size_sum {\n ingress =~ \"$ingress\",\n }[1m]\n )\n) by (path) / sum (\n rate(\n nginx_ingress_controller_response_size_count {\n ingress =~ \"$ingress\",\n }[1m]\n )\n) by (path)\n",
"hide": false,
"instant": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{ path }}",
"refId": "D"
},
{
"expr": " sum (rate(nginx_ingress_controller_response_size_bucket{\n ingress =~ \"$ingress\",\n }[1m])) by (le)\n",
"hide": true,
"legendFormat": "{{le}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Average response size by Path",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "decbytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMETHEUS}",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 32
},
"hiddenSeries": false,
"id": 96,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum (\n rate(\n nginx_ingress_controller_ingress_upstream_latency_seconds_sum {\n ingress =~ \"$ingress\",\n }[1m]\n)) / sum (\n rate(\n nginx_ingress_controller_ingress_upstream_latency_seconds_count {\n ingress =~ \"$ingress\",\n }[1m]\n )\n)\n",
"hide": false,
"instant": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "average",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Upstream service latency",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "s",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"refresh": "30s",
"schemaVersion": 22,
"style": "dark",
"tags": [
"nginx"
],
"templating": {
"list": [
{
"hide": 0,
"label": "datasource",
"name": "DS_PROMETHEUS",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
},
{
"allValue": ".*",
"current": {},
"datasource": "${DS_PROMETHEUS}",
"definition": "label_values(nginx_ingress_controller_requests, ingress) ",
"hide": 0,
"includeAll": true,
"label": "Service Ingress",
"multi": false,
"name": "ingress",
"options": [],
"query": "label_values(nginx_ingress_controller_requests, ingress) ",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 2,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-15m",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"2m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "Request Handling Performance",
"uid": "4GFbkOsZk",
"version": 1
}

View file

@ -0,0 +1 @@
Obeservibity stack installation complete. Please refer domain.com/grafana

View file

@ -0,0 +1,62 @@
{{/*
Expand the name of the chart.
*/}}
{{- define "observability.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "observability.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "observability.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Common labels
*/}}
{{- define "observability.labels" -}}
helm.sh/chart: {{ include "observability.chart" . }}
{{ include "observability.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end }}
{{/*
Selector labels
*/}}
{{- define "observability.selectorLabels" -}}
app.kubernetes.io/name: {{ include "observability.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}
{{/*
Create the name of the service account to use
*/}}
{{- define "observability.serviceAccountName" -}}
{{- if .Values.serviceAccount.create }}
{{- default (include "observability.fullname" .) .Values.serviceAccount.name }}
{{- else }}
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
{{- end }}

View file

@ -0,0 +1,11 @@
apiVersion: v1
data:
{{- (.Files.Glob "dashboards/*").AsConfig | nindent 2 }}
kind: ConfigMap
metadata:
labels:
app: kube-prometheus-stack-grafana
grafana_dashboard: "1"
release: monitoring
name: nginx-igress
namespace: observability