306 lines
10 KiB
YAML
306 lines
10 KiB
YAML
# vim: set fdm=indent:
|
|
##################################################
|
|
## Update the following values
|
|
##
|
|
## For example,
|
|
# domainName: &domainName openreplay.supercompany.com
|
|
##################################################
|
|
|
|
domainName: &domainName "changeme.mycorp.org"
|
|
grafanaAdminPassword: &adminpass "changeNeGrafanaAdminPassword"
|
|
slackWebhookUrl: &slackwebhook "https://hooks.slack.com/services/xxxx/xxxx/xxxxx" # Slack webhook token for sending alerts
|
|
slackChannel: &slackchannel "changeMeAlertsChannel" # Name of the channel, to alerts to be delivered to.
|
|
|
|
|
|
########################################################
|
|
## Custom configuration for Monitoring and logging stack
|
|
########################################################
|
|
|
|
kube-prometheus-stack:
|
|
fullnameOverride: "openreplay"
|
|
grafana:
|
|
adminPassword: *adminpass
|
|
env:
|
|
GF_SERVER_ROOT_URL: http://grafana.local.com/grafana
|
|
additionalDataSources:
|
|
- name: loki
|
|
editable: true
|
|
type: loki
|
|
url: http://loki.observability:3100
|
|
- name: k8s-db-openreplay-clickhouse
|
|
editable: true
|
|
type: vertamedia-clickhouse-datasource
|
|
url: http://clickhouse-openreplay-clickhouse.db:8123
|
|
plugins:
|
|
- grafana-piechart-panel
|
|
- vertamedia-clickhouse-datasource
|
|
- digrich-bubblechart-panel
|
|
- grafana-clock-panel
|
|
ingress:
|
|
enabled: true
|
|
ingressClassName: openreplay
|
|
hosts:
|
|
- *domainName
|
|
annotations:
|
|
nginx.ingress.kubernetes.io/rewrite-target: /$1
|
|
path: /grafana/(.*)
|
|
tls:
|
|
- hosts:
|
|
- *domainName
|
|
secretName: openreplay-ssl
|
|
|
|
prometheus:
|
|
prometheusSpec:
|
|
storageSpec:
|
|
volumeClaimTemplate:
|
|
spec:
|
|
# storageClassName: local-path
|
|
accessModes: ["ReadWriteOnce"]
|
|
resources:
|
|
requests:
|
|
storage: 200Gi
|
|
alertmanager:
|
|
config:
|
|
global:
|
|
resolve_timeout: 5m
|
|
slack_api_url: *slackwebhook
|
|
route:
|
|
# group_by: ['job']
|
|
group_by: ['alertname','container']
|
|
group_wait: 30s
|
|
group_interval: 5m
|
|
repeat_interval: 12h
|
|
receiver: 'slack'
|
|
routes:
|
|
- match:
|
|
alertname: Watchdog
|
|
receiver: 'slack'
|
|
receivers:
|
|
- name: slack
|
|
slack_configs:
|
|
- channel: *slackchannel
|
|
color: '{{ template "slack.color" . }}'
|
|
title: '{{ template "slack.title" . }}'
|
|
text: '{{ template "slack.text" . }}'
|
|
send_resolved: true
|
|
actions:
|
|
- type: button
|
|
text: 'Runbook :green_book:'
|
|
url: '{{ (index .Alerts 0).Annotations.runbook_url }}'
|
|
# - type: button
|
|
# text: 'Query :mag:'
|
|
# url: '{{ (index .Alerts 0).GeneratorURL }}'
|
|
# - type: button
|
|
# text: 'Dashboard :chart_with_upwards_trend:'
|
|
# url: '{{ (index .Alerts 0).Annotations.dashboard_url }}'
|
|
# - type: button
|
|
# text: 'Silence :no_bell:'
|
|
# url: '{{ template "__alert_silence_link" . }}'
|
|
templates:
|
|
- /etc/alertmanager/config/*.tmpl
|
|
|
|
templateFiles:
|
|
template_1.tmpl: |-
|
|
{{/* Alertmanager Silence link */}}
|
|
{{ define "__alert_silence_link" -}}
|
|
{{ .ExternalURL }}/#/silences/new?filter=%7B
|
|
{{- range .CommonLabels.SortedPairs -}}
|
|
{{- if ne .Name "alertname" -}}
|
|
{{- .Name }}%3D"{{- .Value -}}"%2C%20
|
|
{{- end -}}
|
|
{{- end -}}
|
|
alertname%3D"{{- .CommonLabels.alertname -}}"%7D
|
|
{{- end }}
|
|
|
|
{{/* Severity of the alert */}}
|
|
{{ define "__alert_severity" -}}
|
|
{{- if eq .CommonLabels.severity "critical" -}}
|
|
*Severity:* `Critical`
|
|
{{- else if eq .CommonLabels.severity "warning" -}}
|
|
*Severity:* `Warning`
|
|
{{- else if eq .CommonLabels.severity "info" -}}
|
|
*Severity:* `Info`
|
|
{{- else -}}
|
|
*Severity:* :question: {{ .CommonLabels.severity }}
|
|
{{- end }}
|
|
{{- end }}
|
|
|
|
{{/* Title of the Slack alert */}}
|
|
{{ define "slack.title" -}}
|
|
[{{ .Status | toUpper -}}
|
|
{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{- end -}}
|
|
] {{ .CommonLabels.alertname }}
|
|
{{- end }}
|
|
|
|
|
|
{{/* Color of Slack attachment (appears as line next to alert )*/}}
|
|
{{ define "slack.color" -}}
|
|
{{ if eq .Status "firing" -}}
|
|
{{ if eq .CommonLabels.severity "warning" -}}
|
|
warning
|
|
{{- else if eq .CommonLabels.severity "critical" -}}
|
|
danger
|
|
{{- else -}}
|
|
#439FE0
|
|
{{- end -}}
|
|
{{ else -}}
|
|
good
|
|
{{- end }}
|
|
{{- end }}
|
|
|
|
{{/* The text to display in the alert */}}
|
|
{{ define "slack.text" -}}
|
|
|
|
{{ template "__alert_severity" . }}
|
|
{{- if (index .Alerts 0).Annotations.summary }}
|
|
{{- "\n" -}}
|
|
*Summary:* {{ (index .Alerts 0).Annotations.summary }}
|
|
{{- end }}
|
|
|
|
{{ range .Alerts }}
|
|
|
|
{{- if .Annotations.description }}
|
|
{{- "\n" -}}
|
|
{{ .Annotations.description }}
|
|
{{- "\n" -}}
|
|
{{- end }}
|
|
{{- if .Annotations.message }}
|
|
{{- "\n" -}}
|
|
{{ .Annotations.message }}
|
|
{{- "\n" -}}
|
|
{{- end }}
|
|
|
|
{{- end }}
|
|
|
|
{{- end }}
|
|
loki:
|
|
fullnameOverride: "loki"
|
|
config:
|
|
# existingSecret:
|
|
auth_enabled: false
|
|
ingester:
|
|
chunk_idle_period: 3m
|
|
chunk_block_size: 262144
|
|
chunk_retain_period: 1m
|
|
max_transfer_retries: 0
|
|
wal:
|
|
dir: /data/loki/wal
|
|
lifecycler:
|
|
ring:
|
|
kvstore:
|
|
store: inmemory
|
|
replication_factor: 1
|
|
|
|
limits_config:
|
|
enforce_metric_name: false
|
|
reject_old_samples: true
|
|
reject_old_samples_max_age: 168h
|
|
schema_config:
|
|
configs:
|
|
- from: 2020-10-24
|
|
store: boltdb-shipper
|
|
object_store: filesystem
|
|
schema: v11
|
|
index:
|
|
prefix: index_
|
|
period: 24h
|
|
server:
|
|
http_listen_port: 3100
|
|
storage_config:
|
|
boltdb_shipper:
|
|
active_index_directory: /data/loki/boltdb-shipper-active
|
|
cache_location: /data/loki/boltdb-shipper-cache
|
|
cache_ttl: 24h # Can be increased for faster performance over longer query periods, uses more disk space
|
|
shared_store: filesystem
|
|
filesystem:
|
|
directory: /data/loki/chunks
|
|
chunk_store_config:
|
|
max_look_back_period: 0s
|
|
table_manager:
|
|
retention_deletes_enabled: false
|
|
retention_period: 0s
|
|
compactor:
|
|
working_directory: /data/loki/boltdb-shipper-compactor
|
|
shared_store: filesystem
|
|
retention_enabled: true
|
|
# Needed for Alerting: https://grafana.com/docs/loki/latest/rules/
|
|
# This is just a simple example, for more details: https://grafana.com/docs/loki/latest/configuration/#ruler_config
|
|
ruler:
|
|
storage:
|
|
type: local
|
|
local:
|
|
directory: /rules
|
|
rule_path: /tmp/scratch
|
|
alertmanager_url: http://openreplay-alertmanager.observability.svc.cluster.local:9093
|
|
ring:
|
|
kvstore:
|
|
store: inmemory
|
|
enable_api: true
|
|
|
|
persistence:
|
|
enabled: true
|
|
accessModes:
|
|
- ReadWriteOnce
|
|
size: 100Gi
|
|
|
|
serviceMonitor:
|
|
enabled: true
|
|
interval: ""
|
|
additionalLabels:
|
|
release: monitoring
|
|
annotations: {}
|
|
# scrapeTimeout: 10s
|
|
# path: /metrics
|
|
prometheusRule:
|
|
enabled: true
|
|
additionalLabels:
|
|
release: monitoring
|
|
rules:
|
|
- alert: LokiProcessTooManyRestarts
|
|
expr: changes(process_start_time_seconds{job=~"loki"}[15m]) > 2
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Loki process too many restarts (instance {{ $labels.instance }})
|
|
description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
- alert: LokiRequestErrors
|
|
expr: 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Loki request errors (instance {{ $labels.instance }})
|
|
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
- alert: LokiRequestPanic
|
|
expr: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Loki request panic (instance {{ $labels.instance }})
|
|
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
- alert: LokiRequestLatency
|
|
expr: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Loki request latency (instance {{ $labels.instance }})
|
|
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# Specify Loki Alerting rules based on this documentation: https://grafana.com/docs/loki/latest/rules/
|
|
# When specified, you also need to add a ruler config section above. An example is shown in the alerting docs.
|
|
alerting_groups:
|
|
- name: dbZombie
|
|
rules:
|
|
- alert: dbZombie
|
|
expr: absent_over_time({namespace="app", app="db"} |~ "Queue Statistics"[5m]) == 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
promtail:
|
|
fullnameOverride: "promtail"
|
|
config:
|
|
clients:
|
|
- url: http://loki.observability.svc.cluster.local:3100/loki/api/v1/push
|