# vim: set fdm=indent: ################################################## ## Update the following values ## ## For example, # domainName: &domainName openreplay.supercompany.com ################################################## domainName: &domainName "changeme.mycorp.org" grafanaAdminPassword: &adminpass "changeNeGrafanaAdminPassword" slackWebhookUrl: &slackwebhook "https://hooks.slack.com/services/xxxx/xxxx/xxxxx" # Slack webhook token for sending alerts slackChannel: &slackchannel "changeMeAlertsChannel" # Name of the channel, to alerts to be delivered to. ######################################################## ## Custom configuration for Monitoring and logging stack ######################################################## kube-prometheus-stack: fullnameOverride: "openreplay" grafana: adminPassword: *adminpass env: GF_SERVER_ROOT_URL: http://grafana.local.com/grafana additionalDataSources: - name: loki editable: true type: loki url: http://loki.observability:3100 - name: k8s-db-openreplay-clickhouse editable: true type: vertamedia-clickhouse-datasource url: http://clickhouse-openreplay-clickhouse.db:8123 plugins: - grafana-piechart-panel - vertamedia-clickhouse-datasource - digrich-bubblechart-panel - grafana-clock-panel ingress: enabled: true ingressClassName: openreplay hosts: - *domainName annotations: nginx.ingress.kubernetes.io/rewrite-target: /$1 path: /grafana/(.*) tls: - hosts: - *domainName secretName: openreplay-ssl prometheus: prometheusSpec: storageSpec: volumeClaimTemplate: spec: # storageClassName: local-path accessModes: ["ReadWriteOnce"] resources: requests: storage: 200Gi alertmanager: config: global: resolve_timeout: 5m slack_api_url: *slackwebhook route: # group_by: ['job'] group_by: ['alertname','container'] group_wait: 30s group_interval: 5m repeat_interval: 12h receiver: 'slack' routes: - match: alertname: Watchdog receiver: 'slack' receivers: - name: slack slack_configs: - channel: *slackchannel color: '{{ template "slack.color" . }}' title: '{{ template "slack.title" . }}' text: '{{ template "slack.text" . }}' send_resolved: true actions: - type: button text: 'Runbook :green_book:' url: '{{ (index .Alerts 0).Annotations.runbook_url }}' # - type: button # text: 'Query :mag:' # url: '{{ (index .Alerts 0).GeneratorURL }}' # - type: button # text: 'Dashboard :chart_with_upwards_trend:' # url: '{{ (index .Alerts 0).Annotations.dashboard_url }}' # - type: button # text: 'Silence :no_bell:' # url: '{{ template "__alert_silence_link" . }}' templates: - /etc/alertmanager/config/*.tmpl templateFiles: template_1.tmpl: |- {{/* Alertmanager Silence link */}} {{ define "__alert_silence_link" -}} {{ .ExternalURL }}/#/silences/new?filter=%7B {{- range .CommonLabels.SortedPairs -}} {{- if ne .Name "alertname" -}} {{- .Name }}%3D"{{- .Value -}}"%2C%20 {{- end -}} {{- end -}} alertname%3D"{{- .CommonLabels.alertname -}}"%7D {{- end }} {{/* Severity of the alert */}} {{ define "__alert_severity" -}} {{- if eq .CommonLabels.severity "critical" -}} *Severity:* `Critical` {{- else if eq .CommonLabels.severity "warning" -}} *Severity:* `Warning` {{- else if eq .CommonLabels.severity "info" -}} *Severity:* `Info` {{- else -}} *Severity:* :question: {{ .CommonLabels.severity }} {{- end }} {{- end }} {{/* Title of the Slack alert */}} {{ define "slack.title" -}} [{{ .Status | toUpper -}} {{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{- end -}} ] {{ .CommonLabels.alertname }} {{- end }} {{/* Color of Slack attachment (appears as line next to alert )*/}} {{ define "slack.color" -}} {{ if eq .Status "firing" -}} {{ if eq .CommonLabels.severity "warning" -}} warning {{- else if eq .CommonLabels.severity "critical" -}} danger {{- else -}} #439FE0 {{- end -}} {{ else -}} good {{- end }} {{- end }} {{/* The text to display in the alert */}} {{ define "slack.text" -}} {{ template "__alert_severity" . }} {{- if (index .Alerts 0).Annotations.summary }} {{- "\n" -}} *Summary:* {{ (index .Alerts 0).Annotations.summary }} {{- end }} {{ range .Alerts }} {{- if .Annotations.description }} {{- "\n" -}} {{ .Annotations.description }} {{- "\n" -}} {{- end }} {{- if .Annotations.message }} {{- "\n" -}} {{ .Annotations.message }} {{- "\n" -}} {{- end }} {{- end }} {{- end }} loki: fullnameOverride: "loki" config: # existingSecret: auth_enabled: false ingester: chunk_idle_period: 3m chunk_block_size: 262144 chunk_retain_period: 1m max_transfer_retries: 0 wal: dir: /data/loki/wal lifecycler: ring: kvstore: store: inmemory replication_factor: 1 limits_config: enforce_metric_name: false reject_old_samples: true reject_old_samples_max_age: 168h schema_config: configs: - from: 2020-10-24 store: boltdb-shipper object_store: filesystem schema: v11 index: prefix: index_ period: 24h server: http_listen_port: 3100 storage_config: boltdb_shipper: active_index_directory: /data/loki/boltdb-shipper-active cache_location: /data/loki/boltdb-shipper-cache cache_ttl: 24h # Can be increased for faster performance over longer query periods, uses more disk space shared_store: filesystem filesystem: directory: /data/loki/chunks chunk_store_config: max_look_back_period: 0s table_manager: retention_deletes_enabled: false retention_period: 0s compactor: working_directory: /data/loki/boltdb-shipper-compactor shared_store: filesystem retention_enabled: true # Needed for Alerting: https://grafana.com/docs/loki/latest/rules/ # This is just a simple example, for more details: https://grafana.com/docs/loki/latest/configuration/#ruler_config ruler: storage: type: local local: directory: /rules rule_path: /tmp/scratch alertmanager_url: http://openreplay-alertmanager.observability.svc.cluster.local:9093 ring: kvstore: store: inmemory enable_api: true persistence: enabled: true accessModes: - ReadWriteOnce size: 100Gi serviceMonitor: enabled: true interval: "" additionalLabels: release: monitoring annotations: {} # scrapeTimeout: 10s # path: /metrics prometheusRule: enabled: true additionalLabels: release: monitoring rules: - alert: LokiProcessTooManyRestarts expr: changes(process_start_time_seconds{job=~"loki"}[15m]) > 2 for: 0m labels: severity: warning annotations: summary: Loki process too many restarts (instance {{ $labels.instance }}) description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: LokiRequestErrors expr: 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10 for: 15m labels: severity: critical annotations: summary: Loki request errors (instance {{ $labels.instance }}) description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: LokiRequestPanic expr: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 for: 5m labels: severity: critical annotations: summary: Loki request panic (instance {{ $labels.instance }}) description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: LokiRequestLatency expr: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1 for: 5m labels: severity: critical annotations: summary: Loki request latency (instance {{ $labels.instance }}) description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Specify Loki Alerting rules based on this documentation: https://grafana.com/docs/loki/latest/rules/ # When specified, you also need to add a ruler config section above. An example is shown in the alerting docs. alerting_groups: - name: dbZombie rules: - alert: dbZombie expr: absent_over_time({namespace="app", app="db"} |~ "Queue Statistics"[5m]) == 1 for: 10m labels: severity: warning promtail: fullnameOverride: "promtail" config: clients: - url: http://loki.observability.svc.cluster.local:3100/loki/api/v1/push