chore(monitoring): Adding enterprise config

Signed-off-by: rjshrjndrn <rjshrjndrn@gmail.com>
This commit is contained in:
rjshrjndrn 2022-06-29 15:07:55 +02:00
parent 99ee5d5cb1
commit d6e03aad52
4 changed files with 216 additions and 0 deletions

View file

@ -0,0 +1,27 @@
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: grafana
namespace: monitoring
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /$1
spec:
ingressClassName: "{{ tpl .Values.ingress.className . }}"
rules:
- host: {{ .Values.global.domainName }}
http:
paths:
- pathType: Prefix
backend:
service:
name: monitoring-grafana
port:
number: 80
path: /grafana/(.*)
tls:
- hosts:
- {{ .Values.global.domainName }}
{{- if .Values.ingress.tls.secretName}}
secretName: {{ .Values.ingress.tls.secretName }}
{{- end}}

View file

@ -0,0 +1,133 @@
config:
# existingSecret:
auth_enabled: false
ingester:
chunk_idle_period: 3m
chunk_block_size: 262144
chunk_retain_period: 1m
max_transfer_retries: 0
wal:
dir: /data/loki/wal
lifecycler:
ring:
kvstore:
store: inmemory
replication_factor: 1
## Different ring configs can be used. E.g. Consul
# ring:
# store: consul
# replication_factor: 1
# consul:
# host: "consul:8500"
# prefix: ""
# http_client_timeout: "20s"
# consistent_reads: true
limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
server:
http_listen_port: 3100
storage_config:
boltdb_shipper:
active_index_directory: /data/loki/boltdb-shipper-active
cache_location: /data/loki/boltdb-shipper-cache
cache_ttl: 24h # Can be increased for faster performance over longer query periods, uses more disk space
shared_store: filesystem
filesystem:
directory: /data/loki/chunks
chunk_store_config:
max_look_back_period: 0s
table_manager:
retention_deletes_enabled: false
retention_period: 0s
compactor:
working_directory: /data/loki/boltdb-shipper-compactor
shared_store: filesystem
retention_enabled: true
# Needed for Alerting: https://grafana.com/docs/loki/latest/rules/
# This is just a simple example, for more details: https://grafana.com/docs/loki/latest/configuration/#ruler_config
ruler:
storage:
type: local
local:
directory: /rules
rule_path: /tmp/scratch
alertmanager_url: http://openreplay-alertmanager.monitoring.svc.cluster.local:9093
ring:
kvstore:
store: inmemory
enable_api: true
persistence:
enabled: true
accessModes:
- ReadWriteOnce
size: 100Gi
serviceMonitor:
enabled: true
interval: ""
additionalLabels:
release: monitoring
annotations: {}
# scrapeTimeout: 10s
# path: /metrics
prometheusRule:
enabled: true
additionalLabels:
release: monitoring
rules:
- alert: LokiProcessTooManyRestarts
expr: changes(process_start_time_seconds{job=~"loki"}[15m]) > 2
for: 0m
labels:
severity: warning
annotations:
summary: Loki process too many restarts (instance {{ $labels.instance }})
description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestErrors
expr: 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10
for: 15m
labels:
severity: critical
annotations:
summary: Loki request errors (instance {{ $labels.instance }})
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestPanic
expr: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
for: 5m
labels:
severity: critical
annotations:
summary: Loki request panic (instance {{ $labels.instance }})
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestLatency
expr: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1
for: 5m
labels:
severity: critical
annotations:
summary: Loki request latency (instance {{ $labels.instance }})
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Specify Loki Alerting rules based on this documentation: https://grafana.com/docs/loki/latest/rules/
# When specified, you also need to add a ruler config section above. An example is shown in the alerting docs.
alerting_groups:
- name: dbZombie
rules:
- alert: dbZombie
expr: absent_over_time({namespace="app", app="db"} |~ "Queue Statistics"[5m]) == 1
for: 10m
labels:
severity: warning

View file

@ -0,0 +1,56 @@
clickhouse:
# For enterpriseEdition
enabled: false
kafka: &kafka
# For enterpriseEdition
enabled: true
kafkaHost: "kafka.db.svc.cluster.local"
kafkaPort: "9092"
kafkaUseSsl: "false"
config: |-
replica.fetch.max.bytes=3000000
message.max.bytes=3000000
zookeeper.connect=databases-zookeeper
redis: &redis
# For enterpriseEdition
enabled: false
postgresql: &postgres
# For generating passwords
# `openssl rand -hex 20`
enabled: false
postgresqlPassword: "changeMePassword"
postgresqlHost: "postgresql.db.svc.cluster.local"
postgresqlPort: "5432"
postgresqlUser: "postgres"
postgresqlDatabase: "postgres"
ingress-nginx: &ingress-nginx
service:
externalTrafficPolicy: "Local"
extraArgs:
default-ssl-certificate: "app/openreplay-ssl"
config:
use-gzip: true
load-balance: ewma
enable-real-ip: true
# Enable LB forwarded protocol
# Ref: https://kubernetes.github.io/ingress-nginx/user-guide/nginx-configuration/configmap/#use-forwarded-headers
# https://github.com/nginxinc/kubernetes-ingress/issues/1284#issuecomment-872869354
# use-forwarded-headers: true
global:
s3:
region: "us-east-1"
endpoint: "http://minio.db.svc.cluster.local:9000"
assetsBucket: "sessions-assets"
recordingsBucket: "mobs"
sourcemapsBucket: "sourcemaps"
# if you're using one node installation, where
# you're using local s3, make sure these variables
# are same as minio.global.minio.accesskey and secretKey
accessKey: "changeMeMinioAccessKey"
secretKey: "changeMeMinioPassword"
enterpriseEditionLicense: ""
domainName: ""