chore(monitoring): Adding enterprise config
Signed-off-by: rjshrjndrn <rjshrjndrn@gmail.com>
This commit is contained in:
parent
99ee5d5cb1
commit
d6e03aad52
4 changed files with 216 additions and 0 deletions
27
ee/scripts/helmcharts/manifests/grafana-ingress.yaml
Normal file
27
ee/scripts/helmcharts/manifests/grafana-ingress.yaml
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
---
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: monitoring
|
||||
annotations:
|
||||
nginx.ingress.kubernetes.io/rewrite-target: /$1
|
||||
spec:
|
||||
ingressClassName: "{{ tpl .Values.ingress.className . }}"
|
||||
rules:
|
||||
- host: {{ .Values.global.domainName }}
|
||||
http:
|
||||
paths:
|
||||
- pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: monitoring-grafana
|
||||
port:
|
||||
number: 80
|
||||
path: /grafana/(.*)
|
||||
tls:
|
||||
- hosts:
|
||||
- {{ .Values.global.domainName }}
|
||||
{{- if .Values.ingress.tls.secretName}}
|
||||
secretName: {{ .Values.ingress.tls.secretName }}
|
||||
{{- end}}
|
||||
133
ee/scripts/helmcharts/manifests/logging.yaml
Normal file
133
ee/scripts/helmcharts/manifests/logging.yaml
Normal file
|
|
@ -0,0 +1,133 @@
|
|||
config:
|
||||
# existingSecret:
|
||||
auth_enabled: false
|
||||
ingester:
|
||||
chunk_idle_period: 3m
|
||||
chunk_block_size: 262144
|
||||
chunk_retain_period: 1m
|
||||
max_transfer_retries: 0
|
||||
wal:
|
||||
dir: /data/loki/wal
|
||||
lifecycler:
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
replication_factor: 1
|
||||
|
||||
## Different ring configs can be used. E.g. Consul
|
||||
# ring:
|
||||
# store: consul
|
||||
# replication_factor: 1
|
||||
# consul:
|
||||
# host: "consul:8500"
|
||||
# prefix: ""
|
||||
# http_client_timeout: "20s"
|
||||
# consistent_reads: true
|
||||
limits_config:
|
||||
enforce_metric_name: false
|
||||
reject_old_samples: true
|
||||
reject_old_samples_max_age: 168h
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2020-10-24
|
||||
store: boltdb-shipper
|
||||
object_store: filesystem
|
||||
schema: v11
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
storage_config:
|
||||
boltdb_shipper:
|
||||
active_index_directory: /data/loki/boltdb-shipper-active
|
||||
cache_location: /data/loki/boltdb-shipper-cache
|
||||
cache_ttl: 24h # Can be increased for faster performance over longer query periods, uses more disk space
|
||||
shared_store: filesystem
|
||||
filesystem:
|
||||
directory: /data/loki/chunks
|
||||
chunk_store_config:
|
||||
max_look_back_period: 0s
|
||||
table_manager:
|
||||
retention_deletes_enabled: false
|
||||
retention_period: 0s
|
||||
compactor:
|
||||
working_directory: /data/loki/boltdb-shipper-compactor
|
||||
shared_store: filesystem
|
||||
retention_enabled: true
|
||||
# Needed for Alerting: https://grafana.com/docs/loki/latest/rules/
|
||||
# This is just a simple example, for more details: https://grafana.com/docs/loki/latest/configuration/#ruler_config
|
||||
ruler:
|
||||
storage:
|
||||
type: local
|
||||
local:
|
||||
directory: /rules
|
||||
rule_path: /tmp/scratch
|
||||
alertmanager_url: http://openreplay-alertmanager.monitoring.svc.cluster.local:9093
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
enable_api: true
|
||||
|
||||
persistence:
|
||||
enabled: true
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
size: 100Gi
|
||||
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
interval: ""
|
||||
additionalLabels:
|
||||
release: monitoring
|
||||
annotations: {}
|
||||
# scrapeTimeout: 10s
|
||||
# path: /metrics
|
||||
prometheusRule:
|
||||
enabled: true
|
||||
additionalLabels:
|
||||
release: monitoring
|
||||
rules:
|
||||
- alert: LokiProcessTooManyRestarts
|
||||
expr: changes(process_start_time_seconds{job=~"loki"}[15m]) > 2
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Loki process too many restarts (instance {{ $labels.instance }})
|
||||
description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: LokiRequestErrors
|
||||
expr: 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Loki request errors (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: LokiRequestPanic
|
||||
expr: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Loki request panic (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: LokiRequestLatency
|
||||
expr: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Loki request latency (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Specify Loki Alerting rules based on this documentation: https://grafana.com/docs/loki/latest/rules/
|
||||
# When specified, you also need to add a ruler config section above. An example is shown in the alerting docs.
|
||||
alerting_groups:
|
||||
- name: dbZombie
|
||||
rules:
|
||||
- alert: dbZombie
|
||||
expr: absent_over_time({namespace="app", app="db"} |~ "Queue Statistics"[5m]) == 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
0
ee/scripts/helmcharts/manifests/monitoring.yaml
Normal file
0
ee/scripts/helmcharts/manifests/monitoring.yaml
Normal file
56
ee/scripts/helmcharts/manifests/values-override.yaml
Normal file
56
ee/scripts/helmcharts/manifests/values-override.yaml
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
clickhouse:
|
||||
# For enterpriseEdition
|
||||
enabled: false
|
||||
|
||||
kafka: &kafka
|
||||
# For enterpriseEdition
|
||||
enabled: true
|
||||
kafkaHost: "kafka.db.svc.cluster.local"
|
||||
kafkaPort: "9092"
|
||||
kafkaUseSsl: "false"
|
||||
config: |-
|
||||
replica.fetch.max.bytes=3000000
|
||||
message.max.bytes=3000000
|
||||
zookeeper.connect=databases-zookeeper
|
||||
redis: &redis
|
||||
# For enterpriseEdition
|
||||
enabled: false
|
||||
|
||||
postgresql: &postgres
|
||||
# For generating passwords
|
||||
# `openssl rand -hex 20`
|
||||
enabled: false
|
||||
postgresqlPassword: "changeMePassword"
|
||||
postgresqlHost: "postgresql.db.svc.cluster.local"
|
||||
postgresqlPort: "5432"
|
||||
postgresqlUser: "postgres"
|
||||
postgresqlDatabase: "postgres"
|
||||
|
||||
ingress-nginx: &ingress-nginx
|
||||
service:
|
||||
externalTrafficPolicy: "Local"
|
||||
extraArgs:
|
||||
default-ssl-certificate: "app/openreplay-ssl"
|
||||
config:
|
||||
use-gzip: true
|
||||
load-balance: ewma
|
||||
enable-real-ip: true
|
||||
# Enable LB forwarded protocol
|
||||
# Ref: https://kubernetes.github.io/ingress-nginx/user-guide/nginx-configuration/configmap/#use-forwarded-headers
|
||||
# https://github.com/nginxinc/kubernetes-ingress/issues/1284#issuecomment-872869354
|
||||
# use-forwarded-headers: true
|
||||
global:
|
||||
s3:
|
||||
region: "us-east-1"
|
||||
endpoint: "http://minio.db.svc.cluster.local:9000"
|
||||
assetsBucket: "sessions-assets"
|
||||
recordingsBucket: "mobs"
|
||||
sourcemapsBucket: "sourcemaps"
|
||||
# if you're using one node installation, where
|
||||
# you're using local s3, make sure these variables
|
||||
# are same as minio.global.minio.accesskey and secretKey
|
||||
accessKey: "changeMeMinioAccessKey"
|
||||
secretKey: "changeMeMinioPassword"
|
||||
|
||||
enterpriseEditionLicense: ""
|
||||
domainName: ""
|
||||
Loading…
Add table
Reference in a new issue