From d6e03aad522f7debd0bdc95efea197d6ab481873 Mon Sep 17 00:00:00 2001 From: rjshrjndrn Date: Wed, 29 Jun 2022 15:07:55 +0200 Subject: [PATCH] chore(monitoring): Adding enterprise config Signed-off-by: rjshrjndrn --- .../helmcharts/manifests/grafana-ingress.yaml | 27 ++++ ee/scripts/helmcharts/manifests/logging.yaml | 133 ++++++++++++++++++ .../helmcharts/manifests/monitoring.yaml | 0 .../helmcharts/manifests/values-override.yaml | 56 ++++++++ 4 files changed, 216 insertions(+) create mode 100644 ee/scripts/helmcharts/manifests/grafana-ingress.yaml create mode 100644 ee/scripts/helmcharts/manifests/logging.yaml create mode 100644 ee/scripts/helmcharts/manifests/monitoring.yaml create mode 100644 ee/scripts/helmcharts/manifests/values-override.yaml diff --git a/ee/scripts/helmcharts/manifests/grafana-ingress.yaml b/ee/scripts/helmcharts/manifests/grafana-ingress.yaml new file mode 100644 index 000000000..0226c05ed --- /dev/null +++ b/ee/scripts/helmcharts/manifests/grafana-ingress.yaml @@ -0,0 +1,27 @@ +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: grafana + namespace: monitoring + annotations: + nginx.ingress.kubernetes.io/rewrite-target: /$1 +spec: + ingressClassName: "{{ tpl .Values.ingress.className . }}" + rules: + - host: {{ .Values.global.domainName }} + http: + paths: + - pathType: Prefix + backend: + service: + name: monitoring-grafana + port: + number: 80 + path: /grafana/(.*) + tls: + - hosts: + - {{ .Values.global.domainName }} + {{- if .Values.ingress.tls.secretName}} + secretName: {{ .Values.ingress.tls.secretName }} + {{- end}} diff --git a/ee/scripts/helmcharts/manifests/logging.yaml b/ee/scripts/helmcharts/manifests/logging.yaml new file mode 100644 index 000000000..422aa0594 --- /dev/null +++ b/ee/scripts/helmcharts/manifests/logging.yaml @@ -0,0 +1,133 @@ +config: + # existingSecret: + auth_enabled: false + ingester: + chunk_idle_period: 3m + chunk_block_size: 262144 + chunk_retain_period: 1m + max_transfer_retries: 0 + wal: + dir: /data/loki/wal + lifecycler: + ring: + kvstore: + store: inmemory + replication_factor: 1 + + ## Different ring configs can be used. E.g. Consul + # ring: + # store: consul + # replication_factor: 1 + # consul: + # host: "consul:8500" + # prefix: "" + # http_client_timeout: "20s" + # consistent_reads: true + limits_config: + enforce_metric_name: false + reject_old_samples: true + reject_old_samples_max_age: 168h + schema_config: + configs: + - from: 2020-10-24 + store: boltdb-shipper + object_store: filesystem + schema: v11 + index: + prefix: index_ + period: 24h + server: + http_listen_port: 3100 + storage_config: + boltdb_shipper: + active_index_directory: /data/loki/boltdb-shipper-active + cache_location: /data/loki/boltdb-shipper-cache + cache_ttl: 24h # Can be increased for faster performance over longer query periods, uses more disk space + shared_store: filesystem + filesystem: + directory: /data/loki/chunks + chunk_store_config: + max_look_back_period: 0s + table_manager: + retention_deletes_enabled: false + retention_period: 0s + compactor: + working_directory: /data/loki/boltdb-shipper-compactor + shared_store: filesystem + retention_enabled: true +# Needed for Alerting: https://grafana.com/docs/loki/latest/rules/ +# This is just a simple example, for more details: https://grafana.com/docs/loki/latest/configuration/#ruler_config + ruler: + storage: + type: local + local: + directory: /rules + rule_path: /tmp/scratch + alertmanager_url: http://openreplay-alertmanager.monitoring.svc.cluster.local:9093 + ring: + kvstore: + store: inmemory + enable_api: true + +persistence: + enabled: true + accessModes: + - ReadWriteOnce + size: 100Gi + +serviceMonitor: + enabled: true + interval: "" + additionalLabels: + release: monitoring + annotations: {} + # scrapeTimeout: 10s + # path: /metrics + prometheusRule: + enabled: true + additionalLabels: + release: monitoring + rules: + - alert: LokiProcessTooManyRestarts + expr: changes(process_start_time_seconds{job=~"loki"}[15m]) > 2 + for: 0m + labels: + severity: warning + annotations: + summary: Loki process too many restarts (instance {{ $labels.instance }}) + description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: LokiRequestErrors + expr: 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10 + for: 15m + labels: + severity: critical + annotations: + summary: Loki request errors (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: LokiRequestPanic + expr: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: Loki request panic (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: LokiRequestLatency + expr: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1 + for: 5m + labels: + severity: critical + annotations: + summary: Loki request latency (instance {{ $labels.instance }}) + description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + +# Specify Loki Alerting rules based on this documentation: https://grafana.com/docs/loki/latest/rules/ +# When specified, you also need to add a ruler config section above. An example is shown in the alerting docs. +alerting_groups: + - name: dbZombie + rules: + - alert: dbZombie + expr: absent_over_time({namespace="app", app="db"} |~ "Queue Statistics"[5m]) == 1 + for: 10m + labels: + severity: warning diff --git a/ee/scripts/helmcharts/manifests/monitoring.yaml b/ee/scripts/helmcharts/manifests/monitoring.yaml new file mode 100644 index 000000000..e69de29bb diff --git a/ee/scripts/helmcharts/manifests/values-override.yaml b/ee/scripts/helmcharts/manifests/values-override.yaml new file mode 100644 index 000000000..ff7aaaa6c --- /dev/null +++ b/ee/scripts/helmcharts/manifests/values-override.yaml @@ -0,0 +1,56 @@ +clickhouse: + # For enterpriseEdition + enabled: false + +kafka: &kafka + # For enterpriseEdition + enabled: true + kafkaHost: "kafka.db.svc.cluster.local" + kafkaPort: "9092" + kafkaUseSsl: "false" + config: |- + replica.fetch.max.bytes=3000000 + message.max.bytes=3000000 + zookeeper.connect=databases-zookeeper +redis: &redis + # For enterpriseEdition + enabled: false + +postgresql: &postgres + # For generating passwords + # `openssl rand -hex 20` + enabled: false + postgresqlPassword: "changeMePassword" + postgresqlHost: "postgresql.db.svc.cluster.local" + postgresqlPort: "5432" + postgresqlUser: "postgres" + postgresqlDatabase: "postgres" + +ingress-nginx: &ingress-nginx + service: + externalTrafficPolicy: "Local" + extraArgs: + default-ssl-certificate: "app/openreplay-ssl" + config: + use-gzip: true + load-balance: ewma + enable-real-ip: true + # Enable LB forwarded protocol + # Ref: https://kubernetes.github.io/ingress-nginx/user-guide/nginx-configuration/configmap/#use-forwarded-headers + # https://github.com/nginxinc/kubernetes-ingress/issues/1284#issuecomment-872869354 + # use-forwarded-headers: true +global: + s3: + region: "us-east-1" + endpoint: "http://minio.db.svc.cluster.local:9000" + assetsBucket: "sessions-assets" + recordingsBucket: "mobs" + sourcemapsBucket: "sourcemaps" + # if you're using one node installation, where + # you're using local s3, make sure these variables + # are same as minio.global.minio.accesskey and secretKey + accessKey: "changeMeMinioAccessKey" + secretKey: "changeMeMinioPassword" + + enterpriseEditionLicense: "" + domainName: ""