chore(monitoring): Adding enterprise config

Signed-off-by: rjshrjndrn <rjshrjndrn@gmail.com>
2022-06-29 15:07:55 +02:00 · 2022-06-29 15:07:55 +02:00 · d6e03aad52
commit d6e03aad52
parent 99ee5d5cb1
4 changed files with 216 additions and 0 deletions
--- a/ee/scripts/helmcharts/manifests/grafana-ingress.yaml
+++ b/ee/scripts/helmcharts/manifests/grafana-ingress.yaml
@ -0,0 +1,27 @@
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: grafana
+  namespace: monitoring
+  annotations:
+    nginx.ingress.kubernetes.io/rewrite-target: /$1
+spec:
+  ingressClassName: "{{ tpl .Values.ingress.className . }}" 
+  rules:
+    - host: {{ .Values.global.domainName }} 
+      http:
+        paths:
+          - pathType: Prefix
+            backend:
+              service:
+                name: monitoring-grafana
+                port:
+                  number: 80
+            path: /grafana/(.*)
+  tls:
+    - hosts:
+      - {{ .Values.global.domainName }}
+      {{- if .Values.ingress.tls.secretName}}
+      secretName: {{ .Values.ingress.tls.secretName }}
+      {{- end}}
--- a/ee/scripts/helmcharts/manifests/logging.yaml
+++ b/ee/scripts/helmcharts/manifests/logging.yaml
@ -0,0 +1,133 @@
+config:
+  # existingSecret:
+  auth_enabled: false
+  ingester:
+    chunk_idle_period: 3m
+    chunk_block_size: 262144
+    chunk_retain_period: 1m
+    max_transfer_retries: 0
+    wal:
+      dir: /data/loki/wal
+    lifecycler:
+      ring:
+        kvstore:
+          store: inmemory
+        replication_factor: 1
+
+      ## Different ring configs can be used. E.g. Consul
+      # ring:
+      #   store: consul
+      #   replication_factor: 1
+      #   consul:
+      #     host: "consul:8500"
+      #     prefix: ""
+      #     http_client_timeout: "20s"
+      #     consistent_reads: true
+  limits_config:
+    enforce_metric_name: false
+    reject_old_samples: true
+    reject_old_samples_max_age: 168h
+  schema_config:
+    configs:
+    - from: 2020-10-24
+      store: boltdb-shipper
+      object_store: filesystem
+      schema: v11
+      index:
+        prefix: index_
+        period: 24h
+  server:
+    http_listen_port: 3100
+  storage_config:
+    boltdb_shipper:
+      active_index_directory: /data/loki/boltdb-shipper-active
+      cache_location: /data/loki/boltdb-shipper-cache
+      cache_ttl: 24h         # Can be increased for faster performance over longer query periods, uses more disk space
+      shared_store: filesystem
+    filesystem:
+      directory: /data/loki/chunks
+  chunk_store_config:
+    max_look_back_period: 0s
+  table_manager:
+    retention_deletes_enabled: false
+    retention_period: 0s
+  compactor:
+    working_directory: /data/loki/boltdb-shipper-compactor
+    shared_store: filesystem
+    retention_enabled: true
+# Needed for Alerting: https://grafana.com/docs/loki/latest/rules/
+# This is just a simple example, for more details: https://grafana.com/docs/loki/latest/configuration/#ruler_config
+  ruler:
+    storage:
+      type: local
+      local:
+        directory: /rules
+    rule_path: /tmp/scratch
+    alertmanager_url: http://openreplay-alertmanager.monitoring.svc.cluster.local:9093
+    ring:
+      kvstore:
+        store: inmemory
+    enable_api: true
+
+persistence:
+  enabled: true
+  accessModes:
+  - ReadWriteOnce
+  size: 100Gi
+
+serviceMonitor:
+  enabled: true
+  interval: ""
+  additionalLabels:
+    release: monitoring
+  annotations: {}
+  # scrapeTimeout: 10s
+  # path: /metrics
+  prometheusRule:
+    enabled: true
+    additionalLabels:
+      release: monitoring
+    rules: 
+     - alert: LokiProcessTooManyRestarts
+       expr: changes(process_start_time_seconds{job=~"loki"}[15m]) > 2
+       for: 0m
+       labels:
+         severity: warning
+       annotations:
+         summary: Loki process too many restarts (instance {{ $labels.instance }})
+         description: "A loki process had too many restarts (target {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+     - alert: LokiRequestErrors
+       expr: 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10
+       for: 15m
+       labels:
+         severity: critical
+       annotations:
+         summary: Loki request errors (instance {{ $labels.instance }})
+         description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+     - alert: LokiRequestPanic
+       expr: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
+       for: 5m
+       labels:
+         severity: critical
+       annotations:
+         summary: Loki request panic (instance {{ $labels.instance }})
+         description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+     - alert: LokiRequestLatency
+       expr: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le)))  > 1
+       for: 5m
+       labels:
+         severity: critical
+       annotations:
+         summary: Loki request latency (instance {{ $labels.instance }})
+         description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+# Specify Loki Alerting rules based on this documentation: https://grafana.com/docs/loki/latest/rules/
+# When specified, you also need to add a ruler config section above. An example is shown in the alerting docs.
+alerting_groups: 
+  - name: dbZombie
+    rules:
+    - alert: dbZombie
+      expr: absent_over_time({namespace="app", app="db"} |~ "Queue Statistics"[5m]) == 1
+      for: 10m
+      labels:
+        severity: warning
--- a/ee/scripts/helmcharts/manifests/monitoring.yaml
+++ b/ee/scripts/helmcharts/manifests/monitoring.yaml
--- a/ee/scripts/helmcharts/manifests/values-override.yaml
+++ b/ee/scripts/helmcharts/manifests/values-override.yaml
@ -0,0 +1,56 @@
+clickhouse:
+  # For enterpriseEdition
+  enabled: false
+
+kafka: &kafka
+  # For enterpriseEdition
+  enabled: true
+  kafkaHost: "kafka.db.svc.cluster.local"
+  kafkaPort: "9092"
+  kafkaUseSsl: "false"
+  config: |-
+    replica.fetch.max.bytes=3000000
+    message.max.bytes=3000000
+    zookeeper.connect=databases-zookeeper
+redis: &redis
+  # For enterpriseEdition
+  enabled: false
+
+postgresql: &postgres
+  # For generating passwords
+  # `openssl rand -hex 20`
+  enabled: false
+  postgresqlPassword: "changeMePassword"
+  postgresqlHost: "postgresql.db.svc.cluster.local"
+  postgresqlPort: "5432"
+  postgresqlUser: "postgres"
+  postgresqlDatabase: "postgres"
+
+ingress-nginx: &ingress-nginx
+  service:
+    externalTrafficPolicy: "Local"
+  extraArgs:
+    default-ssl-certificate: "app/openreplay-ssl"
+  config:
+    use-gzip: true
+    load-balance: ewma
+    enable-real-ip: true
+    # Enable LB forwarded protocol
+    # Ref: https://kubernetes.github.io/ingress-nginx/user-guide/nginx-configuration/configmap/#use-forwarded-headers
+    # https://github.com/nginxinc/kubernetes-ingress/issues/1284#issuecomment-872869354
+    # use-forwarded-headers: true
+global:
+  s3:
+    region: "us-east-1"
+    endpoint: "http://minio.db.svc.cluster.local:9000"
+    assetsBucket: "sessions-assets"
+    recordingsBucket: "mobs"
+    sourcemapsBucket: "sourcemaps"
+    # if you're using one node installation, where
+    # you're using local s3, make sure these variables
+    # are same as minio.global.minio.accesskey and  secretKey
+    accessKey: "changeMeMinioAccessKey"
+    secretKey: "changeMeMinioPassword"
+
+  enterpriseEditionLicense: ""
+  domainName: ""