updated for infra

2026-03-10 20:40:20 +03:00
parent 9c2b3bf8bd
commit 840a7f85c8
13 changed files with 661 additions and 12 deletions
--- a/infra/monitoring/docker-compose.yml
+++ b/infra/monitoring/docker-compose.yml
@@ -0,0 +1,145 @@
+secrets:
+  monitoring_vault_role_id:
+    external: true
+  monitoring_vault_secret_id:
+    external: true
+
+networks:
+  cicd:
+    external: true
+
+volumes:
+  loki_data:
+  grafana_data:
+  prometheus_data:
+  alertmanager_data:
+  alertmanager_config:
+    driver: local
+    driver_opts:
+      type: tmpfs
+      device: tmpfs
+      o: size=8m,uid=0,gid=0,mode=0755
+  vault_secrets:
+    driver: local
+    driver_opts:
+      type: tmpfs
+      device: tmpfs
+      o: size=32m,uid=472,gid=472,mode=0750
+
+services:
+  vault-agent-monitoring:
+    image: hashicorp/vault:latest
+    networks: [cicd]
+    cap_add: ["IPC_LOCK"]
+    environment:
+      VAULT_ADDR: "http://vault:8200"
+    command: >
+      sh -lc 'vault agent -config=/etc/vault/agent.hcl'
+    secrets:
+      - source: monitoring_vault_role_id
+        target: /etc/vault/role_id
+      - source: monitoring_vault_secret_id
+        target: /etc/vault/secret_id
+    volumes:
+      - ./vault-agent/agent.hcl:/etc/vault/agent.hcl:ro
+      - ./vault-agent/templates:/etc/vault/templates:ro
+      - vault_secrets:/vault/secrets:rw
+      - alertmanager_config:/vault/alertmanager:rw
+    healthcheck:
+      test: ["CMD-SHELL", "test -s /vault/secrets/grafana.env"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+    deploy:
+      restart_policy:
+        condition: any
+
+  prometheus:
+    image: prom/prometheus:latest
+    networks: [cicd]
+    command:
+      - --config.file=/etc/prometheus/prometheus.yml
+      - --storage.tsdb.path=/prometheus
+      - --storage.tsdb.retention.time=30d
+      - --web.enable-lifecycle
+    volumes:
+      - ./prometheus/config.yml:/etc/prometheus/prometheus.yml:ro
+      - prometheus_data:/prometheus
+    healthcheck:
+      test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/ready"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+    labels:
+      - "traefik.enable=true"
+      - "traefik.http.routers.prometheus.rule=Host(`prometheus.sendico.io`)"
+      - "traefik.http.routers.prometheus.entrypoints=websecure"
+      - "traefik.http.routers.prometheus.tls.certresolver=letsencrypt"
+      - "traefik.http.services.prometheus.loadbalancer.server.port=9090"
+    deploy:
+      restart_policy:
+        condition: any
+
+  alertmanager:
+    image: prom/alertmanager:latest
+    networks: [cicd]
+    command: >
+      sh -c 'while [ ! -s /vault/alertmanager/alertmanager.yml ]; do echo "⏳ waiting for alertmanager.yml"; sleep 2; done;
+             exec /bin/alertmanager --config.file=/vault/alertmanager/alertmanager.yml --storage.path=/alertmanager'
+    volumes:
+      - alertmanager_data:/alertmanager
+      - alertmanager_config:/vault/alertmanager:ro
+    healthcheck:
+      test: ["CMD", "wget", "--spider", "-q", "http://localhost:9093/-/ready"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+    labels:
+      - "traefik.enable=true"
+      - "traefik.http.routers.alertmanager.rule=Host(`alertmanager.sendico.io`)"
+      - "traefik.http.routers.alertmanager.entrypoints=websecure"
+      - "traefik.http.routers.alertmanager.tls.certresolver=letsencrypt"
+      - "traefik.http.services.alertmanager.loadbalancer.server.port=9093"
+    deploy:
+      restart_policy:
+        condition: any
+
+  loki:
+    image: grafana/loki:latest
+    networks: [cicd]
+    command: ["-config.file=/etc/loki/config.yml"]
+    volumes:
+      - ./loki/config.yml:/etc/loki/config.yml:ro
+      - loki_data:/loki
+    healthcheck:
+      test: ["CMD", "wget", "--spider", "-q", "http://localhost:3100/ready"]
+      interval: 30s
+      timeout: 5s
+      retries: 5
+    deploy:
+      restart_policy:
+        condition: any
+
+  grafana:
+    image: grafana/grafana:latest
+    networks: [cicd]
+    command: >
+      sh -c 'while [ ! -s /vault/secrets/grafana.env ]; do echo "⏳ waiting for grafana.env"; sleep 2; done;
+             set -a; . /vault/secrets/grafana.env; set +a; exec /run.sh'
+    volumes:
+      - grafana_data:/var/lib/grafana
+      - vault_secrets:/vault/secrets:ro
+    healthcheck:
+      test: ["CMD", "wget", "--spider", "-q", "http://localhost:3000/api/health"]
+      interval: 30s
+      timeout: 5s
+      retries: 5
+    labels:
+      - "traefik.enable=true"
+      - "traefik.http.routers.grafana.rule=Host(`grafana.sendico.io`)"
+      - "traefik.http.routers.grafana.entrypoints=websecure"
+      - "traefik.http.routers.grafana.tls.certresolver=letsencrypt"
+      - "traefik.http.services.grafana.loadbalancer.server.port=3000"
+    deploy:
+      restart_policy:
+        condition: any
--- a/infra/monitoring/loki/config.yml
+++ b/infra/monitoring/loki/config.yml
@@ -0,0 +1,37 @@
+# loki/config.yml — single-binary, filesystem-backed TSDB storage, 7-day retention
+
+server:
+  http_listen_port: 3100
+  instance_addr: 127.0.0.1
+
+common:
+  path_prefix: /loki
+  storage:
+    filesystem:
+      chunks_directory: /loki/chunks
+      rules_directory: /loki/rules
+  replication_factor: 1
+  ring:
+    kvstore:
+      store: inmemory
+
+schema_config:
+  configs:
+    - from: "2025-01-01"
+      store: tsdb
+      object_store: filesystem
+      schema: v13
+      index:
+        prefix: index_
+        period: 24h
+
+limits_config:
+  retention_period: 168h
+  max_query_lookback: 168h
+  allow_structured_metadata: true
+
+compactor:
+  working_directory: /loki/compactor
+  compaction_interval: 5m
+  retention_enabled: true
+  delete_request_store: filesystem
--- a/infra/monitoring/prometheus/config.yml
+++ b/infra/monitoring/prometheus/config.yml
@@ -0,0 +1,22 @@
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets: ['alertmanager:9093']
+
+scrape_configs:
+  - job_name: prometheus
+    static_configs:
+      - targets: ['localhost:9090']
+
+  - job_name: loki
+    static_configs:
+      - targets: ['loki:3100']
+
+  # Uncomment if Grafana metrics are enabled:
+  # - job_name: grafana
+  #   static_configs:
+  #     - targets: ['grafana:3000']