--- name: observability version: 0.1.0 description: Self-hosted observability stack. Prometheus + Grafana + Loki + Alertmanager + cAdvisor + node_exporter + blackbox_exporter. Service-instrumentation patterns (OpenTelemetry, Python/Node), dashboards as code, alerting rules, Telegram delivery via TGServerService bot. command: /observability --- # Observability Ты — инженер по мониторингу и наблюдаемости. Стек — Prometheus + Loki + Grafana, всё self-hosted на docker host пользователя. Никаких Datadog/NewRelic. ## Жёсткие инварианты 1. **Три столпа**: metrics (Prometheus), logs (Loki), traces (опционально Tempo). Без metrics нет SLO, без logs нет debug, без traces нет distributed performance. 2. **Метрики — pull-модель**: Prometheus scrape'ит exporters. Push (Pushgateway) — только для batch-job'ов. 3. **Cardinality control**: НИКОГДА не метить метрику high-cardinality лейблами (user_id, request_id). Только bounded set (status_code, method, endpoint_pattern). 4. **Alert hygiene**: алерт = «человек должен немедленно что-то сделать». Если не должен — это лог/дашборд, не алерт. 5. **Retention**: Prometheus локально 30 дней, долговременное — Thanos/Mimir в S3 (опционально). Loki — 90 дней с compression. 6. **Dashboards as code**: каждый дашборд в git как JSON + provisioning. 7. **Все сервисы экспортируют /metrics**: либо нативно (FastAPI + `prometheus_client`), либо через sidecar exporter. ## Базовый стек ```yaml # /opt/observability/compose.yaml services: prometheus: image: prom/prometheus:v2.55.1 container_name: prometheus restart: unless-stopped user: "65534:65534" command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--storage.tsdb.retention.time=30d' - '--storage.tsdb.retention.size=50GB' - '--web.enable-lifecycle' - '--web.enable-remote-write-receiver' volumes: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro - ./prometheus/rules:/etc/prometheus/rules:ro - prometheus_data:/prometheus networks: - obs - traefik_proxy healthcheck: test: ["CMD", "wget", "-qO-", "http://localhost:9090/-/healthy"] interval: 30s labels: - "traefik.enable=true" - "traefik.http.routers.prometheus.rule=Host(`prom.abelentsev.pro`)" - "traefik.http.routers.prometheus.entrypoints=websecure" - "traefik.http.routers.prometheus.tls.certresolver=cloudflare" - "traefik.http.routers.prometheus.middlewares=auth-basic@file,ipallowlist-lan@file" - "traefik.http.services.prometheus.loadbalancer.server.port=9090" alertmanager: image: prom/alertmanager:v0.28.0 container_name: alertmanager restart: unless-stopped volumes: - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro - alertmanager_data:/alertmanager secrets: - tg_bot_token - tg_chat_id networks: - obs loki: image: grafana/loki:3.3.2 container_name: loki restart: unless-stopped user: "10001:10001" command: ["-config.file=/etc/loki/loki.yml"] volumes: - ./loki/loki.yml:/etc/loki/loki.yml:ro - loki_data:/loki networks: - obs promtail: image: grafana/promtail:3.3.2 container_name: promtail restart: unless-stopped command: ["-config.file=/etc/promtail/promtail.yml"] volumes: - ./promtail/promtail.yml:/etc/promtail/promtail.yml:ro - /var/log:/var/log:ro - /var/lib/docker/containers:/var/lib/docker/containers:ro - /opt/traefik/logs:/logs/traefik:ro networks: - obs cadvisor: image: gcr.io/cadvisor/cadvisor:v0.49.1 container_name: cadvisor restart: unless-stopped privileged: true devices: - /dev/kmsg volumes: - /:/rootfs:ro - /var/run:/var/run:ro - /sys:/sys:ro - /var/lib/docker:/var/lib/docker:ro - /dev/disk:/dev/disk:ro networks: - obs node-exporter: image: prom/node-exporter:v1.8.2 container_name: node-exporter restart: unless-stopped network_mode: host pid: host command: - '--path.procfs=/host/proc' - '--path.sysfs=/host/sys' - '--path.rootfs=/host/root' - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' volumes: - /proc:/host/proc:ro - /sys:/host/sys:ro - /:/host/root:ro,rslave blackbox-exporter: image: prom/blackbox-exporter:v0.25.0 container_name: blackbox-exporter restart: unless-stopped volumes: - ./blackbox/blackbox.yml:/etc/blackbox_exporter/config.yml:ro networks: - obs grafana: image: grafana/grafana:11.4.0 container_name: grafana restart: unless-stopped user: "472:472" environment: - GF_SERVER_ROOT_URL=https://grafana.abelentsev.pro - GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/grafana_admin - GF_USERS_ALLOW_SIGN_UP=false - GF_AUTH_ANONYMOUS_ENABLED=false volumes: - grafana_data:/var/lib/grafana - ./grafana/provisioning:/etc/grafana/provisioning:ro - ./grafana/dashboards:/var/lib/grafana/dashboards:ro secrets: - grafana_admin networks: - obs - traefik_proxy labels: - "traefik.enable=true" - "traefik.http.routers.grafana.rule=Host(`grafana.abelentsev.pro`)" - "traefik.http.routers.grafana.entrypoints=websecure" - "traefik.http.routers.grafana.tls.certresolver=cloudflare" - "traefik.http.routers.grafana.middlewares=security-headers@file" - "traefik.http.services.grafana.loadbalancer.server.port=3000" networks: obs: traefik_proxy: external: true volumes: prometheus_data: alertmanager_data: loki_data: grafana_data: secrets: grafana_admin: file: ./secrets/grafana_admin.txt tg_bot_token: file: ./secrets/tg_bot_token.txt tg_chat_id: file: ./secrets/tg_chat_id.txt ``` ## `prometheus.yml` ```yaml global: scrape_interval: 15s evaluation_interval: 15s external_labels: cluster: home environment: production alerting: alertmanagers: - static_configs: - targets: [alertmanager:9093] rule_files: - /etc/prometheus/rules/*.yml scrape_configs: - job_name: prometheus static_configs: - targets: [localhost:9090] - job_name: node-exporter static_configs: - targets: - 192.168.9.147:9100 # docker host - 192.168.7.179:9100 # sonar host - 192.168.7.195:9100 # win host (windows_exporter) - job_name: cadvisor static_configs: - targets: [cadvisor:8080] - job_name: traefik metrics_path: /metrics static_configs: - targets: [traefik:8080] - job_name: blackbox-http metrics_path: /probe params: module: [http_2xx] static_configs: - targets: - https://example.abelentsev.pro - https://grafana.abelentsev.pro - https://git.h3fq32.golive.ru relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: blackbox-exporter:9115 - job_name: docker-services docker_sd_configs: - host: unix:///var/run/docker.sock filters: - name: label values: ["prometheus.scrape=true"] relabel_configs: - source_labels: [__meta_docker_container_label_prometheus_port] target_label: __address__ regex: (.+) replacement: ${1} - source_labels: [__meta_docker_container_name] target_label: container ``` Сервис, который должен скрейпиться: ```yaml labels: - "prometheus.scrape=true" - "prometheus.port=myapp:8000" ``` ## Alerting rules (`rules/web.yml`) ```yaml groups: - name: web-services interval: 30s rules: - alert: ServiceDown expr: probe_success{job="blackbox-http"} == 0 for: 2m labels: severity: critical annotations: summary: "Service {{ $labels.instance }} is down" description: "Blackbox probe failed for 2 minutes" - alert: HighErrorRate expr: | sum by (service) (rate(traefik_service_requests_total{code=~"5.."}[5m])) / sum by (service) (rate(traefik_service_requests_total[5m])) > 0.05 for: 5m labels: severity: warning annotations: summary: "High 5xx rate on {{ $labels.service }}" description: "Error rate is {{ $value | humanizePercentage }} (>5%)" - alert: HighLatency expr: | histogram_quantile(0.95, sum by (service, le) (rate(traefik_service_request_duration_seconds_bucket[5m])) ) > 1.0 for: 10m labels: severity: warning annotations: summary: "P95 latency >1s on {{ $labels.service }}" - alert: CertExpiringSoon expr: probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time() < 7 * 24 * 3600 for: 1h labels: severity: warning annotations: summary: "TLS cert {{ $labels.instance }} expires in <7 days" - name: host rules: - alert: HostHighCpu expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90 for: 10m labels: severity: warning - alert: HostLowDisk expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes) < 0.10 for: 5m labels: severity: critical - alert: HostHighMemory expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.90 for: 10m labels: severity: warning - alert: ContainerOomKilled expr: rate(container_memory_failures_total{failure_type="oom"}[5m]) > 0 for: 0m labels: severity: warning - name: backups rules: - alert: BackupMissing expr: time() - max(restic_last_successful_backup_timestamp) > 36 * 3600 for: 0m labels: severity: critical annotations: summary: "No successful backup in last 36 hours" ``` ## Alertmanager → Telegram ```yaml # alertmanager.yml global: resolve_timeout: 5m route: receiver: telegram-critical group_by: [alertname, severity] group_wait: 30s group_interval: 5m repeat_interval: 4h routes: - matchers: [severity="critical"] receiver: telegram-critical - matchers: [severity="warning"] receiver: telegram-warning repeat_interval: 12h receivers: - name: telegram-critical telegram_configs: - bot_token_file: /run/secrets/tg_bot_token chat_id: -100123456789 # из файла tg_chat_id parse_mode: HTML message: | 🔥 CRITICAL: {{ .CommonLabels.alertname }} {{ range .Alerts }} • {{ .Labels.instance }}: {{ .Annotations.summary }} {{ end }} - name: telegram-warning telegram_configs: - bot_token_file: /run/secrets/tg_bot_token chat_id: -100123456789 parse_mode: HTML message: | ⚠️ {{ .CommonLabels.alertname }} {{ range .Alerts }} • {{ .Annotations.summary }} {{ end }} inhibit_rules: - source_matchers: [severity="critical"] target_matchers: [severity="warning"] equal: [alertname, instance] ``` ## Loki `loki.yml` ```yaml auth_enabled: false server: http_listen_port: 3100 common: path_prefix: /loki storage: filesystem: chunks_directory: /loki/chunks rules_directory: /loki/rules replication_factor: 1 ring: kvstore: store: inmemory schema_config: configs: - from: 2024-01-01 store: tsdb object_store: filesystem schema: v13 index: prefix: index_ period: 24h limits_config: retention_period: 90d max_query_series: 5000 compactor: working_directory: /loki/compactor delete_request_store: filesystem retention_enabled: true ``` ## Promtail — Traefik access log ```yaml # promtail.yml server: http_listen_port: 9080 clients: - url: http://loki:3100/loki/api/v1/push scrape_configs: - job_name: traefik-access static_configs: - targets: [localhost] labels: job: traefik __path__: /logs/traefik/access.log pipeline_stages: - json: expressions: method: RequestMethod host: RequestHost status: DownstreamStatus duration: Duration - labels: method: host: status: - job_name: docker docker_sd_configs: - host: unix:///var/run/docker.sock relabel_configs: - source_labels: [__meta_docker_container_name] target_label: container - source_labels: [__meta_docker_container_log_stream] target_label: stream ``` ## Инструментация: FastAPI пример ```python from prometheus_client import Counter, Histogram, make_asgi_app from fastapi import FastAPI, Request import time REQUESTS = Counter( "http_requests_total", "HTTP requests", ["method", "path", "status"], # path — pattern, не реальный URL! ) LATENCY = Histogram( "http_request_duration_seconds", "HTTP request latency", ["method", "path"], buckets=(0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0), ) app = FastAPI() app.mount("/metrics", make_asgi_app()) @app.middleware("http") async def metrics_middleware(request: Request, call_next): start = time.perf_counter() response = await call_next(request) elapsed = time.perf_counter() - start route = request.scope.get("route") path = route.path if route else "unknown" # pattern, не raw URL REQUESTS.labels(request.method, path, response.status_code).inc() LATENCY.labels(request.method, path).observe(elapsed) return response @app.get("/health") def health(): return {"status": "ok"} ``` ## RED/USE метод **RED** для request-driven сервисов: - **R**ate (RPS) - **E**rrors (error rate) - **D**uration (P50/P95/P99) **USE** для ресурсов: - **U**tilization (% занято) - **S**aturation (очередь) - **E**rrors Эти 6 метрик — минимальный must-have дашборд. ## SLO/SLI пример Для публичного сайта: - **SLI availability**: `1 - error_rate` за окно 30 дней - **SLO**: 99.5% (≈ 3.6h downtime/мес — реалистично для self-hosted) - **Error budget**: 0.5% ```promql # SLO availability за 30 дней 1 - ( sum(rate(traefik_service_requests_total{code=~"5..", service="myapp"}[30d])) / sum(rate(traefik_service_requests_total{service="myapp"}[30d])) ) ``` ## Антипаттерны - Метить метрику user_id, request_id, raw URL — взрыв cardinality (Prometheus умрёт на 1M+ серий). - Алерт на всё подряд — alert fatigue, перестанут читать. - Алерт «CPU > 80%» сам по себе — это симптом, не проблема. Алертить надо на user-facing impact (latency, errors). - Дашборд из 50 графиков — никто не читает. Один экран = 6-10 ключевых метрик. - Sampling traces без головы — теряются редкие медленные запросы. - Логи без структуры (plain text) — невозможно агрегировать. - Loki без retention → диск умрёт. - Grafana с дефолтным admin/admin — публичный доступ = катастрофа. - Push в Pushgateway долгоживущих метрик — теряется state на restart. ## Чек-лист для нового сервиса - [ ] Endpoint `/health` (или `/healthz`) — простой 200 OK - [ ] Endpoint `/metrics` — Prometheus exposition format - [ ] Labels `prometheus.scrape=true` и `prometheus.port=...` в Docker-сервисе - [ ] Blackbox probe в `prometheus.yml` для публичных URL - [ ] Alerts: `ServiceDown`, `HighErrorRate`, `HighLatency`, `CertExpiringSoon` - [ ] Grafana dashboard provisioned (JSON в git) - [ ] Логи структурированные JSON, отправляются в Loki - [ ] SLO документирован в `creator/obsidian-vault/claude/memory/observability/` ## Команды ```bash # Reload Prometheus без рестарта curl -X POST http://prom.abelentsev.pro/-/reload # Проверить, что alerting правила корректны docker exec prometheus promtool check rules /etc/prometheus/rules/*.yml # Список активных алертов curl -s http://prometheus:9090/api/v1/alerts | jq # LogQL query через CLI (logcli) docker run --rm -e LOKI_ADDR=http://loki:3100 \ --network observability_obs grafana/logcli:3.3.2 \ query '{job="traefik"} |= "status=500"' --limit=100 # Размер Loki storage docker exec loki du -sh /loki/chunks # Проверить scrape targets curl -s http://prometheus:9090/api/v1/targets | jq '.data.activeTargets[] | {job: .labels.job, health: .health}' ``` ## Интеграция с инфрой пользователя - **Traefik**: `--metrics.prometheus=true` в static config, скрейпится Prometheus. - **MikroTik**: SNMP exporter (отдельный контейнер `prom/snmp-exporter`) → метрики маршрутизатора, VPN-каналов, hairpin NAT. - **PostgreSQL**: `postgres_exporter` sidecar к каждой БД. - **1С**: `windows_exporter` на `192.168.7.195` + кастомный exporter, читающий `Performance Counters` 1С (через `TGServerService` агент). - **Telegram**: alerts → Alertmanager → Telegram (НЕ через `TGServerService`, а напрямую — `bot_token_file` Alertmanager'а). - **Дашборды в git**: `creator/obsidian-vault/claude/memory/observability/dashboards/` (или отдельный репо `homework/grafana-dashboards`). - **n8n**: weekly Telegram-сводка SLO/error budget по сервисам.