From 40c6148d7e514fe87a1aa8ffe4ec54a66da8f11c Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Sun, 17 May 2026 22:45:31 +0200 Subject: [PATCH] revert(infra/srv): remove wrong-primitive Traefik docker-compose MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit removes infra/srv/ that I created in d23b99819. The docker-compose-Traefik sketch was architecturally wrong: - Traefik on this host is a Flux-managed Kubernetes DaemonSet at /srv/infra/clusters/default/infrastructure/traefik/helmrelease.yaml (hostNetwork: true, ports 80/443/18789/2222) - Vega's k3s explicitly disables its own bundled Traefik (--disable=traefik,servicelb,metrics-server) and relies on the Flux-managed one - So the correct Traefik integration for sf-server is k8s IngressRoute + Service + Deployment manifests under /srv/infra/apps/ or hosts/vega/, NOT a docker-compose stack in the SF source tree The sf-server Docker image (docker/Dockerfile.sf-server) and the production-grade graceful-shutdown/recovery work in packages/coding-agent/src/modes/rpc/ + src/web/shutdown-state.ts all remain valid and necessary — they just plug into k8s/Traefik via manifests in the operator's GitOps repo, not via this compose. Naming: also moved infra/srv -> docker/vega briefly during this session at the operator's nudging; both locations are gone now. Co-Authored-By: Claude Opus 4.7 (1M context) --- infra/srv/README.md | 125 ---------------------------- infra/srv/docker-compose.yaml | 150 ---------------------------------- 2 files changed, 275 deletions(-) delete mode 100644 infra/srv/README.md delete mode 100644 infra/srv/docker-compose.yaml diff --git a/infra/srv/README.md b/infra/srv/README.md deleted file mode 100644 index 2d48a5347..000000000 --- a/infra/srv/README.md +++ /dev/null @@ -1,125 +0,0 @@ -# SF server infra: Traefik + sf-server with zero-downtime upgrades - -Production deployment of `sf-server` behind a Traefik reverse-proxy. Closes -the orchestration gaps in the bare-docker upgrader (`scripts/upgrade-vega- -source-server.mjs`) by adding: - -- **Health-check-driven traffic drain.** Traefik polls `/api/healthz` every - 2s. The moment SF receives SIGTERM, `src/web/shutdown-state.ts` flips the - flag and the route returns 503. After ~4s Traefik removes the container - from the load-balancer pool. -- **Cookie-based sticky sessions.** `/api/session/events` SSE streams survive - client reconnects within an upgrade window because Traefik routes the - same `sf-aff` cookie to the same replica until that replica is gone. -- **Blue/green via weighted services.** The `sf-candidate` service runs - alongside `sf` under a separate Traefik service. Operator flips weights - to roll traffic gradually; old container drains; old removed. - -## Files - -| File | Purpose | -|------|---------| -| `docker-compose.yaml` | Traefik + sf + sf-candidate services with full label set | -| (this README) | Operator runbook | - -## Quick start (local dev / single-host prod) - -```bash -# 1. Set required env (see `Environment` below) -export SF_IMAGE=ghcr.io/singularity-ng/sf-server:$(git rev-parse HEAD) -export SF_HOSTNAME=sf.localhost # or your real hostname -export SF_WORKSPACE_DIR=/var/lib/sf/workspace - -# 2. Bring everything up -docker compose -f infra/srv/docker-compose.yaml up -d - -# 3. Sanity check -curl -H "Host: ${SF_HOSTNAME}" http://localhost/api/healthz -curl -H "Host: ${SF_HOSTNAME}" http://localhost/api/ready -curl -H "Host: ${SF_HOSTNAME}" http://localhost/api/version -``` - -## Zero-downtime upgrade - -```bash -# 1. Build the new image -export SF_CANDIDATE_IMAGE=ghcr.io/singularity-ng/sf-server:$(git rev-parse HEAD) -docker pull ${SF_CANDIDATE_IMAGE} - -# 2. Bring up the candidate (profile=candidate gates it off by default) -docker compose -f infra/srv/docker-compose.yaml --profile candidate up -d sf-candidate - -# 3. Verify candidate health BEFORE flipping traffic -docker exec sf-server-candidate curl -fsS http://localhost:4000/api/healthz -docker exec sf-server-candidate curl -fsS http://localhost:4000/api/ready - -# 4. Flip Traefik to send traffic to the candidate by promoting it to the -# primary service. The cleanest path is to relabel the candidate's -# routers to match `sf`'s rule, OR use a Traefik weighted-service -# middleware (see https://doc.traefik.io/traefik/routing/services/#weighted-round-robin -# — requires the dynamic-config provider, NOT the docker-labels-only path). -# For now: stop the old, start it as the new with candidate's image. -docker compose -f infra/srv/docker-compose.yaml stop sf -# Traefik now has only the candidate in its pool → traffic flows there. - -# 5. Replace `sf` with the new image and start it -SF_IMAGE=${SF_CANDIDATE_IMAGE} \ - docker compose -f infra/srv/docker-compose.yaml up -d sf - -# 6. Traefik picks up the new `sf` automatically (via docker label -# discovery); both services exist for ~2-4s while health-checks -# converge, then `sf-candidate` can be retired. -docker compose -f infra/srv/docker-compose.yaml --profile candidate down sf-candidate -``` - -## Environment - -| Variable | Default | Purpose | -|---|---|---| -| `SF_IMAGE` | `ghcr.io/singularity-ng/sf-server:latest` | Primary container image | -| `SF_CANDIDATE_IMAGE` | `ghcr.io/singularity-ng/sf-server:candidate` | Blue/green candidate image | -| `SF_HOSTNAME` | `sf.localhost` | Public hostname Traefik routes by | -| `SF_WORKSPACE_DIR` | `./workspace` | Bind-mounted to `/workspace` inside SF | -| `SF_TRAEFIK_HTTP_PORT` | `80` | Host port for Traefik HTTP entrypoint | -| `SF_TRAEFIK_HTTPS_PORT` | `443` | Host port for Traefik HTTPS entrypoint | -| `SF_RPC_SHUTDOWN_GRACE_MS` | `600000` | SF graceful-shutdown drain budget (10 min default). Matches `docker-compose.yaml`'s `stop_grace_period: 610s`. Operator can shorten via env for fast iteration. | - -## Why a 10-min stop_grace_period? - -If a self-feedback queue drain is in flight when SIGTERM lands, it MUST -finish before exit. Losing operator/agent feedback writes across an -upgrade silently corrupts the queue invariant. The 10-min ceiling -handles pathological lock contention; normal drains finish in <1s. - -Operator can bypass via `docker kill sf-server` (sends SIGKILL, -trampling the drain) — but that strands `.draining` files on the -`sf-state` volume. The next container's startup will recover them -(see `recoverOrphanedFeedbackDrains` in -`packages/coding-agent/src/modes/rpc/feedback-queue-recovery.ts`). - -## TLS / ACME - -This compose intentionally exposes only HTTP for local-host demos. -For real deployments, add Traefik command flags for the ACME resolver: - -```yaml -command: - - "--certificatesresolvers.letsencrypt.acme.email=ops@example.com" - - "--certificatesresolvers.letsencrypt.acme.storage=/letsencrypt/acme.json" - - "--certificatesresolvers.letsencrypt.acme.httpchallenge=true" - - "--certificatesresolvers.letsencrypt.acme.httpchallenge.entrypoint=web" -``` - -…and add per-router labels `traefik.http.routers.sf.tls.certresolver=letsencrypt`. - -## What this does NOT replace - -- `Dockerfile.sf-server` — the SF container build is unchanged. This - compose consumes the image, not the source. -- `.forgejo/workflows/self-deploy.yml` — CI builds, pushes, and rolls - k8s deployments. Forgejo's blue-green path uses `kubectl rollout`, - not docker compose. The labels/strategy here are designed to mirror - k8s readinessProbe + sessionAffinity for parity. -- The `scripts/upgrade-vega-source-server.mjs` script — that script - manages the source-server local-dev variant directly via docker run. - This compose is for the production-style deployment. diff --git a/infra/srv/docker-compose.yaml b/infra/srv/docker-compose.yaml deleted file mode 100644 index 838d9c86e..000000000 --- a/infra/srv/docker-compose.yaml +++ /dev/null @@ -1,150 +0,0 @@ -name: sf-srv - -# SF self-hosted production deployment, fronted by Traefik for: -# - health-check-driven traffic draining (consumes /api/healthz 503 during -# graceful shutdown so old containers stop receiving new traffic the -# instant SIGTERM lands — see src/web/shutdown-state.ts) -# - cookie-based sticky sessions so /api/session/events SSE streams survive -# re-issued requests within an upgrade -# - zero-downtime blue/green via weighted services (candidate gets weight=0 -# until probes pass, then weights flip; old container drains; old removed) -# -# Volumes: -# sf-state — persistent .sf/ runtime (queues, DB, drainer recovery files). -# Mounted at /workspace/.sf in each SF container. Survives -# container swaps so queued sf_feedback writes are durable -# across upgrades. -# traefik-acme — ACME cert cache (only used when SF_TRAEFIK_TLS=1) -# -# Bring up: -# docker compose -f infra/srv/docker-compose.yaml up -d -# -# Upgrade (manual blue/green; see infra/srv/README.md for the scripted flow): -# 1. docker compose -f infra/srv/docker-compose.yaml --profile candidate up -d -# 2. curl http://localhost/api/healthz (Traefik health-checks the new svc) -# 3. flip weights: edit sf-candidate label to 100, sf to 0; restart Traefik -# 4. wait for sf-old to drain (healthz 503 → Traefik removes from pool) -# 5. docker compose -f infra/srv/docker-compose.yaml stop sf - -services: - traefik: - image: traefik:v3.3 - container_name: sf-traefik - restart: unless-stopped - command: - - "--api.dashboard=false" - - "--providers.docker=true" - - "--providers.docker.exposedbydefault=false" - - "--providers.docker.network=sf-srv-net" - - "--entrypoints.web.address=:80" - - "--entrypoints.websecure.address=:443" - # Polling health-check interval is set per-service via labels. - # See traefik.http.services.sf.loadbalancer.healthcheck.* below. - ports: - - "${SF_TRAEFIK_HTTP_PORT:-80}:80" - - "${SF_TRAEFIK_HTTPS_PORT:-443}:443" - volumes: - - "/var/run/docker.sock:/var/run/docker.sock:ro" - - "traefik-acme:/letsencrypt" - networks: - - sf-srv-net - healthcheck: - test: ["CMD", "wget", "--quiet", "--spider", "http://localhost:80/ping"] - interval: 10s - timeout: 3s - retries: 3 - - sf: - image: "${SF_IMAGE:-ghcr.io/singularity-ng/sf-server:latest}" - container_name: sf-server - restart: unless-stopped - # k8s default terminationGracePeriodSeconds is 30s; we override here to - # match rpc-mode's SF_RPC_SHUTDOWN_GRACE_MS default (10 min = 600s). - # The graceful-shutdown handler in packages/coding-agent/src/modes/rpc/ - # rpc-mode.ts must finish its drain before SIGKILL — losing self-feedback - # writes across an upgrade is worse than the wait. - stop_grace_period: 610s - environment: - - "SF_RPC_SHUTDOWN_GRACE_MS=${SF_RPC_SHUTDOWN_GRACE_MS:-600000}" - - "SF_WEB_HOST=0.0.0.0" - - "SF_WEB_PORT=4000" - volumes: - - "sf-state:/workspace/.sf" - - "${SF_WORKSPACE_DIR:-./workspace}:/workspace:rw" - networks: - - sf-srv-net - labels: - # Route discovery - - "traefik.enable=true" - - "traefik.docker.network=sf-srv-net" - - "traefik.http.routers.sf.rule=Host(`${SF_HOSTNAME:-sf.localhost}`)" - - "traefik.http.routers.sf.entrypoints=web" - - "traefik.http.routers.sf.service=sf" - - # Backend port - - "traefik.http.services.sf.loadbalancer.server.port=4000" - - # Health-check: drives shutdown-aware draining. The healthz route - # returns 503 the moment src/web/shutdown-state.ts.isShuttingDown() - # flips true (SIGTERM/SIGINT/SIGHUP received). Traefik polls every - # 2s; once 2 consecutive failures land (~4s after SIGTERM), the - # container is removed from the pool and no new requests are sent. - # Existing requests finish (subject to the timeout below). - - "traefik.http.services.sf.loadbalancer.healthcheck.path=/api/healthz" - - "traefik.http.services.sf.loadbalancer.healthcheck.interval=2s" - - "traefik.http.services.sf.loadbalancer.healthcheck.timeout=3s" - - # Sticky session: required for /api/session/events SSE streams to - # survive client reconnects within the same upgrade window. Cookie - # is HttpOnly + Secure-when-TLS-fronted. Affinity is per-replica; - # when a container goes away, the cookie targets disappear and - # Traefik routes the next request to a healthy peer. - - "traefik.http.services.sf.loadbalancer.sticky.cookie=true" - - "traefik.http.services.sf.loadbalancer.sticky.cookie.name=sf-aff" - - "traefik.http.services.sf.loadbalancer.sticky.cookie.httpOnly=true" - - "traefik.http.services.sf.loadbalancer.sticky.cookie.secure=false" - - "traefik.http.services.sf.loadbalancer.sticky.cookie.sameSite=lax" - - # Candidate replica for blue/green upgrades. - # - # Default weight = 0 so production traffic stays on `sf` until probes pass. - # Operator flips weights via the upgrader script (see ../../scripts/upgrade- - # vega-source-server.mjs and the README in this dir for the full flow). - sf-candidate: - image: "${SF_CANDIDATE_IMAGE:-ghcr.io/singularity-ng/sf-server:candidate}" - container_name: sf-server-candidate - restart: unless-stopped - profiles: ["candidate"] - stop_grace_period: 610s - environment: - - "SF_RPC_SHUTDOWN_GRACE_MS=${SF_RPC_SHUTDOWN_GRACE_MS:-600000}" - - "SF_WEB_HOST=0.0.0.0" - - "SF_WEB_PORT=4000" - volumes: - - "sf-state:/workspace/.sf" - - "${SF_WORKSPACE_DIR:-./workspace}:/workspace:rw" - networks: - - sf-srv-net - labels: - - "traefik.enable=true" - - "traefik.docker.network=sf-srv-net" - - "traefik.http.routers.sf-candidate.rule=Host(`${SF_HOSTNAME:-sf.localhost}`)" - - "traefik.http.routers.sf-candidate.entrypoints=web" - - "traefik.http.routers.sf-candidate.service=sf-candidate@docker" - - "traefik.http.services.sf-candidate.loadbalancer.server.port=4000" - - "traefik.http.services.sf-candidate.loadbalancer.healthcheck.path=/api/healthz" - - "traefik.http.services.sf-candidate.loadbalancer.healthcheck.interval=2s" - - "traefik.http.services.sf-candidate.loadbalancer.healthcheck.timeout=3s" - - "traefik.http.services.sf-candidate.loadbalancer.sticky.cookie=true" - - "traefik.http.services.sf-candidate.loadbalancer.sticky.cookie.name=sf-aff-candidate" - -volumes: - sf-state: - driver: local - traefik-acme: - driver: local - -networks: - sf-srv-net: - driver: bridge - name: sf-srv-net