From 40e2c01abd9fdc28ae98d2a7f4002d27709ee721 Mon Sep 17 00:00:00 2001 From: Seu Nome Date: Mon, 8 Dec 2025 15:07:13 -0300 Subject: [PATCH] fix: zero-downtime deploy com start-first e healthcheck MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove docker service update --force que causava downtime - Agrupa env vars do Convex em um único update (evita múltiplos restarts) - Adiciona delay: 10s e monitor: 30s no update_config - Healthcheck do web usa /api/health com timeout - Ajusta start_period: 180s (web) e 60s (convex) - Convex backend não é mais forçado a reiniciar após stack deploy Fluxo correto de deploy: 1. docker stack deploy detecta mudança 2. Novo container é criado (start-first) 3. Swarm espera healthcheck passar 4. Swarm espera monitor period (30s) 5. Container antigo é removido 6. Zero downtime durante todo o processo 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .github/workflows/ci-cd-web-desktop.yml | 28 +++++++++++++++---------- stack.yml | 26 ++++++++++++++++------- 2 files changed, 35 insertions(+), 19 deletions(-) diff --git a/.github/workflows/ci-cd-web-desktop.yml b/.github/workflows/ci-cd-web-desktop.yml index 138a5b6..208661e 100644 --- a/.github/workflows/ci-cd-web-desktop.yml +++ b/.github/workflows/ci-cd-web-desktop.yml @@ -296,26 +296,31 @@ jobs: echo "Using APP_DIR (stable)=$APP_DIR_STABLE" APP_DIR="$APP_DIR_STABLE" RELEASE_SHA=${{ github.sha }} docker stack deploy --with-registry-auth -c stack.yml sistema - - name: Ensure Convex service envs and restart + - name: Ensure Convex service envs (sem force restart) run: | cd "$EFFECTIVE_APP_DIR" set -o allexport if [ -f .env ]; then . ./.env; fi set +o allexport echo "Ensuring Convex envs on service: sistema_convex_backend" + # Acumula todas as env vars em um único update para evitar múltiplos restarts + UPDATE_ARGS="" if [ -n "${MACHINE_PROVISIONING_SECRET:-}" ]; then - docker service update --env-add MACHINE_PROVISIONING_SECRET="${MACHINE_PROVISIONING_SECRET}" sistema_convex_backend || true + UPDATE_ARGS="$UPDATE_ARGS --env-add MACHINE_PROVISIONING_SECRET=${MACHINE_PROVISIONING_SECRET}" fi if [ -n "${MACHINE_TOKEN_TTL_MS:-}" ]; then - docker service update --env-add MACHINE_TOKEN_TTL_MS="${MACHINE_TOKEN_TTL_MS}" sistema_convex_backend || true + UPDATE_ARGS="$UPDATE_ARGS --env-add MACHINE_TOKEN_TTL_MS=${MACHINE_TOKEN_TTL_MS}" fi if [ -n "${FLEET_SYNC_SECRET:-}" ]; then - docker service update --env-add FLEET_SYNC_SECRET="${FLEET_SYNC_SECRET}" sistema_convex_backend || true + UPDATE_ARGS="$UPDATE_ARGS --env-add FLEET_SYNC_SECRET=${FLEET_SYNC_SECRET}" fi - echo "Current envs:" + if [ -n "$UPDATE_ARGS" ]; then + echo "Applying env updates (will respect update_config.order: start-first)..." + docker service update $UPDATE_ARGS sistema_convex_backend || true + fi + echo "Current envs:" docker service inspect sistema_convex_backend --format '{{range .Spec.TaskTemplate.ContainerSpec.Env}}{{println .}}{{end}}' || true - echo "Forcing service restart..." - docker service update --force sistema_convex_backend || true + # NÃO fazemos --force aqui para respeitar a estratégia start-first do stack.yml - name: Smoke test — register + heartbeat run: | @@ -375,10 +380,11 @@ jobs: run: | docker service update --force sistema_web - - name: Restart Convex backend service (optional) - run: | - # Fail the job if the convex backend cannot restart - docker service update --force sistema_convex_backend + # Comentado: o stack deploy já atualiza os serviços com update_config.order: start-first + # Forçar update aqui causa downtime porque ignora a estratégia de rolling update + # - name: Restart Convex backend service (optional) + # run: | + # docker service update --force sistema_convex_backend convex_deploy: name: Deploy Convex functions diff --git a/stack.yml b/stack.yml index a17bf7b..a861f6d 100644 --- a/stack.yml +++ b/stack.yml @@ -45,6 +45,10 @@ services: # start-first evita downtime: sobe o novo task antes de parar o anterior order: start-first failure_action: rollback + # Delay entre updates para garantir que o healthcheck passa + delay: 10s + # Monitor: tempo que o Swarm espera após o deploy para verificar estabilidade + monitor: 30s rollback_config: order: start-first resources: @@ -66,11 +70,14 @@ services: networks: - traefik_public healthcheck: - test: ["CMD", "node", "-e", "fetch('http://localhost:3000').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"] + # Healthcheck mais robusto: verifica se o servidor responde + test: ["CMD", "node", "-e", "fetch('http://localhost:3000/api/health',{timeout:2000}).then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"] interval: 10s - timeout: 3s - retries: 5 - start_period: 120s + timeout: 5s + retries: 3 + # start_period: tempo de inicialização antes de começar a contar falhas + # O novo container só entra em serviço APÓS passar no healthcheck + start_period: 180s convex_backend: image: sistema_convex_backend:1.29.2 @@ -96,6 +103,9 @@ services: parallelism: 1 order: start-first failure_action: rollback + # Delay e monitor para garantir zero-downtime + delay: 10s + monitor: 30s resources: limits: # Limite de memória elevado para evitar reinícios por OOM (exit code 137) em cargas de relatórios / índices. @@ -119,10 +129,10 @@ services: - traefik_public healthcheck: test: ["CMD-SHELL", "curl -sf http://localhost:3210/version >/dev/null || exit 1"] - interval: 30s - timeout: 10s - retries: 10 - start_period: 120s + interval: 15s + timeout: 5s + retries: 3 + start_period: 60s convex_dashboard: image: ghcr.io/get-convex/convex-dashboard:latest