From 40e2c01abd9fdc28ae98d2a7f4002d27709ee721 Mon Sep 17 00:00:00 2001
From: Seu Nome <seu-email@exemplo.com>
Date: Mon, 8 Dec 2025 15:07:13 -0300
Subject: [PATCH] fix: zero-downtime deploy com start-first e healthcheck
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove docker service update --force que causava downtime
- Agrupa env vars do Convex em um único update (evita múltiplos restarts)
- Adiciona delay: 10s e monitor: 30s no update_config
- Healthcheck do web usa /api/health com timeout
- Ajusta start_period: 180s (web) e 60s (convex)
- Convex backend não é mais forçado a reiniciar após stack deploy

Fluxo correto de deploy:
1. docker stack deploy detecta mudança
2. Novo container é criado (start-first)
3. Swarm espera healthcheck passar
4. Swarm espera monitor period (30s)
5. Container antigo é removido
6. Zero downtime durante todo o processo

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .github/workflows/ci-cd-web-desktop.yml | 28 +++++++++++++++----------
 stack.yml                               | 26 ++++++++++++++++-------
 2 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/ci-cd-web-desktop.yml b/.github/workflows/ci-cd-web-desktop.yml
index 138a5b6..208661e 100644
--- a/.github/workflows/ci-cd-web-desktop.yml
+++ b/.github/workflows/ci-cd-web-desktop.yml
@@ -296,26 +296,31 @@ jobs:
           echo "Using APP_DIR (stable)=$APP_DIR_STABLE"
           APP_DIR="$APP_DIR_STABLE" RELEASE_SHA=${{ github.sha }} docker stack deploy --with-registry-auth -c stack.yml sistema
 
-      - name: Ensure Convex service envs and restart
+      - name: Ensure Convex service envs (sem force restart)
         run: |
           cd "$EFFECTIVE_APP_DIR"
           set -o allexport
           if [ -f .env ]; then . ./.env; fi
           set +o allexport
           echo "Ensuring Convex envs on service: sistema_convex_backend"
+          # Acumula todas as env vars em um único update para evitar múltiplos restarts
+          UPDATE_ARGS=""
           if [ -n "${MACHINE_PROVISIONING_SECRET:-}" ]; then
-            docker service update --env-add MACHINE_PROVISIONING_SECRET="${MACHINE_PROVISIONING_SECRET}" sistema_convex_backend || true
+            UPDATE_ARGS="$UPDATE_ARGS --env-add MACHINE_PROVISIONING_SECRET=${MACHINE_PROVISIONING_SECRET}"
           fi
           if [ -n "${MACHINE_TOKEN_TTL_MS:-}" ]; then
-            docker service update --env-add MACHINE_TOKEN_TTL_MS="${MACHINE_TOKEN_TTL_MS}" sistema_convex_backend || true
+            UPDATE_ARGS="$UPDATE_ARGS --env-add MACHINE_TOKEN_TTL_MS=${MACHINE_TOKEN_TTL_MS}"
           fi
           if [ -n "${FLEET_SYNC_SECRET:-}" ]; then
-            docker service update --env-add FLEET_SYNC_SECRET="${FLEET_SYNC_SECRET}" sistema_convex_backend || true
+            UPDATE_ARGS="$UPDATE_ARGS --env-add FLEET_SYNC_SECRET=${FLEET_SYNC_SECRET}"
           fi
-          echo "Current envs:" 
+          if [ -n "$UPDATE_ARGS" ]; then
+            echo "Applying env updates (will respect update_config.order: start-first)..."
+            docker service update $UPDATE_ARGS sistema_convex_backend || true
+          fi
+          echo "Current envs:"
           docker service inspect sistema_convex_backend --format '{{range .Spec.TaskTemplate.ContainerSpec.Env}}{{println .}}{{end}}' || true
-          echo "Forcing service restart..."
-          docker service update --force sistema_convex_backend || true
+          # NÃO fazemos --force aqui para respeitar a estratégia start-first do stack.yml
 
       - name: Smoke test — register + heartbeat
         run: |
@@ -375,10 +380,11 @@ jobs:
         run: |
           docker service update --force sistema_web
 
-      - name: Restart Convex backend service (optional)
-        run: |
-          # Fail the job if the convex backend cannot restart
-          docker service update --force sistema_convex_backend
+      # Comentado: o stack deploy já atualiza os serviços com update_config.order: start-first
+      # Forçar update aqui causa downtime porque ignora a estratégia de rolling update
+      # - name: Restart Convex backend service (optional)
+      #   run: |
+      #     docker service update --force sistema_convex_backend
 
   convex_deploy:
     name: Deploy Convex functions
diff --git a/stack.yml b/stack.yml
index a17bf7b..a861f6d 100644
--- a/stack.yml
+++ b/stack.yml
@@ -45,6 +45,10 @@ services:
         # start-first evita downtime: sobe o novo task antes de parar o anterior
         order: start-first
         failure_action: rollback
+        # Delay entre updates para garantir que o healthcheck passa
+        delay: 10s
+        # Monitor: tempo que o Swarm espera após o deploy para verificar estabilidade
+        monitor: 30s
       rollback_config:
         order: start-first
       resources:
@@ -66,11 +70,14 @@ services:
     networks:
       - traefik_public
     healthcheck:
-      test: ["CMD", "node", "-e", "fetch('http://localhost:3000').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"]
+      # Healthcheck mais robusto: verifica se o servidor responde
+      test: ["CMD", "node", "-e", "fetch('http://localhost:3000/api/health',{timeout:2000}).then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"]
       interval: 10s
-      timeout: 3s
-      retries: 5
-      start_period: 120s
+      timeout: 5s
+      retries: 3
+      # start_period: tempo de inicialização antes de começar a contar falhas
+      # O novo container só entra em serviço APÓS passar no healthcheck
+      start_period: 180s
 
   convex_backend:
     image: sistema_convex_backend:1.29.2
@@ -96,6 +103,9 @@ services:
         parallelism: 1
         order: start-first
         failure_action: rollback
+        # Delay e monitor para garantir zero-downtime
+        delay: 10s
+        monitor: 30s
       resources:
         limits:
           # Limite de memória elevado para evitar reinícios por OOM (exit code 137) em cargas de relatórios / índices.
@@ -119,10 +129,10 @@ services:
       - traefik_public
     healthcheck:
       test: ["CMD-SHELL", "curl -sf http://localhost:3210/version >/dev/null || exit 1"]
-      interval: 30s
-      timeout: 10s
-      retries: 10
-      start_period: 120s
+      interval: 15s
+      timeout: 5s
+      retries: 3
+      start_period: 60s
 
   convex_dashboard:
     image: ghcr.io/get-convex/convex-dashboard:latest