deploy: rolling strategy (per-replica replacement)

Replace the Phase 3 stub with a working rolling implementation. Flow: - Capture previous deployment's per-index container ids up front. - For i = 0..replicas-1: - Start new[i] (gen-suffixed name, coexists with old[i]). - Wait for new[i] healthy (new waitForOneHealthy helper). - On success: stop old[i] if present, continue. - On failure: stop in-flight new[0..i], leave un-replaced old[i+1..N] running, mark FAILED. Already-replaced old replicas are not restored — rolling is not reversible; user redeploys to recover. - After the loop: sweep any leftover old replicas (when replica count shrank) and mark the old deployment STOPPED. Resource peak: replicas + 1. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 09:53:52 +02:00
parent 459cdfe427
commit 653f983a08
1 changed files with 114 additions and 5 deletions
--- a/cameleer-server-app/src/main/java/com/cameleer/server/app/runtime/DeploymentExecutor.java
+++ b/cameleer-server-app/src/main/java/com/cameleer/server/app/runtime/DeploymentExecutor.java
@@ -287,13 +287,122 @@ public class DeploymentExecutor {
    /**
     * Rolling strategy: replace replicas one at a time — start new[i], wait
-     * healthy, stop old[i]. On any replica's health failure, stop the failing
+     * healthy, stop old[i]. On any replica's health failure, stop the
-     * new container and abort without touching the remaining (un-replaced) old
+     * in-flight new container, leave remaining old replicas serving, mark
-     * replicas. Implemented in a follow-up phase.
+     * FAILED. Already-replaced old containers are not restored (can't unring
     * that bell) — user redeploys to recover.
     *
     * Resource peak: replicas + 1 (briefly while a new replica warms up
     * before its counterpart is stopped).
     */
    private void deployRolling(DeployCtx ctx) {
-        throw new UnsupportedOperationException(
+        ResolvedContainerConfig config = ctx.config();
-                "Rolling deployment strategy is not yet implemented; use blue-green");
+        Deployment deployment = ctx.deployment();
        // Capture previous deployment's per-index container ids up front.
        Optional<Deployment> previousOpt = deploymentRepository.findActiveByAppIdAndEnvironmentIdExcluding(
                deployment.appId(), deployment.environmentId(), deployment.id());
        Map<Integer, String> oldContainerByIndex = new LinkedHashMap<>();
        if (previousOpt.isPresent() && previousOpt.get().replicaStates() != null) {
            for (Map<String, Object> r : previousOpt.get().replicaStates()) {
                Object idx = r.get("index");
                Object cid = r.get("containerId");
                if (idx instanceof Number n && cid instanceof String s) {
                    oldContainerByIndex.put(n.intValue(), s);
                }
            }
        }
        // === START REPLICAS ===
        updateStage(deployment.id(), DeployStage.START_REPLICAS);
        List<Map<String, Object>> replicaStates = new ArrayList<>();
        List<String> newContainerIds = new ArrayList<>();
        for (int i = 0; i < config.replicas(); i++) {
            // Start new replica i (gen-suffixed name; coexists with old[i]).
            Map<String, Object> state = new LinkedHashMap<>();
            String newCid = startReplica(ctx, i, state);
            newContainerIds.add(newCid);
            replicaStates.add(state);
            pgDeployRepo.updateReplicaStates(deployment.id(), replicaStates);
            // === HEALTH CHECK (per-replica) ===
            updateStage(deployment.id(), DeployStage.HEALTH_CHECK);
            boolean healthy = waitForOneHealthy(newCid, healthCheckTimeout);
            if (!healthy) {
                // Abort: stop this in-flight new replica AND any new replicas
                // started so far. Already-stopped old replicas stay stopped
                // (rolling is not reversible). Remaining un-replaced old
                // replicas keep serving traffic.
                for (String cid : newContainerIds) {
                    try { orchestrator.stopContainer(cid); orchestrator.removeContainer(cid); }
                    catch (Exception e) { log.warn("Cleanup failed for {}: {}", cid, e.getMessage()); }
                }
                pgDeployRepo.updateDeployStage(deployment.id(), null);
                String reason = String.format(
                        "rolling: replica %d failed to reach healthy within %ds; %d previous replicas still running",
                        i, healthCheckTimeout, oldContainerByIndex.size());
                deploymentService.markFailed(deployment.id(), reason);
                serverMetrics.recordDeploymentOutcome("FAILED");
                serverMetrics.recordDeploymentDuration(ctx.deployStart());
                return;
            }
            // Health check passed: update replica status to RUNNING, stop the
            // corresponding old[i] if present, and continue with replica i+1.
            replicaStates = updateReplicaHealth(replicaStates, newContainerIds);
            pgDeployRepo.updateReplicaStates(deployment.id(), replicaStates);
            String oldCid = oldContainerByIndex.remove(i);
            if (oldCid != null) {
                try {
                    orchestrator.stopContainer(oldCid);
                    orchestrator.removeContainer(oldCid);
                    log.info("rolling: replaced replica {} (old={}, new={})", i, oldCid, newCid);
                } catch (Exception e) {
                    log.warn("rolling: failed to stop old replica {} ({}): {}", i, oldCid, e.getMessage());
                }
            }
        }
        // === SWAP TRAFFIC ===
        // Any old replicas with indices >= new.replicas (e.g., when replica
        // count shrank) are still running; sweep them now so the old
        // deployment can be marked STOPPED.
        updateStage(deployment.id(), DeployStage.SWAP_TRAFFIC);
        for (Map.Entry<Integer, String> e : oldContainerByIndex.entrySet()) {
            try {
                orchestrator.stopContainer(e.getValue());
                orchestrator.removeContainer(e.getValue());
                log.info("rolling: stopped leftover old replica {} ({})", e.getKey(), e.getValue());
            } catch (Exception ex) {
                log.warn("rolling: failed to stop leftover old replica {}: {}", e.getKey(), ex.getMessage());
            }
        }
        if (previousOpt.isPresent()) {
            deploymentService.markStopped(previousOpt.get().id());
        }
        // === COMPLETE ===
        updateStage(deployment.id(), DeployStage.COMPLETE);
        persistSnapshotAndMarkRunning(ctx, newContainerIds.get(0));
        log.info("Deployment {} is RUNNING (rolling, {}/{} replicas replaced)",
                deployment.id(), config.replicas(), config.replicas());
    }
    /** Poll a single container until healthy or the timeout expires. Returns
     * true on healthy, false on timeout or thread interrupt. */
    private boolean waitForOneHealthy(String containerId, int timeoutSeconds) {
        long deadline = System.currentTimeMillis() + (timeoutSeconds * 1000L);
        while (System.currentTimeMillis() < deadline) {
            ContainerStatus status = orchestrator.getContainerStatus(containerId);
            if ("healthy".equals(status.state())) return true;
            try { Thread.sleep(2000); } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
                return false;
            }
        }
        return false;
    }
    /** Start one replica container with the gen-suffixed name and return its