deploy: blue-green strategy (start → health-all → stop old)

Phase 3 of deployment-strategies plan. Refactor executeAsync to dispatch on DeploymentStrategy.fromWire(config.deploymentStrategy()). Blue-green (default): - Start all N new replicas (gen-suffixed names coexist with old). - Wait for ALL healthy (strict — partial-healthy = FAILED, preserves previous deployment untouched). - Only then find + stop the previous deployment. - Final status is always RUNNING; DEGRADED is now reserved for post-deploy replica crashes (set by DockerEventMonitor). Rolling: stub — throws UnsupportedOperationException for now, gets its real implementation in Phase 4. Refactor details: - Extract DeployCtx record to carry 13 per-deploy values around. - Extract startReplica(ctx, i, stateOut) — shared by both strategy paths. - Extract persistSnapshotAndMarkRunning(ctx, primaryCid) — shared finalizer. - Rename waitForAnyHealthy → waitForAllHealthy (the name was misleading; the method already waited for all, just returned partial on timeout). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 09:51:24 +02:00
parent 652346dcd4
commit 459cdfe427
1 changed files with 186 additions and 130 deletions
--- a/cameleer-server-app/src/main/java/com/cameleer/server/app/runtime/DeploymentExecutor.java
+++ b/cameleer-server-app/src/main/java/com/cameleer/server/app/runtime/DeploymentExecutor.java
@@ -96,6 +96,27 @@ public class DeploymentExecutor {
        return deployment.id().toString().substring(0, 8);
    }
    /**
     * Per-deployment context assembled once at the top of executeAsync and passed
     * into strategy handlers. Keeps the strategy methods readable instead of
     * threading 12 positional args.
     */
    private record DeployCtx(
            Deployment deployment,
            App app,
            Environment env,
            ResolvedContainerConfig config,
            String jarPath,
            String resolvedRuntimeType,
            String mainClass,
            String generation,
            String primaryNetwork,
            List<String> additionalNets,
            Map<String, String> baseEnvVars,
            Map<String, String> prometheusLabels,
            long deployStart
    ) {}
    @Async("deploymentTaskExecutor")
    public void executeAsync(Deployment deployment) {
        long deployStart = System.currentTimeMillis();
@@ -152,7 +173,6 @@ public class DeploymentExecutor {
            updateStage(deployment.id(), DeployStage.CREATE_NETWORK);
            // Primary network: use configured CAMELEER_DOCKER_NETWORK (tenant-isolated in SaaS mode)
            String primaryNetwork = dockerNetwork;
            String envNet = null;
            List<String> additionalNets = new ArrayList<>();
            if (networkManager != null) {
                networkManager.ensureNetwork(primaryNetwork);
@@ -160,7 +180,7 @@ public class DeploymentExecutor {
                networkManager.ensureNetwork(DockerNetworkManager.TRAEFIK_NETWORK);
                additionalNets.add(DockerNetworkManager.TRAEFIK_NETWORK);
                // Per-environment network scoped to tenant to prevent cross-tenant collisions
-                envNet = DockerNetworkManager.envNetworkName(tenantId, env.slug());
+                String envNet = DockerNetworkManager.envNetworkName(tenantId, env.slug());
                networkManager.ensureNetwork(envNet);
                additionalNets.add(envNet);
            }
@@ -175,135 +195,21 @@ public class DeploymentExecutor {
                }
            }
-            // === STOP PREVIOUS ACTIVE DEPLOYMENT ===
+            DeployCtx ctx = new DeployCtx(
-            // Container names are deterministic ({tenant}-{env}-{app}-{replica}), so a
+                    deployment, app, env, config, jarPath,
-            // previous active deployment holds the Docker names we need. Stop + remove
+                    resolvedRuntimeType, mainClass, generation,
-            // it before starting new replicas to avoid a 409 name conflict. Excluding
+                    primaryNetwork, additionalNets,
-            // the current deployment id by SQL (not Java) because the newly created
+                    buildEnvVars(app, env, config),
-            // row already has status=STARTING and would otherwise be picked by
+                    PrometheusLabelBuilder.build(resolvedRuntimeType),
-            // findActiveByAppIdAndEnvironmentId ORDER BY created_at DESC LIMIT 1.
+                    deployStart);
-            Optional<Deployment> previous = deploymentRepository.findActiveByAppIdAndEnvironmentIdExcluding(
+
-                    deployment.appId(), deployment.environmentId(), deployment.id());
+            // Dispatch on strategy. Unknown values fall back to BLUE_GREEN via fromWire.
-            if (previous.isPresent()) {
+            DeploymentStrategy strategy = DeploymentStrategy.fromWire(config.deploymentStrategy());
-                log.info("Stopping previous deployment {} before starting new replicas", previous.get().id());
+            switch (strategy) {
-                stopDeploymentContainers(previous.get());
+                case BLUE_GREEN -> deployBlueGreen(ctx);
-                deploymentService.markStopped(previous.get().id());
+                case ROLLING -> deployRolling(ctx);
            }
            // === START REPLICAS ===
            updateStage(deployment.id(), DeployStage.START_REPLICAS);
            Map<String, String> baseEnvVars = buildEnvVars(app, env, config);
            Map<String, String> prometheusLabels = PrometheusLabelBuilder.build(resolvedRuntimeType);
            List<Map<String, Object>> replicaStates = new ArrayList<>();
            List<String> newContainerIds = new ArrayList<>();
            for (int i = 0; i < config.replicas(); i++) {
                String instanceId = env.slug() + "-" + app.slug() + "-" + i + "-" + generation;
                String containerName = tenantId + "-" + instanceId;
                // Per-replica labels (include replica index, generation, and instance-id)
                Map<String, String> labels = TraefikLabelBuilder.build(app.slug(), env.slug(), tenantId, config, i, generation);
                labels.putAll(prometheusLabels);
                // Per-replica env vars (set agent instance ID to match container log identity)
                Map<String, String> replicaEnvVars = new LinkedHashMap<>(baseEnvVars);
                replicaEnvVars.put("CAMELEER_AGENT_INSTANCEID", instanceId);
                String volumeName = jarDockerVolume != null && !jarDockerVolume.isBlank() ? jarDockerVolume : null;
                ContainerRequest request = new ContainerRequest(
                        containerName, baseImage, jarPath,
                        volumeName, jarStoragePath,
                        primaryNetwork,
                        additionalNets,
                        replicaEnvVars, labels,
                        config.memoryLimitBytes(), config.memoryReserveBytes(),
                        config.dockerCpuShares(), config.dockerCpuQuota(),
                        config.exposedPorts(), agentHealthPort,
                        "on-failure", 3,
                        resolvedRuntimeType, config.customArgs(), mainClass
                );
                String containerId = orchestrator.startContainer(request);
                newContainerIds.add(containerId);
                // Connect to additional networks after container is started
                for (String net : additionalNets) {
                    if (networkManager != null) {
                        networkManager.connectContainer(containerId, net);
                    }
                }
                orchestrator.startLogCapture(containerId, instanceId, app.slug(), env.slug(), tenantId);
                replicaStates.add(Map.of(
                        "index", i,
                        "containerId", containerId,
                        "containerName", containerName,
                        "status", "STARTING"
                ));
            }
            pgDeployRepo.updateReplicaStates(deployment.id(), replicaStates);
            // === HEALTH CHECK ===
            updateStage(deployment.id(), DeployStage.HEALTH_CHECK);
            int healthyCount = waitForAnyHealthy(newContainerIds, healthCheckTimeout);
            if (healthyCount == 0) {
                for (String cid : newContainerIds) {
                    try { orchestrator.stopContainer(cid); orchestrator.removeContainer(cid); }
                    catch (Exception e) { log.warn("Cleanup failed for {}: {}", cid, e.getMessage()); }
                }
                pgDeployRepo.updateDeployStage(deployment.id(), null);
                deploymentService.markFailed(deployment.id(), "No replicas passed health check within " + healthCheckTimeout + "s");
                serverMetrics.recordDeploymentOutcome("FAILED");
                serverMetrics.recordDeploymentDuration(deployStart);
                return;
            }
            replicaStates = updateReplicaHealth(replicaStates, newContainerIds);
            pgDeployRepo.updateReplicaStates(deployment.id(), replicaStates);
            // === SWAP TRAFFIC ===
            // Traffic is routed via Traefik Docker labels, so the "swap" happens
            // implicitly once the new replicas are healthy and the old containers
            // are gone. The old deployment was already stopped before START_REPLICAS
            // to free the deterministic container names.
            updateStage(deployment.id(), DeployStage.SWAP_TRAFFIC);
            // === COMPLETE ===
            updateStage(deployment.id(), DeployStage.COMPLETE);
            // Capture config snapshot before marking RUNNING
            ApplicationConfig agentConfig = applicationConfigRepository
                    .findByApplicationAndEnvironment(app.slug(), env.slug())
                    .orElse(null);
            List<String> snapshotSensitiveKeys = agentConfig != null ? agentConfig.getSensitiveKeys() : null;
            DeploymentConfigSnapshot snapshot = new DeploymentConfigSnapshot(
                    deployment.appVersionId(),
                    agentConfig,
                    app.containerConfig(),
                    snapshotSensitiveKeys
            );
            pgDeployRepo.saveDeployedConfigSnapshot(deployment.id(), snapshot);
            String primaryContainerId = newContainerIds.get(0);
            DeploymentStatus finalStatus = healthyCount == config.replicas()
                    ? DeploymentStatus.RUNNING : DeploymentStatus.DEGRADED;
            deploymentService.markRunning(deployment.id(), primaryContainerId);
            if (finalStatus == DeploymentStatus.DEGRADED) {
                deploymentRepository.updateStatus(deployment.id(), DeploymentStatus.DEGRADED,
                        primaryContainerId, null);
            }
            pgDeployRepo.updateDeployStage(deployment.id(), null);
            serverMetrics.recordDeploymentOutcome(finalStatus.name());
            serverMetrics.recordDeploymentDuration(deployStart);
            log.info("Deployment {} is {} ({}/{} replicas healthy)",
                    deployment.id(), finalStatus, healthyCount, config.replicas());
        } catch (Exception e) {
            log.error("Deployment {} FAILED: {}", deployment.id(), e.getMessage(), e);
            pgDeployRepo.updateDeployStage(deployment.id(), null);
@@ -313,6 +219,153 @@ public class DeploymentExecutor {
        }
    }
    /**
     * Blue/green strategy: start all N new replicas (coexisting with the old
     * ones thanks to the gen-suffixed container names), wait for ALL healthy,
     * then stop the previous deployment. Strict all-healthy — partial failure
     * preserves the previous deployment untouched.
     */
    private void deployBlueGreen(DeployCtx ctx) {
        ResolvedContainerConfig config = ctx.config();
        Deployment deployment = ctx.deployment();
        // === START REPLICAS ===
        updateStage(deployment.id(), DeployStage.START_REPLICAS);
        List<Map<String, Object>> replicaStates = new ArrayList<>();
        List<String> newContainerIds = new ArrayList<>();
        for (int i = 0; i < config.replicas(); i++) {
            Map<String, Object> state = new LinkedHashMap<>();
            String containerId = startReplica(ctx, i, state);
            newContainerIds.add(containerId);
            replicaStates.add(state);
        }
        pgDeployRepo.updateReplicaStates(deployment.id(), replicaStates);
        // === HEALTH CHECK ===
        updateStage(deployment.id(), DeployStage.HEALTH_CHECK);
        int healthyCount = waitForAllHealthy(newContainerIds, healthCheckTimeout);
        if (healthyCount < config.replicas()) {
            // Strict abort: tear down new replicas, leave the previous deployment untouched.
            for (String cid : newContainerIds) {
                try { orchestrator.stopContainer(cid); orchestrator.removeContainer(cid); }
                catch (Exception e) { log.warn("Cleanup failed for {}: {}", cid, e.getMessage()); }
            }
            pgDeployRepo.updateDeployStage(deployment.id(), null);
            String reason = String.format(
                    "blue-green: %d/%d replicas healthy within %ds; preserving previous deployment",
                    healthyCount, config.replicas(), healthCheckTimeout);
            deploymentService.markFailed(deployment.id(), reason);
            serverMetrics.recordDeploymentOutcome("FAILED");
            serverMetrics.recordDeploymentDuration(ctx.deployStart());
            return;
        }
        replicaStates = updateReplicaHealth(replicaStates, newContainerIds);
        pgDeployRepo.updateReplicaStates(deployment.id(), replicaStates);
        // === SWAP TRAFFIC ===
        // All new replicas are healthy; Traefik labels are already attracting
        // traffic to them. Stop the previous deployment now — the swap is
        // implicit in the label-driven load balancer.
        updateStage(deployment.id(), DeployStage.SWAP_TRAFFIC);
        Optional<Deployment> previous = deploymentRepository.findActiveByAppIdAndEnvironmentIdExcluding(
                deployment.appId(), deployment.environmentId(), deployment.id());
        if (previous.isPresent()) {
            log.info("blue-green: stopping previous deployment {} now that new replicas are healthy",
                    previous.get().id());
            stopDeploymentContainers(previous.get());
            deploymentService.markStopped(previous.get().id());
        }
        // === COMPLETE ===
        updateStage(deployment.id(), DeployStage.COMPLETE);
        persistSnapshotAndMarkRunning(ctx, newContainerIds.get(0));
        log.info("Deployment {} is RUNNING (blue-green, {}/{} replicas healthy)",
                deployment.id(), healthyCount, config.replicas());
    }
    /**
     * Rolling strategy: replace replicas one at a time — start new[i], wait
     * healthy, stop old[i]. On any replica's health failure, stop the failing
     * new container and abort without touching the remaining (un-replaced) old
     * replicas. Implemented in a follow-up phase.
     */
    private void deployRolling(DeployCtx ctx) {
        throw new UnsupportedOperationException(
                "Rolling deployment strategy is not yet implemented; use blue-green");
    }
    /** Start one replica container with the gen-suffixed name and return its
     * container id. Fills `stateOut` with the replicaStates JSONB row. */
    private String startReplica(DeployCtx ctx, int i, Map<String, Object> stateOut) {
        Environment env = ctx.env();
        App app = ctx.app();
        ResolvedContainerConfig config = ctx.config();
        String instanceId = env.slug() + "-" + app.slug() + "-" + i + "-" + ctx.generation();
        String containerName = tenantId + "-" + instanceId;
        Map<String, String> labels = TraefikLabelBuilder.build(
                app.slug(), env.slug(), tenantId, config, i, ctx.generation());
        labels.putAll(ctx.prometheusLabels());
        Map<String, String> replicaEnvVars = new LinkedHashMap<>(ctx.baseEnvVars());
        replicaEnvVars.put("CAMELEER_AGENT_INSTANCEID", instanceId);
        String volumeName = jarDockerVolume != null && !jarDockerVolume.isBlank() ? jarDockerVolume : null;
        ContainerRequest request = new ContainerRequest(
                containerName, baseImage, ctx.jarPath(),
                volumeName, jarStoragePath,
                ctx.primaryNetwork(),
                ctx.additionalNets(),
                replicaEnvVars, labels,
                config.memoryLimitBytes(), config.memoryReserveBytes(),
                config.dockerCpuShares(), config.dockerCpuQuota(),
                config.exposedPorts(), agentHealthPort,
                "on-failure", 3,
                ctx.resolvedRuntimeType(), config.customArgs(), ctx.mainClass()
        );
        String containerId = orchestrator.startContainer(request);
        // Connect to additional networks after container is started
        for (String net : ctx.additionalNets()) {
            if (networkManager != null) {
                networkManager.connectContainer(containerId, net);
            }
        }
        orchestrator.startLogCapture(containerId, instanceId, app.slug(), env.slug(), tenantId);
        stateOut.put("index", i);
        stateOut.put("containerId", containerId);
        stateOut.put("containerName", containerName);
        stateOut.put("status", "STARTING");
        return containerId;
    }
    /** Persist the deployment snapshot and mark the deployment RUNNING.
     * Finalizes the deploy in a single place shared by all strategy paths. */
    private void persistSnapshotAndMarkRunning(DeployCtx ctx, String primaryContainerId) {
        Deployment deployment = ctx.deployment();
        ApplicationConfig agentConfig = applicationConfigRepository
                .findByApplicationAndEnvironment(ctx.app().slug(), ctx.env().slug())
                .orElse(null);
        List<String> snapshotSensitiveKeys = agentConfig != null ? agentConfig.getSensitiveKeys() : null;
        DeploymentConfigSnapshot snapshot = new DeploymentConfigSnapshot(
                deployment.appVersionId(),
                agentConfig,
                ctx.app().containerConfig(),
                snapshotSensitiveKeys);
        pgDeployRepo.saveDeployedConfigSnapshot(deployment.id(), snapshot);
        deploymentService.markRunning(deployment.id(), primaryContainerId);
        pgDeployRepo.updateDeployStage(deployment.id(), null);
        serverMetrics.recordDeploymentOutcome("RUNNING");
        serverMetrics.recordDeploymentDuration(ctx.deployStart());
    }
    public void stopDeployment(Deployment deployment) {
        pgDeployRepo.updateTargetState(deployment.id(), "STOPPED");
        deploymentRepository.updateStatus(deployment.id(), DeploymentStatus.STOPPING,
@@ -378,7 +431,10 @@ public class DeploymentExecutor {
        return envVars;
    }
-    private int waitForAnyHealthy(List<String> containerIds, int timeoutSeconds) {
+    /** Poll until all containers are healthy or the timeout expires. Returns
     * the healthy count at return time — == ids.size() on full success, less
     * if the timeout won. */
    private int waitForAllHealthy(List<String> containerIds, int timeoutSeconds) {
        long deadline = System.currentTimeMillis() + (timeoutSeconds * 1000L);
        int lastHealthy = 0;
        while (System.currentTimeMillis() < deadline) {