fix(runtime): pre-pull loader image, plug volume-leak windows, document network dep

Pre-pull the loader image at PULL_IMAGE so the implicit pull on first createContainerCmd doesn't bypass the 120s loader-wait timeout. Wrap createAndStartLoader in try/catch so a create/start failure cleans up the just-created volume; same guard around createAndStartMain on phase-2 failures. Folds the wait-error message into the rethrown RuntimeException so the cause chain is visible. Add a @PostConstruct WARN when neither artifactbaseurl nor serverurl is set so the implicit cameleer-server DNS dependency is loud at boot, and document the loader-to-server reachability contract in .claude/rules/docker-orchestration.md. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 16:26:35 +02:00
parent 1ddae94930
commit cc076b1923
3 changed files with 38 additions and 4 deletions
--- a/.claude/rules/docker-orchestration.md
+++ b/.claude/rules/docker-orchestration.md
@@ -48,6 +48,14 @@ When deployed via the cameleer-saas platform, this server orchestrates customer

 `DeploymentExecutor` generates the signed URL via `ArtifactDownloadTokenSigner.sign(appVersion.id(), Duration.ofSeconds(artifactTokenTtlSeconds))` and passes `appVersion.id()`, the URL, `appVersion.jarSizeBytes()`, and the loader image into `ContainerRequest`. The host filesystem is no longer involved at deploy time.

+**Loader → server reachability**: the loader container hits the Cameleer server over HTTP from inside its
+own Docker network. The signed URL is built from `cameleer.server.runtime.artifactbaseurl` (preferred), falling
+back to `cameleer.server.runtime.serverurl`, falling back to `http://cameleer-server:8081`. The default works
+in SaaS mode because `DockerNetworkManager` adds `cameleer-traefik` as an additional network for tenant
+containers, and the server is reachable on that network via the `cameleer-server` DNS alias. For non-SaaS
+topologies (server on a different network than tenants), set `CAMELEER_SERVER_RUNTIME_ARTIFACTBASEURL`
+explicitly to a URL the loader can reach.
+
 ## DeploymentExecutor Details

 Primary network for app containers is set via `CAMELEER_SERVER_RUNTIME_DOCKERNETWORK` env var (in SaaS mode: `cameleer-tenant-{slug}`); apps also connect to `cameleer-traefik` (routing) and `cameleer-env-{tenantId}-{envSlug}` (per-environment discovery) as additional networks. Resolves `runtimeType: auto` to concrete type from `AppVersion.detectedRuntimeType` at PRE_FLIGHT (fails deployment if unresolvable). Builds Docker entrypoint per runtime type (all JVM types use `-javaagent:/app/agent.jar -jar`, plain Java uses `-cp` with main class, native runs binary directly). Sets per-replica `CAMELEER_AGENT_INSTANCEID` env var to `{envSlug}-{appSlug}-{replicaIndex}-{generation}` so container logs and agent logs share the same instance identity. Sets `CAMELEER_AGENT_*` env vars from `ResolvedContainerConfig` (routeControlEnabled, replayEnabled, health port). These are startup-only agent properties — changing them requires redeployment.
--- a/cameleer-server-app/src/main/java/com/cameleer/server/app/runtime/DeploymentExecutor.java
+++ b/cameleer-server-app/src/main/java/com/cameleer/server/app/runtime/DeploymentExecutor.java
@@ -106,6 +106,17 @@ public class DeploymentExecutor {
        this.licenseUsageReader = licenseUsageReader;
    }

+    @jakarta.annotation.PostConstruct
+    public void validateArtifactBaseUrl() {
+        if (artifactBaseUrl.isBlank() && globalServerUrl.isBlank()) {
+            log.warn("Neither cameleer.server.runtime.artifactbaseurl nor cameleer.server.runtime.serverurl is set. "
+                    + "Loader containers will fall back to http://cameleer-server:8081 — this requires the loader's "
+                    + "Docker network to resolve `cameleer-server`. In SaaS mode the server is on `cameleer-traefik` "
+                    + "which is added as an additional network for tenant containers, so this works. For other "
+                    + "deployment topologies, set CAMELEER_SERVER_RUNTIME_ARTIFACTBASEURL explicitly.");
+        }
+    }
+
    /** Deployment-scoped id suffix — distinguishes container names and
     * CAMELEER_AGENT_INSTANCEID across redeploys so old + new replicas can
     * coexist during a blue/green swap. First 8 chars of the deployment UUID. */
@@ -211,6 +222,7 @@ public class DeploymentExecutor {
            // === PULL IMAGE ===
            updateStage(deployment.id(), DeployStage.PULL_IMAGE);
            orchestrator.pullImage(baseImage);
+            orchestrator.pullImage(loaderImage);

            // === CREATE NETWORKS ===
            updateStage(deployment.id(), DeployStage.CREATE_NETWORK);
--- a/cameleer-server-app/src/main/java/com/cameleer/server/app/runtime/DockerRuntimeOrchestrator.java
+++ b/cameleer-server-app/src/main/java/com/cameleer/server/app/runtime/DockerRuntimeOrchestrator.java
@@ -138,7 +138,15 @@ public class DockerRuntimeOrchestrator implements RuntimeOrchestrator {
        // the shared volume. Hardened identically to the main container, plus
        // RW bind on /app/jars and the artifact env vars the loader entrypoint
        // expects. We block on its exit code before bringing the main up.
-        String loaderId = createAndStartLoader(request, volumeName);
+        String loaderId;
+        try {
+            loaderId = createAndStartLoader(request, volumeName);
+        } catch (Exception e) {
+            // Volume created but loader never reached the wait/cleanup paths — clean up here.
+            cleanupVolume(volumeName);
+            throw new RuntimeException("Loader create/start failed for " + request.containerName(), e);
+        }
+
        int exitCode;
        try {
            exitCode = dockerClient.waitContainerCmd(loaderId)
@@ -146,7 +154,7 @@ public class DockerRuntimeOrchestrator implements RuntimeOrchestrator {
                    .awaitStatusCode(LOADER_WAIT_TIMEOUT_SECONDS, TimeUnit.SECONDS);
        } catch (Exception e) {
            cleanup(loaderId, volumeName);
-            throw new RuntimeException("Loader wait failed for " + request.containerName(), e);
+            throw new RuntimeException("Loader wait failed for " + request.containerName() + ": " + e.getMessage(), e);
        } finally {
            try {
                dockerClient.removeContainerCmd(loaderId).withForce(true).exec();
@@ -159,8 +167,14 @@ public class DockerRuntimeOrchestrator implements RuntimeOrchestrator {
            throw new RuntimeException("Loader exited " + exitCode + " for " + request.containerName());
        }

-        // Phase 2: Main container — RO on the shared volume.
-        return createAndStartMain(request, volumeName);
+        // Phase 2: Main container — RO on the shared volume. Wrap in try/catch
+        // so a main-create failure cleans up the volume too (loader already gone).
+        try {
+            return createAndStartMain(request, volumeName);
+        } catch (Exception e) {
+            cleanupVolume(volumeName);
+            throw new RuntimeException("Main container create/start failed for " + request.containerName(), e);
+        }
    }

    private String createAndStartLoader(ContainerRequest request, String volumeName) {