diff --git a/.claude/rules/docker-orchestration.md b/.claude/rules/docker-orchestration.md index b63545e0..5de80ebd 100644 --- a/.claude/rules/docker-orchestration.md +++ b/.claude/rules/docker-orchestration.md @@ -41,7 +41,7 @@ When deployed via the cameleer-saas platform, this server orchestrates customer `startContainer` is now a two-phase op per replica: 1. **Volume create** — `cameleer-jars-{containerName}` named volume (per-replica, deterministic so cleanup in `removeContainer` can derive it). -2. **Loader container** — `loaderImage` (default `gitea.siegeln.net/cameleer/cameleer-runtime-loader:latest`), name `{containerName}-loader`, mount the volume **RW at `/app/jars`**, env vars `ARTIFACT_URL` + `ARTIFACT_EXPECTED_SIZE`. Loader downloads the JAR from the signed URL into the volume and exits 0. Orchestrator blocks on `waitContainerCmd().exec(WaitContainerResultCallback).awaitStatusCode(120, SECONDS)`. Loader container is removed in a `finally` block; on non-zero exit the volume is also removed and `RuntimeException` propagates so `DeploymentExecutor` marks the deployment FAILED. +2. **Loader container** — `loaderImage` (default `gitea.siegeln.net/cameleer/cameleer-runtime-loader:latest`), name `{containerName}-loader`, mount the volume **RW at `/app/jars`**, env vars `ARTIFACT_URL` + `ARTIFACT_EXPECTED_SIZE`. Loader downloads the JAR from the signed URL into the volume and exits 0. Orchestrator blocks on `waitContainerCmd().exec(WaitContainerResultCallback).awaitStatusCode(120, SECONDS)`. Loader container is removed in a `finally` block; on non-zero exit the volume is also removed and `RuntimeException` propagates so `DeploymentExecutor` marks the deployment FAILED. **Loader logs are captured before removal** (`captureLoaderLogs` — `logContainerCmd` with `withTail(50)`, capped at 4096 chars, 5s timeout) and appended to the thrown `RuntimeException` message as `". loader output: "`. Best-effort: log-capture failures are swallowed and don't mask the original exit. The loader image's Dockerfile pre-creates `/app/jars` owned by `loader:loader` (UID 1000) so the orchestrator's fresh named volume initialises with that ownership — without it the empty volume comes up as `root:root 0755` and wget exits 1 with "Permission denied". `LoaderHardeningIT` is the regression guard. 3. **Main container** — same hardening contract, mount the same volume **RO at `/app/jars`**, entrypoint reads `/app/jars/app.jar` (Spring Boot/Quarkus: `-jar /app/jars/app.jar`; plain Java: `-cp /app/jars/app.jar `; native: `exec /app/jars/app.jar`). `removeContainer(id)` derives the volume name from the inspected container name (Docker prefixes it with `/`) and removes the volume after the container removes — blue/green doesn't leak volumes. diff --git a/cameleer-server-app/src/main/java/com/cameleer/server/app/runtime/DockerRuntimeOrchestrator.java b/cameleer-server-app/src/main/java/com/cameleer/server/app/runtime/DockerRuntimeOrchestrator.java index 6770685c..7e52f9fe 100644 --- a/cameleer-server-app/src/main/java/com/cameleer/server/app/runtime/DockerRuntimeOrchestrator.java +++ b/cameleer-server-app/src/main/java/com/cameleer/server/app/runtime/DockerRuntimeOrchestrator.java @@ -13,12 +13,14 @@ import com.github.dockerjava.api.model.HealthCheck; import com.github.dockerjava.api.model.HostConfig; import com.github.dockerjava.api.model.RestartPolicy; import com.github.dockerjava.api.model.Volume; +import com.github.dockerjava.core.command.LogContainerResultCallback; import com.github.dockerjava.core.command.WaitContainerResultCallback; import jakarta.annotation.PreDestroy; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -57,6 +59,14 @@ public class DockerRuntimeOrchestrator implements RuntimeOrchestrator { * the JAR before giving up. */ private static final long LOADER_WAIT_TIMEOUT_SECONDS = 120; + /** Tail size for loader-failure diagnostics surfaced in the thrown + * RuntimeException. wget's BusyBox failure messages are short (e.g. + * "wget: server returned error: HTTP/1.1 401" or "Permission denied"), + * so 50 lines is plenty without bloating exception strings. */ + private static final int LOADER_LOG_TAIL_LINES = 50; + private static final int LOADER_LOG_MAX_CHARS = 4096; + private static final long LOADER_LOG_FETCH_TIMEOUT_SECONDS = 5; + private final DockerClient dockerClient; private final String dockerRuntime; @@ -148,13 +158,19 @@ public class DockerRuntimeOrchestrator implements RuntimeOrchestrator { } int exitCode; + String loaderLogs = ""; try { exitCode = dockerClient.waitContainerCmd(loaderId) .exec(new WaitContainerResultCallback()) .awaitStatusCode(LOADER_WAIT_TIMEOUT_SECONDS, TimeUnit.SECONDS); + if (exitCode != 0) { + loaderLogs = captureLoaderLogs(loaderId); + } } catch (Exception e) { + loaderLogs = captureLoaderLogs(loaderId); cleanup(loaderId, volumeName); - throw new RuntimeException("Loader wait failed for " + request.containerName() + ": " + e.getMessage(), e); + throw new RuntimeException("Loader wait failed for " + request.containerName() + + appendLogs(loaderLogs) + ": " + e.getMessage(), e); } finally { try { dockerClient.removeContainerCmd(loaderId).withForce(true).exec(); @@ -164,7 +180,8 @@ public class DockerRuntimeOrchestrator implements RuntimeOrchestrator { } if (exitCode != 0) { cleanupVolume(volumeName); - throw new RuntimeException("Loader exited " + exitCode + " for " + request.containerName()); + throw new RuntimeException("Loader exited " + exitCode + " for " + request.containerName() + + appendLogs(loaderLogs)); } // Phase 2: Main container — RO on the shared volume. Wrap in try/catch @@ -271,6 +288,40 @@ public class DockerRuntimeOrchestrator implements RuntimeOrchestrator { return id; } + /** Capture the loader's last stdout/stderr so the failure exception explains + * itself instead of just "Loader exited N". Best-effort: a docker-side + * failure to stream logs must not mask the original loader exit. */ + private String captureLoaderLogs(String loaderId) { + try { + StringBuilder buf = new StringBuilder(); + dockerClient.logContainerCmd(loaderId) + .withStdOut(true) + .withStdErr(true) + .withTail(LOADER_LOG_TAIL_LINES) + .exec(new LogContainerResultCallback() { + @Override + public void onNext(Frame frame) { + if (buf.length() < LOADER_LOG_MAX_CHARS) { + buf.append(new String(frame.getPayload(), StandardCharsets.UTF_8)); + } + } + }) + .awaitCompletion(LOADER_LOG_FETCH_TIMEOUT_SECONDS, TimeUnit.SECONDS); + String text = buf.toString().trim(); + if (text.length() > LOADER_LOG_MAX_CHARS) { + text = text.substring(text.length() - LOADER_LOG_MAX_CHARS); + } + return text; + } catch (Exception e) { + log.warn("Failed to capture loader logs for {}: {}", loaderId, e.getMessage()); + return ""; + } + } + + private static String appendLogs(String logs) { + return logs == null || logs.isEmpty() ? "" : ". loader output: " + logs; + } + /** Hardening contract from issue #152 — applied uniformly to loader + main. */ private HostConfig baseHardenedHostConfig() { return HostConfig.newHostConfig() diff --git a/cameleer-server-app/src/test/java/com/cameleer/server/app/runtime/LoaderHardeningIT.java b/cameleer-server-app/src/test/java/com/cameleer/server/app/runtime/LoaderHardeningIT.java new file mode 100644 index 00000000..8d058aac --- /dev/null +++ b/cameleer-server-app/src/test/java/com/cameleer/server/app/runtime/LoaderHardeningIT.java @@ -0,0 +1,135 @@ +package com.cameleer.server.app.runtime; + +import com.github.dockerjava.api.DockerClient; +import com.github.dockerjava.api.model.AccessMode; +import com.github.dockerjava.api.model.Bind; +import com.github.dockerjava.api.model.Capability; +import com.github.dockerjava.api.model.Volume; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.testcontainers.DockerClientFactory; +import org.testcontainers.containers.BindMode; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.Network; +import org.testcontainers.containers.startupcheck.OneShotStartupCheckStrategy; +import org.testcontainers.images.builder.ImageFromDockerfile; +import org.testcontainers.junit.jupiter.Testcontainers; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.time.Duration; +import java.util.List; +import java.util.Map; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Real-Docker IT for the cameleer-runtime-loader image. Closes the regression + * window where /app/jars wasn't pre-created in the loader image — the + * orchestrator's fresh named volume mounted as root:root 0755 and wget + * (running as UID 1000) failed with "Permission denied" only at runtime. + * + *

Replicates the exact hardening shape from + * {@link DockerRuntimeOrchestrator}'s {@code baseHardenedHostConfig()} + + * loader-specific bind, against a real artifact server, and asserts the + * loader writes the expected file. + */ +@Testcontainers +class LoaderHardeningIT { + + private static final Path LOADER_DIR = Paths + .get(System.getProperty("user.dir")) + .getParent() + .resolve("cameleer-runtime-loader"); + + private static final int ARTIFACT_BYTES = 1024; + + private DockerClient dockerClient; + private Network network; + private GenericContainer fileServer; + private GenericContainer loader; + private Path fixtureDir; + private String volumeName; + private String loaderImageId; + + @BeforeEach + void setUp() throws IOException { + dockerClient = DockerClientFactory.instance().client(); + network = Network.newNetwork(); + + fixtureDir = Files.createTempDirectory("loader-it-fixture-"); + Path payload = fixtureDir.resolve("artifact.jar"); + Files.write(payload, new byte[ARTIFACT_BYTES]); + + fileServer = new GenericContainer<>("nginx:alpine") + .withNetwork(network) + .withNetworkAliases("file-server") + .withFileSystemBind( + fixtureDir.toAbsolutePath().toString(), + "/usr/share/nginx/html", + BindMode.READ_ONLY); + fileServer.start(); + + loaderImageId = new ImageFromDockerfile() + .withFileFromPath(".", LOADER_DIR) + .get(); + + volumeName = "cameleer-loader-it-" + UUID.randomUUID().toString().substring(0, 8); + dockerClient.createVolumeCmd().withName(volumeName).exec(); + } + + @AfterEach + void tearDown() throws IOException { + if (loader != null) { + try { loader.stop(); } catch (Exception ignored) { } + } + if (volumeName != null) { + try { dockerClient.removeVolumeCmd(volumeName).exec(); } catch (Exception ignored) { } + } + if (fileServer != null) fileServer.stop(); + if (network != null) network.close(); + if (fixtureDir != null) { + Files.walk(fixtureDir) + .sorted((a, b) -> b.getNameCount() - a.getNameCount()) + .forEach(p -> { + try { Files.deleteIfExists(p); } catch (IOException ignored) { } + }); + } + } + + @Test + void loaderWritesArtifactUnderHardenedContract() { + // OneShotStartupCheckStrategy succeeds only when the container has + // exited with status 0. Anything else (non-zero exit, timeout) throws + // ContainerLaunchException — the assertion below is a belt-and-braces + // explicit check on the resolved exit code. + loader = new GenericContainer<>(loaderImageId) + .withNetwork(network) + .withEnv("ARTIFACT_URL", "http://file-server/artifact.jar") + .withEnv("ARTIFACT_EXPECTED_SIZE", String.valueOf(ARTIFACT_BYTES)) + .withCreateContainerCmdModifier(cmd -> cmd.getHostConfig() + .withCapDrop(Capability.values()) + .withSecurityOpts(List.of("no-new-privileges:true", "apparmor=docker-default")) + .withReadonlyRootfs(true) + .withPidsLimit(512L) + .withTmpFs(Map.of("/tmp", "rw,nosuid,size=64m")) + .withBinds(new Bind(volumeName, new Volume("/app/jars"), AccessMode.rw))) + .withStartupCheckStrategy(new OneShotStartupCheckStrategy() + .withTimeout(Duration.ofSeconds(60))); + + loader.start(); + + Long exit = dockerClient.inspectContainerCmd(loader.getContainerId()) + .exec() + .getState() + .getExitCodeLong(); + assertThat(exit) + .as("loader must exit 0 — non-zero indicates the hardening contract " + + "broke the artifact write (e.g. /app/jars not owned by loader UID 1000)") + .isZero(); + } +}