feat(runtime): capture loader logs in failure exceptions; add LoaderHardeningIT regression guard
Two diagnostics-and-confidence follow-ups to the loader-init-container pattern. 1) DockerRuntimeOrchestrator now captures the loader's last 50 lines of stdout/stderr (capped at 4096 chars, 5s timeout) before the finally-remove and appends them to the thrown RuntimeException as `. loader output: <text>`. Best-effort: log-capture failures are swallowed and never mask the original exit. Closes the visibility gap that turned a simple "wget: Permission denied" into the opaque "Loader exited 1". 2) New LoaderHardeningIT spins up a Testcontainers nginx serving a 1KB fixture, builds the loader image fresh from cameleer-runtime-loader/, and runs it under the exact baseHardenedHostConfig() shape (cap_drop ALL, readonly rootfs, /tmp tmpfs, no-new-privileges, apparmor=docker-default, pids=512) bound to a fresh named volume RW at /app/jars. Asserts exit 0. This would have caught the volume-permission regression in CI. GenericContainer + OneShotStartupCheckStrategy is used instead of raw docker-java waitContainerCmd because docker-java's unshaded api version in this project's pom and testcontainers' shaded copy disagree on WaitContainerCmd.getCondition() — going through GenericContainer keeps the call inside testcontainers' shaded executor. Rules doc updated to point at the captured-output behaviour and the IT. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -41,7 +41,7 @@ When deployed via the cameleer-saas platform, this server orchestrates customer
|
||||
`startContainer` is now a two-phase op per replica:
|
||||
|
||||
1. **Volume create** — `cameleer-jars-{containerName}` named volume (per-replica, deterministic so cleanup in `removeContainer` can derive it).
|
||||
2. **Loader container** — `loaderImage` (default `gitea.siegeln.net/cameleer/cameleer-runtime-loader:latest`), name `{containerName}-loader`, mount the volume **RW at `/app/jars`**, env vars `ARTIFACT_URL` + `ARTIFACT_EXPECTED_SIZE`. Loader downloads the JAR from the signed URL into the volume and exits 0. Orchestrator blocks on `waitContainerCmd().exec(WaitContainerResultCallback).awaitStatusCode(120, SECONDS)`. Loader container is removed in a `finally` block; on non-zero exit the volume is also removed and `RuntimeException` propagates so `DeploymentExecutor` marks the deployment FAILED.
|
||||
2. **Loader container** — `loaderImage` (default `gitea.siegeln.net/cameleer/cameleer-runtime-loader:latest`), name `{containerName}-loader`, mount the volume **RW at `/app/jars`**, env vars `ARTIFACT_URL` + `ARTIFACT_EXPECTED_SIZE`. Loader downloads the JAR from the signed URL into the volume and exits 0. Orchestrator blocks on `waitContainerCmd().exec(WaitContainerResultCallback).awaitStatusCode(120, SECONDS)`. Loader container is removed in a `finally` block; on non-zero exit the volume is also removed and `RuntimeException` propagates so `DeploymentExecutor` marks the deployment FAILED. **Loader logs are captured before removal** (`captureLoaderLogs` — `logContainerCmd` with `withTail(50)`, capped at 4096 chars, 5s timeout) and appended to the thrown `RuntimeException` message as `". loader output: <text>"`. Best-effort: log-capture failures are swallowed and don't mask the original exit. The loader image's Dockerfile pre-creates `/app/jars` owned by `loader:loader` (UID 1000) so the orchestrator's fresh named volume initialises with that ownership — without it the empty volume comes up as `root:root 0755` and wget exits 1 with "Permission denied". `LoaderHardeningIT` is the regression guard.
|
||||
3. **Main container** — same hardening contract, mount the same volume **RO at `/app/jars`**, entrypoint reads `/app/jars/app.jar` (Spring Boot/Quarkus: `-jar /app/jars/app.jar`; plain Java: `-cp /app/jars/app.jar <MainClass>`; native: `exec /app/jars/app.jar`).
|
||||
|
||||
`removeContainer(id)` derives the volume name from the inspected container name (Docker prefixes it with `/`) and removes the volume after the container removes — blue/green doesn't leak volumes.
|
||||
|
||||
@@ -13,12 +13,14 @@ import com.github.dockerjava.api.model.HealthCheck;
|
||||
import com.github.dockerjava.api.model.HostConfig;
|
||||
import com.github.dockerjava.api.model.RestartPolicy;
|
||||
import com.github.dockerjava.api.model.Volume;
|
||||
import com.github.dockerjava.core.command.LogContainerResultCallback;
|
||||
import com.github.dockerjava.core.command.WaitContainerResultCallback;
|
||||
import jakarta.annotation.PreDestroy;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@@ -57,6 +59,14 @@ public class DockerRuntimeOrchestrator implements RuntimeOrchestrator {
|
||||
* the JAR before giving up. */
|
||||
private static final long LOADER_WAIT_TIMEOUT_SECONDS = 120;
|
||||
|
||||
/** Tail size for loader-failure diagnostics surfaced in the thrown
|
||||
* RuntimeException. wget's BusyBox failure messages are short (e.g.
|
||||
* "wget: server returned error: HTTP/1.1 401" or "Permission denied"),
|
||||
* so 50 lines is plenty without bloating exception strings. */
|
||||
private static final int LOADER_LOG_TAIL_LINES = 50;
|
||||
private static final int LOADER_LOG_MAX_CHARS = 4096;
|
||||
private static final long LOADER_LOG_FETCH_TIMEOUT_SECONDS = 5;
|
||||
|
||||
private final DockerClient dockerClient;
|
||||
private final String dockerRuntime;
|
||||
|
||||
@@ -148,13 +158,19 @@ public class DockerRuntimeOrchestrator implements RuntimeOrchestrator {
|
||||
}
|
||||
|
||||
int exitCode;
|
||||
String loaderLogs = "";
|
||||
try {
|
||||
exitCode = dockerClient.waitContainerCmd(loaderId)
|
||||
.exec(new WaitContainerResultCallback())
|
||||
.awaitStatusCode(LOADER_WAIT_TIMEOUT_SECONDS, TimeUnit.SECONDS);
|
||||
if (exitCode != 0) {
|
||||
loaderLogs = captureLoaderLogs(loaderId);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
loaderLogs = captureLoaderLogs(loaderId);
|
||||
cleanup(loaderId, volumeName);
|
||||
throw new RuntimeException("Loader wait failed for " + request.containerName() + ": " + e.getMessage(), e);
|
||||
throw new RuntimeException("Loader wait failed for " + request.containerName()
|
||||
+ appendLogs(loaderLogs) + ": " + e.getMessage(), e);
|
||||
} finally {
|
||||
try {
|
||||
dockerClient.removeContainerCmd(loaderId).withForce(true).exec();
|
||||
@@ -164,7 +180,8 @@ public class DockerRuntimeOrchestrator implements RuntimeOrchestrator {
|
||||
}
|
||||
if (exitCode != 0) {
|
||||
cleanupVolume(volumeName);
|
||||
throw new RuntimeException("Loader exited " + exitCode + " for " + request.containerName());
|
||||
throw new RuntimeException("Loader exited " + exitCode + " for " + request.containerName()
|
||||
+ appendLogs(loaderLogs));
|
||||
}
|
||||
|
||||
// Phase 2: Main container — RO on the shared volume. Wrap in try/catch
|
||||
@@ -271,6 +288,40 @@ public class DockerRuntimeOrchestrator implements RuntimeOrchestrator {
|
||||
return id;
|
||||
}
|
||||
|
||||
/** Capture the loader's last stdout/stderr so the failure exception explains
|
||||
* itself instead of just "Loader exited N". Best-effort: a docker-side
|
||||
* failure to stream logs must not mask the original loader exit. */
|
||||
private String captureLoaderLogs(String loaderId) {
|
||||
try {
|
||||
StringBuilder buf = new StringBuilder();
|
||||
dockerClient.logContainerCmd(loaderId)
|
||||
.withStdOut(true)
|
||||
.withStdErr(true)
|
||||
.withTail(LOADER_LOG_TAIL_LINES)
|
||||
.exec(new LogContainerResultCallback() {
|
||||
@Override
|
||||
public void onNext(Frame frame) {
|
||||
if (buf.length() < LOADER_LOG_MAX_CHARS) {
|
||||
buf.append(new String(frame.getPayload(), StandardCharsets.UTF_8));
|
||||
}
|
||||
}
|
||||
})
|
||||
.awaitCompletion(LOADER_LOG_FETCH_TIMEOUT_SECONDS, TimeUnit.SECONDS);
|
||||
String text = buf.toString().trim();
|
||||
if (text.length() > LOADER_LOG_MAX_CHARS) {
|
||||
text = text.substring(text.length() - LOADER_LOG_MAX_CHARS);
|
||||
}
|
||||
return text;
|
||||
} catch (Exception e) {
|
||||
log.warn("Failed to capture loader logs for {}: {}", loaderId, e.getMessage());
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
private static String appendLogs(String logs) {
|
||||
return logs == null || logs.isEmpty() ? "" : ". loader output: " + logs;
|
||||
}
|
||||
|
||||
/** Hardening contract from issue #152 — applied uniformly to loader + main. */
|
||||
private HostConfig baseHardenedHostConfig() {
|
||||
return HostConfig.newHostConfig()
|
||||
|
||||
@@ -0,0 +1,135 @@
|
||||
package com.cameleer.server.app.runtime;
|
||||
|
||||
import com.github.dockerjava.api.DockerClient;
|
||||
import com.github.dockerjava.api.model.AccessMode;
|
||||
import com.github.dockerjava.api.model.Bind;
|
||||
import com.github.dockerjava.api.model.Capability;
|
||||
import com.github.dockerjava.api.model.Volume;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testcontainers.DockerClientFactory;
|
||||
import org.testcontainers.containers.BindMode;
|
||||
import org.testcontainers.containers.GenericContainer;
|
||||
import org.testcontainers.containers.Network;
|
||||
import org.testcontainers.containers.startupcheck.OneShotStartupCheckStrategy;
|
||||
import org.testcontainers.images.builder.ImageFromDockerfile;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.time.Duration;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
/**
|
||||
* Real-Docker IT for the cameleer-runtime-loader image. Closes the regression
|
||||
* window where /app/jars wasn't pre-created in the loader image — the
|
||||
* orchestrator's fresh named volume mounted as root:root 0755 and wget
|
||||
* (running as UID 1000) failed with "Permission denied" only at runtime.
|
||||
*
|
||||
* <p>Replicates the exact hardening shape from
|
||||
* {@link DockerRuntimeOrchestrator}'s {@code baseHardenedHostConfig()} +
|
||||
* loader-specific bind, against a real artifact server, and asserts the
|
||||
* loader writes the expected file.
|
||||
*/
|
||||
@Testcontainers
|
||||
class LoaderHardeningIT {
|
||||
|
||||
private static final Path LOADER_DIR = Paths
|
||||
.get(System.getProperty("user.dir"))
|
||||
.getParent()
|
||||
.resolve("cameleer-runtime-loader");
|
||||
|
||||
private static final int ARTIFACT_BYTES = 1024;
|
||||
|
||||
private DockerClient dockerClient;
|
||||
private Network network;
|
||||
private GenericContainer<?> fileServer;
|
||||
private GenericContainer<?> loader;
|
||||
private Path fixtureDir;
|
||||
private String volumeName;
|
||||
private String loaderImageId;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws IOException {
|
||||
dockerClient = DockerClientFactory.instance().client();
|
||||
network = Network.newNetwork();
|
||||
|
||||
fixtureDir = Files.createTempDirectory("loader-it-fixture-");
|
||||
Path payload = fixtureDir.resolve("artifact.jar");
|
||||
Files.write(payload, new byte[ARTIFACT_BYTES]);
|
||||
|
||||
fileServer = new GenericContainer<>("nginx:alpine")
|
||||
.withNetwork(network)
|
||||
.withNetworkAliases("file-server")
|
||||
.withFileSystemBind(
|
||||
fixtureDir.toAbsolutePath().toString(),
|
||||
"/usr/share/nginx/html",
|
||||
BindMode.READ_ONLY);
|
||||
fileServer.start();
|
||||
|
||||
loaderImageId = new ImageFromDockerfile()
|
||||
.withFileFromPath(".", LOADER_DIR)
|
||||
.get();
|
||||
|
||||
volumeName = "cameleer-loader-it-" + UUID.randomUUID().toString().substring(0, 8);
|
||||
dockerClient.createVolumeCmd().withName(volumeName).exec();
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
void tearDown() throws IOException {
|
||||
if (loader != null) {
|
||||
try { loader.stop(); } catch (Exception ignored) { }
|
||||
}
|
||||
if (volumeName != null) {
|
||||
try { dockerClient.removeVolumeCmd(volumeName).exec(); } catch (Exception ignored) { }
|
||||
}
|
||||
if (fileServer != null) fileServer.stop();
|
||||
if (network != null) network.close();
|
||||
if (fixtureDir != null) {
|
||||
Files.walk(fixtureDir)
|
||||
.sorted((a, b) -> b.getNameCount() - a.getNameCount())
|
||||
.forEach(p -> {
|
||||
try { Files.deleteIfExists(p); } catch (IOException ignored) { }
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
void loaderWritesArtifactUnderHardenedContract() {
|
||||
// OneShotStartupCheckStrategy succeeds only when the container has
|
||||
// exited with status 0. Anything else (non-zero exit, timeout) throws
|
||||
// ContainerLaunchException — the assertion below is a belt-and-braces
|
||||
// explicit check on the resolved exit code.
|
||||
loader = new GenericContainer<>(loaderImageId)
|
||||
.withNetwork(network)
|
||||
.withEnv("ARTIFACT_URL", "http://file-server/artifact.jar")
|
||||
.withEnv("ARTIFACT_EXPECTED_SIZE", String.valueOf(ARTIFACT_BYTES))
|
||||
.withCreateContainerCmdModifier(cmd -> cmd.getHostConfig()
|
||||
.withCapDrop(Capability.values())
|
||||
.withSecurityOpts(List.of("no-new-privileges:true", "apparmor=docker-default"))
|
||||
.withReadonlyRootfs(true)
|
||||
.withPidsLimit(512L)
|
||||
.withTmpFs(Map.of("/tmp", "rw,nosuid,size=64m"))
|
||||
.withBinds(new Bind(volumeName, new Volume("/app/jars"), AccessMode.rw)))
|
||||
.withStartupCheckStrategy(new OneShotStartupCheckStrategy()
|
||||
.withTimeout(Duration.ofSeconds(60)));
|
||||
|
||||
loader.start();
|
||||
|
||||
Long exit = dockerClient.inspectContainerCmd(loader.getContainerId())
|
||||
.exec()
|
||||
.getState()
|
||||
.getExitCodeLong();
|
||||
assertThat(exit)
|
||||
.as("loader must exit 0 — non-zero indicates the hardening contract "
|
||||
+ "broke the artifact write (e.g. /app/jars not owned by loader UID 1000)")
|
||||
.isZero();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user