diff --git a/.planning/sse-flakiness-diagnosis.md b/.planning/sse-flakiness-diagnosis.md new file mode 100644 index 00000000..a232a81b --- /dev/null +++ b/.planning/sse-flakiness-diagnosis.md @@ -0,0 +1,81 @@ +# SSE Flakiness — Root-Cause Analysis + +**Date:** 2026-04-21 +**Tests:** `AgentSseControllerIT.sseConnect_unknownAgent_returns404`, `.lastEventIdHeader_connectionSucceeds`, `.pingKeepalive_receivedViaSseStream`, `SseSigningIT.deepTraceEvent_containsValidSignature` + +## Summary + +Not order-dependent flakiness (triage report was wrong). Three distinct root causes, one production bug and one test-infrastructure issue, all reproducible when running the classes in isolation. + +## Reproduction + +```bash +mvn -pl cameleer-server-app -am -Dit.test='AgentSseControllerIT' -Dtest='!*' \ + -DfailIfNoTests=false -Dsurefire.failIfNoSpecifiedTests=false verify +``` + +Result: 3 failures out of 7 tests with a cold CH container. Not order-dependent. + +## Root causes + +### 1. `AgentSseController.events` auto-heal is over-permissive (security bug) + +**File:** `cameleer-server-app/src/main/java/com/cameleer/server/app/controller/AgentSseController.java:63-76` + +```java +AgentInfo agent = registryService.findById(id); +if (agent == null) { + var jwtResult = ...; + if (jwtResult != null) { // ← only checks JWT presence + registryService.register(id, id, application, env, ...); + } else { + throw 404; + } +} +``` + +**Bug:** auto-heal registers *any* path id when any valid JWT is present, regardless of whether the JWT subject matches the path id. A holder of agent X's JWT can open SSE for any path-id Y, silently spoofing Y. + +**Surface symptom:** `sseConnect_unknownAgent_returns404` sends a JWT for `test-agent-sse-it` and requests SSE for `unknown-sse-agent`. Auto-heal kicks in, returns 200 with an infinite empty stream. Test's `statusFuture.get(5s)` — which uses `BodyHandlers.ofString()` and waits for the full body — times out instead of getting a synchronous 404. + +**Fix:** only auto-heal when `jwtResult.subject().equals(id)`. + +### 2. `SseConnectionManager.pingAll` reads an unprefixed property key (production bug) + +**File:** `cameleer-server-app/src/main/java/com/cameleer/server/app/agent/SseConnectionManager.java:172` + +```java +@Scheduled(fixedDelayString = "${agent-registry.ping-interval-ms:15000}") +``` + +**Bug:** `AgentRegistryConfig` is `@ConfigurationProperties(prefix = "cameleer.server.agentregistry")`. The scheduler reads an unprefixed `agent-registry.*` key that the YAML never defines — so the default 15s always applies, regardless of config. Same family of bug as the `MetricsFlushScheduler` fix in commit `a6944911`. + +**Fix:** `${cameleer.server.agentregistry.ping-interval-ms:15000}`. + +### 3. SSE response body doesn't flush until first event (test timing dependency) + +**File:** `cameleer-server-app/src/main/java/com/cameleer/server/app/agent/SseConnectionManager.java:connect()` + +Spring's `SseEmitter` holds the response open but doesn't flush headers to the client until the first `emitter.send()`. Until then, clients using `HttpResponse.BodyHandlers.ofInputStream()` block on the first byte. + +**Surface symptom:** +- `lastEventIdHeader_connectionSucceeds` — asserts `awaitConnection(5000)` is `true`. The latch counts down in `.thenAccept(response -> ...)`, which in practice only fires once body bytes start flowing (JDK 21 behaviour with SSE streams). Default ping cadence is 15s → 5s assertion times out. +- `pingKeepalive_receivedViaSseStream` — waits 5s for a `:ping` line. The scheduler runs every 15s (both by default, and because of bug #2, unconditionally). +- `SseSigningIT.deepTraceEvent_containsValidSignature` — same family: `awaitConnection(5000).isTrue()`. + +**Fix:** send an initial `: connected` comment as part of `connect()`. Spring flushes on the first `.send()`, so an immediate comment forces the response headers + first byte to hit the wire, which triggers the client's `thenAccept` callback. Also solves the ping-test: the initial comment is observed as a keepalive line within the test's polling window. + +## Hypothesis ladder (ruled out) + +- **Order-dependent singleton leak** — ruled out: every failure reproduces when the class is run solo. +- **Tomcat async thread pool exhaustion** — ruled out: `SseEmitter(Long.MAX_VALUE)` does hold threads, but the 7-test class doesn't reach Tomcat's defaults. +- **SseConnectionManager emitter-map contamination** — ruled out: each test uses a unique agent id (UUID-suffixed), and the `@Component` is the same instance across tests but the emitter map is keyed by agent id, no collisions. + +## Verification + +``` +mvn -pl cameleer-server-app -am -Dit.test='AgentSseControllerIT,SseSigningIT' ... verify +# Tests run: 9, Failures: 0, Errors: 0, Skipped: 0 +``` + +All 9 tests green with the three fixes applied. diff --git a/cameleer-server-app/src/main/java/com/cameleer/server/app/agent/SseConnectionManager.java b/cameleer-server-app/src/main/java/com/cameleer/server/app/agent/SseConnectionManager.java index ff38dd2c..f074af82 100644 --- a/cameleer-server-app/src/main/java/com/cameleer/server/app/agent/SseConnectionManager.java +++ b/cameleer-server-app/src/main/java/com/cameleer/server/app/agent/SseConnectionManager.java @@ -80,6 +80,17 @@ public class SseConnectionManager implements AgentEventListener { log.debug("SSE connection error for agent {}: {}", agentId, ex.getMessage()); }); + // Send an initial keepalive comment so Spring flushes the response + // headers immediately. Without this, clients blocking on the first + // body byte can hang for a full ping interval before observing the + // connection — surface symptom in ITs that assert awaitConnection(). + try { + emitter.send(SseEmitter.event().comment("connected")); + } catch (IOException e) { + log.debug("Initial keepalive failed for agent {}: {}", agentId, e.getMessage()); + emitters.remove(agentId, emitter); + } + log.info("SSE connection established for agent {}", agentId); return emitter; @@ -169,7 +180,7 @@ public class SseConnectionManager implements AgentEventListener { /** * Scheduled ping keepalive to all connected agents. */ - @Scheduled(fixedDelayString = "${agent-registry.ping-interval-ms:15000}") + @Scheduled(fixedDelayString = "${cameleer.server.agentregistry.ping-interval-ms:15000}") void pingAll() { if (!emitters.isEmpty()) { sendPingToAll(); diff --git a/cameleer-server-app/src/main/java/com/cameleer/server/app/controller/AgentSseController.java b/cameleer-server-app/src/main/java/com/cameleer/server/app/controller/AgentSseController.java index d92c4fe7..f6a8e185 100644 --- a/cameleer-server-app/src/main/java/com/cameleer/server/app/controller/AgentSseController.java +++ b/cameleer-server-app/src/main/java/com/cameleer/server/app/controller/AgentSseController.java @@ -62,10 +62,13 @@ public class AgentSseController { AgentInfo agent = registryService.findById(id); if (agent == null) { - // Auto-heal: re-register agent from JWT claims after server restart + // Auto-heal re-registers an agent from JWT claims after a server + // restart, but only when the JWT subject matches the path id. + // Otherwise a holder of any valid agent JWT could spoof an + // arbitrary agentId in the URL. var jwtResult = (JwtService.JwtValidationResult) httpRequest.getAttribute( JwtAuthenticationFilter.JWT_RESULT_ATTR); - if (jwtResult != null) { + if (jwtResult != null && id.equals(jwtResult.subject())) { String application = jwtResult.application() != null ? jwtResult.application() : "default"; String env = jwtResult.environment() != null ? jwtResult.environment() : "default"; registryService.register(id, id, application, env, "unknown", List.of(), Map.of());