From fb54f9cbd2a4c060ca59f4a381f964522f443b1d Mon Sep 17 00:00:00 2001 From: hsiegeln <37154749+hsiegeln@users.noreply.github.com> Date: Tue, 21 Apr 2026 20:55:47 +0200 Subject: [PATCH] fix(agent): revive DEAD agents on heartbeat (not just STALE) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reproduction: pause a container long enough to cross both the stale and dead thresholds, then unpause. The agent resumes sending heartbeats but the server keeps it shown as DEAD. Only a full container restart (which re-registers) fixes it. Root cause: AgentRegistryService.heartbeat() only revived STALE → LIVE. A DEAD agent's heartbeat updated lastHeartbeat but left state unchanged. checkLifecycle() never downgrades DEAD either (no-op in that branch), so the agent was permanently stuck in DEAD until a register() call. Fix: extend the revival branch to also cover DEAD. Same process; a heartbeat is proof of liveness regardless of the previous state. Also: AgentLifecycleMonitor.mapTransitionEvent() now emits RECOVERED for DEAD → LIVE, mirroring its behavior for STALE → LIVE, so the lifecycle timeline captures the transition. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../server/app/agent/AgentLifecycleMonitor.java | 2 +- .../server/core/agent/AgentRegistryService.java | 9 ++++++--- .../core/agent/AgentRegistryServiceTest.java | 17 +++++++++++++++++ 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/cameleer-server-app/src/main/java/com/cameleer/server/app/agent/AgentLifecycleMonitor.java b/cameleer-server-app/src/main/java/com/cameleer/server/app/agent/AgentLifecycleMonitor.java index 6610fa43..f1893e8a 100644 --- a/cameleer-server-app/src/main/java/com/cameleer/server/app/agent/AgentLifecycleMonitor.java +++ b/cameleer-server-app/src/main/java/com/cameleer/server/app/agent/AgentLifecycleMonitor.java @@ -70,7 +70,7 @@ public class AgentLifecycleMonitor { private String mapTransitionEvent(AgentState from, AgentState to) { if (from == AgentState.LIVE && to == AgentState.STALE) return "WENT_STALE"; if (from == AgentState.STALE && to == AgentState.DEAD) return "WENT_DEAD"; - if (from == AgentState.STALE && to == AgentState.LIVE) return "RECOVERED"; + if (to == AgentState.LIVE && (from == AgentState.STALE || from == AgentState.DEAD)) return "RECOVERED"; return null; } } diff --git a/cameleer-server-core/src/main/java/com/cameleer/server/core/agent/AgentRegistryService.java b/cameleer-server-core/src/main/java/com/cameleer/server/core/agent/AgentRegistryService.java index 88173dfc..014e503d 100644 --- a/cameleer-server-core/src/main/java/com/cameleer/server/core/agent/AgentRegistryService.java +++ b/cameleer-server-core/src/main/java/com/cameleer/server/core/agent/AgentRegistryService.java @@ -72,7 +72,10 @@ public class AgentRegistryService { /** * Process a heartbeat from an agent. * Updates lastHeartbeat, routeIds (if provided), capabilities (if provided), - * and transitions STALE agents back to LIVE. + * and revives STALE or DEAD agents back to LIVE. A DEAD revival happens + * when an agent resumes (e.g. docker unpause) after a pause long enough + * to cross both the stale and dead thresholds — the process is the same, + * no re-registration is required. * * @return true if the agent is known, false otherwise */ @@ -86,9 +89,9 @@ public class AgentRegistryService { if (capabilities != null && !capabilities.isEmpty()) { result = result.withCapabilities(Map.copyOf(capabilities)); } - if (existing.state() == AgentState.STALE) { + if (existing.state() == AgentState.STALE || existing.state() == AgentState.DEAD) { result = result.withState(AgentState.LIVE).withStaleTransitionTime(null); - log.info("Agent {} revived from STALE to LIVE via heartbeat", id); + log.info("Agent {} revived from {} to LIVE via heartbeat", id, existing.state()); } return result; }); diff --git a/cameleer-server-core/src/test/java/com/cameleer/server/core/agent/AgentRegistryServiceTest.java b/cameleer-server-core/src/test/java/com/cameleer/server/core/agent/AgentRegistryServiceTest.java index ad96894e..af0f5173 100644 --- a/cameleer-server-core/src/test/java/com/cameleer/server/core/agent/AgentRegistryServiceTest.java +++ b/cameleer-server-core/src/test/java/com/cameleer/server/core/agent/AgentRegistryServiceTest.java @@ -116,6 +116,23 @@ class AgentRegistryServiceTest { assertThat(registry.findById("agent-1").state()).isEqualTo(AgentState.LIVE); assertThat(registry.findById("agent-1").staleTransitionTime()).isNull(); } + + @Test + void heartbeatDeadAgent_transitionsToLive() { + // A paused docker container can exceed both the stale and dead thresholds + // before resuming. When heartbeats arrive again the registry must revive + // the agent — otherwise the UI shows DEAD forever despite an alive agent. + registry.register("agent-1", "agent-1", "group", "default", "1.0.0", List.of(), Map.of()); + registry.transitionState("agent-1", AgentState.STALE); + registry.transitionState("agent-1", AgentState.DEAD); + + assertThat(registry.findById("agent-1").state()).isEqualTo(AgentState.DEAD); + + registry.heartbeat("agent-1"); + + assertThat(registry.findById("agent-1").state()).isEqualTo(AgentState.LIVE); + assertThat(registry.findById("agent-1").staleTransitionTime()).isNull(); + } } @Nested