fix(agent): revive DEAD agents on heartbeat (not just STALE)
Reproduction: pause a container long enough to cross both the stale and dead thresholds, then unpause. The agent resumes sending heartbeats but the server keeps it shown as DEAD. Only a full container restart (which re-registers) fixes it. Root cause: AgentRegistryService.heartbeat() only revived STALE → LIVE. A DEAD agent's heartbeat updated lastHeartbeat but left state unchanged. checkLifecycle() never downgrades DEAD either (no-op in that branch), so the agent was permanently stuck in DEAD until a register() call. Fix: extend the revival branch to also cover DEAD. Same process; a heartbeat is proof of liveness regardless of the previous state. Also: AgentLifecycleMonitor.mapTransitionEvent() now emits RECOVERED for DEAD → LIVE, mirroring its behavior for STALE → LIVE, so the lifecycle timeline captures the transition. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -70,7 +70,7 @@ public class AgentLifecycleMonitor {
|
|||||||
private String mapTransitionEvent(AgentState from, AgentState to) {
|
private String mapTransitionEvent(AgentState from, AgentState to) {
|
||||||
if (from == AgentState.LIVE && to == AgentState.STALE) return "WENT_STALE";
|
if (from == AgentState.LIVE && to == AgentState.STALE) return "WENT_STALE";
|
||||||
if (from == AgentState.STALE && to == AgentState.DEAD) return "WENT_DEAD";
|
if (from == AgentState.STALE && to == AgentState.DEAD) return "WENT_DEAD";
|
||||||
if (from == AgentState.STALE && to == AgentState.LIVE) return "RECOVERED";
|
if (to == AgentState.LIVE && (from == AgentState.STALE || from == AgentState.DEAD)) return "RECOVERED";
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -72,7 +72,10 @@ public class AgentRegistryService {
|
|||||||
/**
|
/**
|
||||||
* Process a heartbeat from an agent.
|
* Process a heartbeat from an agent.
|
||||||
* Updates lastHeartbeat, routeIds (if provided), capabilities (if provided),
|
* Updates lastHeartbeat, routeIds (if provided), capabilities (if provided),
|
||||||
* and transitions STALE agents back to LIVE.
|
* and revives STALE or DEAD agents back to LIVE. A DEAD revival happens
|
||||||
|
* when an agent resumes (e.g. docker unpause) after a pause long enough
|
||||||
|
* to cross both the stale and dead thresholds — the process is the same,
|
||||||
|
* no re-registration is required.
|
||||||
*
|
*
|
||||||
* @return true if the agent is known, false otherwise
|
* @return true if the agent is known, false otherwise
|
||||||
*/
|
*/
|
||||||
@@ -86,9 +89,9 @@ public class AgentRegistryService {
|
|||||||
if (capabilities != null && !capabilities.isEmpty()) {
|
if (capabilities != null && !capabilities.isEmpty()) {
|
||||||
result = result.withCapabilities(Map.copyOf(capabilities));
|
result = result.withCapabilities(Map.copyOf(capabilities));
|
||||||
}
|
}
|
||||||
if (existing.state() == AgentState.STALE) {
|
if (existing.state() == AgentState.STALE || existing.state() == AgentState.DEAD) {
|
||||||
result = result.withState(AgentState.LIVE).withStaleTransitionTime(null);
|
result = result.withState(AgentState.LIVE).withStaleTransitionTime(null);
|
||||||
log.info("Agent {} revived from STALE to LIVE via heartbeat", id);
|
log.info("Agent {} revived from {} to LIVE via heartbeat", id, existing.state());
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -116,6 +116,23 @@ class AgentRegistryServiceTest {
|
|||||||
assertThat(registry.findById("agent-1").state()).isEqualTo(AgentState.LIVE);
|
assertThat(registry.findById("agent-1").state()).isEqualTo(AgentState.LIVE);
|
||||||
assertThat(registry.findById("agent-1").staleTransitionTime()).isNull();
|
assertThat(registry.findById("agent-1").staleTransitionTime()).isNull();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void heartbeatDeadAgent_transitionsToLive() {
|
||||||
|
// A paused docker container can exceed both the stale and dead thresholds
|
||||||
|
// before resuming. When heartbeats arrive again the registry must revive
|
||||||
|
// the agent — otherwise the UI shows DEAD forever despite an alive agent.
|
||||||
|
registry.register("agent-1", "agent-1", "group", "default", "1.0.0", List.of(), Map.of());
|
||||||
|
registry.transitionState("agent-1", AgentState.STALE);
|
||||||
|
registry.transitionState("agent-1", AgentState.DEAD);
|
||||||
|
|
||||||
|
assertThat(registry.findById("agent-1").state()).isEqualTo(AgentState.DEAD);
|
||||||
|
|
||||||
|
registry.heartbeat("agent-1");
|
||||||
|
|
||||||
|
assertThat(registry.findById("agent-1").state()).isEqualTo(AgentState.LIVE);
|
||||||
|
assertThat(registry.findById("agent-1").staleTransitionTime()).isNull();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Nested
|
@Nested
|
||||||
|
|||||||
Reference in New Issue
Block a user