feat(alerting): AGENT_LIFECYCLE condition kind with per-subject fire mode

Allows alert rules to fire on agent-lifecycle events — REGISTERED,
RE_REGISTERED, DEREGISTERED, WENT_STALE, WENT_DEAD, RECOVERED — rather
than only on current state. Each matching `(agent, eventType, timestamp)`
becomes its own ackable AlertInstance, so outages on distinct agents are
independently routable.

Core:
- New `ConditionKind.AGENT_LIFECYCLE` + `AgentLifecycleCondition` record
  (scope, eventTypes, withinSeconds). Compact ctor rejects empty
  eventTypes and withinSeconds<1.
- Strict allowlist enum `AgentLifecycleEventType` (six entries matching
  the server-emitted types in `AgentRegistrationController` and
  `AgentLifecycleMonitor`). Custom agent-emitted event types tracked in
  backlog issue #145.
- `AgentEventRepository.findInWindow(env, appSlug, agentId, eventTypes,
  from, to, limit)` — new read path ordered `(timestamp ASC, insert_id
  ASC)` used by the evaluator. Implemented on
  `ClickHouseAgentEventRepository` with tenant + env filter mandatory.

App:
- `AgentLifecycleEvaluator` queries events in the last `withinSeconds`
  window and returns `EvalResult.Batch` with one `Firing` per row.
  Every Firing carries a canonical `_subjectFingerprint` of
  `"<agentId>:<eventType>:<tsMillis>"` in context plus `agent` / `event`
  subtrees for Mustache templating.
- `NotificationContextBuilder` gains an `AGENT_LIFECYCLE` branch that
  exposes `{{agent.id}}`, `{{agent.app}}`, `{{event.type}}`,
  `{{event.timestamp}}`, `{{event.detail}}`.
- Validation is delegated to the record compact ctor + enum at Jackson
  deserialization time — matches the existing policy of keeping
  controller validators focused on env-scoped / SQL-injection concerns.

Schema:
- V16 migration generalises the V15 per-exchange discriminator on
  `alert_instances_open_rule_uq` to prefer `_subjectFingerprint` with a
  fallback to the legacy `exchange.id` expression. Scalar kinds still
  resolve to `''` and keep one-open-per-rule. Duplicate-key path in
  `PostgresAlertInstanceRepository.save` is unchanged — the index is
  the deduper.

UI:
- New `AgentLifecycleForm.tsx` wizard form with multi-select chips for
  the six allowed event types + `withinSeconds` input. Wired into
  `ConditionStep`, `form-state` (validation + defaults: WENT_DEAD,
  300 s), and `enums.ts` options. Tests in `enums.test.ts` pin the
  new option array.
- `alert-variables.ts` registers `{{agent.app}}`, `{{event.type}}`,
  `{{event.timestamp}}`, `{{event.detail}}` leaves for the new kind,
  and extends `agent.id`'s availability list to include `AGENT_LIFECYCLE`.

Tests (all passing):
- 5 new JSON-roundtrip cases on `AlertConditionJsonTest` (positive +
  empty/zero/unknown-type rejection).
- 5 new evaluator unit tests on `AgentLifecycleEvaluatorTest` (empty
  window, multi-agent fingerprint shape, scope forwarding, missing env).
- `NotificationContextBuilderTest` switch now covers the new kind.
- 119 alerting unit tests + 71 UI tests green.

Docs: `.claude/rules/{core,app,ui}` and CLAUDE.md migration list updated.
This commit is contained in:
hsiegeln
2026-04-21 14:52:08 +02:00
parent 23d02ba6a0
commit 414f7204bf
24 changed files with 601 additions and 20 deletions

View File

@@ -0,0 +1,130 @@
package com.cameleer.server.app.alerting.eval;
import com.cameleer.server.core.agent.AgentEventRecord;
import com.cameleer.server.core.agent.AgentEventRepository;
import com.cameleer.server.core.alerting.*;
import com.cameleer.server.core.runtime.Environment;
import com.cameleer.server.core.runtime.EnvironmentRepository;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.time.Instant;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.UUID;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.anyInt;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
class AgentLifecycleEvaluatorTest {
private AgentEventRepository events;
private EnvironmentRepository envRepo;
private AgentLifecycleEvaluator eval;
private static final UUID ENV_ID = UUID.fromString("bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb");
private static final UUID RULE_ID = UUID.fromString("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa");
private static final String ENV_SLUG = "prod";
private static final Instant NOW = Instant.parse("2026-04-19T10:00:00Z");
@BeforeEach
void setUp() {
events = mock(AgentEventRepository.class);
envRepo = mock(EnvironmentRepository.class);
when(envRepo.findById(ENV_ID)).thenReturn(Optional.of(
new Environment(ENV_ID, ENV_SLUG, "Prod", true, true, Map.of(), 5, Instant.EPOCH)));
eval = new AgentLifecycleEvaluator(events, envRepo);
}
private AlertRule ruleWith(AlertCondition condition) {
return new AlertRule(RULE_ID, ENV_ID, "lifecycle test", null,
AlertSeverity.CRITICAL, true, condition.kind(), condition,
60, 0, 0, null, null, List.of(), List.of(),
null, null, null, Map.of(), null, null, null, null);
}
private EvalContext ctx() { return new EvalContext("default", NOW, new TickCache()); }
@Test
void kindIsAgentLifecycle() {
assertThat(eval.kind()).isEqualTo(ConditionKind.AGENT_LIFECYCLE);
}
@Test
void emptyWindowYieldsEmptyBatch() {
var condition = new AgentLifecycleCondition(
new AlertScope(null, null, null),
List.of(AgentLifecycleEventType.WENT_DEAD),
300);
when(events.findInWindow(eq(ENV_SLUG), any(), any(), any(), any(), any(), anyInt()))
.thenReturn(List.of());
EvalResult r = eval.evaluate(condition, ruleWith(condition), ctx());
assertThat(r).isInstanceOf(EvalResult.Batch.class);
assertThat(((EvalResult.Batch) r).firings()).isEmpty();
}
@Test
void emitsOneFiringPerEventWithFingerprint() {
Instant ts1 = NOW.minusSeconds(30);
Instant ts2 = NOW.minusSeconds(10);
when(events.findInWindow(eq(ENV_SLUG), any(), any(), any(), any(), any(), anyInt()))
.thenReturn(List.of(
new AgentEventRecord(0, "agent-A", "orders", "WENT_DEAD", "A went dead", ts1),
new AgentEventRecord(0, "agent-B", "orders", "WENT_DEAD", "B went dead", ts2)
));
var condition = new AgentLifecycleCondition(
new AlertScope(null, null, null),
List.of(AgentLifecycleEventType.WENT_DEAD), 60);
EvalResult r = eval.evaluate(condition, ruleWith(condition), ctx());
var batch = (EvalResult.Batch) r;
assertThat(batch.firings()).hasSize(2);
var f0 = batch.firings().get(0);
assertThat(f0.context()).containsKey("_subjectFingerprint");
assertThat((String) f0.context().get("_subjectFingerprint"))
.isEqualTo("agent-A:WENT_DEAD:" + ts1.toEpochMilli());
@SuppressWarnings("unchecked")
Map<String, Object> agent0 = (Map<String, Object>) f0.context().get("agent");
assertThat(agent0).containsEntry("id", "agent-A").containsEntry("app", "orders");
@SuppressWarnings("unchecked")
Map<String, Object> event0 = (Map<String, Object>) f0.context().get("event");
assertThat(event0).containsEntry("type", "WENT_DEAD");
var f1 = batch.firings().get(1);
assertThat((String) f1.context().get("_subjectFingerprint"))
.isEqualTo("agent-B:WENT_DEAD:" + ts2.toEpochMilli());
}
@Test
void forwardsScopeFiltersToRepo() {
when(events.findInWindow(eq(ENV_SLUG), eq("orders"), eq("agent-A"), any(), any(), any(), anyInt()))
.thenReturn(List.of());
var condition = new AgentLifecycleCondition(
new AlertScope("orders", null, "agent-A"),
List.of(AgentLifecycleEventType.REGISTERED), 120);
eval.evaluate(condition, ruleWith(condition), ctx());
// Mockito `when` matches — verifying no mismatch is enough; stub returns []
}
@Test
void clearsWhenEnvIsMissing() {
// envRepo returns empty → should Clear, not throw.
EnvironmentRepository emptyEnvRepo = mock(EnvironmentRepository.class);
when(emptyEnvRepo.findById(ENV_ID)).thenReturn(Optional.empty());
AgentLifecycleEvaluator localEval = new AgentLifecycleEvaluator(events, emptyEnvRepo);
var condition = new AgentLifecycleCondition(
new AlertScope(null, null, null),
List.of(AgentLifecycleEventType.WENT_DEAD), 60);
EvalResult r = localEval.evaluate(condition, ruleWith(condition), ctx());
assertThat(r).isEqualTo(EvalResult.Clear.INSTANCE);
}
}

View File

@@ -43,6 +43,10 @@ class NotificationContextBuilderTest {
case AGENT_STATE -> new AgentStateCondition(
new AlertScope(null, null, null),
"DEAD", 0);
case AGENT_LIFECYCLE -> new AgentLifecycleCondition(
new AlertScope(null, null, null),
List.of(AgentLifecycleEventType.WENT_DEAD),
60);
case DEPLOYMENT_STATE -> new DeploymentStateCondition(
new AlertScope("my-app", null, null),
List.of("FAILED"));