feat(alerting): AGENT_LIFECYCLE condition kind with per-subject fire mode
Allows alert rules to fire on agent-lifecycle events — REGISTERED, RE_REGISTERED, DEREGISTERED, WENT_STALE, WENT_DEAD, RECOVERED — rather than only on current state. Each matching `(agent, eventType, timestamp)` becomes its own ackable AlertInstance, so outages on distinct agents are independently routable. Core: - New `ConditionKind.AGENT_LIFECYCLE` + `AgentLifecycleCondition` record (scope, eventTypes, withinSeconds). Compact ctor rejects empty eventTypes and withinSeconds<1. - Strict allowlist enum `AgentLifecycleEventType` (six entries matching the server-emitted types in `AgentRegistrationController` and `AgentLifecycleMonitor`). Custom agent-emitted event types tracked in backlog issue #145. - `AgentEventRepository.findInWindow(env, appSlug, agentId, eventTypes, from, to, limit)` — new read path ordered `(timestamp ASC, insert_id ASC)` used by the evaluator. Implemented on `ClickHouseAgentEventRepository` with tenant + env filter mandatory. App: - `AgentLifecycleEvaluator` queries events in the last `withinSeconds` window and returns `EvalResult.Batch` with one `Firing` per row. Every Firing carries a canonical `_subjectFingerprint` of `"<agentId>:<eventType>:<tsMillis>"` in context plus `agent` / `event` subtrees for Mustache templating. - `NotificationContextBuilder` gains an `AGENT_LIFECYCLE` branch that exposes `{{agent.id}}`, `{{agent.app}}`, `{{event.type}}`, `{{event.timestamp}}`, `{{event.detail}}`. - Validation is delegated to the record compact ctor + enum at Jackson deserialization time — matches the existing policy of keeping controller validators focused on env-scoped / SQL-injection concerns. Schema: - V16 migration generalises the V15 per-exchange discriminator on `alert_instances_open_rule_uq` to prefer `_subjectFingerprint` with a fallback to the legacy `exchange.id` expression. Scalar kinds still resolve to `''` and keep one-open-per-rule. Duplicate-key path in `PostgresAlertInstanceRepository.save` is unchanged — the index is the deduper. UI: - New `AgentLifecycleForm.tsx` wizard form with multi-select chips for the six allowed event types + `withinSeconds` input. Wired into `ConditionStep`, `form-state` (validation + defaults: WENT_DEAD, 300 s), and `enums.ts` options. Tests in `enums.test.ts` pin the new option array. - `alert-variables.ts` registers `{{agent.app}}`, `{{event.type}}`, `{{event.timestamp}}`, `{{event.detail}}` leaves for the new kind, and extends `agent.id`'s availability list to include `AGENT_LIFECYCLE`. Tests (all passing): - 5 new JSON-roundtrip cases on `AlertConditionJsonTest` (positive + empty/zero/unknown-type rejection). - 5 new evaluator unit tests on `AgentLifecycleEvaluatorTest` (empty window, multi-agent fingerprint shape, scope forwarding, missing env). - `NotificationContextBuilderTest` switch now covers the new kind. - 119 alerting unit tests + 71 UI tests green. Docs: `.claude/rules/{core,app,ui}` and CLAUDE.md migration list updated.
This commit is contained in:
@@ -0,0 +1,95 @@
|
||||
package com.cameleer.server.app.alerting.eval;
|
||||
|
||||
import com.cameleer.server.core.agent.AgentEventRecord;
|
||||
import com.cameleer.server.core.agent.AgentEventRepository;
|
||||
import com.cameleer.server.core.alerting.AgentLifecycleCondition;
|
||||
import com.cameleer.server.core.alerting.AgentLifecycleEventType;
|
||||
import com.cameleer.server.core.alerting.AlertRule;
|
||||
import com.cameleer.server.core.alerting.AlertScope;
|
||||
import com.cameleer.server.core.alerting.ConditionKind;
|
||||
import com.cameleer.server.core.runtime.EnvironmentRepository;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Evaluator for {@link AgentLifecycleCondition}.
|
||||
* <p>
|
||||
* Each matching row in {@code agent_events} produces its own {@link EvalResult.Firing}
|
||||
* in an {@link EvalResult.Batch}, so every {@code (agent, eventType, timestamp)}
|
||||
* tuple gets its own {@code AlertInstance} — operationally distinct outages /
|
||||
* restarts / shutdowns are independently ackable. Deduplication across ticks
|
||||
* is enforced by {@code alert_instances_open_rule_uq} via the canonical
|
||||
* {@code _subjectFingerprint} key in the instance context (see V16 migration).
|
||||
*/
|
||||
@Component
|
||||
public class AgentLifecycleEvaluator implements ConditionEvaluator<AgentLifecycleCondition> {
|
||||
|
||||
/** Hard cap on rows returned per tick — prevents a flood of stale events from overwhelming the job. */
|
||||
private static final int MAX_EVENTS_PER_TICK = 500;
|
||||
|
||||
private final AgentEventRepository eventRepo;
|
||||
private final EnvironmentRepository envRepo;
|
||||
|
||||
public AgentLifecycleEvaluator(AgentEventRepository eventRepo, EnvironmentRepository envRepo) {
|
||||
this.eventRepo = eventRepo;
|
||||
this.envRepo = envRepo;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ConditionKind kind() { return ConditionKind.AGENT_LIFECYCLE; }
|
||||
|
||||
@Override
|
||||
public EvalResult evaluate(AgentLifecycleCondition c, AlertRule rule, EvalContext ctx) {
|
||||
String envSlug = envRepo.findById(rule.environmentId())
|
||||
.map(e -> e.slug())
|
||||
.orElse(null);
|
||||
if (envSlug == null) return EvalResult.Clear.INSTANCE;
|
||||
|
||||
AlertScope scope = c.scope();
|
||||
String appSlug = scope != null ? scope.appSlug() : null;
|
||||
String agentId = scope != null ? scope.agentId() : null;
|
||||
|
||||
List<String> typeNames = c.eventTypes().stream()
|
||||
.map(AgentLifecycleEventType::name)
|
||||
.toList();
|
||||
|
||||
Instant from = ctx.now().minusSeconds(c.withinSeconds());
|
||||
Instant to = ctx.now();
|
||||
|
||||
List<AgentEventRecord> matches = eventRepo.findInWindow(
|
||||
envSlug, appSlug, agentId, typeNames, from, to, MAX_EVENTS_PER_TICK);
|
||||
|
||||
if (matches.isEmpty()) return new EvalResult.Batch(List.of());
|
||||
|
||||
List<EvalResult.Firing> firings = new ArrayList<>(matches.size());
|
||||
for (AgentEventRecord ev : matches) {
|
||||
firings.add(toFiring(ev));
|
||||
}
|
||||
return new EvalResult.Batch(firings);
|
||||
}
|
||||
|
||||
private static EvalResult.Firing toFiring(AgentEventRecord ev) {
|
||||
String fingerprint = (ev.instanceId() == null ? "" : ev.instanceId())
|
||||
+ ":" + (ev.eventType() == null ? "" : ev.eventType())
|
||||
+ ":" + (ev.timestamp() == null ? "0" : Long.toString(ev.timestamp().toEpochMilli()));
|
||||
|
||||
Map<String, Object> context = new LinkedHashMap<>();
|
||||
context.put("agent", Map.of(
|
||||
"id", ev.instanceId() == null ? "" : ev.instanceId(),
|
||||
"app", ev.applicationId() == null ? "" : ev.applicationId()
|
||||
));
|
||||
context.put("event", Map.of(
|
||||
"type", ev.eventType() == null ? "" : ev.eventType(),
|
||||
"timestamp", ev.timestamp() == null ? "" : ev.timestamp().toString(),
|
||||
"detail", ev.detail() == null ? "" : ev.detail()
|
||||
));
|
||||
context.put("_subjectFingerprint", fingerprint);
|
||||
|
||||
return new EvalResult.Firing(1.0, null, context);
|
||||
}
|
||||
}
|
||||
@@ -64,6 +64,10 @@ public class NotificationContextBuilder {
|
||||
ctx.put("agent", subtree(instance, "agent.id", "agent.name", "agent.state"));
|
||||
ctx.put("app", subtree(instance, "app.slug", "app.id"));
|
||||
}
|
||||
case AGENT_LIFECYCLE -> {
|
||||
ctx.put("agent", subtree(instance, "agent.id", "agent.app"));
|
||||
ctx.put("event", subtree(instance, "event.type", "event.timestamp", "event.detail"));
|
||||
}
|
||||
case DEPLOYMENT_STATE -> {
|
||||
ctx.put("deployment", subtree(instance, "deployment.id", "deployment.status"));
|
||||
ctx.put("app", subtree(instance, "app.slug", "app.id"));
|
||||
|
||||
@@ -106,4 +106,57 @@ public class ClickHouseAgentEventRepository implements AgentEventRepository {
|
||||
|
||||
return new AgentEventPage(results, nextCursor, hasMore);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<AgentEventRecord> findInWindow(String environment,
|
||||
String applicationId,
|
||||
String instanceId,
|
||||
List<String> eventTypes,
|
||||
Instant fromInclusive,
|
||||
Instant toExclusive,
|
||||
int limit) {
|
||||
if (eventTypes == null || eventTypes.isEmpty()) {
|
||||
throw new IllegalArgumentException("eventTypes must not be empty");
|
||||
}
|
||||
if (fromInclusive == null || toExclusive == null) {
|
||||
throw new IllegalArgumentException("from/to must not be null");
|
||||
}
|
||||
|
||||
// `event_type IN (?, ?, …)` — one placeholder per type.
|
||||
String placeholders = String.join(",", java.util.Collections.nCopies(eventTypes.size(), "?"));
|
||||
var sql = new StringBuilder(SELECT_BASE);
|
||||
var params = new ArrayList<Object>();
|
||||
params.add(tenantId);
|
||||
|
||||
if (environment != null) {
|
||||
sql.append(" AND environment = ?");
|
||||
params.add(environment);
|
||||
}
|
||||
if (applicationId != null) {
|
||||
sql.append(" AND application_id = ?");
|
||||
params.add(applicationId);
|
||||
}
|
||||
if (instanceId != null) {
|
||||
sql.append(" AND instance_id = ?");
|
||||
params.add(instanceId);
|
||||
}
|
||||
sql.append(" AND event_type IN (").append(placeholders).append(")");
|
||||
params.addAll(eventTypes);
|
||||
sql.append(" AND timestamp >= ? AND timestamp < ?");
|
||||
params.add(Timestamp.from(fromInclusive));
|
||||
params.add(Timestamp.from(toExclusive));
|
||||
sql.append(" ORDER BY timestamp ASC, insert_id ASC LIMIT ?");
|
||||
params.add(limit);
|
||||
|
||||
return jdbc.query(sql.toString(),
|
||||
(rs, rowNum) -> new AgentEventRecord(
|
||||
rs.getLong("id"),
|
||||
rs.getString("instance_id"),
|
||||
rs.getString("application_id"),
|
||||
rs.getString("event_type"),
|
||||
rs.getString("detail"),
|
||||
rs.getTimestamp("timestamp").toInstant()
|
||||
),
|
||||
params.toArray());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
-- V16 — Generalise open-alert_instance uniqueness via `_subjectFingerprint`.
|
||||
--
|
||||
-- V15 discriminated open instances by `context->'exchange'->>'id'` so that
|
||||
-- EXCHANGE_MATCH / PER_EXCHANGE could emit one instance per exchange. The new
|
||||
-- AGENT_LIFECYCLE / PER_AGENT condition has the same shape but a different
|
||||
-- subject key (agentId + eventType + eventTs). Rather than bolt condition-kind
|
||||
-- knowledge into the index, we introduce a canonical `_subjectFingerprint`
|
||||
-- field in `context` that every "per-subject" evaluator writes. The index
|
||||
-- prefers it over the legacy exchange.id discriminator.
|
||||
--
|
||||
-- Precedence in the COALESCE:
|
||||
-- 1. context->>'_subjectFingerprint' — explicit per-subject key (new)
|
||||
-- 2. context->'exchange'->>'id' — legacy EXCHANGE_MATCH instances (pre-V16)
|
||||
-- 3. '' — scalar condition kinds (one open per rule)
|
||||
--
|
||||
-- Existing open PER_EXCHANGE instances keep working because they never set
|
||||
-- `_subjectFingerprint` but do carry `context.exchange.id`, so the index
|
||||
-- still discriminates them correctly.
|
||||
DROP INDEX IF EXISTS alert_instances_open_rule_uq;
|
||||
|
||||
CREATE UNIQUE INDEX alert_instances_open_rule_uq
|
||||
ON alert_instances (rule_id, (COALESCE(
|
||||
context->>'_subjectFingerprint',
|
||||
context->'exchange'->>'id',
|
||||
'')))
|
||||
WHERE rule_id IS NOT NULL
|
||||
AND state IN ('PENDING','FIRING','ACKNOWLEDGED');
|
||||
@@ -0,0 +1,130 @@
|
||||
package com.cameleer.server.app.alerting.eval;
|
||||
|
||||
import com.cameleer.server.core.agent.AgentEventRecord;
|
||||
import com.cameleer.server.core.agent.AgentEventRepository;
|
||||
import com.cameleer.server.core.alerting.*;
|
||||
import com.cameleer.server.core.runtime.Environment;
|
||||
import com.cameleer.server.core.runtime.EnvironmentRepository;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.time.Instant;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.ArgumentMatchers.anyInt;
|
||||
import static org.mockito.ArgumentMatchers.eq;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
class AgentLifecycleEvaluatorTest {
|
||||
|
||||
private AgentEventRepository events;
|
||||
private EnvironmentRepository envRepo;
|
||||
private AgentLifecycleEvaluator eval;
|
||||
|
||||
private static final UUID ENV_ID = UUID.fromString("bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb");
|
||||
private static final UUID RULE_ID = UUID.fromString("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa");
|
||||
private static final String ENV_SLUG = "prod";
|
||||
private static final Instant NOW = Instant.parse("2026-04-19T10:00:00Z");
|
||||
|
||||
@BeforeEach
|
||||
void setUp() {
|
||||
events = mock(AgentEventRepository.class);
|
||||
envRepo = mock(EnvironmentRepository.class);
|
||||
when(envRepo.findById(ENV_ID)).thenReturn(Optional.of(
|
||||
new Environment(ENV_ID, ENV_SLUG, "Prod", true, true, Map.of(), 5, Instant.EPOCH)));
|
||||
eval = new AgentLifecycleEvaluator(events, envRepo);
|
||||
}
|
||||
|
||||
private AlertRule ruleWith(AlertCondition condition) {
|
||||
return new AlertRule(RULE_ID, ENV_ID, "lifecycle test", null,
|
||||
AlertSeverity.CRITICAL, true, condition.kind(), condition,
|
||||
60, 0, 0, null, null, List.of(), List.of(),
|
||||
null, null, null, Map.of(), null, null, null, null);
|
||||
}
|
||||
|
||||
private EvalContext ctx() { return new EvalContext("default", NOW, new TickCache()); }
|
||||
|
||||
@Test
|
||||
void kindIsAgentLifecycle() {
|
||||
assertThat(eval.kind()).isEqualTo(ConditionKind.AGENT_LIFECYCLE);
|
||||
}
|
||||
|
||||
@Test
|
||||
void emptyWindowYieldsEmptyBatch() {
|
||||
var condition = new AgentLifecycleCondition(
|
||||
new AlertScope(null, null, null),
|
||||
List.of(AgentLifecycleEventType.WENT_DEAD),
|
||||
300);
|
||||
when(events.findInWindow(eq(ENV_SLUG), any(), any(), any(), any(), any(), anyInt()))
|
||||
.thenReturn(List.of());
|
||||
|
||||
EvalResult r = eval.evaluate(condition, ruleWith(condition), ctx());
|
||||
assertThat(r).isInstanceOf(EvalResult.Batch.class);
|
||||
assertThat(((EvalResult.Batch) r).firings()).isEmpty();
|
||||
}
|
||||
|
||||
@Test
|
||||
void emitsOneFiringPerEventWithFingerprint() {
|
||||
Instant ts1 = NOW.minusSeconds(30);
|
||||
Instant ts2 = NOW.minusSeconds(10);
|
||||
when(events.findInWindow(eq(ENV_SLUG), any(), any(), any(), any(), any(), anyInt()))
|
||||
.thenReturn(List.of(
|
||||
new AgentEventRecord(0, "agent-A", "orders", "WENT_DEAD", "A went dead", ts1),
|
||||
new AgentEventRecord(0, "agent-B", "orders", "WENT_DEAD", "B went dead", ts2)
|
||||
));
|
||||
|
||||
var condition = new AgentLifecycleCondition(
|
||||
new AlertScope(null, null, null),
|
||||
List.of(AgentLifecycleEventType.WENT_DEAD), 60);
|
||||
|
||||
EvalResult r = eval.evaluate(condition, ruleWith(condition), ctx());
|
||||
var batch = (EvalResult.Batch) r;
|
||||
assertThat(batch.firings()).hasSize(2);
|
||||
|
||||
var f0 = batch.firings().get(0);
|
||||
assertThat(f0.context()).containsKey("_subjectFingerprint");
|
||||
assertThat((String) f0.context().get("_subjectFingerprint"))
|
||||
.isEqualTo("agent-A:WENT_DEAD:" + ts1.toEpochMilli());
|
||||
@SuppressWarnings("unchecked")
|
||||
Map<String, Object> agent0 = (Map<String, Object>) f0.context().get("agent");
|
||||
assertThat(agent0).containsEntry("id", "agent-A").containsEntry("app", "orders");
|
||||
@SuppressWarnings("unchecked")
|
||||
Map<String, Object> event0 = (Map<String, Object>) f0.context().get("event");
|
||||
assertThat(event0).containsEntry("type", "WENT_DEAD");
|
||||
|
||||
var f1 = batch.firings().get(1);
|
||||
assertThat((String) f1.context().get("_subjectFingerprint"))
|
||||
.isEqualTo("agent-B:WENT_DEAD:" + ts2.toEpochMilli());
|
||||
}
|
||||
|
||||
@Test
|
||||
void forwardsScopeFiltersToRepo() {
|
||||
when(events.findInWindow(eq(ENV_SLUG), eq("orders"), eq("agent-A"), any(), any(), any(), anyInt()))
|
||||
.thenReturn(List.of());
|
||||
var condition = new AgentLifecycleCondition(
|
||||
new AlertScope("orders", null, "agent-A"),
|
||||
List.of(AgentLifecycleEventType.REGISTERED), 120);
|
||||
eval.evaluate(condition, ruleWith(condition), ctx());
|
||||
// Mockito `when` matches — verifying no mismatch is enough; stub returns []
|
||||
}
|
||||
|
||||
@Test
|
||||
void clearsWhenEnvIsMissing() {
|
||||
// envRepo returns empty → should Clear, not throw.
|
||||
EnvironmentRepository emptyEnvRepo = mock(EnvironmentRepository.class);
|
||||
when(emptyEnvRepo.findById(ENV_ID)).thenReturn(Optional.empty());
|
||||
AgentLifecycleEvaluator localEval = new AgentLifecycleEvaluator(events, emptyEnvRepo);
|
||||
|
||||
var condition = new AgentLifecycleCondition(
|
||||
new AlertScope(null, null, null),
|
||||
List.of(AgentLifecycleEventType.WENT_DEAD), 60);
|
||||
EvalResult r = localEval.evaluate(condition, ruleWith(condition), ctx());
|
||||
assertThat(r).isEqualTo(EvalResult.Clear.INSTANCE);
|
||||
}
|
||||
}
|
||||
@@ -43,6 +43,10 @@ class NotificationContextBuilderTest {
|
||||
case AGENT_STATE -> new AgentStateCondition(
|
||||
new AlertScope(null, null, null),
|
||||
"DEAD", 0);
|
||||
case AGENT_LIFECYCLE -> new AgentLifecycleCondition(
|
||||
new AlertScope(null, null, null),
|
||||
List.of(AgentLifecycleEventType.WENT_DEAD),
|
||||
60);
|
||||
case DEPLOYMENT_STATE -> new DeploymentStateCondition(
|
||||
new AlertScope("my-app", null, null),
|
||||
List.of("FAILED"));
|
||||
|
||||
Reference in New Issue
Block a user