feat(alerting): AGENT_LIFECYCLE condition kind with per-subject fire mode

Allows alert rules to fire on agent-lifecycle events — REGISTERED,
RE_REGISTERED, DEREGISTERED, WENT_STALE, WENT_DEAD, RECOVERED — rather
than only on current state. Each matching `(agent, eventType, timestamp)`
becomes its own ackable AlertInstance, so outages on distinct agents are
independently routable.

Core:
- New `ConditionKind.AGENT_LIFECYCLE` + `AgentLifecycleCondition` record
  (scope, eventTypes, withinSeconds). Compact ctor rejects empty
  eventTypes and withinSeconds<1.
- Strict allowlist enum `AgentLifecycleEventType` (six entries matching
  the server-emitted types in `AgentRegistrationController` and
  `AgentLifecycleMonitor`). Custom agent-emitted event types tracked in
  backlog issue #145.
- `AgentEventRepository.findInWindow(env, appSlug, agentId, eventTypes,
  from, to, limit)` — new read path ordered `(timestamp ASC, insert_id
  ASC)` used by the evaluator. Implemented on
  `ClickHouseAgentEventRepository` with tenant + env filter mandatory.

App:
- `AgentLifecycleEvaluator` queries events in the last `withinSeconds`
  window and returns `EvalResult.Batch` with one `Firing` per row.
  Every Firing carries a canonical `_subjectFingerprint` of
  `"<agentId>:<eventType>:<tsMillis>"` in context plus `agent` / `event`
  subtrees for Mustache templating.
- `NotificationContextBuilder` gains an `AGENT_LIFECYCLE` branch that
  exposes `{{agent.id}}`, `{{agent.app}}`, `{{event.type}}`,
  `{{event.timestamp}}`, `{{event.detail}}`.
- Validation is delegated to the record compact ctor + enum at Jackson
  deserialization time — matches the existing policy of keeping
  controller validators focused on env-scoped / SQL-injection concerns.

Schema:
- V16 migration generalises the V15 per-exchange discriminator on
  `alert_instances_open_rule_uq` to prefer `_subjectFingerprint` with a
  fallback to the legacy `exchange.id` expression. Scalar kinds still
  resolve to `''` and keep one-open-per-rule. Duplicate-key path in
  `PostgresAlertInstanceRepository.save` is unchanged — the index is
  the deduper.

UI:
- New `AgentLifecycleForm.tsx` wizard form with multi-select chips for
  the six allowed event types + `withinSeconds` input. Wired into
  `ConditionStep`, `form-state` (validation + defaults: WENT_DEAD,
  300 s), and `enums.ts` options. Tests in `enums.test.ts` pin the
  new option array.
- `alert-variables.ts` registers `{{agent.app}}`, `{{event.type}}`,
  `{{event.timestamp}}`, `{{event.detail}}` leaves for the new kind,
  and extends `agent.id`'s availability list to include `AGENT_LIFECYCLE`.

Tests (all passing):
- 5 new JSON-roundtrip cases on `AlertConditionJsonTest` (positive +
  empty/zero/unknown-type rejection).
- 5 new evaluator unit tests on `AgentLifecycleEvaluatorTest` (empty
  window, multi-agent fingerprint shape, scope forwarding, missing env).
- `NotificationContextBuilderTest` switch now covers the new kind.
- 119 alerting unit tests + 71 UI tests green.

Docs: `.claude/rules/{core,app,ui}` and CLAUDE.md migration list updated.
This commit is contained in:
hsiegeln
2026-04-21 14:52:08 +02:00
parent 23d02ba6a0
commit 414f7204bf
24 changed files with 601 additions and 20 deletions

View File

@@ -0,0 +1,95 @@
package com.cameleer.server.app.alerting.eval;
import com.cameleer.server.core.agent.AgentEventRecord;
import com.cameleer.server.core.agent.AgentEventRepository;
import com.cameleer.server.core.alerting.AgentLifecycleCondition;
import com.cameleer.server.core.alerting.AgentLifecycleEventType;
import com.cameleer.server.core.alerting.AlertRule;
import com.cameleer.server.core.alerting.AlertScope;
import com.cameleer.server.core.alerting.ConditionKind;
import com.cameleer.server.core.runtime.EnvironmentRepository;
import org.springframework.stereotype.Component;
import java.time.Instant;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
/**
* Evaluator for {@link AgentLifecycleCondition}.
* <p>
* Each matching row in {@code agent_events} produces its own {@link EvalResult.Firing}
* in an {@link EvalResult.Batch}, so every {@code (agent, eventType, timestamp)}
* tuple gets its own {@code AlertInstance} — operationally distinct outages /
* restarts / shutdowns are independently ackable. Deduplication across ticks
* is enforced by {@code alert_instances_open_rule_uq} via the canonical
* {@code _subjectFingerprint} key in the instance context (see V16 migration).
*/
@Component
public class AgentLifecycleEvaluator implements ConditionEvaluator<AgentLifecycleCondition> {
/** Hard cap on rows returned per tick — prevents a flood of stale events from overwhelming the job. */
private static final int MAX_EVENTS_PER_TICK = 500;
private final AgentEventRepository eventRepo;
private final EnvironmentRepository envRepo;
public AgentLifecycleEvaluator(AgentEventRepository eventRepo, EnvironmentRepository envRepo) {
this.eventRepo = eventRepo;
this.envRepo = envRepo;
}
@Override
public ConditionKind kind() { return ConditionKind.AGENT_LIFECYCLE; }
@Override
public EvalResult evaluate(AgentLifecycleCondition c, AlertRule rule, EvalContext ctx) {
String envSlug = envRepo.findById(rule.environmentId())
.map(e -> e.slug())
.orElse(null);
if (envSlug == null) return EvalResult.Clear.INSTANCE;
AlertScope scope = c.scope();
String appSlug = scope != null ? scope.appSlug() : null;
String agentId = scope != null ? scope.agentId() : null;
List<String> typeNames = c.eventTypes().stream()
.map(AgentLifecycleEventType::name)
.toList();
Instant from = ctx.now().minusSeconds(c.withinSeconds());
Instant to = ctx.now();
List<AgentEventRecord> matches = eventRepo.findInWindow(
envSlug, appSlug, agentId, typeNames, from, to, MAX_EVENTS_PER_TICK);
if (matches.isEmpty()) return new EvalResult.Batch(List.of());
List<EvalResult.Firing> firings = new ArrayList<>(matches.size());
for (AgentEventRecord ev : matches) {
firings.add(toFiring(ev));
}
return new EvalResult.Batch(firings);
}
private static EvalResult.Firing toFiring(AgentEventRecord ev) {
String fingerprint = (ev.instanceId() == null ? "" : ev.instanceId())
+ ":" + (ev.eventType() == null ? "" : ev.eventType())
+ ":" + (ev.timestamp() == null ? "0" : Long.toString(ev.timestamp().toEpochMilli()));
Map<String, Object> context = new LinkedHashMap<>();
context.put("agent", Map.of(
"id", ev.instanceId() == null ? "" : ev.instanceId(),
"app", ev.applicationId() == null ? "" : ev.applicationId()
));
context.put("event", Map.of(
"type", ev.eventType() == null ? "" : ev.eventType(),
"timestamp", ev.timestamp() == null ? "" : ev.timestamp().toString(),
"detail", ev.detail() == null ? "" : ev.detail()
));
context.put("_subjectFingerprint", fingerprint);
return new EvalResult.Firing(1.0, null, context);
}
}

View File

@@ -64,6 +64,10 @@ public class NotificationContextBuilder {
ctx.put("agent", subtree(instance, "agent.id", "agent.name", "agent.state"));
ctx.put("app", subtree(instance, "app.slug", "app.id"));
}
case AGENT_LIFECYCLE -> {
ctx.put("agent", subtree(instance, "agent.id", "agent.app"));
ctx.put("event", subtree(instance, "event.type", "event.timestamp", "event.detail"));
}
case DEPLOYMENT_STATE -> {
ctx.put("deployment", subtree(instance, "deployment.id", "deployment.status"));
ctx.put("app", subtree(instance, "app.slug", "app.id"));

View File

@@ -106,4 +106,57 @@ public class ClickHouseAgentEventRepository implements AgentEventRepository {
return new AgentEventPage(results, nextCursor, hasMore);
}
@Override
public List<AgentEventRecord> findInWindow(String environment,
String applicationId,
String instanceId,
List<String> eventTypes,
Instant fromInclusive,
Instant toExclusive,
int limit) {
if (eventTypes == null || eventTypes.isEmpty()) {
throw new IllegalArgumentException("eventTypes must not be empty");
}
if (fromInclusive == null || toExclusive == null) {
throw new IllegalArgumentException("from/to must not be null");
}
// `event_type IN (?, ?, …)` — one placeholder per type.
String placeholders = String.join(",", java.util.Collections.nCopies(eventTypes.size(), "?"));
var sql = new StringBuilder(SELECT_BASE);
var params = new ArrayList<Object>();
params.add(tenantId);
if (environment != null) {
sql.append(" AND environment = ?");
params.add(environment);
}
if (applicationId != null) {
sql.append(" AND application_id = ?");
params.add(applicationId);
}
if (instanceId != null) {
sql.append(" AND instance_id = ?");
params.add(instanceId);
}
sql.append(" AND event_type IN (").append(placeholders).append(")");
params.addAll(eventTypes);
sql.append(" AND timestamp >= ? AND timestamp < ?");
params.add(Timestamp.from(fromInclusive));
params.add(Timestamp.from(toExclusive));
sql.append(" ORDER BY timestamp ASC, insert_id ASC LIMIT ?");
params.add(limit);
return jdbc.query(sql.toString(),
(rs, rowNum) -> new AgentEventRecord(
rs.getLong("id"),
rs.getString("instance_id"),
rs.getString("application_id"),
rs.getString("event_type"),
rs.getString("detail"),
rs.getTimestamp("timestamp").toInstant()
),
params.toArray());
}
}

View File

@@ -0,0 +1,27 @@
-- V16 — Generalise open-alert_instance uniqueness via `_subjectFingerprint`.
--
-- V15 discriminated open instances by `context->'exchange'->>'id'` so that
-- EXCHANGE_MATCH / PER_EXCHANGE could emit one instance per exchange. The new
-- AGENT_LIFECYCLE / PER_AGENT condition has the same shape but a different
-- subject key (agentId + eventType + eventTs). Rather than bolt condition-kind
-- knowledge into the index, we introduce a canonical `_subjectFingerprint`
-- field in `context` that every "per-subject" evaluator writes. The index
-- prefers it over the legacy exchange.id discriminator.
--
-- Precedence in the COALESCE:
-- 1. context->>'_subjectFingerprint' — explicit per-subject key (new)
-- 2. context->'exchange'->>'id' — legacy EXCHANGE_MATCH instances (pre-V16)
-- 3. '' — scalar condition kinds (one open per rule)
--
-- Existing open PER_EXCHANGE instances keep working because they never set
-- `_subjectFingerprint` but do carry `context.exchange.id`, so the index
-- still discriminates them correctly.
DROP INDEX IF EXISTS alert_instances_open_rule_uq;
CREATE UNIQUE INDEX alert_instances_open_rule_uq
ON alert_instances (rule_id, (COALESCE(
context->>'_subjectFingerprint',
context->'exchange'->>'id',
'')))
WHERE rule_id IS NOT NULL
AND state IN ('PENDING','FIRING','ACKNOWLEDGED');