feat(alerting): AGENT_LIFECYCLE condition kind with per-subject fire mode

Allows alert rules to fire on agent-lifecycle events — REGISTERED,
RE_REGISTERED, DEREGISTERED, WENT_STALE, WENT_DEAD, RECOVERED — rather
than only on current state. Each matching `(agent, eventType, timestamp)`
becomes its own ackable AlertInstance, so outages on distinct agents are
independently routable.

Core:
- New `ConditionKind.AGENT_LIFECYCLE` + `AgentLifecycleCondition` record
  (scope, eventTypes, withinSeconds). Compact ctor rejects empty
  eventTypes and withinSeconds<1.
- Strict allowlist enum `AgentLifecycleEventType` (six entries matching
  the server-emitted types in `AgentRegistrationController` and
  `AgentLifecycleMonitor`). Custom agent-emitted event types tracked in
  backlog issue #145.
- `AgentEventRepository.findInWindow(env, appSlug, agentId, eventTypes,
  from, to, limit)` — new read path ordered `(timestamp ASC, insert_id
  ASC)` used by the evaluator. Implemented on
  `ClickHouseAgentEventRepository` with tenant + env filter mandatory.

App:
- `AgentLifecycleEvaluator` queries events in the last `withinSeconds`
  window and returns `EvalResult.Batch` with one `Firing` per row.
  Every Firing carries a canonical `_subjectFingerprint` of
  `"<agentId>:<eventType>:<tsMillis>"` in context plus `agent` / `event`
  subtrees for Mustache templating.
- `NotificationContextBuilder` gains an `AGENT_LIFECYCLE` branch that
  exposes `{{agent.id}}`, `{{agent.app}}`, `{{event.type}}`,
  `{{event.timestamp}}`, `{{event.detail}}`.
- Validation is delegated to the record compact ctor + enum at Jackson
  deserialization time — matches the existing policy of keeping
  controller validators focused on env-scoped / SQL-injection concerns.

Schema:
- V16 migration generalises the V15 per-exchange discriminator on
  `alert_instances_open_rule_uq` to prefer `_subjectFingerprint` with a
  fallback to the legacy `exchange.id` expression. Scalar kinds still
  resolve to `''` and keep one-open-per-rule. Duplicate-key path in
  `PostgresAlertInstanceRepository.save` is unchanged — the index is
  the deduper.

UI:
- New `AgentLifecycleForm.tsx` wizard form with multi-select chips for
  the six allowed event types + `withinSeconds` input. Wired into
  `ConditionStep`, `form-state` (validation + defaults: WENT_DEAD,
  300 s), and `enums.ts` options. Tests in `enums.test.ts` pin the
  new option array.
- `alert-variables.ts` registers `{{agent.app}}`, `{{event.type}}`,
  `{{event.timestamp}}`, `{{event.detail}}` leaves for the new kind,
  and extends `agent.id`'s availability list to include `AGENT_LIFECYCLE`.

Tests (all passing):
- 5 new JSON-roundtrip cases on `AlertConditionJsonTest` (positive +
  empty/zero/unknown-type rejection).
- 5 new evaluator unit tests on `AgentLifecycleEvaluatorTest` (empty
  window, multi-agent fingerprint shape, scope forwarding, missing env).
- `NotificationContextBuilderTest` switch now covers the new kind.
- 119 alerting unit tests + 71 UI tests green.

Docs: `.claude/rules/{core,app,ui}` and CLAUDE.md migration list updated.
This commit is contained in:
hsiegeln
2026-04-21 14:52:08 +02:00
parent 23d02ba6a0
commit 414f7204bf
24 changed files with 601 additions and 20 deletions

View File

@@ -2221,6 +2221,16 @@ export interface components {
/** Format: date-time */
createdAt?: string;
};
AgentLifecycleCondition: {
kind: "AgentLifecycleCondition";
} & (Omit<components["schemas"]["AlertCondition"], "kind"> & {
scope?: components["schemas"]["AlertScope"];
eventTypes?: ("REGISTERED" | "RE_REGISTERED" | "DEREGISTERED" | "WENT_STALE" | "WENT_DEAD" | "RECOVERED")[];
/** Format: int32 */
withinSeconds?: number;
/** @enum {string} */
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "AGENT_LIFECYCLE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
});
AgentStateCondition: {
kind: "AgentStateCondition";
} & (Omit<components["schemas"]["AlertCondition"], "kind"> & {
@@ -2229,11 +2239,11 @@ export interface components {
/** Format: int32 */
forSeconds?: number;
/** @enum {string} */
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "AGENT_LIFECYCLE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
});
AlertCondition: {
/** @enum {string} */
kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "AGENT_LIFECYCLE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
};
AlertRuleRequest: {
name?: string;
@@ -2241,8 +2251,8 @@ export interface components {
/** @enum {string} */
severity: "CRITICAL" | "WARNING" | "INFO";
/** @enum {string} */
conditionKind: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
condition: components["schemas"]["AgentStateCondition"] | components["schemas"]["DeploymentStateCondition"] | components["schemas"]["ExchangeMatchCondition"] | components["schemas"]["JvmMetricCondition"] | components["schemas"]["LogPatternCondition"] | components["schemas"]["RouteMetricCondition"];
conditionKind: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "AGENT_LIFECYCLE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
condition: components["schemas"]["AgentLifecycleCondition"] | components["schemas"]["AgentStateCondition"] | components["schemas"]["DeploymentStateCondition"] | components["schemas"]["ExchangeMatchCondition"] | components["schemas"]["JvmMetricCondition"] | components["schemas"]["LogPatternCondition"] | components["schemas"]["RouteMetricCondition"];
/** Format: int32 */
evaluationIntervalSeconds?: number;
/** Format: int32 */
@@ -2274,7 +2284,7 @@ export interface components {
scope?: components["schemas"]["AlertScope"];
states?: string[];
/** @enum {string} */
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "AGENT_LIFECYCLE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
});
ExchangeFilter: {
status?: string;
@@ -2296,7 +2306,7 @@ export interface components {
/** Format: int32 */
perExchangeLingerSeconds?: number;
/** @enum {string} */
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "AGENT_LIFECYCLE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
});
JvmMetricCondition: {
kind: "JvmMetricCondition";
@@ -2312,7 +2322,7 @@ export interface components {
/** Format: int32 */
windowSeconds?: number;
/** @enum {string} */
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "AGENT_LIFECYCLE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
});
LogPatternCondition: {
kind: "LogPatternCondition";
@@ -2325,7 +2335,7 @@ export interface components {
/** Format: int32 */
windowSeconds?: number;
/** @enum {string} */
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "AGENT_LIFECYCLE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
});
RouteMetricCondition: {
kind: "RouteMetricCondition";
@@ -2340,7 +2350,7 @@ export interface components {
/** Format: int32 */
windowSeconds?: number;
/** @enum {string} */
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "AGENT_LIFECYCLE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
});
WebhookBindingRequest: {
/** Format: uuid */
@@ -2361,8 +2371,8 @@ export interface components {
severity?: "CRITICAL" | "WARNING" | "INFO";
enabled?: boolean;
/** @enum {string} */
conditionKind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
condition?: components["schemas"]["AgentStateCondition"] | components["schemas"]["DeploymentStateCondition"] | components["schemas"]["ExchangeMatchCondition"] | components["schemas"]["JvmMetricCondition"] | components["schemas"]["LogPatternCondition"] | components["schemas"]["RouteMetricCondition"];
conditionKind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "AGENT_LIFECYCLE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
condition?: components["schemas"]["AgentLifecycleCondition"] | components["schemas"]["AgentStateCondition"] | components["schemas"]["DeploymentStateCondition"] | components["schemas"]["ExchangeMatchCondition"] | components["schemas"]["JvmMetricCondition"] | components["schemas"]["LogPatternCondition"] | components["schemas"]["RouteMetricCondition"];
/** Format: int32 */
evaluationIntervalSeconds?: number;
/** Format: int32 */