feat(alerting): AGENT_LIFECYCLE condition kind with per-subject fire mode

Allows alert rules to fire on agent-lifecycle events — REGISTERED,
RE_REGISTERED, DEREGISTERED, WENT_STALE, WENT_DEAD, RECOVERED — rather
than only on current state. Each matching `(agent, eventType, timestamp)`
becomes its own ackable AlertInstance, so outages on distinct agents are
independently routable.

Core:
- New `ConditionKind.AGENT_LIFECYCLE` + `AgentLifecycleCondition` record
  (scope, eventTypes, withinSeconds). Compact ctor rejects empty
  eventTypes and withinSeconds<1.
- Strict allowlist enum `AgentLifecycleEventType` (six entries matching
  the server-emitted types in `AgentRegistrationController` and
  `AgentLifecycleMonitor`). Custom agent-emitted event types tracked in
  backlog issue #145.
- `AgentEventRepository.findInWindow(env, appSlug, agentId, eventTypes,
  from, to, limit)` — new read path ordered `(timestamp ASC, insert_id
  ASC)` used by the evaluator. Implemented on
  `ClickHouseAgentEventRepository` with tenant + env filter mandatory.

App:
- `AgentLifecycleEvaluator` queries events in the last `withinSeconds`
  window and returns `EvalResult.Batch` with one `Firing` per row.
  Every Firing carries a canonical `_subjectFingerprint` of
  `"<agentId>:<eventType>:<tsMillis>"` in context plus `agent` / `event`
  subtrees for Mustache templating.
- `NotificationContextBuilder` gains an `AGENT_LIFECYCLE` branch that
  exposes `{{agent.id}}`, `{{agent.app}}`, `{{event.type}}`,
  `{{event.timestamp}}`, `{{event.detail}}`.
- Validation is delegated to the record compact ctor + enum at Jackson
  deserialization time — matches the existing policy of keeping
  controller validators focused on env-scoped / SQL-injection concerns.

Schema:
- V16 migration generalises the V15 per-exchange discriminator on
  `alert_instances_open_rule_uq` to prefer `_subjectFingerprint` with a
  fallback to the legacy `exchange.id` expression. Scalar kinds still
  resolve to `''` and keep one-open-per-rule. Duplicate-key path in
  `PostgresAlertInstanceRepository.save` is unchanged — the index is
  the deduper.

UI:
- New `AgentLifecycleForm.tsx` wizard form with multi-select chips for
  the six allowed event types + `withinSeconds` input. Wired into
  `ConditionStep`, `form-state` (validation + defaults: WENT_DEAD,
  300 s), and `enums.ts` options. Tests in `enums.test.ts` pin the
  new option array.
- `alert-variables.ts` registers `{{agent.app}}`, `{{event.type}}`,
  `{{event.timestamp}}`, `{{event.detail}}` leaves for the new kind,
  and extends `agent.id`'s availability list to include `AGENT_LIFECYCLE`.

Tests (all passing):
- 5 new JSON-roundtrip cases on `AlertConditionJsonTest` (positive +
  empty/zero/unknown-type rejection).
- 5 new evaluator unit tests on `AgentLifecycleEvaluatorTest` (empty
  window, multi-agent fingerprint shape, scope forwarding, missing env).
- `NotificationContextBuilderTest` switch now covers the new kind.
- 119 alerting unit tests + 71 UI tests green.

Docs: `.claude/rules/{core,app,ui}` and CLAUDE.md migration list updated.
This commit is contained in:
hsiegeln
2026-04-21 14:52:08 +02:00
parent 23d02ba6a0
commit 414f7204bf
24 changed files with 601 additions and 20 deletions

File diff suppressed because one or more lines are too long

View File

@@ -2221,6 +2221,16 @@ export interface components {
/** Format: date-time */
createdAt?: string;
};
AgentLifecycleCondition: {
kind: "AgentLifecycleCondition";
} & (Omit<components["schemas"]["AlertCondition"], "kind"> & {
scope?: components["schemas"]["AlertScope"];
eventTypes?: ("REGISTERED" | "RE_REGISTERED" | "DEREGISTERED" | "WENT_STALE" | "WENT_DEAD" | "RECOVERED")[];
/** Format: int32 */
withinSeconds?: number;
/** @enum {string} */
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "AGENT_LIFECYCLE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
});
AgentStateCondition: {
kind: "AgentStateCondition";
} & (Omit<components["schemas"]["AlertCondition"], "kind"> & {
@@ -2229,11 +2239,11 @@ export interface components {
/** Format: int32 */
forSeconds?: number;
/** @enum {string} */
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "AGENT_LIFECYCLE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
});
AlertCondition: {
/** @enum {string} */
kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "AGENT_LIFECYCLE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
};
AlertRuleRequest: {
name?: string;
@@ -2241,8 +2251,8 @@ export interface components {
/** @enum {string} */
severity: "CRITICAL" | "WARNING" | "INFO";
/** @enum {string} */
conditionKind: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
condition: components["schemas"]["AgentStateCondition"] | components["schemas"]["DeploymentStateCondition"] | components["schemas"]["ExchangeMatchCondition"] | components["schemas"]["JvmMetricCondition"] | components["schemas"]["LogPatternCondition"] | components["schemas"]["RouteMetricCondition"];
conditionKind: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "AGENT_LIFECYCLE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
condition: components["schemas"]["AgentLifecycleCondition"] | components["schemas"]["AgentStateCondition"] | components["schemas"]["DeploymentStateCondition"] | components["schemas"]["ExchangeMatchCondition"] | components["schemas"]["JvmMetricCondition"] | components["schemas"]["LogPatternCondition"] | components["schemas"]["RouteMetricCondition"];
/** Format: int32 */
evaluationIntervalSeconds?: number;
/** Format: int32 */
@@ -2274,7 +2284,7 @@ export interface components {
scope?: components["schemas"]["AlertScope"];
states?: string[];
/** @enum {string} */
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "AGENT_LIFECYCLE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
});
ExchangeFilter: {
status?: string;
@@ -2296,7 +2306,7 @@ export interface components {
/** Format: int32 */
perExchangeLingerSeconds?: number;
/** @enum {string} */
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "AGENT_LIFECYCLE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
});
JvmMetricCondition: {
kind: "JvmMetricCondition";
@@ -2312,7 +2322,7 @@ export interface components {
/** Format: int32 */
windowSeconds?: number;
/** @enum {string} */
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "AGENT_LIFECYCLE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
});
LogPatternCondition: {
kind: "LogPatternCondition";
@@ -2325,7 +2335,7 @@ export interface components {
/** Format: int32 */
windowSeconds?: number;
/** @enum {string} */
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "AGENT_LIFECYCLE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
});
RouteMetricCondition: {
kind: "RouteMetricCondition";
@@ -2340,7 +2350,7 @@ export interface components {
/** Format: int32 */
windowSeconds?: number;
/** @enum {string} */
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
readonly kind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "AGENT_LIFECYCLE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
});
WebhookBindingRequest: {
/** Format: uuid */
@@ -2361,8 +2371,8 @@ export interface components {
severity?: "CRITICAL" | "WARNING" | "INFO";
enabled?: boolean;
/** @enum {string} */
conditionKind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
condition?: components["schemas"]["AgentStateCondition"] | components["schemas"]["DeploymentStateCondition"] | components["schemas"]["ExchangeMatchCondition"] | components["schemas"]["JvmMetricCondition"] | components["schemas"]["LogPatternCondition"] | components["schemas"]["RouteMetricCondition"];
conditionKind?: "ROUTE_METRIC" | "EXCHANGE_MATCH" | "AGENT_STATE" | "AGENT_LIFECYCLE" | "DEPLOYMENT_STATE" | "LOG_PATTERN" | "JVM_METRIC";
condition?: components["schemas"]["AgentLifecycleCondition"] | components["schemas"]["AgentStateCondition"] | components["schemas"]["DeploymentStateCondition"] | components["schemas"]["ExchangeMatchCondition"] | components["schemas"]["JvmMetricCondition"] | components["schemas"]["LogPatternCondition"] | components["schemas"]["RouteMetricCondition"];
/** Format: int32 */
evaluationIntervalSeconds?: number;
/** Format: int32 */

View File

@@ -42,6 +42,16 @@ export const ALERT_VARIABLES: AlertVariable[] = [
{ path: 'app.id', type: 'uuid', description: 'App UUID', sampleValue: '33333333-...',
availableForKinds: ['ROUTE_METRIC', 'EXCHANGE_MATCH', 'AGENT_STATE', 'DEPLOYMENT_STATE', 'LOG_PATTERN', 'JVM_METRIC'], mayBeNull: true },
// AGENT_LIFECYCLE — agent + event subtree (distinct from AGENT_STATE's agent.* leaves)
{ path: 'agent.app', type: 'string', description: 'Agent app slug', sampleValue: 'orders',
availableForKinds: ['AGENT_LIFECYCLE'] },
{ path: 'event.type', type: 'string', description: 'Lifecycle event type', sampleValue: 'WENT_DEAD',
availableForKinds: ['AGENT_LIFECYCLE'] },
{ path: 'event.timestamp', type: 'Instant', description: 'When the event happened', sampleValue: '2026-04-20T14:33:10Z',
availableForKinds: ['AGENT_LIFECYCLE'] },
{ path: 'event.detail', type: 'string', description: 'Free-text event detail', sampleValue: 'orders-0 STALE -> DEAD',
availableForKinds: ['AGENT_LIFECYCLE'], mayBeNull: true },
// ROUTE_METRIC + EXCHANGE_MATCH share route.*
{ path: 'route.id', type: 'string', description: 'Route ID', sampleValue: 'route-1',
availableForKinds: ['ROUTE_METRIC', 'EXCHANGE_MATCH'] },
@@ -56,7 +66,7 @@ export const ALERT_VARIABLES: AlertVariable[] = [
// AGENT_STATE + JVM_METRIC share agent.id/name; AGENT_STATE adds agent.state
{ path: 'agent.id', type: 'string', description: 'Agent instance ID', sampleValue: 'prod-orders-0',
availableForKinds: ['AGENT_STATE', 'JVM_METRIC'] },
availableForKinds: ['AGENT_STATE', 'AGENT_LIFECYCLE', 'JVM_METRIC'] },
{ path: 'agent.name', type: 'string', description: 'Agent display name', sampleValue: 'orders-0',
availableForKinds: ['AGENT_STATE', 'JVM_METRIC'] },
{ path: 'agent.state', type: 'string', description: 'Agent state', sampleValue: 'DEAD',

View File

@@ -3,6 +3,7 @@ import type { FormState } from './form-state';
import { RouteMetricForm } from './condition-forms/RouteMetricForm';
import { ExchangeMatchForm } from './condition-forms/ExchangeMatchForm';
import { AgentStateForm } from './condition-forms/AgentStateForm';
import { AgentLifecycleForm } from './condition-forms/AgentLifecycleForm';
import { DeploymentStateForm } from './condition-forms/DeploymentStateForm';
import { LogPatternForm } from './condition-forms/LogPatternForm';
import { JvmMetricForm } from './condition-forms/JvmMetricForm';
@@ -23,6 +24,13 @@ export function ConditionStep({ form, setForm }: { form: FormState; setForm: (f:
base.perExchangeLingerSeconds = 300;
base.filter = {};
}
if (kind === 'AGENT_LIFECYCLE') {
// Sensible defaults so a rule can be saved without touching every sub-field.
// WENT_DEAD is the most "alert-worthy" event out of the six; a 5-minute
// window matches the registry's STALE→DEAD cadence + slack for tick jitter.
base.eventTypes = ['WENT_DEAD'];
base.withinSeconds = 300;
}
setForm({
...form,
conditionKind: kind,
@@ -42,6 +50,7 @@ export function ConditionStep({ form, setForm }: { form: FormState; setForm: (f:
{form.conditionKind === 'ROUTE_METRIC' && <RouteMetricForm form={form} setForm={setForm} />}
{form.conditionKind === 'EXCHANGE_MATCH' && <ExchangeMatchForm form={form} setForm={setForm} />}
{form.conditionKind === 'AGENT_STATE' && <AgentStateForm form={form} setForm={setForm} />}
{form.conditionKind === 'AGENT_LIFECYCLE' && <AgentLifecycleForm form={form} setForm={setForm} />}
{form.conditionKind === 'DEPLOYMENT_STATE' && <DeploymentStateForm form={form} setForm={setForm} />}
{form.conditionKind === 'LOG_PATTERN' && <LogPatternForm form={form} setForm={setForm} />}
{form.conditionKind === 'JVM_METRIC' && <JvmMetricForm form={form} setForm={setForm} />}

View File

@@ -0,0 +1,72 @@
import { FormField, Input } from '@cameleer/design-system';
import type { FormState } from '../form-state';
import {
AGENT_LIFECYCLE_EVENT_TYPE_OPTIONS,
type AgentLifecycleEventType,
} from '../../enums';
/**
* Form for `AGENT_LIFECYCLE` conditions. Users pick one or more event types
* (allowlist only) and a lookback window in seconds. The evaluator queries
* `agent_events` with those filters; each matching row produces its own
* {@code AlertInstance}.
*/
export function AgentLifecycleForm({ form, setForm }: { form: FormState; setForm: (f: FormState) => void }) {
const c = form.condition as Record<string, unknown>;
const selected = new Set<AgentLifecycleEventType>(
Array.isArray(c.eventTypes) ? (c.eventTypes as AgentLifecycleEventType[]) : [],
);
const patch = (p: Record<string, unknown>) =>
setForm({
...form,
condition: { ...(form.condition as Record<string, unknown>), ...p } as FormState['condition'],
});
const toggle = (t: AgentLifecycleEventType) => {
const next = new Set(selected);
if (next.has(t)) next.delete(t); else next.add(t);
patch({ eventTypes: [...next] });
};
return (
<>
<FormField
label="Event types"
hint="Fires one alert per matching event. Pick at least one."
>
<div style={{ display: 'flex', flexWrap: 'wrap', gap: 6 }}>
{AGENT_LIFECYCLE_EVENT_TYPE_OPTIONS.map((opt) => {
const active = selected.has(opt.value);
return (
<button
key={opt.value}
type="button"
onClick={() => toggle(opt.value)}
style={{
border: `1px solid ${active ? 'var(--amber)' : 'var(--border-subtle)'}`,
background: active ? 'var(--amber-bg)' : 'transparent',
color: active ? 'var(--text-primary)' : 'var(--text-secondary)',
borderRadius: 999,
padding: '4px 10px',
fontSize: 12,
cursor: 'pointer',
}}
>
{opt.label}
</button>
);
})}
</div>
</FormField>
<FormField label="Lookback window (seconds)" hint="How far back to search for matching events each tick.">
<Input
type="number"
min={1}
value={(c.withinSeconds as number | undefined) ?? 300}
onChange={(e) => patch({ withinSeconds: Number(e.target.value) })}
/>
</FormField>
</>
);
}

View File

@@ -160,6 +160,13 @@ export function validateStep(step: WizardStep, f: FormState): string[] {
if (c.windowSeconds == null) errs.push('Window (seconds) is required for COUNT_IN_WINDOW.');
}
}
if (f.conditionKind === 'AGENT_LIFECYCLE') {
const c = f.condition as Record<string, unknown>;
const types = Array.isArray(c.eventTypes) ? (c.eventTypes as string[]) : [];
if (types.length === 0) errs.push('Pick at least one event type.');
const within = c.withinSeconds as number | undefined;
if (within == null || within < 1) errs.push('Lookback window must be \u2265 1 second.');
}
}
if (step === 'trigger') {
if (f.evaluationIntervalSeconds < 5) errs.push('Evaluation interval must be \u2265 5 s.');

View File

@@ -7,6 +7,7 @@ import {
JVM_AGGREGATION_OPTIONS,
EXCHANGE_FIRE_MODE_OPTIONS,
TARGET_KIND_OPTIONS,
AGENT_LIFECYCLE_EVENT_TYPE_OPTIONS,
} from './enums';
/**
@@ -25,12 +26,24 @@ describe('alerts/enums option arrays', () => {
{ value: 'ROUTE_METRIC', label: 'Route metric (error rate, latency, throughput)' },
{ value: 'EXCHANGE_MATCH', label: 'Exchange match (specific failures)' },
{ value: 'AGENT_STATE', label: 'Agent state (DEAD / STALE)' },
{ value: 'AGENT_LIFECYCLE', label: 'Agent lifecycle (register / restart / stale / dead)' },
{ value: 'DEPLOYMENT_STATE', label: 'Deployment state (FAILED / DEGRADED)' },
{ value: 'LOG_PATTERN', label: 'Log pattern (count of matching logs)' },
{ value: 'JVM_METRIC', label: 'JVM metric (heap, GC, inflight)' },
]);
});
it('AGENT_LIFECYCLE_EVENT_TYPE_OPTIONS', () => {
expect(AGENT_LIFECYCLE_EVENT_TYPE_OPTIONS).toEqual([
{ value: 'WENT_STALE', label: 'Went stale (heartbeat missed)' },
{ value: 'WENT_DEAD', label: 'Went dead (extended silence)' },
{ value: 'RECOVERED', label: 'Recovered (stale → live)' },
{ value: 'REGISTERED', label: 'Registered (first check-in)' },
{ value: 'RE_REGISTERED', label: 'Re-registered (app restart)' },
{ value: 'DEREGISTERED', label: 'Deregistered (graceful shutdown)' },
]);
});
it('SEVERITY_OPTIONS', () => {
expect(SEVERITY_OPTIONS).toEqual([
{ value: 'CRITICAL', label: 'Critical' },

View File

@@ -44,6 +44,13 @@ export type RouteMetric = 'ERROR_RATE' | 'AVG_DURATION_MS' | 'P99_LATENCY_M
export type Comparator = 'GT' | 'GTE' | 'LT' | 'LTE' | 'EQ';
export type JvmAggregation = 'MAX' | 'MIN' | 'AVG' | 'LATEST';
export type ExchangeFireMode = 'PER_EXCHANGE' | 'COUNT_IN_WINDOW';
export type AgentLifecycleEventType =
| 'REGISTERED'
| 'RE_REGISTERED'
| 'DEREGISTERED'
| 'WENT_STALE'
| 'WENT_DEAD'
| 'RECOVERED';
export interface Option<T extends string> { value: T; label: string }
@@ -73,6 +80,7 @@ const CONDITION_KIND_LABELS: Record<ConditionKind, string> = {
ROUTE_METRIC: 'Route metric (error rate, latency, throughput)',
EXCHANGE_MATCH: 'Exchange match (specific failures)',
AGENT_STATE: 'Agent state (DEAD / STALE)',
AGENT_LIFECYCLE: 'Agent lifecycle (register / restart / stale / dead)',
DEPLOYMENT_STATE: 'Deployment state (FAILED / DEGRADED)',
LOG_PATTERN: 'Log pattern (count of matching logs)',
JVM_METRIC: 'JVM metric (heap, GC, inflight)',
@@ -114,6 +122,15 @@ const EXCHANGE_FIRE_MODE_LABELS: Record<ExchangeFireMode, string> = {
COUNT_IN_WINDOW: 'Threshold: N matches in window',
};
const AGENT_LIFECYCLE_EVENT_TYPE_LABELS: Record<AgentLifecycleEventType, string> = {
WENT_STALE: 'Went stale (heartbeat missed)',
WENT_DEAD: 'Went dead (extended silence)',
RECOVERED: 'Recovered (stale → live)',
REGISTERED: 'Registered (first check-in)',
RE_REGISTERED: 'Re-registered (app restart)',
DEREGISTERED: 'Deregistered (graceful shutdown)',
};
const TARGET_KIND_LABELS: Record<TargetKind, string> = {
USER: 'User',
GROUP: 'Group',
@@ -147,3 +164,5 @@ export const COMPARATOR_OPTIONS: Option<Comparator>[] = toOptions
export const JVM_AGGREGATION_OPTIONS: Option<JvmAggregation>[] = toOptions(JVM_AGGREGATION_LABELS, JVM_AGGREGATION_HIDDEN);
export const EXCHANGE_FIRE_MODE_OPTIONS: Option<ExchangeFireMode>[] = toOptions(EXCHANGE_FIRE_MODE_LABELS);
export const TARGET_KIND_OPTIONS: Option<TargetKind>[] = toOptions(TARGET_KIND_LABELS);
export const AGENT_LIFECYCLE_EVENT_TYPE_OPTIONS: Option<AgentLifecycleEventType>[] =
toOptions(AGENT_LIFECYCLE_EVENT_TYPE_LABELS);