feat(alerting): AGENT_LIFECYCLE condition kind with per-subject fire mode
Allows alert rules to fire on agent-lifecycle events — REGISTERED, RE_REGISTERED, DEREGISTERED, WENT_STALE, WENT_DEAD, RECOVERED — rather than only on current state. Each matching `(agent, eventType, timestamp)` becomes its own ackable AlertInstance, so outages on distinct agents are independently routable. Core: - New `ConditionKind.AGENT_LIFECYCLE` + `AgentLifecycleCondition` record (scope, eventTypes, withinSeconds). Compact ctor rejects empty eventTypes and withinSeconds<1. - Strict allowlist enum `AgentLifecycleEventType` (six entries matching the server-emitted types in `AgentRegistrationController` and `AgentLifecycleMonitor`). Custom agent-emitted event types tracked in backlog issue #145. - `AgentEventRepository.findInWindow(env, appSlug, agentId, eventTypes, from, to, limit)` — new read path ordered `(timestamp ASC, insert_id ASC)` used by the evaluator. Implemented on `ClickHouseAgentEventRepository` with tenant + env filter mandatory. App: - `AgentLifecycleEvaluator` queries events in the last `withinSeconds` window and returns `EvalResult.Batch` with one `Firing` per row. Every Firing carries a canonical `_subjectFingerprint` of `"<agentId>:<eventType>:<tsMillis>"` in context plus `agent` / `event` subtrees for Mustache templating. - `NotificationContextBuilder` gains an `AGENT_LIFECYCLE` branch that exposes `{{agent.id}}`, `{{agent.app}}`, `{{event.type}}`, `{{event.timestamp}}`, `{{event.detail}}`. - Validation is delegated to the record compact ctor + enum at Jackson deserialization time — matches the existing policy of keeping controller validators focused on env-scoped / SQL-injection concerns. Schema: - V16 migration generalises the V15 per-exchange discriminator on `alert_instances_open_rule_uq` to prefer `_subjectFingerprint` with a fallback to the legacy `exchange.id` expression. Scalar kinds still resolve to `''` and keep one-open-per-rule. Duplicate-key path in `PostgresAlertInstanceRepository.save` is unchanged — the index is the deduper. UI: - New `AgentLifecycleForm.tsx` wizard form with multi-select chips for the six allowed event types + `withinSeconds` input. Wired into `ConditionStep`, `form-state` (validation + defaults: WENT_DEAD, 300 s), and `enums.ts` options. Tests in `enums.test.ts` pin the new option array. - `alert-variables.ts` registers `{{agent.app}}`, `{{event.type}}`, `{{event.timestamp}}`, `{{event.detail}}` leaves for the new kind, and extends `agent.id`'s availability list to include `AGENT_LIFECYCLE`. Tests (all passing): - 5 new JSON-roundtrip cases on `AlertConditionJsonTest` (positive + empty/zero/unknown-type rejection). - 5 new evaluator unit tests on `AgentLifecycleEvaluatorTest` (empty window, multi-agent fingerprint shape, scope forwarding, missing env). - `NotificationContextBuilderTest` switch now covers the new kind. - 119 alerting unit tests + 71 UI tests green. Docs: `.claude/rules/{core,app,ui}` and CLAUDE.md migration list updated.
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
package com.cameleer.server.core.agent;
|
||||
|
||||
import java.time.Instant;
|
||||
import java.util.List;
|
||||
|
||||
public interface AgentEventRepository {
|
||||
|
||||
@@ -13,4 +14,19 @@ public interface AgentEventRepository {
|
||||
*/
|
||||
AgentEventPage queryPage(String applicationId, String instanceId, String environment,
|
||||
Instant from, Instant to, String cursor, int limit);
|
||||
|
||||
/**
|
||||
* Inclusive-exclusive window query ordered by (timestamp ASC, instance_id ASC)
|
||||
* used by the AGENT_LIFECYCLE alert evaluator. {@code eventTypes} is required
|
||||
* and must be non-empty; the implementation filters via {@code event_type IN (...)}.
|
||||
* Scope filters ({@code applicationId}, {@code instanceId}) are optional. The
|
||||
* returned list is capped at {@code limit} rows.
|
||||
*/
|
||||
List<AgentEventRecord> findInWindow(String environment,
|
||||
String applicationId,
|
||||
String instanceId,
|
||||
List<String> eventTypes,
|
||||
Instant fromInclusive,
|
||||
Instant toExclusive,
|
||||
int limit);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
package com.cameleer.server.core.alerting;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Fires one {@code AlertInstance} per matching {@code agent_events} row in the
|
||||
* lookback window. Per-subject fire mode (see
|
||||
* {@link AgentLifecycleEventType}) — each {@code (agent, eventType, timestamp)}
|
||||
* tuple is independently ackable, driven by a canonical
|
||||
* {@code _subjectFingerprint} in the instance context and the partial unique
|
||||
* index on {@code alert_instances}.
|
||||
*/
|
||||
public record AgentLifecycleCondition(
|
||||
AlertScope scope,
|
||||
List<AgentLifecycleEventType> eventTypes,
|
||||
int withinSeconds
|
||||
) implements AlertCondition {
|
||||
|
||||
public AgentLifecycleCondition {
|
||||
if (eventTypes == null || eventTypes.isEmpty()) {
|
||||
throw new IllegalArgumentException("eventTypes must not be empty");
|
||||
}
|
||||
if (withinSeconds < 1) {
|
||||
throw new IllegalArgumentException("withinSeconds must be >= 1");
|
||||
}
|
||||
eventTypes = List.copyOf(eventTypes);
|
||||
}
|
||||
|
||||
@Override
|
||||
@JsonProperty(value = "kind", access = JsonProperty.Access.READ_ONLY)
|
||||
public ConditionKind kind() { return ConditionKind.AGENT_LIFECYCLE; }
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
package com.cameleer.server.core.alerting;
|
||||
|
||||
/**
|
||||
* Allowlist of agent-lifecycle event types that may appear in an
|
||||
* {@link AgentLifecycleCondition}. The set matches exactly the events the
|
||||
* server writes to {@code agent_events} — registration-controller emits
|
||||
* REGISTERED / RE_REGISTERED / DEREGISTERED, the lifecycle monitor emits
|
||||
* WENT_STALE / WENT_DEAD / RECOVERED.
|
||||
* <p>
|
||||
* Custom agent-emitted event types (via {@code POST /api/v1/data/events})
|
||||
* are intentionally excluded — see backlog issue #145.
|
||||
*/
|
||||
public enum AgentLifecycleEventType {
|
||||
REGISTERED,
|
||||
RE_REGISTERED,
|
||||
DEREGISTERED,
|
||||
WENT_STALE,
|
||||
WENT_DEAD,
|
||||
RECOVERED
|
||||
}
|
||||
@@ -9,13 +9,15 @@ import com.fasterxml.jackson.annotation.JsonTypeInfo;
|
||||
@JsonSubTypes.Type(value = RouteMetricCondition.class, name = "ROUTE_METRIC"),
|
||||
@JsonSubTypes.Type(value = ExchangeMatchCondition.class, name = "EXCHANGE_MATCH"),
|
||||
@JsonSubTypes.Type(value = AgentStateCondition.class, name = "AGENT_STATE"),
|
||||
@JsonSubTypes.Type(value = AgentLifecycleCondition.class, name = "AGENT_LIFECYCLE"),
|
||||
@JsonSubTypes.Type(value = DeploymentStateCondition.class, name = "DEPLOYMENT_STATE"),
|
||||
@JsonSubTypes.Type(value = LogPatternCondition.class, name = "LOG_PATTERN"),
|
||||
@JsonSubTypes.Type(value = JvmMetricCondition.class, name = "JVM_METRIC")
|
||||
})
|
||||
public sealed interface AlertCondition permits
|
||||
RouteMetricCondition, ExchangeMatchCondition, AgentStateCondition,
|
||||
DeploymentStateCondition, LogPatternCondition, JvmMetricCondition {
|
||||
AgentLifecycleCondition, DeploymentStateCondition, LogPatternCondition,
|
||||
JvmMetricCondition {
|
||||
|
||||
@JsonProperty("kind")
|
||||
ConditionKind kind();
|
||||
|
||||
@@ -1,3 +1,11 @@
|
||||
package com.cameleer.server.core.alerting;
|
||||
|
||||
public enum ConditionKind { ROUTE_METRIC, EXCHANGE_MATCH, AGENT_STATE, DEPLOYMENT_STATE, LOG_PATTERN, JVM_METRIC }
|
||||
public enum ConditionKind {
|
||||
ROUTE_METRIC,
|
||||
EXCHANGE_MATCH,
|
||||
AGENT_STATE,
|
||||
AGENT_LIFECYCLE,
|
||||
DEPLOYMENT_STATE,
|
||||
LOG_PATTERN,
|
||||
JVM_METRIC
|
||||
}
|
||||
|
||||
@@ -101,4 +101,50 @@ class AlertConditionJsonTest {
|
||||
AlertCondition parsed = om.readValue(om.writeValueAsString((AlertCondition) c), AlertCondition.class);
|
||||
assertThat(parsed).isInstanceOf(JvmMetricCondition.class);
|
||||
}
|
||||
|
||||
@Test
|
||||
void roundtripAgentLifecycle() throws Exception {
|
||||
var c = new AgentLifecycleCondition(
|
||||
new AlertScope("orders", null, null),
|
||||
List.of(AgentLifecycleEventType.WENT_DEAD, AgentLifecycleEventType.DEREGISTERED),
|
||||
300);
|
||||
AlertCondition parsed = om.readValue(om.writeValueAsString((AlertCondition) c), AlertCondition.class);
|
||||
assertThat(parsed).isInstanceOf(AgentLifecycleCondition.class);
|
||||
var alc = (AgentLifecycleCondition) parsed;
|
||||
assertThat(alc.eventTypes()).containsExactly(
|
||||
AgentLifecycleEventType.WENT_DEAD, AgentLifecycleEventType.DEREGISTERED);
|
||||
assertThat(alc.withinSeconds()).isEqualTo(300);
|
||||
assertThat(alc.kind()).isEqualTo(ConditionKind.AGENT_LIFECYCLE);
|
||||
}
|
||||
|
||||
@Test
|
||||
void agentLifecycleRejectsEmptyEventTypes() {
|
||||
assertThatThrownBy(() -> new AgentLifecycleCondition(
|
||||
new AlertScope(null, null, null), List.of(), 60))
|
||||
.isInstanceOf(IllegalArgumentException.class)
|
||||
.hasMessageContaining("eventTypes");
|
||||
}
|
||||
|
||||
@Test
|
||||
void agentLifecycleRejectsZeroWindow() {
|
||||
assertThatThrownBy(() -> new AgentLifecycleCondition(
|
||||
new AlertScope(null, null, null),
|
||||
List.of(AgentLifecycleEventType.WENT_DEAD), 0))
|
||||
.isInstanceOf(IllegalArgumentException.class)
|
||||
.hasMessageContaining("withinSeconds");
|
||||
}
|
||||
|
||||
@Test
|
||||
void agentLifecycleRejectsUnknownEventTypeOnDeserialization() {
|
||||
String json = """
|
||||
{
|
||||
"kind": "AGENT_LIFECYCLE",
|
||||
"scope": {},
|
||||
"eventTypes": ["REGISTERED", "BOGUS_EVENT"],
|
||||
"withinSeconds": 60
|
||||
}
|
||||
""";
|
||||
assertThatThrownBy(() -> om.readValue(json, AlertCondition.class))
|
||||
.hasMessageContaining("BOGUS_EVENT");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user