fix(alerting): allow multiple open alert_instances per rule for PER_EXCHANGE
All checks were successful
CI / cleanup-branch (push) Has been skipped
CI / build (push) Successful in 1m51s
CI / docker (push) Successful in 1m17s
CI / deploy-feature (push) Has been skipped
CI / deploy (push) Successful in 41s

V13 added a partial unique index on alert_instances(rule_id) WHERE state
IN (PENDING,FIRING,ACKNOWLEDGED). Correct for scalar condition kinds
(ROUTE_METRIC / AGENT_STATE / DEPLOYMENT_STATE / LOG_PATTERN / JVM_METRIC
/ EXCHANGE_MATCH in COUNT_IN_WINDOW) but wrong for EXCHANGE_MATCH /
PER_EXCHANGE, which by design emits one alert_instance per matching
exchange. Under V13 every PER_EXCHANGE tick with >1 match logged
"Skipped duplicate open alert_instance for rule …" at evaluator cadence
and silently lost alert fidelity — only the first matching exchange per
tick got an AlertInstance + webhook dispatch.

V15 drops the rule_id-only constraint and recreates it with a
discriminator on context->'exchange'->>'id'. Scalar kinds emit
Map.of() as context, so their expression resolves to '' — "one open per
rule" preserved. ExchangeMatchEvaluator.evaluatePerExchange always
populates exchange.id, so per-exchange instances coexist cleanly.

Two new PostgresAlertInstanceRepositoryIT tests:
  - multiple open instances for same rule + distinct exchanges all land
  - second open for identical (rule, exchange) still dedups via the
    DuplicateKeyException fallback in save() — defense-in-depth kept

Also fixes pre-existing PostgresAlertReadRepositoryIT brokenness: its
setup() inserted 3 open instances sharing one rule_id, which V13 blocked
on arrival. Migrate to one rule_id per instance (pattern already used
across other storage ITs).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
hsiegeln
2026-04-20 22:26:19 +02:00
parent e7ce1a73d0
commit 037a27d405
3 changed files with 108 additions and 8 deletions

View File

@@ -0,0 +1,24 @@
-- V15 — Discriminate open-alert_instance uniqueness by exchange id for PER_EXCHANGE rules.
--
-- V13 enforced "one open alert_instance per rule" via a partial unique on
-- (rule_id). That is correct for every scalar condition kind (ROUTE_METRIC,
-- AGENT_STATE, DEPLOYMENT_STATE, LOG_PATTERN, JVM_METRIC, and EXCHANGE_MATCH
-- in COUNT_IN_WINDOW mode) but wrong for EXCHANGE_MATCH / PER_EXCHANGE, which
-- by design emits one alert_instance per matching exchange. Under V13 every
-- PER_EXCHANGE tick with >1 match logged "Skipped duplicate open alert_instance
-- for rule …" at evaluator cadence and silently lost alert fidelity — only the
-- first matching exchange per tick got an AlertInstance + webhook dispatch.
--
-- New discriminator: (rule_id, COALESCE(context->'exchange'->>'id', '')).
-- Scalar evaluators emit Map.of() (no exchange subtree), the expression
-- resolves to '' for all of them, so they continue to get "one open per rule".
-- ExchangeMatchEvaluator.evaluatePerExchange emits {exchange.id = <execId>}
-- per firing, so PER_EXCHANGE instances for distinct exchanges coexist.
-- Two attempts to open an instance for the same (rule, exchange) still collapse
-- to one — the repo's DuplicateKeyException fallback preserves defense-in-depth.
DROP INDEX IF EXISTS alert_instances_open_rule_uq;
CREATE UNIQUE INDEX alert_instances_open_rule_uq
ON alert_instances (rule_id, (COALESCE(context->'exchange'->>'id', '')))
WHERE rule_id IS NOT NULL
AND state IN ('PENDING','FIRING','ACKNOWLEDGED');

View File

@@ -243,6 +243,61 @@ class PostgresAlertInstanceRepositoryIT extends AbstractPostgresIT {
// Extra rules are cleaned up by @AfterEach via env-scoped DELETE // Extra rules are cleaned up by @AfterEach via env-scoped DELETE
} }
@Test
void save_allowsMultipleOpenInstancesForSameRuleWithDifferentExchangeIds() {
// PER_EXCHANGE mode (EXCHANGE_MATCH rules) emits one alert_instance per
// matching exchange — the design requires many open instances per rule,
// each carrying a distinct exchange.id in its context. The open-rule
// partial unique must therefore discriminate on (rule_id, exchange.id),
// not on rule_id alone. Regression: V13's original (rule_id)-only
// constraint masked this by catching DuplicateKeyException and silently
// dropping every per-exchange firing after the first — logs filled with
// "Skipped duplicate open alert_instance for rule …" at evaluator cadence
// and PER_EXCHANGE notifications were never dispatched.
var exchangeA = newInstanceWithExchange(ruleId, "exchange-aaa");
var exchangeB = newInstanceWithExchange(ruleId, "exchange-bbb");
var exchangeC = newInstanceWithExchange(ruleId, "exchange-ccc");
repo.save(exchangeA);
repo.save(exchangeB);
repo.save(exchangeC);
assertThat(repo.findById(exchangeA.id())).isPresent();
assertThat(repo.findById(exchangeB.id())).isPresent();
assertThat(repo.findById(exchangeC.id())).isPresent();
Long count = jdbcTemplate.queryForObject(
"SELECT COUNT(*) FROM alert_instances " +
" WHERE rule_id = ? AND state IN ('PENDING','FIRING','ACKNOWLEDGED')",
Long.class, ruleId);
assertThat(count).isEqualTo(3L);
}
@Test
void save_rejectsSecondOpenInstanceForSameRuleAndExchange() {
// The partial unique still prevents genuine duplicates: two open instances
// for the same (rule, exchange) pair must collapse to one. The repo's
// DuplicateKeyException fallback returns the already-persisted instance.
var first = newInstanceWithExchange(ruleId, "exchange-dup");
var second = newInstanceWithExchange(ruleId, "exchange-dup");
repo.save(first);
var returned = repo.save(second);
// Second insert was suppressed; the repo returned the existing instance.
assertThat(returned.id()).isEqualTo(first.id());
assertThat(repo.findById(first.id())).isPresent();
assertThat(repo.findById(second.id())).isEmpty();
Long count = jdbcTemplate.queryForObject(
"SELECT COUNT(*) FROM alert_instances " +
" WHERE rule_id = ? AND state IN ('PENDING','FIRING','ACKNOWLEDGED')",
Long.class, ruleId);
assertThat(count).isEqualTo(1L);
}
@Test @Test
void markSilenced_togglesToTrue() { void markSilenced_togglesToTrue() {
var inst = newInstance(ruleId, List.of(userId), List.of(), List.of()); var inst = newInstance(ruleId, List.of(userId), List.of(), List.of());
@@ -276,6 +331,23 @@ class PostgresAlertInstanceRepositoryIT extends AbstractPostgresIT {
userIds, groupIds, roleNames); userIds, groupIds, roleNames);
} }
/**
* Creates an open FIRING instance carrying a PER_EXCHANGE-shaped context:
* {@code context.exchange.id = <exchangeId>}. Mirrors what
* {@code ExchangeMatchEvaluator.evaluatePerExchange} emits for each matching
* exchange in a Batch result.
*/
private AlertInstance newInstanceWithExchange(UUID ruleId, String exchangeId) {
return new AlertInstance(
UUID.randomUUID(), ruleId, Map.of(), envId,
AlertState.FIRING, AlertSeverity.WARNING,
Instant.now(), null, null, null, null,
false, null, null,
Map.of("exchange", Map.of("id", exchangeId)),
"title", "message",
List.of(userId), List.of(), List.of());
}
/** Inserts a minimal alert_rule with the given severity. */ /** Inserts a minimal alert_rule with the given severity. */
private UUID seedRuleWithSeverity(String name, AlertSeverity severity) { private UUID seedRuleWithSeverity(String name, AlertSeverity severity) {
UUID id = UUID.randomUUID(); UUID id = UUID.randomUUID();

View File

@@ -31,7 +31,6 @@ class PostgresAlertReadRepositoryIT extends AbstractPostgresIT {
instanceId1 = UUID.randomUUID(); instanceId1 = UUID.randomUUID();
instanceId2 = UUID.randomUUID(); instanceId2 = UUID.randomUUID();
instanceId3 = UUID.randomUUID(); instanceId3 = UUID.randomUUID();
UUID ruleId = UUID.randomUUID();
jdbcTemplate.update( jdbcTemplate.update(
"INSERT INTO environments (id, slug, display_name) VALUES (?, ?, ?)", "INSERT INTO environments (id, slug, display_name) VALUES (?, ?, ?)",
@@ -41,18 +40,23 @@ class PostgresAlertReadRepositoryIT extends AbstractPostgresIT {
jdbcTemplate.update( jdbcTemplate.update(
"INSERT INTO users (user_id, provider, email) VALUES (?, 'local', ?) ON CONFLICT (user_id) DO NOTHING", "INSERT INTO users (user_id, provider, email) VALUES (?, 'local', ?) ON CONFLICT (user_id) DO NOTHING",
userId, userId + "@example.com"); userId, userId + "@example.com");
jdbcTemplate.update(
"INSERT INTO alert_rules (id, environment_id, name, severity, condition_kind, condition, " +
"notification_title_tmpl, notification_message_tmpl, created_by, updated_by) " +
"VALUES (?, ?, 'rule', 'WARNING', 'AGENT_STATE', '{}'::jsonb, 't', 'm', 'sys-user', 'sys-user')",
ruleId, envId);
for (UUID id : List.of(instanceId1, instanceId2, instanceId3)) { // Each open alert_instance needs its own rule_id — the alert_instances_open_rule_uq
// partial unique forbids multiple open instances sharing the same rule_id + exchange
// discriminator (V13/V15). Three separate rules let all three instances coexist
// in FIRING state so alert_reads tests can target each one independently.
for (UUID instanceId : List.of(instanceId1, instanceId2, instanceId3)) {
UUID ruleId = UUID.randomUUID();
jdbcTemplate.update(
"INSERT INTO alert_rules (id, environment_id, name, severity, condition_kind, condition, " +
"notification_title_tmpl, notification_message_tmpl, created_by, updated_by) " +
"VALUES (?, ?, ?, 'WARNING', 'AGENT_STATE', '{}'::jsonb, 't', 'm', 'sys-user', 'sys-user')",
ruleId, envId, "rule-" + instanceId);
jdbcTemplate.update( jdbcTemplate.update(
"INSERT INTO alert_instances (id, rule_id, rule_snapshot, environment_id, state, severity, " + "INSERT INTO alert_instances (id, rule_id, rule_snapshot, environment_id, state, severity, " +
"fired_at, context, title, message) VALUES (?, ?, '{}'::jsonb, ?, 'FIRING', 'WARNING', " + "fired_at, context, title, message) VALUES (?, ?, '{}'::jsonb, ?, 'FIRING', 'WARNING', " +
"now(), '{}'::jsonb, 'title', 'msg')", "now(), '{}'::jsonb, 'title', 'msg')",
id, ruleId, envId); instanceId, ruleId, envId);
} }
} }