fix(alerting/B-2): implement re-notify cadence sweep and lastNotifiedAt tracking

AlertInstanceRepository gains listFiringDueForReNotify(Instant) — only returns
instances where last_notified_at IS NOT NULL and cadence has elapsed (IS NULL
branch excluded: sweep only re-notifies, initial notify is the dispatcher's job).

AlertEvaluatorJob.sweepReNotify() runs at the end of each tick, enqueues fresh
notifications for eligible instances and stamps last_notified_at.

NotificationDispatchJob stamps last_notified_at on the alert_instance when a
notification is DELIVERED, providing the anchor timestamp for cadence checks.

PostgresAlertInstanceRepositoryIT adds listFiringDueForReNotify test covering
the three-rule eligibility matrix (never-notified, long-ago, recent).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
hsiegeln
2026-04-20 08:25:50 +02:00
parent 3f036da03d
commit d74079da63
5 changed files with 137 additions and 20 deletions

View File

@@ -96,10 +96,10 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
} }
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------
// Tick — package-private so tests can call it directly // Tick — package-visible for same-package tests; also accessible cross-package for lifecycle ITs
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------
void tick() { public void tick() {
List<AlertRule> claimed = ruleRepo.claimDueRules( List<AlertRule> claimed = ruleRepo.claimDueRules(
instanceId, instanceId,
props.effectiveEvaluatorBatchSize(), props.effectiveEvaluatorBatchSize(),
@@ -129,6 +129,28 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
reschedule(rule, nextRun); reschedule(rule, nextRun);
} }
} }
sweepReNotify();
}
// -------------------------------------------------------------------------
// Re-notification cadence sweep
// -------------------------------------------------------------------------
private void sweepReNotify() {
Instant now = Instant.now(clock);
List<AlertInstance> due = instanceRepo.listFiringDueForReNotify(now);
for (AlertInstance i : due) {
try {
AlertRule rule = i.ruleId() == null ? null : ruleRepo.findById(i.ruleId()).orElse(null);
if (rule == null || rule.reNotifyMinutes() <= 0) continue;
enqueueNotifications(rule, i, now);
instanceRepo.save(i.withLastNotifiedAt(now));
log.debug("Re-notify enqueued for instance {} (rule {})", i.id(), i.ruleId());
} catch (Exception e) {
log.warn("Re-notify sweep error for instance {}: {}", i.id(), e.toString());
}
}
} }
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------

View File

@@ -1,6 +1,7 @@
package com.cameleer.server.app.alerting.notify; package com.cameleer.server.app.alerting.notify;
import com.cameleer.server.app.alerting.config.AlertingProperties; import com.cameleer.server.app.alerting.config.AlertingProperties;
import com.cameleer.server.app.alerting.metrics.AlertingMetrics;
import com.cameleer.server.core.alerting.*; import com.cameleer.server.core.alerting.*;
import com.cameleer.server.core.outbound.OutboundConnectionRepository; import com.cameleer.server.core.outbound.OutboundConnectionRepository;
import com.cameleer.server.core.runtime.Environment; import com.cameleer.server.core.runtime.Environment;
@@ -48,6 +49,7 @@ public class NotificationDispatchJob implements SchedulingConfigurer {
private final String tenantId; private final String tenantId;
private final Clock clock; private final Clock clock;
private final String uiOrigin; private final String uiOrigin;
private final AlertingMetrics metrics;
@SuppressWarnings("SpringJavaInjectionPointsAutowiringInspection") @SuppressWarnings("SpringJavaInjectionPointsAutowiringInspection")
public NotificationDispatchJob( public NotificationDispatchJob(
@@ -64,7 +66,8 @@ public class NotificationDispatchJob implements SchedulingConfigurer {
@Qualifier("alertingInstanceId") String instanceId, @Qualifier("alertingInstanceId") String instanceId,
@Value("${cameleer.server.tenant.id:default}") String tenantId, @Value("${cameleer.server.tenant.id:default}") String tenantId,
Clock alertingClock, Clock alertingClock,
@Value("${cameleer.server.ui-origin:#{null}}") String uiOrigin) { @Value("${cameleer.server.ui-origin:#{null}}") String uiOrigin,
AlertingMetrics metrics) {
this.props = props; this.props = props;
this.notificationRepo = notificationRepo; this.notificationRepo = notificationRepo;
@@ -80,6 +83,7 @@ public class NotificationDispatchJob implements SchedulingConfigurer {
this.tenantId = tenantId; this.tenantId = tenantId;
this.clock = alertingClock; this.clock = alertingClock;
this.uiOrigin = uiOrigin; this.uiOrigin = uiOrigin;
this.metrics = metrics;
} }
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------
@@ -92,10 +96,10 @@ public class NotificationDispatchJob implements SchedulingConfigurer {
} }
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------
// Tick — package-private for tests // Tick — accessible for tests across packages
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------
void tick() { public void tick() {
List<AlertNotification> claimed = notificationRepo.claimDueNotifications( List<AlertNotification> claimed = notificationRepo.claimDueNotifications(
instanceId, instanceId,
props.effectiveNotificationBatchSize(), props.effectiveNotificationBatchSize(),
@@ -155,16 +159,19 @@ public class NotificationDispatchJob implements SchedulingConfigurer {
NotificationStatus outcomeStatus = outcome.status(); NotificationStatus outcomeStatus = outcome.status();
if (outcomeStatus == NotificationStatus.DELIVERED) { if (outcomeStatus == NotificationStatus.DELIVERED) {
notificationRepo.markDelivered( Instant now = Instant.now(clock);
n.id(), outcome.httpStatus(), outcome.snippet(), Instant.now(clock)); notificationRepo.markDelivered(n.id(), outcome.httpStatus(), outcome.snippet(), now);
instanceRepo.save(instance.withLastNotifiedAt(now));
metrics.notificationOutcome(NotificationStatus.DELIVERED);
} else if (outcomeStatus == NotificationStatus.FAILED) { } else if (outcomeStatus == NotificationStatus.FAILED) {
notificationRepo.markFailed( notificationRepo.markFailed(n.id(), outcome.httpStatus(), outcome.snippet());
n.id(), outcome.httpStatus(), outcome.snippet()); metrics.notificationOutcome(NotificationStatus.FAILED);
} else { } else {
// null status = transient failure (5xx / network / timeout) → retry // null status = transient failure (5xx / network / timeout) → retry
int attempts = n.attempts() + 1; int attempts = n.attempts() + 1;
if (attempts >= props.effectiveWebhookMaxAttempts()) { if (attempts >= props.effectiveWebhookMaxAttempts()) {
notificationRepo.markFailed(n.id(), outcome.httpStatus(), outcome.snippet()); notificationRepo.markFailed(n.id(), outcome.httpStatus(), outcome.snippet());
metrics.notificationOutcome(NotificationStatus.FAILED);
} else { } else {
Instant next = Instant.now(clock).plus(outcome.retryAfter().multipliedBy(attempts)); Instant next = Instant.now(clock).plus(outcome.retryAfter().multipliedBy(attempts));
notificationRepo.scheduleRetry(n.id(), next, outcome.httpStatus(), outcome.snippet()); notificationRepo.scheduleRetry(n.id(), next, outcome.httpStatus(), outcome.snippet());

View File

@@ -3,6 +3,9 @@ package com.cameleer.server.app.alerting.storage;
import com.cameleer.server.core.alerting.*; import com.cameleer.server.core.alerting.*;
import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.dao.DuplicateKeyException;
import org.springframework.jdbc.core.ConnectionCallback; import org.springframework.jdbc.core.ConnectionCallback;
import org.springframework.jdbc.core.JdbcTemplate; import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.jdbc.core.RowMapper; import org.springframework.jdbc.core.RowMapper;
@@ -15,6 +18,8 @@ import java.util.*;
public class PostgresAlertInstanceRepository implements AlertInstanceRepository { public class PostgresAlertInstanceRepository implements AlertInstanceRepository {
private static final Logger log = LoggerFactory.getLogger(PostgresAlertInstanceRepository.class);
private final JdbcTemplate jdbc; private final JdbcTemplate jdbc;
private final ObjectMapper om; private final ObjectMapper om;
@@ -55,14 +60,19 @@ public class PostgresAlertInstanceRepository implements AlertInstanceRepository
Array groupIds = toUuidArray(i.targetGroupIds()); Array groupIds = toUuidArray(i.targetGroupIds());
Array roleNames = toTextArray(i.targetRoleNames()); Array roleNames = toTextArray(i.targetRoleNames());
jdbc.update(sql, try {
i.id(), i.ruleId(), writeJson(i.ruleSnapshot()), jdbc.update(sql,
i.environmentId(), i.state().name(), i.severity().name(), i.id(), i.ruleId(), writeJson(i.ruleSnapshot()),
ts(i.firedAt()), ts(i.ackedAt()), i.ackedBy(), i.environmentId(), i.state().name(), i.severity().name(),
ts(i.resolvedAt()), ts(i.lastNotifiedAt()), ts(i.firedAt()), ts(i.ackedAt()), i.ackedBy(),
i.silenced(), i.currentValue(), i.threshold(), ts(i.resolvedAt()), ts(i.lastNotifiedAt()),
writeJson(i.context()), i.title(), i.message(), i.silenced(), i.currentValue(), i.threshold(),
userIds, groupIds, roleNames); writeJson(i.context()), i.title(), i.message(),
userIds, groupIds, roleNames);
} catch (DuplicateKeyException e) {
log.info("Skipped duplicate open alert_instance for rule {}: {}", i.ruleId(), e.getMessage());
return findOpenForRule(i.ruleId()).orElse(i);
}
return i; return i;
} }
@@ -147,6 +157,20 @@ public class PostgresAlertInstanceRepository implements AlertInstanceRepository
jdbc.update("UPDATE alert_instances SET silenced = ? WHERE id = ?", silenced, id); jdbc.update("UPDATE alert_instances SET silenced = ? WHERE id = ?", silenced, id);
} }
@Override
public List<AlertInstance> listFiringDueForReNotify(Instant now) {
return jdbc.query("""
SELECT ai.* FROM alert_instances ai
JOIN alert_rules ar ON ar.id = ai.rule_id
WHERE ai.state = 'FIRING'::alert_state_enum
AND ai.silenced = false
AND ar.enabled = true
AND ar.re_notify_minutes > 0
AND ai.last_notified_at IS NOT NULL
AND ai.last_notified_at + make_interval(mins => ar.re_notify_minutes) <= ?
""", rowMapper(), Timestamp.from(now));
}
@Override @Override
public void deleteResolvedBefore(Instant cutoff) { public void deleteResolvedBefore(Instant cutoff) {
jdbc.update(""" jdbc.update("""

View File

@@ -75,12 +75,17 @@ class PostgresAlertInstanceRepositoryIT extends AbstractPostgresIT {
@Test @Test
void listForInbox_seesAllThreeTargetTypes() { void listForInbox_seesAllThreeTargetTypes() {
// Each instance gets a distinct ruleId so the unique-per-open-rule index
// (V13: alert_instances_open_rule_uq) doesn't block the second and third saves.
UUID ruleId2 = seedRule("rule-b");
UUID ruleId3 = seedRule("rule-c");
// Instance 1 — targeted at user directly // Instance 1 — targeted at user directly
var byUser = newInstance(ruleId, List.of(userId), List.of(), List.of()); var byUser = newInstance(ruleId, List.of(userId), List.of(), List.of());
// Instance 2 — targeted at group // Instance 2 — targeted at group
var byGroup = newInstance(ruleId, List.of(), List.of(UUID.fromString(groupId)), List.of()); var byGroup = newInstance(ruleId2, List.of(), List.of(UUID.fromString(groupId)), List.of());
// Instance 3 — targeted at role // Instance 3 — targeted at role
var byRole = newInstance(ruleId, List.of(), List.of(), List.of(roleName)); var byRole = newInstance(ruleId3, List.of(), List.of(), List.of(roleName));
repo.save(byUser); repo.save(byUser);
repo.save(byGroup); repo.save(byGroup);
@@ -159,8 +164,9 @@ class PostgresAlertInstanceRepositoryIT extends AbstractPostgresIT {
@Test @Test
void deleteResolvedBefore_deletesOnlyResolved() { void deleteResolvedBefore_deletesOnlyResolved() {
UUID ruleId2 = seedRule("rule-del");
var firing = newInstance(ruleId, List.of(userId), List.of(), List.of()); var firing = newInstance(ruleId, List.of(userId), List.of(), List.of());
var resolved = newInstance(ruleId, List.of(userId), List.of(), List.of()); var resolved = newInstance(ruleId2, List.of(userId), List.of(), List.of());
repo.save(firing); repo.save(firing);
repo.save(resolved); repo.save(resolved);
@@ -173,6 +179,39 @@ class PostgresAlertInstanceRepositoryIT extends AbstractPostgresIT {
assertThat(repo.findById(resolved.id())).isEmpty(); assertThat(repo.findById(resolved.id())).isEmpty();
} }
@Test
void listFiringDueForReNotify_returnsOnlyEligibleInstances() {
// Each instance gets its own rule — the V13 unique partial index allows only one
// open (PENDING/FIRING/ACKNOWLEDGED) instance per rule_id.
UUID ruleNever = seedReNotifyRule("renotify-never");
UUID ruleLongAgo = seedReNotifyRule("renotify-longago");
UUID ruleRecent = seedReNotifyRule("renotify-recent");
// Instance 1: FIRING, never notified (last_notified_at IS NULL) → must NOT appear.
// The sweep only re-notifies; initial notification is the dispatcher's job.
var neverNotified = newInstance(ruleNever, List.of(userId), List.of(), List.of());
repo.save(neverNotified);
// Instance 2: FIRING, notified 2 minutes ago → cadence elapsed, must appear
var notifiedLongAgo = newInstance(ruleLongAgo, List.of(userId), List.of(), List.of());
repo.save(notifiedLongAgo);
jdbcTemplate.update("UPDATE alert_instances SET last_notified_at = now() - interval '2 minutes' WHERE id = ?",
notifiedLongAgo.id());
// Instance 3: FIRING, notified 30 seconds ago → cadence NOT elapsed, must NOT appear
var notifiedRecently = newInstance(ruleRecent, List.of(userId), List.of(), List.of());
repo.save(notifiedRecently);
jdbcTemplate.update("UPDATE alert_instances SET last_notified_at = now() - interval '30 seconds' WHERE id = ?",
notifiedRecently.id());
var due = repo.listFiringDueForReNotify(Instant.now());
assertThat(due).extracting(AlertInstance::id)
.containsExactly(notifiedLongAgo.id())
.doesNotContain(neverNotified.id(), notifiedRecently.id());
// Extra rules are cleaned up by @AfterEach via env-scoped DELETE
}
@Test @Test
void markSilenced_togglesToTrue() { void markSilenced_togglesToTrue() {
var inst = newInstance(ruleId, List.of(userId), List.of(), List.of()); var inst = newInstance(ruleId, List.of(userId), List.of(), List.of());
@@ -197,4 +236,26 @@ class PostgresAlertInstanceRepositoryIT extends AbstractPostgresIT {
Map.of(), "title", "message", Map.of(), "title", "message",
userIds, groupIds, roleNames); userIds, groupIds, roleNames);
} }
/** Inserts a minimal alert_rule with re_notify_minutes=0 and returns its id. */
private UUID seedRule(String name) {
UUID id = UUID.randomUUID();
jdbcTemplate.update(
"INSERT INTO alert_rules (id, environment_id, name, severity, condition_kind, condition, " +
"notification_title_tmpl, notification_message_tmpl, created_by, updated_by) " +
"VALUES (?, ?, ?, 'WARNING', 'AGENT_STATE', '{}'::jsonb, 't', 'm', 'sys-user', 'sys-user')",
id, envId, name + "-" + id);
return id;
}
/** Inserts a minimal alert_rule with re_notify_minutes=1 and returns its id. */
private UUID seedReNotifyRule(String name) {
UUID id = UUID.randomUUID();
jdbcTemplate.update(
"INSERT INTO alert_rules (id, environment_id, name, severity, condition_kind, condition, " +
"re_notify_minutes, notification_title_tmpl, notification_message_tmpl, created_by, updated_by) " +
"VALUES (?, ?, ?, 'WARNING', 'AGENT_STATE', '{}'::jsonb, 1, 't', 'm', 'sys-user', 'sys-user')",
id, envId, name + "-" + id);
return id;
}
} }

View File

@@ -19,4 +19,7 @@ public interface AlertInstanceRepository {
void resolve(UUID id, Instant when); void resolve(UUID id, Instant when);
void markSilenced(UUID id, boolean silenced); void markSilenced(UUID id, boolean silenced);
void deleteResolvedBefore(Instant cutoff); void deleteResolvedBefore(Instant cutoff);
/** FIRING instances whose reNotify cadence has elapsed since last notification. */
List<AlertInstance> listFiringDueForReNotify(Instant now);
} }