feat(alerting): NotificationDispatchJob outbox loop with silence + retry

Claim-polling SchedulingConfigurer: claims due notifications, resolves
instance/connection/rule, checks active silences, dispatches via
WebhookDispatcher, classifies outcomes into DELIVERED/FAILED/retry.
Guards null rule/env after deletion. 5 Testcontainers ITs: 200/503/404
outcomes, active silence suppression, deleted connection fast-fail.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
hsiegeln
2026-04-19 20:24:54 +02:00
parent 466aceb920
commit 6b48bc63bf
2 changed files with 397 additions and 0 deletions

View File

@@ -0,0 +1,174 @@
package com.cameleer.server.app.alerting.notify;
import com.cameleer.server.app.alerting.config.AlertingProperties;
import com.cameleer.server.core.alerting.*;
import com.cameleer.server.core.outbound.OutboundConnectionRepository;
import com.cameleer.server.core.runtime.Environment;
import com.cameleer.server.core.runtime.EnvironmentRepository;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.scheduling.annotation.SchedulingConfigurer;
import org.springframework.scheduling.config.ScheduledTaskRegistrar;
import org.springframework.stereotype.Component;
import java.time.Clock;
import java.time.Instant;
import java.util.List;
import java.util.Map;
/**
* Claim-polling outbox loop that dispatches {@link AlertNotification} records.
* <p>
* On each tick, claims a batch of due notifications, resolves the backing
* {@link AlertInstance} and {@link com.cameleer.server.core.outbound.OutboundConnection},
* checks active silences, delegates to {@link WebhookDispatcher}, and persists the outcome.
* <p>
* Retry backoff: {@code retryAfter × attempts} (30 s, 60 s, 90 s, …).
* After {@link AlertingProperties#effectiveWebhookMaxAttempts()} retries the notification
* is marked FAILED permanently.
*/
@Component
public class NotificationDispatchJob implements SchedulingConfigurer {
private static final Logger log = LoggerFactory.getLogger(NotificationDispatchJob.class);
private final AlertingProperties props;
private final AlertNotificationRepository notificationRepo;
private final AlertInstanceRepository instanceRepo;
private final AlertRuleRepository ruleRepo;
private final AlertSilenceRepository silenceRepo;
private final OutboundConnectionRepository outboundRepo;
private final EnvironmentRepository envRepo;
private final WebhookDispatcher dispatcher;
private final SilenceMatcherService silenceMatcher;
private final NotificationContextBuilder contextBuilder;
private final String instanceId;
private final String tenantId;
private final Clock clock;
private final String uiOrigin;
@SuppressWarnings("SpringJavaInjectionPointsAutowiringInspection")
public NotificationDispatchJob(
AlertingProperties props,
AlertNotificationRepository notificationRepo,
AlertInstanceRepository instanceRepo,
AlertRuleRepository ruleRepo,
AlertSilenceRepository silenceRepo,
OutboundConnectionRepository outboundRepo,
EnvironmentRepository envRepo,
WebhookDispatcher dispatcher,
SilenceMatcherService silenceMatcher,
NotificationContextBuilder contextBuilder,
@Qualifier("alertingInstanceId") String instanceId,
@Value("${cameleer.server.tenant.id:default}") String tenantId,
Clock alertingClock,
@Value("${cameleer.server.ui-origin:#{null}}") String uiOrigin) {
this.props = props;
this.notificationRepo = notificationRepo;
this.instanceRepo = instanceRepo;
this.ruleRepo = ruleRepo;
this.silenceRepo = silenceRepo;
this.outboundRepo = outboundRepo;
this.envRepo = envRepo;
this.dispatcher = dispatcher;
this.silenceMatcher = silenceMatcher;
this.contextBuilder = contextBuilder;
this.instanceId = instanceId;
this.tenantId = tenantId;
this.clock = alertingClock;
this.uiOrigin = uiOrigin;
}
// -------------------------------------------------------------------------
// SchedulingConfigurer
// -------------------------------------------------------------------------
@Override
public void configureTasks(ScheduledTaskRegistrar registrar) {
registrar.addFixedDelayTask(this::tick, props.effectiveNotificationTickIntervalMs());
}
// -------------------------------------------------------------------------
// Tick — package-private for tests
// -------------------------------------------------------------------------
void tick() {
List<AlertNotification> claimed = notificationRepo.claimDueNotifications(
instanceId,
props.effectiveNotificationBatchSize(),
props.effectiveClaimTtlSeconds());
for (AlertNotification n : claimed) {
try {
processOne(n);
} catch (Exception e) {
log.warn("Notification dispatch error for {}: {}", n.id(), e.toString());
notificationRepo.scheduleRetry(n.id(), Instant.now(clock).plusSeconds(30), -1, e.getMessage());
}
}
}
// -------------------------------------------------------------------------
// Per-notification processing
// -------------------------------------------------------------------------
private void processOne(AlertNotification n) {
// 1. Resolve alert instance
AlertInstance instance = instanceRepo.findById(n.alertInstanceId()).orElse(null);
if (instance == null) {
notificationRepo.markFailed(n.id(), 0, "instance deleted");
return;
}
// 2. Resolve outbound connection
var conn = outboundRepo.findById(tenantId, n.outboundConnectionId()).orElse(null);
if (conn == null) {
notificationRepo.markFailed(n.id(), 0, "outbound connection deleted");
return;
}
// 3. Resolve rule and environment (may be null after deletion)
AlertRule rule = instance.ruleId() == null ? null
: ruleRepo.findById(instance.ruleId()).orElse(null);
Environment env = envRepo.findById(instance.environmentId()).orElse(null);
// 4. Build Mustache context (guard: rule or env may be null after deletion)
Map<String, Object> context = (rule != null && env != null)
? contextBuilder.build(rule, instance, env, uiOrigin)
: Map.of();
// 5. Silence check
List<AlertSilence> activeSilences = silenceRepo.listActive(instance.environmentId(), Instant.now(clock));
for (AlertSilence s : activeSilences) {
if (silenceMatcher.matches(s.matcher(), instance, rule)) {
instanceRepo.markSilenced(instance.id(), true);
notificationRepo.markFailed(n.id(), 0, "silenced");
return;
}
}
// 6. Dispatch
WebhookDispatcher.Outcome outcome = dispatcher.dispatch(n, rule, instance, conn, context);
NotificationStatus outcomeStatus = outcome.status();
if (outcomeStatus == NotificationStatus.DELIVERED) {
notificationRepo.markDelivered(
n.id(), outcome.httpStatus(), outcome.snippet(), Instant.now(clock));
} else if (outcomeStatus == NotificationStatus.FAILED) {
notificationRepo.markFailed(
n.id(), outcome.httpStatus(), outcome.snippet());
} else {
// null status = transient failure (5xx / network / timeout) → retry
int attempts = n.attempts() + 1;
if (attempts >= props.effectiveWebhookMaxAttempts()) {
notificationRepo.markFailed(n.id(), outcome.httpStatus(), outcome.snippet());
} else {
Instant next = Instant.now(clock).plus(outcome.retryAfter().multipliedBy(attempts));
notificationRepo.scheduleRetry(n.id(), next, outcome.httpStatus(), outcome.snippet());
}
}
}
}