alerting(eval): atomic per-rule batch commit via @Transactional — Phase 2 close

Wraps instance writes, notification enqueues, and cursor advance in one transactional boundary per rule tick. Rollback leaves the rule replayable on next tick. Turns the Phase 2 atomicity IT green (see AlertEvaluatorJobIT #tickRollback_faultOnSecondNotificationInsert_leavesCursorUnchanged).
2026-04-22 17:03:07 +02:00
parent 989dde23eb
commit ba4e2bb68f
3 changed files with 215 additions and 57 deletions
--- a/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/eval/AlertEvaluatorJob.java
+++ b/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/eval/AlertEvaluatorJob.java
@@ -47,6 +47,7 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
    private final NotificationContextBuilder contextBuilder;
    private final EnvironmentRepository environmentRepo;
    private final ObjectMapper objectMapper;
    private final BatchResultApplier batchResultApplier;
    private final String instanceId;
    private final String tenantId;
    private final Clock clock;
@@ -64,6 +65,7 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
            NotificationContextBuilder contextBuilder,
            EnvironmentRepository environmentRepo,
            ObjectMapper objectMapper,
            BatchResultApplier batchResultApplier,
            @Qualifier("alertingInstanceId") String instanceId,
            @Value("${cameleer.server.tenant.id:default}") String tenantId,
            Clock alertingClock,
@@ -80,6 +82,7 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
        this.contextBuilder     = contextBuilder;
        this.environmentRepo    = environmentRepo;
        this.objectMapper       = objectMapper;
        this.batchResultApplier = batchResultApplier;
        this.instanceId         = instanceId;
        this.tenantId           = tenantId;
        this.clock              = alertingClock;
@@ -112,33 +115,61 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
        for (AlertRule rule : claimed) {
            Instant nextRun = Instant.now(clock).plusSeconds(rule.evaluationIntervalSeconds());
            Map<String, Object> nextEvalState = rule.evalState();
            try {
            if (circuitBreaker.isOpen(rule.conditionKind())) {
                log.debug("Circuit breaker open for {}; skipping rule {}", rule.conditionKind(), rule.id());
                reschedule(rule, nextRun);
                continue;
            }
-                EvalResult result = metrics.evalDuration(rule.conditionKind())
+
            EvalResult result;
            try {
                result = metrics.evalDuration(rule.conditionKind())
                        .recordCallable(() -> evaluateSafely(rule, ctx));
                if (result instanceof EvalResult.Batch b) {
                    applyResult(rule, b);
                    // Guard: only advance cursor when the batch returned one.
                    // Empty ticks (no matches) return Map.of() — overwriting would
                    // wipe the persisted cursor and force the next tick to re-scan
                    // from rule.createdAt, re-alerting on every historical exchange.
                    if (!b.nextEvalState().isEmpty()) {
                        nextEvalState = b.nextEvalState();
                    }
                } else {
                    applyResult(rule, result);
                }
                circuitBreaker.recordSuccess(rule.conditionKind());
            } catch (Exception e) {
                metrics.evalError(rule.conditionKind(), rule.id());
                circuitBreaker.recordFailure(rule.conditionKind());
                log.warn("Evaluator error for rule {} ({}): {}", rule.id(), rule.conditionKind(), e.toString());
                // Evaluation itself failed — release the claim so the rule can be
                // retried on the next tick. Cursor stays put.
                reschedule(rule, nextRun);
                continue;
            }
            if (result instanceof EvalResult.Batch b) {
                // Phase 2: the Batch path is atomic. The @Transactional apply() on
                // BatchResultApplier wraps instance writes, notification enqueues,
                // AND the cursor advance + releaseClaim into a single tx. A
                // mid-batch fault rolls everything back — including the cursor —
                // so the next tick replays the whole batch exactly once.
                try {
                    batchResultApplier.apply(rule, b, nextRun);
                    circuitBreaker.recordSuccess(rule.conditionKind());
                } catch (Exception e) {
                    metrics.evalError(rule.conditionKind(), rule.id());
                    circuitBreaker.recordFailure(rule.conditionKind());
                    log.warn("Batch apply failed for rule {} ({}): {} — rolling back; next tick will retry",
                            rule.id(), rule.conditionKind(), e.toString());
                    // The transaction rolled back. Do NOT call reschedule here —
                    // leaving claim + next_evaluation_at as they were means the
                    // claim TTL takes over and the rule becomes due on its own.
                    // Rethrowing is unnecessary for correctness — the cursor
                    // stayed put, so exactly-once-per-exchange is preserved.
                }
            } else {
                // Non-Batch path (FIRING / Clear / Error): classic apply + rule
                // reschedule. Not wrapped in a single tx — semantics unchanged
                // from pre-Phase-2.
                try {
                    applyResult(rule, result);
                    circuitBreaker.recordSuccess(rule.conditionKind());
                } catch (Exception e) {
                    metrics.evalError(rule.conditionKind(), rule.id());
                    circuitBreaker.recordFailure(rule.conditionKind());
                    log.warn("applyResult failed for rule {} ({}): {}",
                            rule.id(), rule.conditionKind(), e.toString());
                } finally {
-                reschedule(rule.withEvalState(nextEvalState), nextRun);
+                    reschedule(rule, nextRun);
                }
            }
        }
@@ -183,14 +214,10 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
    // -------------------------------------------------------------------------
    private void applyResult(AlertRule rule, EvalResult result) {
-        if (result instanceof EvalResult.Batch b) {
+        // Note: the Batch path is handled by BatchResultApplier (transactional) —
-            // PER_EXCHANGE mode: each Firing in the batch creates its own AlertInstance
+        // tick() routes Batch results there directly and never calls applyResult
-            for (EvalResult.Firing f : b.firings()) {
+        // for them. This method only handles FIRING / Clear / Error state-machine
-                applyBatchFiring(rule, f);
+        // transitions for the classic (non-PER_EXCHANGE) path.
            }
            return;
        }
        AlertInstance current = instanceRepo.findOpenForRule(rule.id()).orElse(null);
        Instant now = Instant.now(clock);
@@ -211,19 +238,6 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
        });
    }
    /**
     * Batch (PER_EXCHANGE) mode: always create a fresh FIRING instance per Firing entry.
     * No forDuration check — each exchange is its own event.
     */
    private void applyBatchFiring(AlertRule rule, EvalResult.Firing f) {
        Instant now = Instant.now(clock);
        AlertInstance instance = AlertStateTransitions.newInstance(rule, f, AlertState.FIRING, now)
                .withRuleSnapshot(snapshotRule(rule));
        AlertInstance enriched = enrichTitleMessage(rule, instance);
        AlertInstance persisted = instanceRepo.save(enriched);
        enqueueNotifications(rule, persisted, now);
    }
    // -------------------------------------------------------------------------
    // Title / message rendering
    // -------------------------------------------------------------------------
--- a/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/eval/BatchResultApplier.java
+++ b/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/eval/BatchResultApplier.java
@@ -0,0 +1,144 @@
 package com.cameleer.server.app.alerting.eval;
 import com.cameleer.server.app.alerting.notify.MustacheRenderer;
 import com.cameleer.server.app.alerting.notify.NotificationContextBuilder;
 import com.cameleer.server.core.alerting.*;
 import com.cameleer.server.core.runtime.Environment;
 import com.cameleer.server.core.runtime.EnvironmentRepository;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.stereotype.Component;
 import org.springframework.transaction.annotation.Transactional;
 import java.time.Clock;
 import java.time.Instant;
 import java.util.LinkedHashMap;
 import java.util.Map;
 import java.util.UUID;
 /**
 * Applies a {@link EvalResult.Batch} result to persistent state inside a single
 * transaction: instance writes, notification enqueues, and the rule's cursor
 * advance + {@code releaseClaim} either all commit or all roll back together.
 * <p>
 * Lives in its own bean so the {@code @Transactional} annotation engages via the
 * Spring proxy when invoked from {@link AlertEvaluatorJob#tick()}; calling it as
 * {@code this.apply(...)} from {@code AlertEvaluatorJob} (a bean calling its own
 * method) would bypass the proxy and silently disable the transaction.
 * <p>
 * Phase 2 of the per-exchange exactly-once plan (see
 * {@code docs/superpowers/plans/2026-04-22-per-exchange-exactly-once.md}).
 */
@Component
 public class BatchResultApplier {
    private static final Logger log = LoggerFactory.getLogger(BatchResultApplier.class);
    private final AlertRuleRepository ruleRepo;
    private final AlertInstanceRepository instanceRepo;
    private final AlertNotificationRepository notificationRepo;
    private final MustacheRenderer renderer;
    private final NotificationContextBuilder contextBuilder;
    private final EnvironmentRepository environmentRepo;
    private final ObjectMapper objectMapper;
    private final Clock clock;
    public BatchResultApplier(
            AlertRuleRepository ruleRepo,
            AlertInstanceRepository instanceRepo,
            AlertNotificationRepository notificationRepo,
            MustacheRenderer renderer,
            NotificationContextBuilder contextBuilder,
            EnvironmentRepository environmentRepo,
            ObjectMapper objectMapper,
            Clock alertingClock) {
        this.ruleRepo         = ruleRepo;
        this.instanceRepo     = instanceRepo;
        this.notificationRepo = notificationRepo;
        this.renderer         = renderer;
        this.contextBuilder   = contextBuilder;
        this.environmentRepo  = environmentRepo;
        this.objectMapper     = objectMapper;
        this.clock            = alertingClock;
    }
    /**
     * Atomically apply a Batch result for a single rule:
     * <ol>
     *   <li>persist a FIRING instance per firing + enqueue its notifications</li>
     *   <li>advance the rule's cursor ({@code evalState}) iff the batch supplied one</li>
     *   <li>release the claim with the new {@code nextRun} + {@code evalState}</li>
     * </ol>
     * Any exception thrown from the repo calls rolls back every write — including
     * the cursor advance — so the rule is replayable on the next tick.
     */
    @Transactional
    public void apply(AlertRule rule, EvalResult.Batch batch, Instant nextRun) {
        for (EvalResult.Firing f : batch.firings()) {
            applyBatchFiring(rule, f);
        }
        Map<String, Object> nextEvalState =
                batch.nextEvalState().isEmpty() ? rule.evalState() : batch.nextEvalState();
        ruleRepo.releaseClaim(rule.id(), nextRun, nextEvalState);
    }
    /**
     * Batch (PER_EXCHANGE) mode: always create a fresh FIRING instance per Firing entry.
     * No forDuration check — each exchange is its own event.
     */
    private void applyBatchFiring(AlertRule rule, EvalResult.Firing f) {
        Instant now = Instant.now(clock);
        AlertInstance instance = AlertStateTransitions.newInstance(rule, f, AlertState.FIRING, now)
                .withRuleSnapshot(snapshotRule(rule));
        AlertInstance enriched = enrichTitleMessage(rule, instance);
        AlertInstance persisted = instanceRepo.save(enriched);
        enqueueNotifications(rule, persisted, now);
    }
    private AlertInstance enrichTitleMessage(AlertRule rule, AlertInstance instance) {
        Environment env = environmentRepo.findById(rule.environmentId()).orElse(null);
        Map<String, Object> ctx = contextBuilder.build(rule, instance, env, null);
        String title   = renderer.render(rule.notificationTitleTmpl(), ctx);
        String message = renderer.render(rule.notificationMessageTmpl(), ctx);
        return instance.withTitleMessage(title, message);
    }
    private void enqueueNotifications(AlertRule rule, AlertInstance instance, Instant now) {
        for (WebhookBinding w : rule.webhooks()) {
            Map<String, Object> payload = buildPayload(rule, instance);
            notificationRepo.save(new AlertNotification(
                    UUID.randomUUID(),
                    instance.id(),
                    w.id(),
                    w.outboundConnectionId(),
                    NotificationStatus.PENDING,
                    0,
                    now,
                    null, null, null, null,
                    payload,
                    null,
                    now));
        }
    }
    private Map<String, Object> buildPayload(AlertRule rule, AlertInstance instance) {
        Environment env = environmentRepo.findById(rule.environmentId()).orElse(null);
        return contextBuilder.build(rule, instance, env, null);
    }
    @SuppressWarnings("unchecked")
    private Map<String, Object> snapshotRule(AlertRule rule) {
        try {
            Map<String, Object> raw = objectMapper.convertValue(rule, Map.class);
            // Map.copyOf (used in AlertInstance compact ctor) rejects null values —
            // strip them so the snapshot is safe to store.
            Map<String, Object> safe = new LinkedHashMap<>();
            raw.forEach((k, v) -> { if (v != null) safe.put(k, v); });
            return safe;
        } catch (Exception e) {
            log.warn("Failed to snapshot rule {}: {}", rule.id(), e.getMessage());
            return Map.of("id", rule.id().toString(), "name", rule.name());
        }
    }
 }
--- a/cameleer-server-app/src/test/java/com/cameleer/server/app/alerting/eval/AlertEvaluatorJobIT.java
+++ b/cameleer-server-app/src/test/java/com/cameleer/server/app/alerting/eval/AlertEvaluatorJobIT.java
@@ -28,7 +28,6 @@ import java.util.UUID;
 import java.util.concurrent.atomic.AtomicInteger;
 import static org.assertj.core.api.Assertions.assertThat;
 import static org.assertj.core.api.Assertions.assertThatThrownBy;
 import static org.mockito.Mockito.when;
 /**
@@ -441,6 +440,7 @@ class AlertEvaluatorJobIT extends AbstractPostgresIT {
        } catch (RuntimeException expectedAfterPhase2) {
            // Phase 2 may choose to rethrow; either way the rollback assertions
            // below are what pin the contract.
            // intentionally empty — fault-injection swallow/rethrow tolerance; see comment above
        }
        // Post-rollback: zero instances, zero notifications, cursor unchanged,