alerting(eval): atomic per-rule batch commit via @Transactional — Phase 2 close
Wraps instance writes, notification enqueues, and cursor advance in one transactional boundary per rule tick. Rollback leaves the rule replayable on next tick. Turns the Phase 2 atomicity IT green (see AlertEvaluatorJobIT #tickRollback_faultOnSecondNotificationInsert_leavesCursorUnchanged).
This commit is contained in:
@@ -47,6 +47,7 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
|
|||||||
private final NotificationContextBuilder contextBuilder;
|
private final NotificationContextBuilder contextBuilder;
|
||||||
private final EnvironmentRepository environmentRepo;
|
private final EnvironmentRepository environmentRepo;
|
||||||
private final ObjectMapper objectMapper;
|
private final ObjectMapper objectMapper;
|
||||||
|
private final BatchResultApplier batchResultApplier;
|
||||||
private final String instanceId;
|
private final String instanceId;
|
||||||
private final String tenantId;
|
private final String tenantId;
|
||||||
private final Clock clock;
|
private final Clock clock;
|
||||||
@@ -64,6 +65,7 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
|
|||||||
NotificationContextBuilder contextBuilder,
|
NotificationContextBuilder contextBuilder,
|
||||||
EnvironmentRepository environmentRepo,
|
EnvironmentRepository environmentRepo,
|
||||||
ObjectMapper objectMapper,
|
ObjectMapper objectMapper,
|
||||||
|
BatchResultApplier batchResultApplier,
|
||||||
@Qualifier("alertingInstanceId") String instanceId,
|
@Qualifier("alertingInstanceId") String instanceId,
|
||||||
@Value("${cameleer.server.tenant.id:default}") String tenantId,
|
@Value("${cameleer.server.tenant.id:default}") String tenantId,
|
||||||
Clock alertingClock,
|
Clock alertingClock,
|
||||||
@@ -80,6 +82,7 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
|
|||||||
this.contextBuilder = contextBuilder;
|
this.contextBuilder = contextBuilder;
|
||||||
this.environmentRepo = environmentRepo;
|
this.environmentRepo = environmentRepo;
|
||||||
this.objectMapper = objectMapper;
|
this.objectMapper = objectMapper;
|
||||||
|
this.batchResultApplier = batchResultApplier;
|
||||||
this.instanceId = instanceId;
|
this.instanceId = instanceId;
|
||||||
this.tenantId = tenantId;
|
this.tenantId = tenantId;
|
||||||
this.clock = alertingClock;
|
this.clock = alertingClock;
|
||||||
@@ -112,33 +115,61 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
|
|||||||
|
|
||||||
for (AlertRule rule : claimed) {
|
for (AlertRule rule : claimed) {
|
||||||
Instant nextRun = Instant.now(clock).plusSeconds(rule.evaluationIntervalSeconds());
|
Instant nextRun = Instant.now(clock).plusSeconds(rule.evaluationIntervalSeconds());
|
||||||
Map<String, Object> nextEvalState = rule.evalState();
|
|
||||||
try {
|
|
||||||
if (circuitBreaker.isOpen(rule.conditionKind())) {
|
if (circuitBreaker.isOpen(rule.conditionKind())) {
|
||||||
log.debug("Circuit breaker open for {}; skipping rule {}", rule.conditionKind(), rule.id());
|
log.debug("Circuit breaker open for {}; skipping rule {}", rule.conditionKind(), rule.id());
|
||||||
|
reschedule(rule, nextRun);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
EvalResult result = metrics.evalDuration(rule.conditionKind())
|
|
||||||
|
EvalResult result;
|
||||||
|
try {
|
||||||
|
result = metrics.evalDuration(rule.conditionKind())
|
||||||
.recordCallable(() -> evaluateSafely(rule, ctx));
|
.recordCallable(() -> evaluateSafely(rule, ctx));
|
||||||
if (result instanceof EvalResult.Batch b) {
|
|
||||||
applyResult(rule, b);
|
|
||||||
// Guard: only advance cursor when the batch returned one.
|
|
||||||
// Empty ticks (no matches) return Map.of() — overwriting would
|
|
||||||
// wipe the persisted cursor and force the next tick to re-scan
|
|
||||||
// from rule.createdAt, re-alerting on every historical exchange.
|
|
||||||
if (!b.nextEvalState().isEmpty()) {
|
|
||||||
nextEvalState = b.nextEvalState();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
applyResult(rule, result);
|
|
||||||
}
|
|
||||||
circuitBreaker.recordSuccess(rule.conditionKind());
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
metrics.evalError(rule.conditionKind(), rule.id());
|
metrics.evalError(rule.conditionKind(), rule.id());
|
||||||
circuitBreaker.recordFailure(rule.conditionKind());
|
circuitBreaker.recordFailure(rule.conditionKind());
|
||||||
log.warn("Evaluator error for rule {} ({}): {}", rule.id(), rule.conditionKind(), e.toString());
|
log.warn("Evaluator error for rule {} ({}): {}", rule.id(), rule.conditionKind(), e.toString());
|
||||||
|
// Evaluation itself failed — release the claim so the rule can be
|
||||||
|
// retried on the next tick. Cursor stays put.
|
||||||
|
reschedule(rule, nextRun);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result instanceof EvalResult.Batch b) {
|
||||||
|
// Phase 2: the Batch path is atomic. The @Transactional apply() on
|
||||||
|
// BatchResultApplier wraps instance writes, notification enqueues,
|
||||||
|
// AND the cursor advance + releaseClaim into a single tx. A
|
||||||
|
// mid-batch fault rolls everything back — including the cursor —
|
||||||
|
// so the next tick replays the whole batch exactly once.
|
||||||
|
try {
|
||||||
|
batchResultApplier.apply(rule, b, nextRun);
|
||||||
|
circuitBreaker.recordSuccess(rule.conditionKind());
|
||||||
|
} catch (Exception e) {
|
||||||
|
metrics.evalError(rule.conditionKind(), rule.id());
|
||||||
|
circuitBreaker.recordFailure(rule.conditionKind());
|
||||||
|
log.warn("Batch apply failed for rule {} ({}): {} — rolling back; next tick will retry",
|
||||||
|
rule.id(), rule.conditionKind(), e.toString());
|
||||||
|
// The transaction rolled back. Do NOT call reschedule here —
|
||||||
|
// leaving claim + next_evaluation_at as they were means the
|
||||||
|
// claim TTL takes over and the rule becomes due on its own.
|
||||||
|
// Rethrowing is unnecessary for correctness — the cursor
|
||||||
|
// stayed put, so exactly-once-per-exchange is preserved.
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Non-Batch path (FIRING / Clear / Error): classic apply + rule
|
||||||
|
// reschedule. Not wrapped in a single tx — semantics unchanged
|
||||||
|
// from pre-Phase-2.
|
||||||
|
try {
|
||||||
|
applyResult(rule, result);
|
||||||
|
circuitBreaker.recordSuccess(rule.conditionKind());
|
||||||
|
} catch (Exception e) {
|
||||||
|
metrics.evalError(rule.conditionKind(), rule.id());
|
||||||
|
circuitBreaker.recordFailure(rule.conditionKind());
|
||||||
|
log.warn("applyResult failed for rule {} ({}): {}",
|
||||||
|
rule.id(), rule.conditionKind(), e.toString());
|
||||||
} finally {
|
} finally {
|
||||||
reschedule(rule.withEvalState(nextEvalState), nextRun);
|
reschedule(rule, nextRun);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -183,14 +214,10 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
|
|||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
private void applyResult(AlertRule rule, EvalResult result) {
|
private void applyResult(AlertRule rule, EvalResult result) {
|
||||||
if (result instanceof EvalResult.Batch b) {
|
// Note: the Batch path is handled by BatchResultApplier (transactional) —
|
||||||
// PER_EXCHANGE mode: each Firing in the batch creates its own AlertInstance
|
// tick() routes Batch results there directly and never calls applyResult
|
||||||
for (EvalResult.Firing f : b.firings()) {
|
// for them. This method only handles FIRING / Clear / Error state-machine
|
||||||
applyBatchFiring(rule, f);
|
// transitions for the classic (non-PER_EXCHANGE) path.
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
AlertInstance current = instanceRepo.findOpenForRule(rule.id()).orElse(null);
|
AlertInstance current = instanceRepo.findOpenForRule(rule.id()).orElse(null);
|
||||||
Instant now = Instant.now(clock);
|
Instant now = Instant.now(clock);
|
||||||
|
|
||||||
@@ -211,19 +238,6 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Batch (PER_EXCHANGE) mode: always create a fresh FIRING instance per Firing entry.
|
|
||||||
* No forDuration check — each exchange is its own event.
|
|
||||||
*/
|
|
||||||
private void applyBatchFiring(AlertRule rule, EvalResult.Firing f) {
|
|
||||||
Instant now = Instant.now(clock);
|
|
||||||
AlertInstance instance = AlertStateTransitions.newInstance(rule, f, AlertState.FIRING, now)
|
|
||||||
.withRuleSnapshot(snapshotRule(rule));
|
|
||||||
AlertInstance enriched = enrichTitleMessage(rule, instance);
|
|
||||||
AlertInstance persisted = instanceRepo.save(enriched);
|
|
||||||
enqueueNotifications(rule, persisted, now);
|
|
||||||
}
|
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
// Title / message rendering
|
// Title / message rendering
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|||||||
@@ -0,0 +1,144 @@
|
|||||||
|
package com.cameleer.server.app.alerting.eval;
|
||||||
|
|
||||||
|
import com.cameleer.server.app.alerting.notify.MustacheRenderer;
|
||||||
|
import com.cameleer.server.app.alerting.notify.NotificationContextBuilder;
|
||||||
|
import com.cameleer.server.core.alerting.*;
|
||||||
|
import com.cameleer.server.core.runtime.Environment;
|
||||||
|
import com.cameleer.server.core.runtime.EnvironmentRepository;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
import org.springframework.transaction.annotation.Transactional;
|
||||||
|
|
||||||
|
import java.time.Clock;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Applies a {@link EvalResult.Batch} result to persistent state inside a single
|
||||||
|
* transaction: instance writes, notification enqueues, and the rule's cursor
|
||||||
|
* advance + {@code releaseClaim} either all commit or all roll back together.
|
||||||
|
* <p>
|
||||||
|
* Lives in its own bean so the {@code @Transactional} annotation engages via the
|
||||||
|
* Spring proxy when invoked from {@link AlertEvaluatorJob#tick()}; calling it as
|
||||||
|
* {@code this.apply(...)} from {@code AlertEvaluatorJob} (a bean calling its own
|
||||||
|
* method) would bypass the proxy and silently disable the transaction.
|
||||||
|
* <p>
|
||||||
|
* Phase 2 of the per-exchange exactly-once plan (see
|
||||||
|
* {@code docs/superpowers/plans/2026-04-22-per-exchange-exactly-once.md}).
|
||||||
|
*/
|
||||||
|
@Component
|
||||||
|
public class BatchResultApplier {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(BatchResultApplier.class);
|
||||||
|
|
||||||
|
private final AlertRuleRepository ruleRepo;
|
||||||
|
private final AlertInstanceRepository instanceRepo;
|
||||||
|
private final AlertNotificationRepository notificationRepo;
|
||||||
|
private final MustacheRenderer renderer;
|
||||||
|
private final NotificationContextBuilder contextBuilder;
|
||||||
|
private final EnvironmentRepository environmentRepo;
|
||||||
|
private final ObjectMapper objectMapper;
|
||||||
|
private final Clock clock;
|
||||||
|
|
||||||
|
public BatchResultApplier(
|
||||||
|
AlertRuleRepository ruleRepo,
|
||||||
|
AlertInstanceRepository instanceRepo,
|
||||||
|
AlertNotificationRepository notificationRepo,
|
||||||
|
MustacheRenderer renderer,
|
||||||
|
NotificationContextBuilder contextBuilder,
|
||||||
|
EnvironmentRepository environmentRepo,
|
||||||
|
ObjectMapper objectMapper,
|
||||||
|
Clock alertingClock) {
|
||||||
|
this.ruleRepo = ruleRepo;
|
||||||
|
this.instanceRepo = instanceRepo;
|
||||||
|
this.notificationRepo = notificationRepo;
|
||||||
|
this.renderer = renderer;
|
||||||
|
this.contextBuilder = contextBuilder;
|
||||||
|
this.environmentRepo = environmentRepo;
|
||||||
|
this.objectMapper = objectMapper;
|
||||||
|
this.clock = alertingClock;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically apply a Batch result for a single rule:
|
||||||
|
* <ol>
|
||||||
|
* <li>persist a FIRING instance per firing + enqueue its notifications</li>
|
||||||
|
* <li>advance the rule's cursor ({@code evalState}) iff the batch supplied one</li>
|
||||||
|
* <li>release the claim with the new {@code nextRun} + {@code evalState}</li>
|
||||||
|
* </ol>
|
||||||
|
* Any exception thrown from the repo calls rolls back every write — including
|
||||||
|
* the cursor advance — so the rule is replayable on the next tick.
|
||||||
|
*/
|
||||||
|
@Transactional
|
||||||
|
public void apply(AlertRule rule, EvalResult.Batch batch, Instant nextRun) {
|
||||||
|
for (EvalResult.Firing f : batch.firings()) {
|
||||||
|
applyBatchFiring(rule, f);
|
||||||
|
}
|
||||||
|
Map<String, Object> nextEvalState =
|
||||||
|
batch.nextEvalState().isEmpty() ? rule.evalState() : batch.nextEvalState();
|
||||||
|
ruleRepo.releaseClaim(rule.id(), nextRun, nextEvalState);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Batch (PER_EXCHANGE) mode: always create a fresh FIRING instance per Firing entry.
|
||||||
|
* No forDuration check — each exchange is its own event.
|
||||||
|
*/
|
||||||
|
private void applyBatchFiring(AlertRule rule, EvalResult.Firing f) {
|
||||||
|
Instant now = Instant.now(clock);
|
||||||
|
AlertInstance instance = AlertStateTransitions.newInstance(rule, f, AlertState.FIRING, now)
|
||||||
|
.withRuleSnapshot(snapshotRule(rule));
|
||||||
|
AlertInstance enriched = enrichTitleMessage(rule, instance);
|
||||||
|
AlertInstance persisted = instanceRepo.save(enriched);
|
||||||
|
enqueueNotifications(rule, persisted, now);
|
||||||
|
}
|
||||||
|
|
||||||
|
private AlertInstance enrichTitleMessage(AlertRule rule, AlertInstance instance) {
|
||||||
|
Environment env = environmentRepo.findById(rule.environmentId()).orElse(null);
|
||||||
|
Map<String, Object> ctx = contextBuilder.build(rule, instance, env, null);
|
||||||
|
String title = renderer.render(rule.notificationTitleTmpl(), ctx);
|
||||||
|
String message = renderer.render(rule.notificationMessageTmpl(), ctx);
|
||||||
|
return instance.withTitleMessage(title, message);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void enqueueNotifications(AlertRule rule, AlertInstance instance, Instant now) {
|
||||||
|
for (WebhookBinding w : rule.webhooks()) {
|
||||||
|
Map<String, Object> payload = buildPayload(rule, instance);
|
||||||
|
notificationRepo.save(new AlertNotification(
|
||||||
|
UUID.randomUUID(),
|
||||||
|
instance.id(),
|
||||||
|
w.id(),
|
||||||
|
w.outboundConnectionId(),
|
||||||
|
NotificationStatus.PENDING,
|
||||||
|
0,
|
||||||
|
now,
|
||||||
|
null, null, null, null,
|
||||||
|
payload,
|
||||||
|
null,
|
||||||
|
now));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private Map<String, Object> buildPayload(AlertRule rule, AlertInstance instance) {
|
||||||
|
Environment env = environmentRepo.findById(rule.environmentId()).orElse(null);
|
||||||
|
return contextBuilder.build(rule, instance, env, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
private Map<String, Object> snapshotRule(AlertRule rule) {
|
||||||
|
try {
|
||||||
|
Map<String, Object> raw = objectMapper.convertValue(rule, Map.class);
|
||||||
|
// Map.copyOf (used in AlertInstance compact ctor) rejects null values —
|
||||||
|
// strip them so the snapshot is safe to store.
|
||||||
|
Map<String, Object> safe = new LinkedHashMap<>();
|
||||||
|
raw.forEach((k, v) -> { if (v != null) safe.put(k, v); });
|
||||||
|
return safe;
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.warn("Failed to snapshot rule {}: {}", rule.id(), e.getMessage());
|
||||||
|
return Map.of("id", rule.id().toString(), "name", rule.name());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -28,7 +28,6 @@ import java.util.UUID;
|
|||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
import static org.assertj.core.api.Assertions.assertThatThrownBy;
|
|
||||||
import static org.mockito.Mockito.when;
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -441,6 +440,7 @@ class AlertEvaluatorJobIT extends AbstractPostgresIT {
|
|||||||
} catch (RuntimeException expectedAfterPhase2) {
|
} catch (RuntimeException expectedAfterPhase2) {
|
||||||
// Phase 2 may choose to rethrow; either way the rollback assertions
|
// Phase 2 may choose to rethrow; either way the rollback assertions
|
||||||
// below are what pin the contract.
|
// below are what pin the contract.
|
||||||
|
// intentionally empty — fault-injection swallow/rethrow tolerance; see comment above
|
||||||
}
|
}
|
||||||
|
|
||||||
// Post-rollback: zero instances, zero notifications, cursor unchanged,
|
// Post-rollback: zero instances, zero notifications, cursor unchanged,
|
||||||
|
|||||||
Reference in New Issue
Block a user