alerting(it): RED test pinning Phase 2 tick-atomicity contract

Fault-injection IT asserts that a crash mid-batch rolls back every
instance + notification write AND leaves the cursor unchanged. Fails
against current (Phase 1 only) code — turns green when Task 2.2
wraps batch processing in @Transactional.
This commit is contained in:
hsiegeln
2026-04-22 16:51:09 +02:00
parent 3c3d90c45b
commit 989dde23eb

View File

@@ -13,15 +13,22 @@ import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.boot.test.context.TestConfiguration;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Import;
import org.springframework.context.annotation.Primary;
import org.springframework.jdbc.core.JdbcTemplate;
import java.time.Instant;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicInteger;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import static org.mockito.Mockito.when;
/**
@@ -33,6 +40,7 @@ import static org.mockito.Mockito.when;
* {@code AgentRegistryService} is mocked so tests can control which agents
* are DEAD without depending on in-memory timing.
*/
@Import(AlertEvaluatorJobIT.FaultInjectingNotificationRepoConfig.class)
class AlertEvaluatorJobIT extends AbstractPostgresIT {
@MockBean(name = "clickHouseLogStore") ClickHouseLogStore clickHouseLogStore;
@@ -42,6 +50,7 @@ class AlertEvaluatorJobIT extends AbstractPostgresIT {
@Autowired private AlertRuleRepository ruleRepo;
@Autowired private AlertInstanceRepository instanceRepo;
@Autowired private AlertNotificationRepository notificationRepo;
@Autowired private FaultInjectingNotificationRepository faultInjectingNotificationRepo;
@Autowired private ClickHouseExecutionStore executionStore;
@Autowired @Qualifier("clickHouseJdbcTemplate") private JdbcTemplate clickHouseJdbc;
@@ -92,6 +101,10 @@ class AlertEvaluatorJobIT extends AbstractPostgresIT {
@AfterEach
void cleanup() {
// Always reset the fault-injector — a prior @Test may have left it armed,
// and Spring reuses the same context (and thus the same decorator bean)
// across tests in this class.
faultInjectingNotificationRepo.clearFault();
jdbcTemplate.update("DELETE FROM alert_notifications WHERE alert_instance_id IN " +
"(SELECT id FROM alert_instances WHERE environment_id = ?)", envId);
jdbcTemplate.update("DELETE FROM alert_instances WHERE environment_id = ?", envId);
@@ -389,4 +402,183 @@ class AlertEvaluatorJobIT extends AbstractPostgresIT {
jdbcTemplate.update("DELETE FROM alert_instances WHERE rule_id = ?", perExRuleId);
jdbcTemplate.update("DELETE FROM alert_rules WHERE id = ?", perExRuleId);
}
// -------------------------------------------------------------------------
// Tick atomicity regression pin — a crash mid-batch must roll back every
// instance + notification write AND leave the cursor unchanged so the
// next tick re-processes the entire batch exactly once.
// See: docs/superpowers/plans/2026-04-22-per-exchange-exactly-once.md (Task 2.1)
// -------------------------------------------------------------------------
@Test
void tickRollback_faultOnSecondNotificationInsert_leavesCursorUnchanged() {
// Seed 3 FAILED executions so the rule's PER_EXCHANGE batch has 3 firings.
// Relative-to-now timestamps so they fall inside [rule.createdAt .. ctx.now()].
Instant t0 = Instant.now().minusSeconds(30);
seedFailedExecution("exec-1", t0);
seedFailedExecution("exec-2", t0.plusSeconds(1));
seedFailedExecution("exec-3", t0.plusSeconds(2));
UUID perExRuleId = createPerExchangeRuleWithWebhook();
var ruleBefore = ruleRepo.findById(perExRuleId).orElseThrow();
Object cursorBefore = ruleBefore.evalState().get("lastExchangeCursor"); // null on first run
Instant nextRunBefore = ruleBefore.nextEvaluationAt();
// Arm the fault injector: the 2nd notification save() throws.
// (Instance saves are NOT counted — the injector is scoped to notification saves.)
faultInjectingNotificationRepo.failOnSave(2);
// Today (Phase 1, non-transactional): the evaluator catches the exception
// per-rule and logs a warning — see AlertEvaluatorJob#tick's try/catch
// around applyResult. So tick() itself does NOT rethrow. That is exactly
// why this IT is RED pre-Phase-2: post-rollback asserts expect 0 instances
// and 0 notifications, but the current code will have persisted
// firing #1 (instance + notification) and firing #2's instance before the
// fault on firing #2's notification. Phase 2 wraps the per-rule body in
// @Transactional so the single-rule failure rolls back all of its writes.
try {
job.tick();
} catch (RuntimeException expectedAfterPhase2) {
// Phase 2 may choose to rethrow; either way the rollback assertions
// below are what pin the contract.
}
// Post-rollback: zero instances, zero notifications, cursor unchanged,
// nextRunAt unchanged (Phase 2 will hold the claim so the next tick retries).
assertThat(countInstancesByRule(perExRuleId))
.as("Phase 2 contract: mid-batch fault rolls back every instance write")
.isZero();
assertThat(listNotificationsByRule(perExRuleId))
.as("Phase 2 contract: mid-batch fault rolls back every notification write")
.isEmpty();
var ruleAfter = ruleRepo.findById(perExRuleId).orElseThrow();
assertThat(ruleAfter.evalState().get("lastExchangeCursor"))
.as("Phase 2 contract: cursor MUST NOT advance when the tick fails")
.isEqualTo(cursorBefore);
assertThat(ruleAfter.nextEvaluationAt())
.as("Phase 2 contract: nextEvaluationAt MUST NOT advance when the tick fails")
.isEqualTo(nextRunBefore);
// Recover: clear the fault, reopen the claim, tick again.
// All 3 firings must land on the second tick — exactly-once-per-exchange.
faultInjectingNotificationRepo.clearFault();
jdbcTemplate.update(
"UPDATE alert_rules SET next_evaluation_at = now() - interval '1 second', " +
"claimed_by = NULL, claimed_until = NULL WHERE id = ?", perExRuleId);
job.tick();
assertThat(countInstancesByRule(perExRuleId))
.as("after recovery: all 3 exchanges produce exactly one instance each")
.isEqualTo(3);
assertThat(listNotificationsByRule(perExRuleId))
.as("after recovery: all 3 instances produce exactly one notification each")
.hasSize(3);
assertThat(ruleRepo.findById(perExRuleId).orElseThrow()
.evalState().get("lastExchangeCursor"))
.as("after recovery: cursor advanced past exec-3")
.isNotEqualTo(cursorBefore);
// Clean up the extra rule (setup-created rule is handled by @AfterEach).
jdbcTemplate.update("DELETE FROM alert_notifications WHERE alert_instance_id IN " +
"(SELECT id FROM alert_instances WHERE rule_id = ?)", perExRuleId);
jdbcTemplate.update("DELETE FROM alert_instances WHERE rule_id = ?", perExRuleId);
jdbcTemplate.update("DELETE FROM alert_rules WHERE id = ?", perExRuleId);
}
// -------------------------------------------------------------------------
// Fault-injecting AlertNotificationRepository decorator
//
// Delegates all calls to the real Postgres-backed repository except
// {@link #save(AlertNotification)} — that method increments a counter and
// throws a RuntimeException when the configured trigger-count is reached.
// Only notification saves are counted; instance saves go through a separate
// repo and are unaffected.
// -------------------------------------------------------------------------
static class FaultInjectingNotificationRepository implements AlertNotificationRepository {
private final AlertNotificationRepository delegate;
private final AtomicInteger saveCount = new AtomicInteger(0);
private volatile int failOnNthSave = -1; // -1 = disabled
FaultInjectingNotificationRepository(AlertNotificationRepository delegate) {
this.delegate = delegate;
}
/** Arms the fault: the {@code n}-th call to {@link #save} (1-indexed) throws. */
void failOnSave(int n) {
saveCount.set(0);
failOnNthSave = n;
}
/** Disarms the fault and resets the counter. */
void clearFault() {
failOnNthSave = -1;
saveCount.set(0);
}
@Override
public AlertNotification save(AlertNotification n) {
int current = saveCount.incrementAndGet();
if (failOnNthSave > 0 && current == failOnNthSave) {
throw new RuntimeException(
"FaultInjectingNotificationRepository: injected failure on save #" + current);
}
return delegate.save(n);
}
@Override
public Optional<AlertNotification> findById(UUID id) { return delegate.findById(id); }
@Override
public List<AlertNotification> listForInstance(UUID alertInstanceId) {
return delegate.listForInstance(alertInstanceId);
}
@Override
public List<AlertNotification> claimDueNotifications(String instanceId, int batchSize, int claimTtlSeconds) {
return delegate.claimDueNotifications(instanceId, batchSize, claimTtlSeconds);
}
@Override
public void markDelivered(UUID id, int status, String snippet, Instant when) {
delegate.markDelivered(id, status, snippet, when);
}
@Override
public void scheduleRetry(UUID id, Instant nextAttemptAt, int status, String snippet) {
delegate.scheduleRetry(id, nextAttemptAt, status, snippet);
}
@Override
public void markFailed(UUID id, int status, String snippet) {
delegate.markFailed(id, status, snippet);
}
@Override
public void resetForRetry(UUID id, Instant nextAttemptAt) {
delegate.resetForRetry(id, nextAttemptAt);
}
@Override
public void deleteSettledBefore(Instant cutoff) {
delegate.deleteSettledBefore(cutoff);
}
}
/**
* {@link TestConfiguration} that installs the fault-injecting decorator as the
* {@code @Primary} {@link AlertNotificationRepository}. The real Postgres repo is
* still registered (via {@code AlertingBeanConfig}) and is injected into the
* decorator as the delegate, so every non-instrumented call path exercises real SQL.
*/
@TestConfiguration
static class FaultInjectingNotificationRepoConfig {
@Bean
@Primary
FaultInjectingNotificationRepository faultInjectingNotificationRepository(
@Qualifier("alertNotificationRepository") AlertNotificationRepository realRepo) {
return new FaultInjectingNotificationRepository(realRepo);
}
}
}