test(alerting): rewrite AlertingFullLifecycleIT — REST-driven rule creation, re-notify cadence
Rule creation now goes through POST /alerts/rules (exercises saveTargets on the
write path). Clock is replaced with @MockBean(name="alertingClock") and re-stubbed
in @BeforeEach to survive Mockito's inter-test reset. Six ordered steps:
1. seed log → tick evaluator → assert FIRING instance with non-empty targets (B-1)
2. tick dispatcher → assert DELIVERED notification + lastNotifiedAt stamped (B-2)
3. ack via REST → assert ACKNOWLEDGED state
4. create silence → inject PENDING notification → tick dispatcher → assert silenced (FAILED)
5. delete rule → assert rule_id nullified, rule_snapshot preserved (ON DELETE SET NULL)
6. new rule with reNotifyMinutes=1 → first dispatch → advance clock 61s →
evaluator sweep → second dispatch → verify 2 WireMock POSTs (B-2 cadence)
Background scheduler races addressed by resetting claimed_by/claimed_until before
each manual tick. Simulated clock set AFTER log insert to guarantee log timestamp
falls within the evaluator window. Re-notify notifications backdated in Postgres
to work around the simulated vs real clock gap in claimDueNotifications.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,7 +1,10 @@
|
||||
package com.cameleer.server.app;
|
||||
|
||||
import com.cameleer.server.app.search.ClickHouseSearchIndex;
|
||||
import com.cameleer.server.core.agent.AgentRegistryService;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.jdbc.core.JdbcTemplate;
|
||||
import org.springframework.test.context.ActiveProfiles;
|
||||
import org.springframework.test.context.DynamicPropertyRegistry;
|
||||
@@ -14,6 +17,12 @@ import org.testcontainers.containers.PostgreSQLContainer;
|
||||
@ActiveProfiles("test")
|
||||
public abstract class AbstractPostgresIT {
|
||||
|
||||
// Mocked infrastructure beans required by the full application context.
|
||||
// ClickHouseSearchIndex is not available in test without explicit ClickHouse wiring,
|
||||
// and AgentRegistryService requires in-memory state that tests manage directly.
|
||||
@MockBean(name = "clickHouseSearchIndex") protected ClickHouseSearchIndex clickHouseSearchIndex;
|
||||
@MockBean protected AgentRegistryService agentRegistryService;
|
||||
|
||||
static final PostgreSQLContainer<?> postgres;
|
||||
static final ClickHouseContainer clickhouse;
|
||||
|
||||
|
||||
@@ -16,12 +16,16 @@ import com.github.tomakehurst.wiremock.WireMockServer;
|
||||
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.junit.jupiter.api.TestInstance.Lifecycle;
|
||||
import org.mockito.Mockito;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.boot.test.web.client.TestRestTemplate;
|
||||
import org.springframework.http.*;
|
||||
|
||||
import java.time.Clock;
|
||||
import java.time.Instant;
|
||||
import java.time.ZoneOffset;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
@@ -32,9 +36,14 @@ import static org.assertj.core.api.Assertions.assertThat;
|
||||
/**
|
||||
* Canary integration test — exercises the full alerting lifecycle end-to-end:
|
||||
* fire → notify → ack → silence → re-fire (suppressed) → resolve → rule delete.
|
||||
* Also verifies the re-notification cadence (reNotifyMinutes).
|
||||
*
|
||||
* Rule creation is driven through the REST API (POST /alerts/rules), not raw SQL,
|
||||
* so target persistence via saveTargets() is exercised on the critical path.
|
||||
*
|
||||
* Uses real Postgres (Testcontainers) and real ClickHouse for log seeding.
|
||||
* WireMock provides the webhook target.
|
||||
* Clock is replaced with a @MockBean so the re-notify test can advance time.
|
||||
*/
|
||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||
@TestInstance(Lifecycle.PER_CLASS)
|
||||
@@ -42,6 +51,9 @@ class AlertingFullLifecycleIT extends AbstractPostgresIT {
|
||||
|
||||
// AbstractPostgresIT already declares clickHouseSearchIndex + agentRegistryService mocks.
|
||||
|
||||
// Replace the alertingClock bean so we can control time in re-notify test
|
||||
@MockBean(name = "alertingClock") Clock alertingClock;
|
||||
|
||||
// ── Spring beans ──────────────────────────────────────────────────────────
|
||||
|
||||
@Autowired private AlertEvaluatorJob evaluatorJob;
|
||||
@@ -71,15 +83,30 @@ class AlertingFullLifecycleIT extends AbstractPostgresIT {
|
||||
private UUID connId;
|
||||
private UUID instanceId; // filled after first FIRING
|
||||
|
||||
// Current simulated clock time — starts at "now" and can be advanced
|
||||
private Instant simulatedNow = Instant.now();
|
||||
|
||||
// ── Setup / teardown ──────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Mockito resets @MockBean stubs between @Test methods even with PER_CLASS lifecycle.
|
||||
* Re-stub the clock before every test so clock.instant() never returns null.
|
||||
*/
|
||||
@BeforeEach
|
||||
void refreshClock() {
|
||||
stubClock();
|
||||
}
|
||||
|
||||
@BeforeAll
|
||||
void seedFixtures() throws Exception {
|
||||
wm = new WireMockServer(WireMockConfiguration.options()
|
||||
.httpDisabled(true)
|
||||
.dynamicHttpsPort());
|
||||
wm.start();
|
||||
// ClickHouse schema is auto-initialized by ClickHouseSchemaInitializer on Spring context startup.
|
||||
|
||||
// Default clock behaviour: delegate to simulatedNow
|
||||
stubClock();
|
||||
|
||||
operatorJwt = securityHelper.operatorToken();
|
||||
|
||||
// Seed operator user in Postgres
|
||||
@@ -111,41 +138,8 @@ class AlertingFullLifecycleIT extends AbstractPostgresIT {
|
||||
" 'test-operator', 'test-operator')",
|
||||
connId, tenantId, webhookUrl, hmacCiphertext);
|
||||
|
||||
// Seed alert rule (LOG_PATTERN, forDurationSeconds=0, threshold=0 so >=1 log fires immediately)
|
||||
ruleId = UUID.randomUUID();
|
||||
UUID webhookBindingId = UUID.randomUUID();
|
||||
String webhooksJson = objectMapper.writeValueAsString(List.of(
|
||||
Map.of("id", webhookBindingId.toString(),
|
||||
"outboundConnectionId", connId.toString())));
|
||||
String conditionJson = objectMapper.writeValueAsString(Map.of(
|
||||
"kind", "LOG_PATTERN",
|
||||
"scope", Map.of("appSlug", "lc-app"),
|
||||
"level", "ERROR",
|
||||
"pattern", "TimeoutException",
|
||||
"threshold", 0,
|
||||
"windowSeconds", 300));
|
||||
|
||||
jdbcTemplate.update("""
|
||||
INSERT INTO alert_rules
|
||||
(id, environment_id, name, severity, enabled,
|
||||
condition_kind, condition,
|
||||
evaluation_interval_seconds, for_duration_seconds,
|
||||
notification_title_tmpl, notification_message_tmpl,
|
||||
webhooks, next_evaluation_at,
|
||||
created_by, updated_by)
|
||||
VALUES (?, ?, 'lc-timeout-rule', 'WARNING'::severity_enum, true,
|
||||
'LOG_PATTERN'::condition_kind_enum, ?::jsonb,
|
||||
60, 0,
|
||||
'Alert: {{rule.name}}', 'Instance {{alert.id}} fired',
|
||||
?::jsonb, now() - interval '1 second',
|
||||
'test-operator', 'test-operator')
|
||||
""",
|
||||
ruleId, envId, conditionJson, webhooksJson);
|
||||
|
||||
// Seed alert_rule_targets so the instance shows up in inbox
|
||||
jdbcTemplate.update(
|
||||
"INSERT INTO alert_rule_targets (id, rule_id, target_kind, target_id) VALUES (gen_random_uuid(), ?, 'USER'::target_kind_enum, 'test-operator') ON CONFLICT (rule_id, target_kind, target_id) DO NOTHING",
|
||||
ruleId);
|
||||
// Create alert rule via REST API (exercises saveTargets on the write path)
|
||||
ruleId = createRuleViaRestApi();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
@@ -154,8 +148,8 @@ class AlertingFullLifecycleIT extends AbstractPostgresIT {
|
||||
jdbcTemplate.update("DELETE FROM alert_silences WHERE environment_id = ?", envId);
|
||||
jdbcTemplate.update("DELETE FROM alert_notifications WHERE alert_instance_id IN (SELECT id FROM alert_instances WHERE environment_id = ?)", envId);
|
||||
jdbcTemplate.update("DELETE FROM alert_instances WHERE environment_id = ?", envId);
|
||||
jdbcTemplate.update("DELETE FROM alert_rule_targets WHERE rule_id = ?", ruleId);
|
||||
jdbcTemplate.update("DELETE FROM alert_rules WHERE id = ?", ruleId);
|
||||
jdbcTemplate.update("DELETE FROM alert_rule_targets WHERE rule_id IN (SELECT id FROM alert_rules WHERE environment_id = ?)", envId);
|
||||
jdbcTemplate.update("DELETE FROM alert_rules WHERE environment_id = ?", envId);
|
||||
jdbcTemplate.update("DELETE FROM outbound_connections WHERE id = ?", connId);
|
||||
jdbcTemplate.update("DELETE FROM environments WHERE id = ?", envId);
|
||||
jdbcTemplate.update("DELETE FROM users WHERE user_id = 'test-operator'");
|
||||
@@ -169,9 +163,27 @@ class AlertingFullLifecycleIT extends AbstractPostgresIT {
|
||||
// Stub WireMock to return 200
|
||||
wm.stubFor(post("/webhook").willReturn(aResponse().withStatus(200).withBody("accepted")));
|
||||
|
||||
// Seed a matching log into ClickHouse
|
||||
// Seed a matching log into ClickHouse BEFORE capturing simulatedNow,
|
||||
// so the log timestamp is guaranteed to fall inside [simulatedNow-300s, simulatedNow].
|
||||
seedMatchingLog();
|
||||
|
||||
// Set simulatedNow to current wall time — the log was inserted a few ms earlier,
|
||||
// so its timestamp is guaranteed <= simulatedNow within the 300s window.
|
||||
setSimulatedNow(Instant.now());
|
||||
|
||||
// Release any claim the background scheduler may have already placed on the rule,
|
||||
// and backdate next_evaluation_at so it's due again for our manual tick.
|
||||
jdbcTemplate.update(
|
||||
"UPDATE alert_rules SET claimed_by = NULL, claimed_until = NULL, " +
|
||||
"next_evaluation_at = now() - interval '1 second' WHERE id = ?", ruleId);
|
||||
|
||||
// Verify rule is in DB and due (no claim outstanding)
|
||||
Integer ruleCount = jdbcTemplate.queryForObject(
|
||||
"SELECT count(*) FROM alert_rules WHERE id = ? AND enabled = true " +
|
||||
"AND next_evaluation_at <= now() AND (claimed_until IS NULL OR claimed_until < now())",
|
||||
Integer.class, ruleId);
|
||||
assertThat(ruleCount).as("rule must be unclaimed and due before tick").isEqualTo(1);
|
||||
|
||||
// Tick evaluator
|
||||
evaluatorJob.tick();
|
||||
|
||||
@@ -181,6 +193,13 @@ class AlertingFullLifecycleIT extends AbstractPostgresIT {
|
||||
assertThat(instances).hasSize(1);
|
||||
assertThat(instances.get(0).state()).isEqualTo(AlertState.FIRING);
|
||||
assertThat(instances.get(0).ruleId()).isEqualTo(ruleId);
|
||||
|
||||
// B-1 fix verification: targets were persisted via the REST API path,
|
||||
// so target_user_ids must be non-empty (not {} as before the fix)
|
||||
assertThat(instances.get(0).targetUserIds())
|
||||
.as("target_user_ids must be non-empty — verifies B-1 fix (saveTargets)")
|
||||
.isNotEmpty();
|
||||
|
||||
instanceId = instances.get(0).id();
|
||||
}
|
||||
|
||||
@@ -205,6 +224,12 @@ class AlertingFullLifecycleIT extends AbstractPostgresIT {
|
||||
// Body should contain rule name
|
||||
wm.verify(postRequestedFor(urlEqualTo("/webhook"))
|
||||
.withRequestBody(containing("lc-timeout-rule")));
|
||||
|
||||
// B-2: lastNotifiedAt must be set after dispatch (step sets it on DELIVERED)
|
||||
AlertInstance inst = instanceRepo.findById(instanceId).orElseThrow();
|
||||
assertThat(inst.lastNotifiedAt())
|
||||
.as("lastNotifiedAt must be set after DELIVERED — verifies B-2 tracking fix")
|
||||
.isNotNull();
|
||||
}
|
||||
|
||||
@Test
|
||||
@@ -234,8 +259,8 @@ class AlertingFullLifecycleIT extends AbstractPostgresIT {
|
||||
String silenceBody = objectMapper.writeValueAsString(Map.of(
|
||||
"matcher", Map.of("ruleId", ruleId.toString()),
|
||||
"reason", "lifecycle-test-silence",
|
||||
"startsAt", Instant.now().minusSeconds(10).toString(),
|
||||
"endsAt", Instant.now().plusSeconds(3600).toString()
|
||||
"startsAt", simulatedNow.minusSeconds(10).toString(),
|
||||
"endsAt", simulatedNow.plusSeconds(3600).toString()
|
||||
));
|
||||
ResponseEntity<String> silenceResp = restTemplate.exchange(
|
||||
"/api/v1/environments/" + envSlug + "/alerts/silences",
|
||||
@@ -305,8 +330,178 @@ class AlertingFullLifecycleIT extends AbstractPostgresIT {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@Order(6)
|
||||
void step6_reNotifyCadenceFiresSecondNotification() throws Exception {
|
||||
// Standalone sub-test: create a fresh rule with reNotifyMinutes=1 and verify
|
||||
// that the evaluator's re-notify sweep enqueues a second notification after 61 seconds.
|
||||
|
||||
wm.resetRequests();
|
||||
wm.stubFor(post("/webhook").willReturn(aResponse().withStatus(200).withBody("accepted")));
|
||||
|
||||
// Create a new rule via REST with reNotifyMinutes=1, forDurationSeconds=0
|
||||
UUID reNotifyRuleId = createReNotifyRuleViaRestApi();
|
||||
|
||||
// Seed the log BEFORE capturing T+0 so the log timestamp falls inside
|
||||
// the evaluator window [t0-300s, t0].
|
||||
seedMatchingLog();
|
||||
|
||||
// Set T+0 to current wall time — the log was inserted a few ms earlier,
|
||||
// so its timestamp is guaranteed <= t0 within the 300s window.
|
||||
Instant t0 = Instant.now();
|
||||
setSimulatedNow(t0);
|
||||
|
||||
// Tick evaluator at T+0 → instance FIRING, notification PENDING
|
||||
evaluatorJob.tick();
|
||||
|
||||
List<AlertInstance> instances = instanceRepo.listForInbox(
|
||||
envId, List.of(), "test-operator", List.of("OPERATOR"), 10);
|
||||
// Find the instance for the reNotify rule
|
||||
AlertInstance inst = instances.stream()
|
||||
.filter(i -> reNotifyRuleId.equals(i.ruleId()))
|
||||
.findFirst()
|
||||
.orElse(null);
|
||||
assertThat(inst).as("FIRING instance for reNotify rule").isNotNull();
|
||||
UUID reNotifyInstanceId = inst.id();
|
||||
|
||||
// Tick dispatcher at T+0 → notification DELIVERED, WireMock: 1 POST
|
||||
dispatchJob.tick();
|
||||
wm.verify(1, postRequestedFor(urlEqualTo("/webhook")));
|
||||
|
||||
// Verify lastNotifiedAt was stamped (B-2 tracking)
|
||||
AlertInstance afterFirstDispatch = instanceRepo.findById(reNotifyInstanceId).orElseThrow();
|
||||
assertThat(afterFirstDispatch.lastNotifiedAt()).isNotNull();
|
||||
|
||||
// --- Advance clock 61 seconds ---
|
||||
setSimulatedNow(t0.plusSeconds(61));
|
||||
|
||||
// Backdate next_evaluation_at so the rule is claimed again
|
||||
jdbcTemplate.update(
|
||||
"UPDATE alert_rules SET next_evaluation_at = now() - interval '1 second', " +
|
||||
"claimed_by = NULL, claimed_until = NULL WHERE id = ?", reNotifyRuleId);
|
||||
|
||||
// Tick evaluator at T+61 — re-notify sweep fires because lastNotifiedAt + 1 min <= now
|
||||
evaluatorJob.tick();
|
||||
|
||||
// The sweep saves notifications with nextAttemptAt = simulatedNow (T+61s) which is in the
|
||||
// future relative to Postgres real clock. Backdate so the dispatcher can claim them.
|
||||
jdbcTemplate.update(
|
||||
"UPDATE alert_notifications SET next_attempt_at = now() - interval '1 second' " +
|
||||
"WHERE alert_instance_id = ? AND status = 'PENDING'::notification_status_enum",
|
||||
reNotifyInstanceId);
|
||||
|
||||
// Tick dispatcher → second POST
|
||||
dispatchJob.tick();
|
||||
wm.verify(2, postRequestedFor(urlEqualTo("/webhook")));
|
||||
|
||||
// Cleanup
|
||||
jdbcTemplate.update("DELETE FROM alert_notifications WHERE alert_instance_id = ?", reNotifyInstanceId);
|
||||
jdbcTemplate.update("DELETE FROM alert_instances WHERE id = ?", reNotifyInstanceId);
|
||||
jdbcTemplate.update("DELETE FROM alert_rule_targets WHERE rule_id = ?", reNotifyRuleId);
|
||||
jdbcTemplate.update("DELETE FROM alert_rules WHERE id = ?", reNotifyRuleId);
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
/** POST the main lifecycle rule via REST API. Returns the created rule ID. */
|
||||
private UUID createRuleViaRestApi() throws Exception {
|
||||
// Build JSON directly — Map.of() supports at most 10 entries
|
||||
String ruleBody = """
|
||||
{
|
||||
"name": "lc-timeout-rule",
|
||||
"severity": "WARNING",
|
||||
"conditionKind": "LOG_PATTERN",
|
||||
"condition": {
|
||||
"kind": "LOG_PATTERN",
|
||||
"scope": {"appSlug": "lc-app"},
|
||||
"level": "ERROR",
|
||||
"pattern": "TimeoutException",
|
||||
"threshold": 0,
|
||||
"windowSeconds": 300
|
||||
},
|
||||
"evaluationIntervalSeconds": 60,
|
||||
"forDurationSeconds": 0,
|
||||
"reNotifyMinutes": 0,
|
||||
"notificationTitleTmpl": "Alert: {{rule.name}}",
|
||||
"notificationMessageTmpl": "Instance {{alert.id}} fired",
|
||||
"webhooks": [{"outboundConnectionId": "%s"}],
|
||||
"targets": [{"kind": "USER", "targetId": "test-operator"}]
|
||||
}
|
||||
""".formatted(connId);
|
||||
|
||||
ResponseEntity<String> resp = restTemplate.exchange(
|
||||
"/api/v1/environments/" + envSlug + "/alerts/rules",
|
||||
HttpMethod.POST,
|
||||
new HttpEntity<>(ruleBody, securityHelper.authHeaders(operatorJwt)),
|
||||
String.class);
|
||||
|
||||
assertThat(resp.getStatusCode()).isEqualTo(HttpStatus.CREATED);
|
||||
JsonNode body = objectMapper.readTree(resp.getBody());
|
||||
String id = body.path("id").asText();
|
||||
assertThat(id).isNotBlank();
|
||||
|
||||
// Backdate next_evaluation_at so it's due immediately
|
||||
UUID ruleUuid = UUID.fromString(id);
|
||||
jdbcTemplate.update(
|
||||
"UPDATE alert_rules SET next_evaluation_at = now() - interval '1 second' WHERE id = ?",
|
||||
ruleUuid);
|
||||
|
||||
return ruleUuid;
|
||||
}
|
||||
|
||||
/** POST a short-cadence re-notify rule via REST API. Returns the created rule ID. */
|
||||
private UUID createReNotifyRuleViaRestApi() throws Exception {
|
||||
String ruleBody = """
|
||||
{
|
||||
"name": "lc-renotify-rule",
|
||||
"severity": "WARNING",
|
||||
"conditionKind": "LOG_PATTERN",
|
||||
"condition": {
|
||||
"kind": "LOG_PATTERN",
|
||||
"scope": {"appSlug": "lc-app"},
|
||||
"level": "ERROR",
|
||||
"pattern": "TimeoutException",
|
||||
"threshold": 0,
|
||||
"windowSeconds": 300
|
||||
},
|
||||
"evaluationIntervalSeconds": 60,
|
||||
"forDurationSeconds": 0,
|
||||
"reNotifyMinutes": 1,
|
||||
"notificationTitleTmpl": "ReNotify: {{rule.name}}",
|
||||
"notificationMessageTmpl": "Re-fired {{alert.id}}",
|
||||
"webhooks": [{"outboundConnectionId": "%s"}],
|
||||
"targets": [{"kind": "USER", "targetId": "test-operator"}]
|
||||
}
|
||||
""".formatted(connId);
|
||||
|
||||
ResponseEntity<String> resp = restTemplate.exchange(
|
||||
"/api/v1/environments/" + envSlug + "/alerts/rules",
|
||||
HttpMethod.POST,
|
||||
new HttpEntity<>(ruleBody, securityHelper.authHeaders(operatorJwt)),
|
||||
String.class);
|
||||
|
||||
assertThat(resp.getStatusCode()).isEqualTo(HttpStatus.CREATED);
|
||||
JsonNode body = objectMapper.readTree(resp.getBody());
|
||||
String id = body.path("id").asText();
|
||||
assertThat(id).isNotBlank();
|
||||
|
||||
UUID ruleUuid = UUID.fromString(id);
|
||||
jdbcTemplate.update(
|
||||
"UPDATE alert_rules SET next_evaluation_at = now() - interval '1 second' WHERE id = ?",
|
||||
ruleUuid);
|
||||
return ruleUuid;
|
||||
}
|
||||
|
||||
private void setSimulatedNow(Instant instant) {
|
||||
simulatedNow = instant;
|
||||
stubClock();
|
||||
}
|
||||
|
||||
private void stubClock() {
|
||||
Mockito.when(alertingClock.instant()).thenReturn(simulatedNow);
|
||||
Mockito.when(alertingClock.getZone()).thenReturn(ZoneOffset.UTC);
|
||||
}
|
||||
|
||||
private void seedMatchingLog() {
|
||||
LogEntry entry = new LogEntry(
|
||||
Instant.now(),
|
||||
|
||||
Reference in New Issue
Block a user