feat(alerting): AlertEvaluatorJob with claim-polling + circuit breaker
- AlertEvaluatorJob implements SchedulingConfigurer; fixed-delay tick from AlertingProperties.effectiveEvaluatorTickIntervalMs (5 s floor) - Claim-polling via AlertRuleRepository.claimDueRules (FOR UPDATE SKIP LOCKED) - Per-kind circuit breaker guards each evaluator; failures recorded, open kinds skipped and rescheduled without evaluation - Single-Firing path delegates to AlertStateTransitions; new FIRING instances enqueue AlertNotification rows per rule.webhooks() - Batch (PER_EXCHANGE) path creates one FIRING AlertInstance per Firing entry - PENDING→FIRING promotion handled in applyResult via state machine - Title/message rendered via MustacheRenderer + NotificationContextBuilder; environment resolved from EnvironmentRepository.findById per tick - AlertEvaluatorJobIT (4 tests): uses named @MockBean replacements for ClickHouseSearchIndex + ClickHouseLogStore; @MockBean AgentRegistryService drives Clear/Firing/resolve cycle without timing sensitivity Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,199 @@
|
||||
package com.cameleer.server.app.alerting.eval;
|
||||
|
||||
import com.cameleer.server.app.AbstractPostgresIT;
|
||||
import com.cameleer.server.app.search.ClickHouseLogStore;
|
||||
import com.cameleer.server.app.search.ClickHouseSearchIndex;
|
||||
import com.cameleer.server.core.agent.AgentInfo;
|
||||
import com.cameleer.server.core.agent.AgentRegistryService;
|
||||
import com.cameleer.server.core.agent.AgentState;
|
||||
import com.cameleer.server.core.alerting.*;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
|
||||
import java.time.Instant;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
/**
|
||||
* Integration test for {@link AlertEvaluatorJob}.
|
||||
* <p>
|
||||
* Uses real Postgres (Testcontainers) for the full claim→persist pipeline.
|
||||
* {@code ClickHouseSearchIndex} and {@code ClickHouseLogStore} are mocked so
|
||||
* {@code ExchangeMatchEvaluator} and {@code LogPatternEvaluator} wire up even
|
||||
* though those concrete types are not directly registered as Spring beans.
|
||||
* {@code AgentRegistryService} is mocked so tests can control which agents
|
||||
* are DEAD without depending on in-memory timing.
|
||||
*/
|
||||
class AlertEvaluatorJobIT extends AbstractPostgresIT {
|
||||
|
||||
// Replace the named beans so ExchangeMatchEvaluator / LogPatternEvaluator can wire their
|
||||
// concrete-type constructor args without duplicating the SearchIndex / LogIndex beans.
|
||||
@MockBean(name = "clickHouseSearchIndex") ClickHouseSearchIndex clickHouseSearchIndex;
|
||||
@MockBean(name = "clickHouseLogStore") ClickHouseLogStore clickHouseLogStore;
|
||||
|
||||
// Control agent state per test without timing sensitivity
|
||||
@MockBean AgentRegistryService agentRegistryService;
|
||||
|
||||
@Autowired private AlertEvaluatorJob job;
|
||||
@Autowired private AlertRuleRepository ruleRepo;
|
||||
@Autowired private AlertInstanceRepository instanceRepo;
|
||||
|
||||
private UUID envId;
|
||||
private UUID ruleId;
|
||||
private static final String SYS_USER = "sys-eval-it";
|
||||
private static final String APP_SLUG = "orders";
|
||||
private static final String AGENT_ID = "test-agent-01";
|
||||
|
||||
@BeforeEach
|
||||
void setup() {
|
||||
// Default: empty registry — all evaluators return Clear
|
||||
when(agentRegistryService.findAll()).thenReturn(List.of());
|
||||
|
||||
envId = UUID.randomUUID();
|
||||
ruleId = UUID.randomUUID();
|
||||
|
||||
jdbcTemplate.update(
|
||||
"INSERT INTO environments (id, slug, display_name) VALUES (?, ?, ?)",
|
||||
envId, "eval-it-env-" + envId, "Eval IT Env");
|
||||
jdbcTemplate.update(
|
||||
"INSERT INTO users (user_id, provider, email) VALUES (?, 'local', ?) ON CONFLICT (user_id) DO NOTHING",
|
||||
SYS_USER, SYS_USER + "@test.example.com");
|
||||
|
||||
// Rule: AGENT_STATE = DEAD, forSeconds=60, forDurationSeconds=0 (straight to FIRING)
|
||||
var condition = new AgentStateCondition(
|
||||
new AlertScope(APP_SLUG, null, null), "DEAD", 60);
|
||||
var rule = new AlertRule(
|
||||
ruleId, envId, "dead-agent-rule", "fires when orders agent is dead",
|
||||
AlertSeverity.WARNING, true, ConditionKind.AGENT_STATE, condition,
|
||||
60, 0, 60,
|
||||
"Agent dead: {{agent.name}}", "Agent {{agent.id}} is {{agent.state}}",
|
||||
List.of(), List.of(),
|
||||
Instant.now().minusSeconds(5), // due now
|
||||
null, null, Map.of(),
|
||||
Instant.now(), SYS_USER, Instant.now(), SYS_USER);
|
||||
ruleRepo.save(rule);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
void cleanup() {
|
||||
jdbcTemplate.update("DELETE FROM alert_notifications WHERE alert_instance_id IN " +
|
||||
"(SELECT id FROM alert_instances WHERE environment_id = ?)", envId);
|
||||
jdbcTemplate.update("DELETE FROM alert_instances WHERE environment_id = ?", envId);
|
||||
jdbcTemplate.update("DELETE FROM alert_rules WHERE environment_id = ?", envId);
|
||||
jdbcTemplate.update("DELETE FROM environments WHERE id = ?", envId);
|
||||
jdbcTemplate.update("DELETE FROM users WHERE user_id = ?", SYS_USER);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
private AgentInfo deadAgent(Instant lastHeartbeat) {
|
||||
return new AgentInfo(AGENT_ID, "orders-service", APP_SLUG,
|
||||
envId.toString(), "1.0", List.of(), Map.of(),
|
||||
AgentState.DEAD, lastHeartbeat.minusSeconds(300), lastHeartbeat, null);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Tests
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
@Test
|
||||
void noMatchingAgentProducesNoInstance() {
|
||||
// Registry empty → evaluator returns Clear → no alert_instance
|
||||
when(agentRegistryService.findAll()).thenReturn(List.of());
|
||||
|
||||
job.tick();
|
||||
|
||||
assertThat(instanceRepo.findOpenForRule(ruleId)).isEmpty();
|
||||
}
|
||||
|
||||
@Test
|
||||
void deadAgentProducesFiringInstance() {
|
||||
// Agent has been DEAD for 2 minutes (> forSeconds=60) → FIRING
|
||||
when(agentRegistryService.findAll())
|
||||
.thenReturn(List.of(deadAgent(Instant.now().minusSeconds(120))));
|
||||
|
||||
job.tick();
|
||||
|
||||
assertThat(instanceRepo.findOpenForRule(ruleId)).hasValueSatisfying(i -> {
|
||||
assertThat(i.state()).isEqualTo(AlertState.FIRING);
|
||||
assertThat(i.ruleId()).isEqualTo(ruleId);
|
||||
assertThat(i.environmentId()).isEqualTo(envId);
|
||||
assertThat(i.severity()).isEqualTo(AlertSeverity.WARNING);
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
void claimDueResolveCycle() {
|
||||
// Tick 1: dead agent → FIRING
|
||||
when(agentRegistryService.findAll())
|
||||
.thenReturn(List.of(deadAgent(Instant.now().minusSeconds(120))));
|
||||
job.tick();
|
||||
assertThat(instanceRepo.findOpenForRule(ruleId)).hasValueSatisfying(i ->
|
||||
assertThat(i.state()).isEqualTo(AlertState.FIRING));
|
||||
|
||||
// Bump next_evaluation_at so rule is due again
|
||||
jdbcTemplate.update(
|
||||
"UPDATE alert_rules SET next_evaluation_at = now() - interval '1 second', " +
|
||||
"claimed_by = NULL, claimed_until = NULL WHERE id = ?", ruleId);
|
||||
|
||||
// Tick 2: empty registry → Clear → RESOLVED
|
||||
when(agentRegistryService.findAll()).thenReturn(List.of());
|
||||
job.tick();
|
||||
|
||||
assertThat(instanceRepo.findOpenForRule(ruleId)).isEmpty();
|
||||
long resolvedCount = jdbcTemplate.queryForObject(
|
||||
"SELECT count(*) FROM alert_instances WHERE rule_id = ? AND state = 'RESOLVED'",
|
||||
Long.class, ruleId);
|
||||
assertThat(resolvedCount).isEqualTo(1L);
|
||||
}
|
||||
|
||||
@Test
|
||||
void firingWithForDurationCreatesPendingThenPromotes() {
|
||||
UUID ruleId2 = UUID.randomUUID();
|
||||
var condition = new AgentStateCondition(new AlertScope(APP_SLUG, null, null), "DEAD", 60);
|
||||
var ruleWithDuration = new AlertRule(
|
||||
ruleId2, envId, "pending-rule", null,
|
||||
AlertSeverity.WARNING, true, ConditionKind.AGENT_STATE, condition,
|
||||
60, 60, 60, // forDurationSeconds = 60
|
||||
"title", "msg",
|
||||
List.of(), List.of(),
|
||||
Instant.now().minusSeconds(5),
|
||||
null, null, Map.of(),
|
||||
Instant.now(), SYS_USER, Instant.now(), SYS_USER);
|
||||
ruleRepo.save(ruleWithDuration);
|
||||
|
||||
// Dead agent for both rules
|
||||
when(agentRegistryService.findAll())
|
||||
.thenReturn(List.of(deadAgent(Instant.now().minusSeconds(120))));
|
||||
job.tick();
|
||||
|
||||
// ruleId2 has forDuration=60 → PENDING
|
||||
assertThat(instanceRepo.findOpenForRule(ruleId2)).hasValueSatisfying(i ->
|
||||
assertThat(i.state()).isEqualTo(AlertState.PENDING));
|
||||
|
||||
// Backdate firedAt so promotion window is met
|
||||
jdbcTemplate.update(
|
||||
"UPDATE alert_instances SET fired_at = now() - interval '90 seconds' WHERE rule_id = ?",
|
||||
ruleId2);
|
||||
jdbcTemplate.update(
|
||||
"UPDATE alert_rules SET next_evaluation_at = now() - interval '1 second', " +
|
||||
"claimed_by = NULL, claimed_until = NULL WHERE id = ?", ruleId2);
|
||||
|
||||
job.tick();
|
||||
|
||||
assertThat(instanceRepo.findOpenForRule(ruleId2)).hasValueSatisfying(i ->
|
||||
assertThat(i.state()).isEqualTo(AlertState.FIRING));
|
||||
|
||||
jdbcTemplate.update("DELETE FROM alert_instances WHERE rule_id = ?", ruleId2);
|
||||
jdbcTemplate.update("DELETE FROM alert_rules WHERE id = ?", ruleId2);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user