feat(alerting): AlertingProperties + AlertStateTransitions state machine

- AlertingProperties @ConfigurationProperties with effective*() accessors and
  5000 ms floor clamp on evaluatorTickIntervalMs; warn logged at startup
- AlertStateTransitions pure static state machine: Clear/Firing/Batch/Error
  branches, PENDING→FIRING promotion on forDuration elapsed; Batch delegated
  to job
- AlertInstance wither helpers: withState, withFiredAt, withResolvedAt, withAck,
  withSilenced, withTitleMessage, withLastNotifiedAt, withContext
- AlertingBeanConfig gains @EnableConfigurationProperties(AlertingProperties),
  alertingInstanceId bean (hostname:pid), alertingClock bean,
  PerKindCircuitBreaker bean wired from props
- 12 unit tests in AlertStateTransitionsTest covering all transitions

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
hsiegeln
2026-04-19 19:58:12 +02:00
parent f8cd3f3ee4
commit 657dc2d407
5 changed files with 461 additions and 0 deletions

View File

@@ -1,15 +1,25 @@
package com.cameleer.server.app.alerting.config;
import com.cameleer.server.app.alerting.eval.PerKindCircuitBreaker;
import com.cameleer.server.app.alerting.storage.*;
import com.cameleer.server.core.alerting.*;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.jdbc.core.JdbcTemplate;
import java.net.InetAddress;
import java.time.Clock;
@Configuration
@EnableConfigurationProperties(AlertingProperties.class)
public class AlertingBeanConfig {
private static final Logger log = LoggerFactory.getLogger(AlertingBeanConfig.class);
@Bean
public AlertRuleRepository alertRuleRepository(JdbcTemplate jdbc, ObjectMapper om) {
return new PostgresAlertRuleRepository(jdbc, om);
@@ -34,4 +44,33 @@ public class AlertingBeanConfig {
public AlertReadRepository alertReadRepository(JdbcTemplate jdbc) {
return new PostgresAlertReadRepository(jdbc);
}
@Bean
public Clock alertingClock() {
return Clock.systemDefaultZone();
}
@Bean("alertingInstanceId")
public String alertingInstanceId() {
String hostname;
try {
hostname = InetAddress.getLocalHost().getHostName();
} catch (Exception e) {
hostname = "unknown";
}
return hostname + ":" + ProcessHandle.current().pid();
}
@Bean
public PerKindCircuitBreaker perKindCircuitBreaker(AlertingProperties props) {
if (props.evaluatorTickIntervalMs() != null
&& props.evaluatorTickIntervalMs() < 5000) {
log.warn("cameleer.server.alerting.evaluatorTickIntervalMs={} is below the 5000 ms floor; clamping to 5000 ms",
props.evaluatorTickIntervalMs());
}
return new PerKindCircuitBreaker(
props.cbFailThreshold(),
props.cbWindowSeconds(),
props.cbCooldownSeconds());
}
}

View File

@@ -0,0 +1,73 @@
package com.cameleer.server.app.alerting.config;
import org.springframework.boot.context.properties.ConfigurationProperties;
@ConfigurationProperties("cameleer.server.alerting")
public record AlertingProperties(
Integer evaluatorTickIntervalMs,
Integer evaluatorBatchSize,
Integer claimTtlSeconds,
Integer notificationTickIntervalMs,
Integer notificationBatchSize,
Boolean inTickCacheEnabled,
Integer circuitBreakerFailThreshold,
Integer circuitBreakerWindowSeconds,
Integer circuitBreakerCooldownSeconds,
Integer eventRetentionDays,
Integer notificationRetentionDays,
Integer webhookTimeoutMs,
Integer webhookMaxAttempts) {
public int effectiveEvaluatorTickIntervalMs() {
int raw = evaluatorTickIntervalMs == null ? 5000 : evaluatorTickIntervalMs;
return Math.max(5000, raw); // floor: no faster than 5 s
}
public int effectiveEvaluatorBatchSize() {
return evaluatorBatchSize == null ? 20 : evaluatorBatchSize;
}
public int effectiveClaimTtlSeconds() {
return claimTtlSeconds == null ? 30 : claimTtlSeconds;
}
public int effectiveNotificationTickIntervalMs() {
return notificationTickIntervalMs == null ? 5000 : notificationTickIntervalMs;
}
public int effectiveNotificationBatchSize() {
return notificationBatchSize == null ? 50 : notificationBatchSize;
}
public boolean effectiveInTickCacheEnabled() {
return inTickCacheEnabled == null || inTickCacheEnabled;
}
public int effectiveEventRetentionDays() {
return eventRetentionDays == null ? 90 : eventRetentionDays;
}
public int effectiveNotificationRetentionDays() {
return notificationRetentionDays == null ? 30 : notificationRetentionDays;
}
public int effectiveWebhookTimeoutMs() {
return webhookTimeoutMs == null ? 5000 : webhookTimeoutMs;
}
public int effectiveWebhookMaxAttempts() {
return webhookMaxAttempts == null ? 3 : webhookMaxAttempts;
}
public int cbFailThreshold() {
return circuitBreakerFailThreshold == null ? 5 : circuitBreakerFailThreshold;
}
public int cbWindowSeconds() {
return circuitBreakerWindowSeconds == null ? 30 : circuitBreakerWindowSeconds;
}
public int cbCooldownSeconds() {
return circuitBreakerCooldownSeconds == null ? 60 : circuitBreakerCooldownSeconds;
}
}

View File

@@ -0,0 +1,123 @@
package com.cameleer.server.app.alerting.eval;
import com.cameleer.server.core.alerting.AlertInstance;
import com.cameleer.server.core.alerting.AlertRule;
import com.cameleer.server.core.alerting.AlertSeverity;
import com.cameleer.server.core.alerting.AlertState;
import java.time.Instant;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.UUID;
/**
* Pure, stateless state-machine for alert instance transitions.
* <p>
* Given the current open instance (nullable) and an EvalResult, returns the new/updated
* AlertInstance or {@link Optional#empty()} when no action is needed.
* <p>
* Batch results must be handled directly in the job; this helper returns empty for them.
*/
public final class AlertStateTransitions {
private AlertStateTransitions() {}
/**
* Apply an EvalResult to the current open AlertInstance.
*
* @param current the open instance for this rule (PENDING / FIRING / ACKNOWLEDGED), or null if none
* @param result the evaluator outcome
* @param rule the rule being evaluated
* @param now wall-clock instant for the current tick
* @return the new or updated AlertInstance, or empty when nothing should change
*/
public static Optional<AlertInstance> apply(
AlertInstance current, EvalResult result, AlertRule rule, Instant now) {
if (result instanceof EvalResult.Clear) return onClear(current, now);
if (result instanceof EvalResult.Firing f) return onFiring(current, f, rule, now);
// EvalResult.Error and EvalResult.Batch — no action (Batch handled by the job directly)
return Optional.empty();
}
// -------------------------------------------------------------------------
// Clear branch
// -------------------------------------------------------------------------
private static Optional<AlertInstance> onClear(AlertInstance current, Instant now) {
if (current == null) return Optional.empty(); // no open instance — no-op
if (current.state() == AlertState.RESOLVED) return Optional.empty(); // already resolved
// Any open state (PENDING / FIRING / ACKNOWLEDGED) → RESOLVED
return Optional.of(current
.withState(AlertState.RESOLVED)
.withResolvedAt(now));
}
// -------------------------------------------------------------------------
// Firing branch
// -------------------------------------------------------------------------
private static Optional<AlertInstance> onFiring(
AlertInstance current, EvalResult.Firing f, AlertRule rule, Instant now) {
if (current == null) {
// No open instance — create a new one
AlertState initial = rule.forDurationSeconds() > 0
? AlertState.PENDING
: AlertState.FIRING;
return Optional.of(newInstance(rule, f, initial, now));
}
return switch (current.state()) {
case PENDING -> {
// Check whether the forDuration window has elapsed
Instant promoteAt = current.firedAt().plusSeconds(rule.forDurationSeconds());
if (!promoteAt.isAfter(now)) {
// Promote to FIRING; keep the original firedAt (that's when it first appeared)
yield Optional.of(current
.withState(AlertState.FIRING)
.withFiredAt(now));
}
// Still within forDuration — stay PENDING, nothing to persist
yield Optional.empty();
}
// FIRING / ACKNOWLEDGED — re-notification cadence handled by the dispatcher
case FIRING, ACKNOWLEDGED -> Optional.empty();
// RESOLVED should never appear as the "current open" instance, but guard anyway
case RESOLVED -> Optional.empty();
};
}
// -------------------------------------------------------------------------
// Factory helpers
// -------------------------------------------------------------------------
/**
* Creates a brand-new AlertInstance from a rule + Firing result.
* title/message are left empty here; the job enriches them via MustacheRenderer after.
*/
static AlertInstance newInstance(AlertRule rule, EvalResult.Firing f, AlertState state, Instant now) {
return new AlertInstance(
UUID.randomUUID(),
rule.id(),
Map.of(), // ruleSnapshot — caller (job) fills in via ObjectMapper
rule.environmentId(),
state,
rule.severity() != null ? rule.severity() : AlertSeverity.WARNING,
now, // firedAt
null, // ackedAt
null, // ackedBy
null, // resolvedAt
null, // lastNotifiedAt
false, // silenced
f.currentValue(),
f.threshold(),
f.context() != null ? f.context() : Map.of(),
"", // title — rendered by job
"", // message — rendered by job
List.of(),
List.of(),
List.of());
}
}

View File

@@ -0,0 +1,168 @@
package com.cameleer.server.app.alerting.eval;
import com.cameleer.server.core.alerting.*;
import org.junit.jupiter.api.Test;
import java.time.Instant;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import static org.assertj.core.api.Assertions.assertThat;
class AlertStateTransitionsTest {
private static final Instant NOW = Instant.parse("2026-04-19T12:00:00Z");
// -------------------------------------------------------------------------
// Helpers
// -------------------------------------------------------------------------
private AlertRule ruleWith(int forDurationSeconds) {
return new AlertRule(
UUID.randomUUID(), UUID.randomUUID(), "test-rule", null,
AlertSeverity.WARNING, true, ConditionKind.AGENT_STATE,
new AgentStateCondition(new AlertScope(null, null, null), "DEAD", 60),
60, forDurationSeconds, 60,
"{{rule.name}} fired", "Alert: {{alert.state}}",
List.of(), List.of(),
NOW, null, null, Map.of(),
NOW, "u1", NOW, "u1");
}
private AlertInstance openInstance(AlertState state, Instant firedAt, String ackedBy) {
return new AlertInstance(
UUID.randomUUID(), UUID.randomUUID(), Map.of(), UUID.randomUUID(),
state, AlertSeverity.WARNING,
firedAt, null, ackedBy, null, null, false,
1.0, null, Map.of(), "title", "msg",
List.of(), List.of(), List.of());
}
private static final EvalResult.Firing FIRING_RESULT =
new EvalResult.Firing(2500.0, 2000.0, Map.of());
// -------------------------------------------------------------------------
// Clear branch
// -------------------------------------------------------------------------
@Test
void clearWithNoOpenInstanceIsNoOp() {
var next = AlertStateTransitions.apply(null, EvalResult.Clear.INSTANCE, ruleWith(0), NOW);
assertThat(next).isEmpty();
}
@Test
void clearWithAlreadyResolvedInstanceIsNoOp() {
var resolved = openInstance(AlertState.RESOLVED, NOW.minusSeconds(120), null);
var next = AlertStateTransitions.apply(resolved, EvalResult.Clear.INSTANCE, ruleWith(0), NOW);
assertThat(next).isEmpty();
}
@Test
void firingClearTransitionsToResolved() {
var firing = openInstance(AlertState.FIRING, NOW.minusSeconds(90), null);
var next = AlertStateTransitions.apply(firing, EvalResult.Clear.INSTANCE, ruleWith(0), NOW);
assertThat(next).hasValueSatisfying(i -> {
assertThat(i.state()).isEqualTo(AlertState.RESOLVED);
assertThat(i.resolvedAt()).isEqualTo(NOW);
});
}
@Test
void ackedInstanceClearsToResolved() {
var acked = openInstance(AlertState.ACKNOWLEDGED, NOW.minusSeconds(30), "alice");
var next = AlertStateTransitions.apply(acked, EvalResult.Clear.INSTANCE, ruleWith(0), NOW);
assertThat(next).hasValueSatisfying(i -> {
assertThat(i.state()).isEqualTo(AlertState.RESOLVED);
assertThat(i.resolvedAt()).isEqualTo(NOW);
assertThat(i.ackedBy()).isEqualTo("alice"); // preserves acked_by
});
}
// -------------------------------------------------------------------------
// Firing branch — no open instance
// -------------------------------------------------------------------------
@Test
void firingWithNoOpenInstanceCreatesPendingIfForDuration() {
var rule = ruleWith(60);
var next = AlertStateTransitions.apply(null, FIRING_RESULT, rule, NOW);
assertThat(next).hasValueSatisfying(i -> {
assertThat(i.state()).isEqualTo(AlertState.PENDING);
assertThat(i.firedAt()).isEqualTo(NOW);
assertThat(i.ruleId()).isEqualTo(rule.id());
});
}
@Test
void firingWithNoForDurationGoesStraightToFiring() {
var rule = ruleWith(0);
var next = AlertStateTransitions.apply(null, new EvalResult.Firing(1.0, null, Map.of()), rule, NOW);
assertThat(next).hasValueSatisfying(i -> {
assertThat(i.state()).isEqualTo(AlertState.FIRING);
assertThat(i.firedAt()).isEqualTo(NOW);
});
}
// -------------------------------------------------------------------------
// Firing branch — PENDING current
// -------------------------------------------------------------------------
@Test
void pendingStaysWhenForDurationNotElapsed() {
var rule = ruleWith(60);
// firedAt = NOW-10s, forDuration=60s → promoteAt = NOW+50s → still in window
var pending = openInstance(AlertState.PENDING, NOW.minusSeconds(10), null);
var next = AlertStateTransitions.apply(pending, FIRING_RESULT, rule, NOW);
assertThat(next).isEmpty(); // no change
}
@Test
void pendingPromotesToFiringAfterForDuration() {
var rule = ruleWith(60);
// firedAt = NOW-120s, forDuration=60s → promoteAt = NOW-60s → elapsed
var pending = openInstance(AlertState.PENDING, NOW.minusSeconds(120), null);
var next = AlertStateTransitions.apply(pending, FIRING_RESULT, rule, NOW);
assertThat(next).hasValueSatisfying(i -> {
assertThat(i.state()).isEqualTo(AlertState.FIRING);
assertThat(i.firedAt()).isEqualTo(NOW);
});
}
// -------------------------------------------------------------------------
// Firing branch — already open FIRING / ACKNOWLEDGED
// -------------------------------------------------------------------------
@Test
void firingWhenAlreadyFiringIsNoOp() {
var firing = openInstance(AlertState.FIRING, NOW.minusSeconds(120), null);
var next = AlertStateTransitions.apply(firing, FIRING_RESULT, ruleWith(0), NOW);
assertThat(next).isEmpty();
}
@Test
void firingWhenAcknowledgedIsNoOp() {
var acked = openInstance(AlertState.ACKNOWLEDGED, NOW.minusSeconds(30), "alice");
var next = AlertStateTransitions.apply(acked, FIRING_RESULT, ruleWith(0), NOW);
assertThat(next).isEmpty();
}
// -------------------------------------------------------------------------
// Batch + Error → always empty
// -------------------------------------------------------------------------
@Test
void batchResultAlwaysEmpty() {
var batch = new EvalResult.Batch(List.of(FIRING_RESULT));
var next = AlertStateTransitions.apply(null, batch, ruleWith(0), NOW);
assertThat(next).isEmpty();
}
@Test
void errorResultAlwaysEmpty() {
var next = AlertStateTransitions.apply(null,
new EvalResult.Error(new RuntimeException("fail")), ruleWith(0), NOW);
assertThat(next).isEmpty();
}
}