feat(alerting): AlertingProperties + AlertStateTransitions state machine

- AlertingProperties @ConfigurationProperties with effective*() accessors and
  5000 ms floor clamp on evaluatorTickIntervalMs; warn logged at startup
- AlertStateTransitions pure static state machine: Clear/Firing/Batch/Error
  branches, PENDING→FIRING promotion on forDuration elapsed; Batch delegated
  to job
- AlertInstance wither helpers: withState, withFiredAt, withResolvedAt, withAck,
  withSilenced, withTitleMessage, withLastNotifiedAt, withContext
- AlertingBeanConfig gains @EnableConfigurationProperties(AlertingProperties),
  alertingInstanceId bean (hostname:pid), alertingClock bean,
  PerKindCircuitBreaker bean wired from props
- 12 unit tests in AlertStateTransitionsTest covering all transitions

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
hsiegeln
2026-04-19 19:58:12 +02:00
parent f8cd3f3ee4
commit 657dc2d407
5 changed files with 461 additions and 0 deletions

View File

@@ -1,15 +1,25 @@
package com.cameleer.server.app.alerting.config;
import com.cameleer.server.app.alerting.eval.PerKindCircuitBreaker;
import com.cameleer.server.app.alerting.storage.*;
import com.cameleer.server.core.alerting.*;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.jdbc.core.JdbcTemplate;
import java.net.InetAddress;
import java.time.Clock;
@Configuration
@EnableConfigurationProperties(AlertingProperties.class)
public class AlertingBeanConfig {
private static final Logger log = LoggerFactory.getLogger(AlertingBeanConfig.class);
@Bean
public AlertRuleRepository alertRuleRepository(JdbcTemplate jdbc, ObjectMapper om) {
return new PostgresAlertRuleRepository(jdbc, om);
@@ -34,4 +44,33 @@ public class AlertingBeanConfig {
public AlertReadRepository alertReadRepository(JdbcTemplate jdbc) {
return new PostgresAlertReadRepository(jdbc);
}
@Bean
public Clock alertingClock() {
return Clock.systemDefaultZone();
}
@Bean("alertingInstanceId")
public String alertingInstanceId() {
String hostname;
try {
hostname = InetAddress.getLocalHost().getHostName();
} catch (Exception e) {
hostname = "unknown";
}
return hostname + ":" + ProcessHandle.current().pid();
}
@Bean
public PerKindCircuitBreaker perKindCircuitBreaker(AlertingProperties props) {
if (props.evaluatorTickIntervalMs() != null
&& props.evaluatorTickIntervalMs() < 5000) {
log.warn("cameleer.server.alerting.evaluatorTickIntervalMs={} is below the 5000 ms floor; clamping to 5000 ms",
props.evaluatorTickIntervalMs());
}
return new PerKindCircuitBreaker(
props.cbFailThreshold(),
props.cbWindowSeconds(),
props.cbCooldownSeconds());
}
}

View File

@@ -0,0 +1,73 @@
package com.cameleer.server.app.alerting.config;
import org.springframework.boot.context.properties.ConfigurationProperties;
@ConfigurationProperties("cameleer.server.alerting")
public record AlertingProperties(
Integer evaluatorTickIntervalMs,
Integer evaluatorBatchSize,
Integer claimTtlSeconds,
Integer notificationTickIntervalMs,
Integer notificationBatchSize,
Boolean inTickCacheEnabled,
Integer circuitBreakerFailThreshold,
Integer circuitBreakerWindowSeconds,
Integer circuitBreakerCooldownSeconds,
Integer eventRetentionDays,
Integer notificationRetentionDays,
Integer webhookTimeoutMs,
Integer webhookMaxAttempts) {
public int effectiveEvaluatorTickIntervalMs() {
int raw = evaluatorTickIntervalMs == null ? 5000 : evaluatorTickIntervalMs;
return Math.max(5000, raw); // floor: no faster than 5 s
}
public int effectiveEvaluatorBatchSize() {
return evaluatorBatchSize == null ? 20 : evaluatorBatchSize;
}
public int effectiveClaimTtlSeconds() {
return claimTtlSeconds == null ? 30 : claimTtlSeconds;
}
public int effectiveNotificationTickIntervalMs() {
return notificationTickIntervalMs == null ? 5000 : notificationTickIntervalMs;
}
public int effectiveNotificationBatchSize() {
return notificationBatchSize == null ? 50 : notificationBatchSize;
}
public boolean effectiveInTickCacheEnabled() {
return inTickCacheEnabled == null || inTickCacheEnabled;
}
public int effectiveEventRetentionDays() {
return eventRetentionDays == null ? 90 : eventRetentionDays;
}
public int effectiveNotificationRetentionDays() {
return notificationRetentionDays == null ? 30 : notificationRetentionDays;
}
public int effectiveWebhookTimeoutMs() {
return webhookTimeoutMs == null ? 5000 : webhookTimeoutMs;
}
public int effectiveWebhookMaxAttempts() {
return webhookMaxAttempts == null ? 3 : webhookMaxAttempts;
}
public int cbFailThreshold() {
return circuitBreakerFailThreshold == null ? 5 : circuitBreakerFailThreshold;
}
public int cbWindowSeconds() {
return circuitBreakerWindowSeconds == null ? 30 : circuitBreakerWindowSeconds;
}
public int cbCooldownSeconds() {
return circuitBreakerCooldownSeconds == null ? 60 : circuitBreakerCooldownSeconds;
}
}

View File

@@ -0,0 +1,123 @@
package com.cameleer.server.app.alerting.eval;
import com.cameleer.server.core.alerting.AlertInstance;
import com.cameleer.server.core.alerting.AlertRule;
import com.cameleer.server.core.alerting.AlertSeverity;
import com.cameleer.server.core.alerting.AlertState;
import java.time.Instant;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.UUID;
/**
* Pure, stateless state-machine for alert instance transitions.
* <p>
* Given the current open instance (nullable) and an EvalResult, returns the new/updated
* AlertInstance or {@link Optional#empty()} when no action is needed.
* <p>
* Batch results must be handled directly in the job; this helper returns empty for them.
*/
public final class AlertStateTransitions {
private AlertStateTransitions() {}
/**
* Apply an EvalResult to the current open AlertInstance.
*
* @param current the open instance for this rule (PENDING / FIRING / ACKNOWLEDGED), or null if none
* @param result the evaluator outcome
* @param rule the rule being evaluated
* @param now wall-clock instant for the current tick
* @return the new or updated AlertInstance, or empty when nothing should change
*/
public static Optional<AlertInstance> apply(
AlertInstance current, EvalResult result, AlertRule rule, Instant now) {
if (result instanceof EvalResult.Clear) return onClear(current, now);
if (result instanceof EvalResult.Firing f) return onFiring(current, f, rule, now);
// EvalResult.Error and EvalResult.Batch — no action (Batch handled by the job directly)
return Optional.empty();
}
// -------------------------------------------------------------------------
// Clear branch
// -------------------------------------------------------------------------
private static Optional<AlertInstance> onClear(AlertInstance current, Instant now) {
if (current == null) return Optional.empty(); // no open instance — no-op
if (current.state() == AlertState.RESOLVED) return Optional.empty(); // already resolved
// Any open state (PENDING / FIRING / ACKNOWLEDGED) → RESOLVED
return Optional.of(current
.withState(AlertState.RESOLVED)
.withResolvedAt(now));
}
// -------------------------------------------------------------------------
// Firing branch
// -------------------------------------------------------------------------
private static Optional<AlertInstance> onFiring(
AlertInstance current, EvalResult.Firing f, AlertRule rule, Instant now) {
if (current == null) {
// No open instance — create a new one
AlertState initial = rule.forDurationSeconds() > 0
? AlertState.PENDING
: AlertState.FIRING;
return Optional.of(newInstance(rule, f, initial, now));
}
return switch (current.state()) {
case PENDING -> {
// Check whether the forDuration window has elapsed
Instant promoteAt = current.firedAt().plusSeconds(rule.forDurationSeconds());
if (!promoteAt.isAfter(now)) {
// Promote to FIRING; keep the original firedAt (that's when it first appeared)
yield Optional.of(current
.withState(AlertState.FIRING)
.withFiredAt(now));
}
// Still within forDuration — stay PENDING, nothing to persist
yield Optional.empty();
}
// FIRING / ACKNOWLEDGED — re-notification cadence handled by the dispatcher
case FIRING, ACKNOWLEDGED -> Optional.empty();
// RESOLVED should never appear as the "current open" instance, but guard anyway
case RESOLVED -> Optional.empty();
};
}
// -------------------------------------------------------------------------
// Factory helpers
// -------------------------------------------------------------------------
/**
* Creates a brand-new AlertInstance from a rule + Firing result.
* title/message are left empty here; the job enriches them via MustacheRenderer after.
*/
static AlertInstance newInstance(AlertRule rule, EvalResult.Firing f, AlertState state, Instant now) {
return new AlertInstance(
UUID.randomUUID(),
rule.id(),
Map.of(), // ruleSnapshot — caller (job) fills in via ObjectMapper
rule.environmentId(),
state,
rule.severity() != null ? rule.severity() : AlertSeverity.WARNING,
now, // firedAt
null, // ackedAt
null, // ackedBy
null, // resolvedAt
null, // lastNotifiedAt
false, // silenced
f.currentValue(),
f.threshold(),
f.context() != null ? f.context() : Map.of(),
"", // title — rendered by job
"", // message — rendered by job
List.of(),
List.of(),
List.of());
}
}