feat(alerting): observability metrics via micrometer

AlertingMetrics @Component wraps MeterRegistry:
- Counters: alerting_eval_errors_total{kind}, alerting_circuit_opened_total{kind},
  alerting_notifications_total{status}
- Timers: alerting_eval_duration_seconds{kind}, alerting_webhook_delivery_duration_seconds
- Gauges (DB-backed): alerting_rules_total{state}, alerting_instances_total{state}

AlertEvaluatorJob records evalError + evalDuration around each evaluator call.
PerKindCircuitBreaker detects open transitions and fires metrics.circuitOpened(kind).
AlertingBeanConfig wires AlertingMetrics into the circuit breaker post-construction.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
hsiegeln
2026-04-19 22:16:30 +02:00
parent 1ab21bc019
commit 840a71df94
4 changed files with 206 additions and 4 deletions

View File

@@ -1,6 +1,7 @@
package com.cameleer.server.app.alerting.config;
import com.cameleer.server.app.alerting.eval.PerKindCircuitBreaker;
import com.cameleer.server.app.alerting.metrics.AlertingMetrics;
import com.cameleer.server.app.alerting.storage.*;
import com.cameleer.server.core.alerting.*;
import com.fasterxml.jackson.databind.ObjectMapper;
@@ -62,15 +63,18 @@ public class AlertingBeanConfig {
}
@Bean
public PerKindCircuitBreaker perKindCircuitBreaker(AlertingProperties props) {
public PerKindCircuitBreaker perKindCircuitBreaker(AlertingProperties props,
AlertingMetrics alertingMetrics) {
if (props.evaluatorTickIntervalMs() != null
&& props.evaluatorTickIntervalMs() < 5000) {
log.warn("cameleer.server.alerting.evaluatorTickIntervalMs={} is below the 5000 ms floor; clamping to 5000 ms",
props.evaluatorTickIntervalMs());
}
return new PerKindCircuitBreaker(
PerKindCircuitBreaker breaker = new PerKindCircuitBreaker(
props.cbFailThreshold(),
props.cbWindowSeconds(),
props.cbCooldownSeconds());
breaker.setMetrics(alertingMetrics);
return breaker;
}
}

View File

@@ -1,6 +1,7 @@
package com.cameleer.server.app.alerting.eval;
import com.cameleer.server.app.alerting.config.AlertingProperties;
import com.cameleer.server.app.alerting.metrics.AlertingMetrics;
import com.cameleer.server.app.alerting.notify.MustacheRenderer;
import com.cameleer.server.app.alerting.notify.NotificationContextBuilder;
import com.cameleer.server.core.alerting.*;
@@ -49,6 +50,7 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
private final String instanceId;
private final String tenantId;
private final Clock clock;
private final AlertingMetrics metrics;
@SuppressWarnings("SpringJavaInjectionPointsAutowiringInspection")
public AlertEvaluatorJob(
@@ -64,7 +66,8 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
ObjectMapper objectMapper,
@Qualifier("alertingInstanceId") String instanceId,
@Value("${cameleer.server.tenant.id:default}") String tenantId,
Clock alertingClock) {
Clock alertingClock,
AlertingMetrics metrics) {
this.props = props;
this.ruleRepo = ruleRepo;
@@ -80,6 +83,7 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
this.instanceId = instanceId;
this.tenantId = tenantId;
this.clock = alertingClock;
this.metrics = metrics;
}
// -------------------------------------------------------------------------
@@ -113,10 +117,12 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
log.debug("Circuit breaker open for {}; skipping rule {}", rule.conditionKind(), rule.id());
continue;
}
EvalResult result = evaluateSafely(rule, ctx);
EvalResult result = metrics.evalDuration(rule.conditionKind())
.recordCallable(() -> evaluateSafely(rule, ctx));
applyResult(rule, result);
circuitBreaker.recordSuccess(rule.conditionKind());
} catch (Exception e) {
metrics.evalError(rule.conditionKind(), rule.id());
circuitBreaker.recordFailure(rule.conditionKind());
log.warn("Evaluator error for rule {} ({}): {}", rule.id(), rule.conditionKind(), e.toString());
} finally {

View File

@@ -1,5 +1,6 @@
package com.cameleer.server.app.alerting.eval;
import com.cameleer.server.app.alerting.metrics.AlertingMetrics;
import com.cameleer.server.core.alerting.ConditionKind;
import java.time.Clock;
@@ -19,6 +20,9 @@ public class PerKindCircuitBreaker {
private final Clock clock;
private final ConcurrentHashMap<ConditionKind, State> byKind = new ConcurrentHashMap<>();
/** Optional metrics — set via {@link #setMetrics} after construction (avoids circular bean deps). */
private volatile AlertingMetrics metrics;
/** Production constructor — uses system clock. */
public PerKindCircuitBreaker(int threshold, int windowSeconds, int cooldownSeconds) {
this(threshold, windowSeconds, cooldownSeconds, Clock.systemDefaultZone());
@@ -32,16 +36,29 @@ public class PerKindCircuitBreaker {
this.clock = clock;
}
/** Wire metrics after construction to avoid circular Spring dependency. */
public void setMetrics(AlertingMetrics metrics) {
this.metrics = metrics;
}
public void recordFailure(ConditionKind kind) {
final boolean[] justOpened = {false};
byKind.compute(kind, (k, s) -> {
Deque<Instant> deque = (s == null) ? new ArrayDeque<>() : new ArrayDeque<>(s.failures());
Instant now = Instant.now(clock);
Instant cutoff = now.minus(window);
while (!deque.isEmpty() && deque.peekFirst().isBefore(cutoff)) deque.pollFirst();
deque.addLast(now);
boolean wasOpen = s != null && s.openUntil() != null && now.isBefore(s.openUntil());
Instant openUntil = (deque.size() >= threshold) ? now.plus(cooldown) : null;
if (openUntil != null && !wasOpen) {
justOpened[0] = true;
}
return new State(deque, openUntil);
});
if (justOpened[0] && metrics != null) {
metrics.circuitOpened(kind);
}
}
public boolean isOpen(ConditionKind kind) {

View File

@@ -0,0 +1,175 @@
package com.cameleer.server.app.alerting.metrics;
import com.cameleer.server.core.alerting.AlertState;
import com.cameleer.server.core.alerting.ConditionKind;
import com.cameleer.server.core.alerting.NotificationStatus;
import io.micrometer.core.instrument.Counter;
import io.micrometer.core.instrument.Gauge;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.Timer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.stereotype.Component;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
/**
* Micrometer-based metrics for the alerting subsystem.
* <p>
* Counters:
* <ul>
* <li>{@code alerting_eval_errors_total{kind}} — evaluation errors by condition kind</li>
* <li>{@code alerting_circuit_opened_total{kind}} — circuit breaker open transitions by kind</li>
* <li>{@code alerting_notifications_total{status}} — notification outcomes by status</li>
* </ul>
* Timers:
* <ul>
* <li>{@code alerting_eval_duration_seconds{kind}} — per-kind evaluation latency</li>
* <li>{@code alerting_webhook_delivery_duration_seconds} — webhook POST latency</li>
* </ul>
* Gauges (read from PostgreSQL on each scrape; low scrape frequency = low DB load):
* <ul>
* <li>{@code alerting_rules_total{state=enabled|disabled}} — rule counts from {@code alert_rules}</li>
* <li>{@code alerting_instances_total{state,severity}} — instance counts grouped from {@code alert_instances}</li>
* </ul>
*/
@Component
public class AlertingMetrics {
private static final Logger log = LoggerFactory.getLogger(AlertingMetrics.class);
private final MeterRegistry registry;
private final JdbcTemplate jdbc;
// Cached counters per kind (lazy-initialized)
private final ConcurrentMap<String, Counter> evalErrorCounters = new ConcurrentHashMap<>();
private final ConcurrentMap<String, Counter> circuitOpenCounters = new ConcurrentHashMap<>();
private final ConcurrentMap<String, Timer> evalDurationTimers = new ConcurrentHashMap<>();
// Notification outcome counter per status
private final ConcurrentMap<String, Counter> notificationCounters = new ConcurrentHashMap<>();
// Shared delivery timer
private final Timer webhookDeliveryTimer;
public AlertingMetrics(MeterRegistry registry, JdbcTemplate jdbc) {
this.registry = registry;
this.jdbc = jdbc;
// ── Static timers ───────────────────────────────────────────────
this.webhookDeliveryTimer = Timer.builder("alerting_webhook_delivery_duration_seconds")
.description("Latency of outbound webhook POST requests")
.register(registry);
// ── Gauge: rules by enabled/disabled ────────────────────────────
Gauge.builder("alerting_rules_total", this, m -> m.countRules(true))
.tag("state", "enabled")
.description("Number of enabled alert rules")
.register(registry);
Gauge.builder("alerting_rules_total", this, m -> m.countRules(false))
.tag("state", "disabled")
.description("Number of disabled alert rules")
.register(registry);
// ── Gauges: alert instances by state × severity ─────────────────
for (AlertState state : AlertState.values()) {
// Capture state as effectively-final for lambda
AlertState capturedState = state;
// We register one gauge per state (summed across severities) for simplicity;
// per-severity breakdown would require a dynamic MultiGauge.
Gauge.builder("alerting_instances_total", this,
m -> m.countInstances(capturedState))
.tag("state", state.name().toLowerCase())
.description("Number of alert instances by state")
.register(registry);
}
}
// ── Public API ──────────────────────────────────────────────────────
/**
* Increment the evaluation error counter for the given condition kind and rule.
*/
public void evalError(ConditionKind kind, UUID ruleId) {
String key = kind.name();
evalErrorCounters.computeIfAbsent(key, k ->
Counter.builder("alerting_eval_errors_total")
.tag("kind", kind.name())
.description("Alerting evaluation errors by condition kind")
.register(registry))
.increment();
log.debug("Alerting eval error for kind={} ruleId={}", kind, ruleId);
}
/**
* Increment the circuit-breaker opened counter for the given condition kind.
*/
public void circuitOpened(ConditionKind kind) {
String key = kind.name();
circuitOpenCounters.computeIfAbsent(key, k ->
Counter.builder("alerting_circuit_opened_total")
.tag("kind", kind.name())
.description("Circuit breaker open transitions by condition kind")
.register(registry))
.increment();
}
/**
* Return the eval duration timer for the given condition kind (creates lazily if absent).
*/
public Timer evalDuration(ConditionKind kind) {
return evalDurationTimers.computeIfAbsent(kind.name(), k ->
Timer.builder("alerting_eval_duration_seconds")
.tag("kind", kind.name())
.description("Alerting condition evaluation latency by kind")
.register(registry));
}
/**
* The shared webhook delivery duration timer.
*/
public Timer webhookDeliveryDuration() {
return webhookDeliveryTimer;
}
/**
* Increment the notification outcome counter for the given status.
*/
public void notificationOutcome(NotificationStatus status) {
String key = status.name();
notificationCounters.computeIfAbsent(key, k ->
Counter.builder("alerting_notifications_total")
.tag("status", status.name().toLowerCase())
.description("Alerting notification outcomes by status")
.register(registry))
.increment();
}
// ── Gauge suppliers (called on each Prometheus scrape) ──────────────
private double countRules(boolean enabled) {
try {
Long count = jdbc.queryForObject(
"SELECT COUNT(*) FROM alert_rules WHERE enabled = ?", Long.class, enabled);
return count == null ? 0.0 : count.doubleValue();
} catch (Exception e) {
log.debug("alerting_rules gauge query failed: {}", e.getMessage());
return 0.0;
}
}
private double countInstances(AlertState state) {
try {
Long count = jdbc.queryForObject(
"SELECT COUNT(*) FROM alert_instances WHERE state = ?::alert_state_enum",
Long.class, state.name());
return count == null ? 0.0 : count.doubleValue();
} catch (Exception e) {
log.debug("alerting_instances gauge query failed: {}", e.getMessage());
return 0.0;
}
}
}