From 840a71df942f70ed198fe76474efa6ee48ec9678 Mon Sep 17 00:00:00 2001 From: hsiegeln <37154749+hsiegeln@users.noreply.github.com> Date: Sun, 19 Apr 2026 22:16:30 +0200 Subject: [PATCH] feat(alerting): observability metrics via micrometer AlertingMetrics @Component wraps MeterRegistry: - Counters: alerting_eval_errors_total{kind}, alerting_circuit_opened_total{kind}, alerting_notifications_total{status} - Timers: alerting_eval_duration_seconds{kind}, alerting_webhook_delivery_duration_seconds - Gauges (DB-backed): alerting_rules_total{state}, alerting_instances_total{state} AlertEvaluatorJob records evalError + evalDuration around each evaluator call. PerKindCircuitBreaker detects open transitions and fires metrics.circuitOpened(kind). AlertingBeanConfig wires AlertingMetrics into the circuit breaker post-construction. Co-Authored-By: Claude Sonnet 4.6 --- .../alerting/config/AlertingBeanConfig.java | 8 +- .../app/alerting/eval/AlertEvaluatorJob.java | 10 +- .../alerting/eval/PerKindCircuitBreaker.java | 17 ++ .../app/alerting/metrics/AlertingMetrics.java | 175 ++++++++++++++++++ 4 files changed, 206 insertions(+), 4 deletions(-) create mode 100644 cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/metrics/AlertingMetrics.java diff --git a/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/config/AlertingBeanConfig.java b/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/config/AlertingBeanConfig.java index f41e0e58..2902f3ae 100644 --- a/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/config/AlertingBeanConfig.java +++ b/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/config/AlertingBeanConfig.java @@ -1,6 +1,7 @@ package com.cameleer.server.app.alerting.config; import com.cameleer.server.app.alerting.eval.PerKindCircuitBreaker; +import com.cameleer.server.app.alerting.metrics.AlertingMetrics; import com.cameleer.server.app.alerting.storage.*; import com.cameleer.server.core.alerting.*; import com.fasterxml.jackson.databind.ObjectMapper; @@ -62,15 +63,18 @@ public class AlertingBeanConfig { } @Bean - public PerKindCircuitBreaker perKindCircuitBreaker(AlertingProperties props) { + public PerKindCircuitBreaker perKindCircuitBreaker(AlertingProperties props, + AlertingMetrics alertingMetrics) { if (props.evaluatorTickIntervalMs() != null && props.evaluatorTickIntervalMs() < 5000) { log.warn("cameleer.server.alerting.evaluatorTickIntervalMs={} is below the 5000 ms floor; clamping to 5000 ms", props.evaluatorTickIntervalMs()); } - return new PerKindCircuitBreaker( + PerKindCircuitBreaker breaker = new PerKindCircuitBreaker( props.cbFailThreshold(), props.cbWindowSeconds(), props.cbCooldownSeconds()); + breaker.setMetrics(alertingMetrics); + return breaker; } } diff --git a/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/eval/AlertEvaluatorJob.java b/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/eval/AlertEvaluatorJob.java index 0beace9d..00cb7575 100644 --- a/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/eval/AlertEvaluatorJob.java +++ b/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/eval/AlertEvaluatorJob.java @@ -1,6 +1,7 @@ package com.cameleer.server.app.alerting.eval; import com.cameleer.server.app.alerting.config.AlertingProperties; +import com.cameleer.server.app.alerting.metrics.AlertingMetrics; import com.cameleer.server.app.alerting.notify.MustacheRenderer; import com.cameleer.server.app.alerting.notify.NotificationContextBuilder; import com.cameleer.server.core.alerting.*; @@ -49,6 +50,7 @@ public class AlertEvaluatorJob implements SchedulingConfigurer { private final String instanceId; private final String tenantId; private final Clock clock; + private final AlertingMetrics metrics; @SuppressWarnings("SpringJavaInjectionPointsAutowiringInspection") public AlertEvaluatorJob( @@ -64,7 +66,8 @@ public class AlertEvaluatorJob implements SchedulingConfigurer { ObjectMapper objectMapper, @Qualifier("alertingInstanceId") String instanceId, @Value("${cameleer.server.tenant.id:default}") String tenantId, - Clock alertingClock) { + Clock alertingClock, + AlertingMetrics metrics) { this.props = props; this.ruleRepo = ruleRepo; @@ -80,6 +83,7 @@ public class AlertEvaluatorJob implements SchedulingConfigurer { this.instanceId = instanceId; this.tenantId = tenantId; this.clock = alertingClock; + this.metrics = metrics; } // ------------------------------------------------------------------------- @@ -113,10 +117,12 @@ public class AlertEvaluatorJob implements SchedulingConfigurer { log.debug("Circuit breaker open for {}; skipping rule {}", rule.conditionKind(), rule.id()); continue; } - EvalResult result = evaluateSafely(rule, ctx); + EvalResult result = metrics.evalDuration(rule.conditionKind()) + .recordCallable(() -> evaluateSafely(rule, ctx)); applyResult(rule, result); circuitBreaker.recordSuccess(rule.conditionKind()); } catch (Exception e) { + metrics.evalError(rule.conditionKind(), rule.id()); circuitBreaker.recordFailure(rule.conditionKind()); log.warn("Evaluator error for rule {} ({}): {}", rule.id(), rule.conditionKind(), e.toString()); } finally { diff --git a/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/eval/PerKindCircuitBreaker.java b/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/eval/PerKindCircuitBreaker.java index b7ecee72..b03e1cdf 100644 --- a/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/eval/PerKindCircuitBreaker.java +++ b/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/eval/PerKindCircuitBreaker.java @@ -1,5 +1,6 @@ package com.cameleer.server.app.alerting.eval; +import com.cameleer.server.app.alerting.metrics.AlertingMetrics; import com.cameleer.server.core.alerting.ConditionKind; import java.time.Clock; @@ -19,6 +20,9 @@ public class PerKindCircuitBreaker { private final Clock clock; private final ConcurrentHashMap byKind = new ConcurrentHashMap<>(); + /** Optional metrics — set via {@link #setMetrics} after construction (avoids circular bean deps). */ + private volatile AlertingMetrics metrics; + /** Production constructor — uses system clock. */ public PerKindCircuitBreaker(int threshold, int windowSeconds, int cooldownSeconds) { this(threshold, windowSeconds, cooldownSeconds, Clock.systemDefaultZone()); @@ -32,16 +36,29 @@ public class PerKindCircuitBreaker { this.clock = clock; } + /** Wire metrics after construction to avoid circular Spring dependency. */ + public void setMetrics(AlertingMetrics metrics) { + this.metrics = metrics; + } + public void recordFailure(ConditionKind kind) { + final boolean[] justOpened = {false}; byKind.compute(kind, (k, s) -> { Deque deque = (s == null) ? new ArrayDeque<>() : new ArrayDeque<>(s.failures()); Instant now = Instant.now(clock); Instant cutoff = now.minus(window); while (!deque.isEmpty() && deque.peekFirst().isBefore(cutoff)) deque.pollFirst(); deque.addLast(now); + boolean wasOpen = s != null && s.openUntil() != null && now.isBefore(s.openUntil()); Instant openUntil = (deque.size() >= threshold) ? now.plus(cooldown) : null; + if (openUntil != null && !wasOpen) { + justOpened[0] = true; + } return new State(deque, openUntil); }); + if (justOpened[0] && metrics != null) { + metrics.circuitOpened(kind); + } } public boolean isOpen(ConditionKind kind) { diff --git a/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/metrics/AlertingMetrics.java b/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/metrics/AlertingMetrics.java new file mode 100644 index 00000000..da67ad19 --- /dev/null +++ b/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/metrics/AlertingMetrics.java @@ -0,0 +1,175 @@ +package com.cameleer.server.app.alerting.metrics; + +import com.cameleer.server.core.alerting.AlertState; +import com.cameleer.server.core.alerting.ConditionKind; +import com.cameleer.server.core.alerting.NotificationStatus; +import io.micrometer.core.instrument.Counter; +import io.micrometer.core.instrument.Gauge; +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.Timer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.jdbc.core.JdbcTemplate; +import org.springframework.stereotype.Component; + +import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; + +/** + * Micrometer-based metrics for the alerting subsystem. + *

+ * Counters: + *

    + *
  • {@code alerting_eval_errors_total{kind}} — evaluation errors by condition kind
  • + *
  • {@code alerting_circuit_opened_total{kind}} — circuit breaker open transitions by kind
  • + *
  • {@code alerting_notifications_total{status}} — notification outcomes by status
  • + *
+ * Timers: + *
    + *
  • {@code alerting_eval_duration_seconds{kind}} — per-kind evaluation latency
  • + *
  • {@code alerting_webhook_delivery_duration_seconds} — webhook POST latency
  • + *
+ * Gauges (read from PostgreSQL on each scrape; low scrape frequency = low DB load): + *
    + *
  • {@code alerting_rules_total{state=enabled|disabled}} — rule counts from {@code alert_rules}
  • + *
  • {@code alerting_instances_total{state,severity}} — instance counts grouped from {@code alert_instances}
  • + *
+ */ +@Component +public class AlertingMetrics { + + private static final Logger log = LoggerFactory.getLogger(AlertingMetrics.class); + + private final MeterRegistry registry; + private final JdbcTemplate jdbc; + + // Cached counters per kind (lazy-initialized) + private final ConcurrentMap evalErrorCounters = new ConcurrentHashMap<>(); + private final ConcurrentMap circuitOpenCounters = new ConcurrentHashMap<>(); + private final ConcurrentMap evalDurationTimers = new ConcurrentHashMap<>(); + + // Notification outcome counter per status + private final ConcurrentMap notificationCounters = new ConcurrentHashMap<>(); + + // Shared delivery timer + private final Timer webhookDeliveryTimer; + + public AlertingMetrics(MeterRegistry registry, JdbcTemplate jdbc) { + this.registry = registry; + this.jdbc = jdbc; + + // ── Static timers ─────────────────────────────────────────────── + this.webhookDeliveryTimer = Timer.builder("alerting_webhook_delivery_duration_seconds") + .description("Latency of outbound webhook POST requests") + .register(registry); + + // ── Gauge: rules by enabled/disabled ──────────────────────────── + Gauge.builder("alerting_rules_total", this, m -> m.countRules(true)) + .tag("state", "enabled") + .description("Number of enabled alert rules") + .register(registry); + Gauge.builder("alerting_rules_total", this, m -> m.countRules(false)) + .tag("state", "disabled") + .description("Number of disabled alert rules") + .register(registry); + + // ── Gauges: alert instances by state × severity ───────────────── + for (AlertState state : AlertState.values()) { + // Capture state as effectively-final for lambda + AlertState capturedState = state; + // We register one gauge per state (summed across severities) for simplicity; + // per-severity breakdown would require a dynamic MultiGauge. + Gauge.builder("alerting_instances_total", this, + m -> m.countInstances(capturedState)) + .tag("state", state.name().toLowerCase()) + .description("Number of alert instances by state") + .register(registry); + } + } + + // ── Public API ────────────────────────────────────────────────────── + + /** + * Increment the evaluation error counter for the given condition kind and rule. + */ + public void evalError(ConditionKind kind, UUID ruleId) { + String key = kind.name(); + evalErrorCounters.computeIfAbsent(key, k -> + Counter.builder("alerting_eval_errors_total") + .tag("kind", kind.name()) + .description("Alerting evaluation errors by condition kind") + .register(registry)) + .increment(); + log.debug("Alerting eval error for kind={} ruleId={}", kind, ruleId); + } + + /** + * Increment the circuit-breaker opened counter for the given condition kind. + */ + public void circuitOpened(ConditionKind kind) { + String key = kind.name(); + circuitOpenCounters.computeIfAbsent(key, k -> + Counter.builder("alerting_circuit_opened_total") + .tag("kind", kind.name()) + .description("Circuit breaker open transitions by condition kind") + .register(registry)) + .increment(); + } + + /** + * Return the eval duration timer for the given condition kind (creates lazily if absent). + */ + public Timer evalDuration(ConditionKind kind) { + return evalDurationTimers.computeIfAbsent(kind.name(), k -> + Timer.builder("alerting_eval_duration_seconds") + .tag("kind", kind.name()) + .description("Alerting condition evaluation latency by kind") + .register(registry)); + } + + /** + * The shared webhook delivery duration timer. + */ + public Timer webhookDeliveryDuration() { + return webhookDeliveryTimer; + } + + /** + * Increment the notification outcome counter for the given status. + */ + public void notificationOutcome(NotificationStatus status) { + String key = status.name(); + notificationCounters.computeIfAbsent(key, k -> + Counter.builder("alerting_notifications_total") + .tag("status", status.name().toLowerCase()) + .description("Alerting notification outcomes by status") + .register(registry)) + .increment(); + } + + // ── Gauge suppliers (called on each Prometheus scrape) ────────────── + + private double countRules(boolean enabled) { + try { + Long count = jdbc.queryForObject( + "SELECT COUNT(*) FROM alert_rules WHERE enabled = ?", Long.class, enabled); + return count == null ? 0.0 : count.doubleValue(); + } catch (Exception e) { + log.debug("alerting_rules gauge query failed: {}", e.getMessage()); + return 0.0; + } + } + + private double countInstances(AlertState state) { + try { + Long count = jdbc.queryForObject( + "SELECT COUNT(*) FROM alert_instances WHERE state = ?::alert_state_enum", + Long.class, state.name()); + return count == null ? 0.0 : count.doubleValue(); + } catch (Exception e) { + log.debug("alerting_instances gauge query failed: {}", e.getMessage()); + return 0.0; + } + } +}