feat(alerting): observability metrics via micrometer
AlertingMetrics @Component wraps MeterRegistry:
- Counters: alerting_eval_errors_total{kind}, alerting_circuit_opened_total{kind},
alerting_notifications_total{status}
- Timers: alerting_eval_duration_seconds{kind}, alerting_webhook_delivery_duration_seconds
- Gauges (DB-backed): alerting_rules_total{state}, alerting_instances_total{state}
AlertEvaluatorJob records evalError + evalDuration around each evaluator call.
PerKindCircuitBreaker detects open transitions and fires metrics.circuitOpened(kind).
AlertingBeanConfig wires AlertingMetrics into the circuit breaker post-construction.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
package com.cameleer.server.app.alerting.config;
|
||||
|
||||
import com.cameleer.server.app.alerting.eval.PerKindCircuitBreaker;
|
||||
import com.cameleer.server.app.alerting.metrics.AlertingMetrics;
|
||||
import com.cameleer.server.app.alerting.storage.*;
|
||||
import com.cameleer.server.core.alerting.*;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
@@ -62,15 +63,18 @@ public class AlertingBeanConfig {
|
||||
}
|
||||
|
||||
@Bean
|
||||
public PerKindCircuitBreaker perKindCircuitBreaker(AlertingProperties props) {
|
||||
public PerKindCircuitBreaker perKindCircuitBreaker(AlertingProperties props,
|
||||
AlertingMetrics alertingMetrics) {
|
||||
if (props.evaluatorTickIntervalMs() != null
|
||||
&& props.evaluatorTickIntervalMs() < 5000) {
|
||||
log.warn("cameleer.server.alerting.evaluatorTickIntervalMs={} is below the 5000 ms floor; clamping to 5000 ms",
|
||||
props.evaluatorTickIntervalMs());
|
||||
}
|
||||
return new PerKindCircuitBreaker(
|
||||
PerKindCircuitBreaker breaker = new PerKindCircuitBreaker(
|
||||
props.cbFailThreshold(),
|
||||
props.cbWindowSeconds(),
|
||||
props.cbCooldownSeconds());
|
||||
breaker.setMetrics(alertingMetrics);
|
||||
return breaker;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package com.cameleer.server.app.alerting.eval;
|
||||
|
||||
import com.cameleer.server.app.alerting.config.AlertingProperties;
|
||||
import com.cameleer.server.app.alerting.metrics.AlertingMetrics;
|
||||
import com.cameleer.server.app.alerting.notify.MustacheRenderer;
|
||||
import com.cameleer.server.app.alerting.notify.NotificationContextBuilder;
|
||||
import com.cameleer.server.core.alerting.*;
|
||||
@@ -49,6 +50,7 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
|
||||
private final String instanceId;
|
||||
private final String tenantId;
|
||||
private final Clock clock;
|
||||
private final AlertingMetrics metrics;
|
||||
|
||||
@SuppressWarnings("SpringJavaInjectionPointsAutowiringInspection")
|
||||
public AlertEvaluatorJob(
|
||||
@@ -64,7 +66,8 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
|
||||
ObjectMapper objectMapper,
|
||||
@Qualifier("alertingInstanceId") String instanceId,
|
||||
@Value("${cameleer.server.tenant.id:default}") String tenantId,
|
||||
Clock alertingClock) {
|
||||
Clock alertingClock,
|
||||
AlertingMetrics metrics) {
|
||||
|
||||
this.props = props;
|
||||
this.ruleRepo = ruleRepo;
|
||||
@@ -80,6 +83,7 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
|
||||
this.instanceId = instanceId;
|
||||
this.tenantId = tenantId;
|
||||
this.clock = alertingClock;
|
||||
this.metrics = metrics;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
@@ -113,10 +117,12 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
|
||||
log.debug("Circuit breaker open for {}; skipping rule {}", rule.conditionKind(), rule.id());
|
||||
continue;
|
||||
}
|
||||
EvalResult result = evaluateSafely(rule, ctx);
|
||||
EvalResult result = metrics.evalDuration(rule.conditionKind())
|
||||
.recordCallable(() -> evaluateSafely(rule, ctx));
|
||||
applyResult(rule, result);
|
||||
circuitBreaker.recordSuccess(rule.conditionKind());
|
||||
} catch (Exception e) {
|
||||
metrics.evalError(rule.conditionKind(), rule.id());
|
||||
circuitBreaker.recordFailure(rule.conditionKind());
|
||||
log.warn("Evaluator error for rule {} ({}): {}", rule.id(), rule.conditionKind(), e.toString());
|
||||
} finally {
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
package com.cameleer.server.app.alerting.eval;
|
||||
|
||||
import com.cameleer.server.app.alerting.metrics.AlertingMetrics;
|
||||
import com.cameleer.server.core.alerting.ConditionKind;
|
||||
|
||||
import java.time.Clock;
|
||||
@@ -19,6 +20,9 @@ public class PerKindCircuitBreaker {
|
||||
private final Clock clock;
|
||||
private final ConcurrentHashMap<ConditionKind, State> byKind = new ConcurrentHashMap<>();
|
||||
|
||||
/** Optional metrics — set via {@link #setMetrics} after construction (avoids circular bean deps). */
|
||||
private volatile AlertingMetrics metrics;
|
||||
|
||||
/** Production constructor — uses system clock. */
|
||||
public PerKindCircuitBreaker(int threshold, int windowSeconds, int cooldownSeconds) {
|
||||
this(threshold, windowSeconds, cooldownSeconds, Clock.systemDefaultZone());
|
||||
@@ -32,16 +36,29 @@ public class PerKindCircuitBreaker {
|
||||
this.clock = clock;
|
||||
}
|
||||
|
||||
/** Wire metrics after construction to avoid circular Spring dependency. */
|
||||
public void setMetrics(AlertingMetrics metrics) {
|
||||
this.metrics = metrics;
|
||||
}
|
||||
|
||||
public void recordFailure(ConditionKind kind) {
|
||||
final boolean[] justOpened = {false};
|
||||
byKind.compute(kind, (k, s) -> {
|
||||
Deque<Instant> deque = (s == null) ? new ArrayDeque<>() : new ArrayDeque<>(s.failures());
|
||||
Instant now = Instant.now(clock);
|
||||
Instant cutoff = now.minus(window);
|
||||
while (!deque.isEmpty() && deque.peekFirst().isBefore(cutoff)) deque.pollFirst();
|
||||
deque.addLast(now);
|
||||
boolean wasOpen = s != null && s.openUntil() != null && now.isBefore(s.openUntil());
|
||||
Instant openUntil = (deque.size() >= threshold) ? now.plus(cooldown) : null;
|
||||
if (openUntil != null && !wasOpen) {
|
||||
justOpened[0] = true;
|
||||
}
|
||||
return new State(deque, openUntil);
|
||||
});
|
||||
if (justOpened[0] && metrics != null) {
|
||||
metrics.circuitOpened(kind);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isOpen(ConditionKind kind) {
|
||||
|
||||
@@ -0,0 +1,175 @@
|
||||
package com.cameleer.server.app.alerting.metrics;
|
||||
|
||||
import com.cameleer.server.core.alerting.AlertState;
|
||||
import com.cameleer.server.core.alerting.ConditionKind;
|
||||
import com.cameleer.server.core.alerting.NotificationStatus;
|
||||
import io.micrometer.core.instrument.Counter;
|
||||
import io.micrometer.core.instrument.Gauge;
|
||||
import io.micrometer.core.instrument.MeterRegistry;
|
||||
import io.micrometer.core.instrument.Timer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.jdbc.core.JdbcTemplate;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
|
||||
/**
|
||||
* Micrometer-based metrics for the alerting subsystem.
|
||||
* <p>
|
||||
* Counters:
|
||||
* <ul>
|
||||
* <li>{@code alerting_eval_errors_total{kind}} — evaluation errors by condition kind</li>
|
||||
* <li>{@code alerting_circuit_opened_total{kind}} — circuit breaker open transitions by kind</li>
|
||||
* <li>{@code alerting_notifications_total{status}} — notification outcomes by status</li>
|
||||
* </ul>
|
||||
* Timers:
|
||||
* <ul>
|
||||
* <li>{@code alerting_eval_duration_seconds{kind}} — per-kind evaluation latency</li>
|
||||
* <li>{@code alerting_webhook_delivery_duration_seconds} — webhook POST latency</li>
|
||||
* </ul>
|
||||
* Gauges (read from PostgreSQL on each scrape; low scrape frequency = low DB load):
|
||||
* <ul>
|
||||
* <li>{@code alerting_rules_total{state=enabled|disabled}} — rule counts from {@code alert_rules}</li>
|
||||
* <li>{@code alerting_instances_total{state,severity}} — instance counts grouped from {@code alert_instances}</li>
|
||||
* </ul>
|
||||
*/
|
||||
@Component
|
||||
public class AlertingMetrics {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(AlertingMetrics.class);
|
||||
|
||||
private final MeterRegistry registry;
|
||||
private final JdbcTemplate jdbc;
|
||||
|
||||
// Cached counters per kind (lazy-initialized)
|
||||
private final ConcurrentMap<String, Counter> evalErrorCounters = new ConcurrentHashMap<>();
|
||||
private final ConcurrentMap<String, Counter> circuitOpenCounters = new ConcurrentHashMap<>();
|
||||
private final ConcurrentMap<String, Timer> evalDurationTimers = new ConcurrentHashMap<>();
|
||||
|
||||
// Notification outcome counter per status
|
||||
private final ConcurrentMap<String, Counter> notificationCounters = new ConcurrentHashMap<>();
|
||||
|
||||
// Shared delivery timer
|
||||
private final Timer webhookDeliveryTimer;
|
||||
|
||||
public AlertingMetrics(MeterRegistry registry, JdbcTemplate jdbc) {
|
||||
this.registry = registry;
|
||||
this.jdbc = jdbc;
|
||||
|
||||
// ── Static timers ───────────────────────────────────────────────
|
||||
this.webhookDeliveryTimer = Timer.builder("alerting_webhook_delivery_duration_seconds")
|
||||
.description("Latency of outbound webhook POST requests")
|
||||
.register(registry);
|
||||
|
||||
// ── Gauge: rules by enabled/disabled ────────────────────────────
|
||||
Gauge.builder("alerting_rules_total", this, m -> m.countRules(true))
|
||||
.tag("state", "enabled")
|
||||
.description("Number of enabled alert rules")
|
||||
.register(registry);
|
||||
Gauge.builder("alerting_rules_total", this, m -> m.countRules(false))
|
||||
.tag("state", "disabled")
|
||||
.description("Number of disabled alert rules")
|
||||
.register(registry);
|
||||
|
||||
// ── Gauges: alert instances by state × severity ─────────────────
|
||||
for (AlertState state : AlertState.values()) {
|
||||
// Capture state as effectively-final for lambda
|
||||
AlertState capturedState = state;
|
||||
// We register one gauge per state (summed across severities) for simplicity;
|
||||
// per-severity breakdown would require a dynamic MultiGauge.
|
||||
Gauge.builder("alerting_instances_total", this,
|
||||
m -> m.countInstances(capturedState))
|
||||
.tag("state", state.name().toLowerCase())
|
||||
.description("Number of alert instances by state")
|
||||
.register(registry);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Public API ──────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Increment the evaluation error counter for the given condition kind and rule.
|
||||
*/
|
||||
public void evalError(ConditionKind kind, UUID ruleId) {
|
||||
String key = kind.name();
|
||||
evalErrorCounters.computeIfAbsent(key, k ->
|
||||
Counter.builder("alerting_eval_errors_total")
|
||||
.tag("kind", kind.name())
|
||||
.description("Alerting evaluation errors by condition kind")
|
||||
.register(registry))
|
||||
.increment();
|
||||
log.debug("Alerting eval error for kind={} ruleId={}", kind, ruleId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Increment the circuit-breaker opened counter for the given condition kind.
|
||||
*/
|
||||
public void circuitOpened(ConditionKind kind) {
|
||||
String key = kind.name();
|
||||
circuitOpenCounters.computeIfAbsent(key, k ->
|
||||
Counter.builder("alerting_circuit_opened_total")
|
||||
.tag("kind", kind.name())
|
||||
.description("Circuit breaker open transitions by condition kind")
|
||||
.register(registry))
|
||||
.increment();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the eval duration timer for the given condition kind (creates lazily if absent).
|
||||
*/
|
||||
public Timer evalDuration(ConditionKind kind) {
|
||||
return evalDurationTimers.computeIfAbsent(kind.name(), k ->
|
||||
Timer.builder("alerting_eval_duration_seconds")
|
||||
.tag("kind", kind.name())
|
||||
.description("Alerting condition evaluation latency by kind")
|
||||
.register(registry));
|
||||
}
|
||||
|
||||
/**
|
||||
* The shared webhook delivery duration timer.
|
||||
*/
|
||||
public Timer webhookDeliveryDuration() {
|
||||
return webhookDeliveryTimer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Increment the notification outcome counter for the given status.
|
||||
*/
|
||||
public void notificationOutcome(NotificationStatus status) {
|
||||
String key = status.name();
|
||||
notificationCounters.computeIfAbsent(key, k ->
|
||||
Counter.builder("alerting_notifications_total")
|
||||
.tag("status", status.name().toLowerCase())
|
||||
.description("Alerting notification outcomes by status")
|
||||
.register(registry))
|
||||
.increment();
|
||||
}
|
||||
|
||||
// ── Gauge suppliers (called on each Prometheus scrape) ──────────────
|
||||
|
||||
private double countRules(boolean enabled) {
|
||||
try {
|
||||
Long count = jdbc.queryForObject(
|
||||
"SELECT COUNT(*) FROM alert_rules WHERE enabled = ?", Long.class, enabled);
|
||||
return count == null ? 0.0 : count.doubleValue();
|
||||
} catch (Exception e) {
|
||||
log.debug("alerting_rules gauge query failed: {}", e.getMessage());
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
private double countInstances(AlertState state) {
|
||||
try {
|
||||
Long count = jdbc.queryForObject(
|
||||
"SELECT COUNT(*) FROM alert_instances WHERE state = ?::alert_state_enum",
|
||||
Long.class, state.name());
|
||||
return count == null ? 0.0 : count.doubleValue();
|
||||
} catch (Exception e) {
|
||||
log.debug("alerting_instances gauge query failed: {}", e.getMessage());
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user