perf(alerting): 30s TTL cache on AlertingMetrics gauge suppliers

Prometheus scrapes can fire every few seconds. The open-alerts / open-rules gauges query Postgres on each read — caching the values for 30s amortises that to one query per half-minute. Addresses final-review NIT from Plan 02. - Introduces a package-private TtlCache that wraps a Supplier<Long> and memoises the last read for a configurable Duration against a Supplier<Instant> clock. - Wraps each gauge supplier (alerting_rules_total{enabled|disabled}, alerting_instances_total{state}) in its own TtlCache. - Adds a test-friendly constructor (package-private) taking explicit Duration + Supplier<Instant> so AlertingMetricsCachingTest can advance a fake clock without waiting wall-clock time. - Adds AlertingMetricsCachingTest covering: * supplier invoked once per TTL across repeated scrapes * 29 s elapsed → still cached; 31 s elapsed → re-queried * gauge value reflects the cached result even after delegate mutates Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 14:22:54 +02:00
parent 5ebc729b82
commit 9f109b20fd
2 changed files with 241 additions and 28 deletions
--- a/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/metrics/AlertingMetrics.java
+++ b/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/metrics/AlertingMetrics.java
@@ -12,9 +12,16 @@ import org.slf4j.LoggerFactory;
 import org.springframework.jdbc.core.JdbcTemplate;
 import org.springframework.stereotype.Component;
 import java.time.Duration;
 import java.time.Instant;
 import java.util.ArrayList;
 import java.util.EnumMap;
 import java.util.List;
 import java.util.Map;
 import java.util.UUID;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
 import java.util.function.Supplier;
 /**
 * Micrometer-based metrics for the alerting subsystem.
@@ -30,10 +37,11 @@ import java.util.concurrent.ConcurrentMap;
 *   <li>{@code alerting_eval_duration_seconds{kind}} — per-kind evaluation latency</li>
 *   <li>{@code alerting_webhook_delivery_duration_seconds} — webhook POST latency</li>
 * </ul>
- * Gauges (read from PostgreSQL on each scrape; low scrape frequency = low DB load):
+ * Gauges (read from PostgreSQL, cached for {@link #DEFAULT_GAUGE_TTL} to amortise
 * Prometheus scrapes that may fire every few seconds):
 * <ul>
 *   <li>{@code alerting_rules_total{state=enabled|disabled}} — rule counts from {@code alert_rules}</li>
- *   <li>{@code alerting_instances_total{state,severity}} — instance counts grouped from {@code alert_instances}</li>
+ *   <li>{@code alerting_instances_total{state}} — instance counts grouped from {@code alert_instances}</li>
 * </ul>
 */
@Component
@@ -41,8 +49,10 @@ public class AlertingMetrics {
    private static final Logger log = LoggerFactory.getLogger(AlertingMetrics.class);
    /** Default time-to-live for the gauge-supplier caches. */
    static final Duration DEFAULT_GAUGE_TTL = Duration.ofSeconds(30);
    private final MeterRegistry registry;
    private final JdbcTemplate jdbc;
    // Cached counters per kind (lazy-initialized)
    private final ConcurrentMap<String, Counter> evalErrorCounters   = new ConcurrentHashMap<>();
@@ -55,33 +65,80 @@ public class AlertingMetrics {
    // Shared delivery timer
    private final Timer webhookDeliveryTimer;
    // TTL-cached gauge suppliers registered so tests can force a read cycle.
    private final TtlCache enabledRulesCache;
    private final TtlCache disabledRulesCache;
    private final Map<AlertState, TtlCache> instancesByStateCaches;
    /**
     * Production constructor: wraps the Postgres-backed gauge suppliers in a
     * 30-second TTL cache so Prometheus scrapes don't cause per-scrape DB queries.
     */
    public AlertingMetrics(MeterRegistry registry, JdbcTemplate jdbc) {
        this(registry,
             () -> countRules(jdbc, true),
             () -> countRules(jdbc, false),
             state -> countInstances(jdbc, state),
             DEFAULT_GAUGE_TTL,
             Instant::now);
    }
    /**
     * Test-friendly constructor accepting the three gauge suppliers that are
     * exercised in the {@link AlertingMetricsCachingTest} plan sketch. The
     * {@code instancesSupplier} is used for every {@link AlertState}.
     */
    AlertingMetrics(MeterRegistry registry,
                    Supplier<Long> enabledRulesSupplier,
                    Supplier<Long> disabledRulesSupplier,
                    Supplier<Long> instancesSupplier,
                    Duration gaugeTtl,
                    Supplier<Instant> clock) {
        this(registry,
             enabledRulesSupplier,
             disabledRulesSupplier,
             state -> instancesSupplier.get(),
             gaugeTtl,
             clock);
    }
    /**
     * Core constructor: accepts per-state instance supplier so production can
     * query PostgreSQL with a different value per {@link AlertState}.
     */
    private AlertingMetrics(MeterRegistry registry,
                            Supplier<Long> enabledRulesSupplier,
                            Supplier<Long> disabledRulesSupplier,
                            java.util.function.Function<AlertState, Long> instancesSupplier,
                            Duration gaugeTtl,
                            Supplier<Instant> clock) {
        this.registry = registry;
        this.jdbc     = jdbc;
        // ── Static timers ───────────────────────────────────────────────
        this.webhookDeliveryTimer = Timer.builder("alerting_webhook_delivery_duration_seconds")
                .description("Latency of outbound webhook POST requests")
                .register(registry);
-        // ── Gauge: rules by enabled/disabled ────────────────────────────
+        // ── Gauge: rules by enabled/disabled (cached) ───────────────────
-        Gauge.builder("alerting_rules_total", this, m -> m.countRules(true))
+        this.enabledRulesCache  = new TtlCache(enabledRulesSupplier,  gaugeTtl, clock);
        this.disabledRulesCache = new TtlCache(disabledRulesSupplier, gaugeTtl, clock);
        Gauge.builder("alerting_rules_total", enabledRulesCache, TtlCache::getAsDouble)
                .tag("state", "enabled")
                .description("Number of enabled alert rules")
                .register(registry);
-        Gauge.builder("alerting_rules_total", this, m -> m.countRules(false))
+        Gauge.builder("alerting_rules_total", disabledRulesCache, TtlCache::getAsDouble)
                .tag("state", "disabled")
                .description("Number of disabled alert rules")
                .register(registry);
-        // ── Gauges: alert instances by state × severity ─────────────────
+        // ── Gauges: alert instances by state (cached) ───────────────────
        this.instancesByStateCaches = new EnumMap<>(AlertState.class);
        for (AlertState state : AlertState.values()) {
-            // Capture state as effectively-final for lambda
+            AlertState captured = state;
-            AlertState capturedState = state;
+            TtlCache cache = new TtlCache(() -> instancesSupplier.apply(captured), gaugeTtl, clock);
-            // We register one gauge per state (summed across severities) for simplicity;
+            this.instancesByStateCaches.put(state, cache);
-            // per-severity breakdown would require a dynamic MultiGauge.
+            Gauge.builder("alerting_instances_total", cache, TtlCache::getAsDouble)
            Gauge.builder("alerting_instances_total", this,
                          m -> m.countInstances(capturedState))
                    .tag("state", state.name().toLowerCase())
                    .description("Number of alert instances by state")
                    .register(registry);
@@ -148,28 +205,73 @@ public class AlertingMetrics {
            .increment();
    }
-    // ── Gauge suppliers (called on each Prometheus scrape) ──────────────
+    /**
     * Force a read of every TTL-cached gauge supplier. Used by tests to simulate
     * a Prometheus scrape without needing a real registry scrape pipeline.
     */
    void snapshotAllGauges() {
        List<TtlCache> all = new ArrayList<>();
        all.add(enabledRulesCache);
        all.add(disabledRulesCache);
        all.addAll(instancesByStateCaches.values());
        for (TtlCache c : all) {
            c.getAsDouble();
        }
    }
-    private double countRules(boolean enabled) {
+    // ── Gauge suppliers (queried at most once per TTL) ──────────────────
    private static long countRules(JdbcTemplate jdbc, boolean enabled) {
        try {
            Long count = jdbc.queryForObject(
                "SELECT COUNT(*) FROM alert_rules WHERE enabled = ?", Long.class, enabled);
-            return count == null ? 0.0 : count.doubleValue();
+            return count == null ? 0L : count;
        } catch (Exception e) {
            log.debug("alerting_rules gauge query failed: {}", e.getMessage());
-            return 0.0;
+            return 0L;
        }
    }
-    private double countInstances(AlertState state) {
+    private static long countInstances(JdbcTemplate jdbc, AlertState state) {
        try {
            Long count = jdbc.queryForObject(
                "SELECT COUNT(*) FROM alert_instances WHERE state = ?::alert_state_enum",
                Long.class, state.name());
-            return count == null ? 0.0 : count.doubleValue();
+            return count == null ? 0L : count;
        } catch (Exception e) {
            log.debug("alerting_instances gauge query failed: {}", e.getMessage());
-            return 0.0;
+            return 0L;
        }
    }
    /**
     * Lightweight TTL cache around a {@code Supplier<Long>}. Every call to
     * {@link #getAsDouble()} either returns the cached value (if {@code clock.get()
     * - lastRead < ttl}) or invokes the delegate and refreshes the cache.
     *
     * <p>Used to amortise Postgres queries behind Prometheus gauges over a
     * 30-second TTL (see {@link AlertingMetrics#DEFAULT_GAUGE_TTL}).
     */
    static final class TtlCache {
        private final Supplier<Long> delegate;
        private final Duration ttl;
        private final Supplier<Instant> clock;
        private volatile Instant lastRead = Instant.MIN;
        private volatile long cached = 0L;
        TtlCache(Supplier<Long> delegate, Duration ttl, Supplier<Instant> clock) {
            this.delegate = delegate;
            this.ttl = ttl;
            this.clock = clock;
        }
        synchronized double getAsDouble() {
            Instant now = clock.get();
            if (lastRead == Instant.MIN || Duration.between(lastRead, now).compareTo(ttl) >= 0) {
                cached = delegate.get();
                lastRead = now;
            }
            return cached;
        }
    }
 }
--- a/cameleer-server-app/src/test/java/com/cameleer/server/app/alerting/metrics/AlertingMetricsCachingTest.java
+++ b/cameleer-server-app/src/test/java/com/cameleer/server/app/alerting/metrics/AlertingMetricsCachingTest.java
@@ -0,0 +1,111 @@
 package com.cameleer.server.app.alerting.metrics;
 import com.cameleer.server.core.alerting.AlertState;
 import io.micrometer.core.instrument.MeterRegistry;
 import io.micrometer.core.instrument.simple.SimpleMeterRegistry;
 import org.junit.jupiter.api.Test;
 import java.time.Duration;
 import java.time.Instant;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.function.Supplier;
 import static org.assertj.core.api.Assertions.assertThat;
 /**
 * Verifies that {@link AlertingMetrics} caches gauge values for a configurable TTL,
 * so that Prometheus scrapes do not cause one Postgres query per scrape.
 */
 class AlertingMetricsCachingTest {
    @Test
    void gaugeSupplierIsCalledAtMostOncePerTtl() {
        // The instances supplier is shared across every AlertState gauge, so each
        // full gauge snapshot invokes it once per AlertState (one cache per state).
        final int stateCount = AlertState.values().length;
        AtomicInteger enabledRulesCalls  = new AtomicInteger();
        AtomicInteger disabledRulesCalls = new AtomicInteger();
        AtomicInteger instancesCalls     = new AtomicInteger();
        AtomicReference<Instant> now = new AtomicReference<>(Instant.parse("2026-04-20T00:00:00Z"));
        Supplier<Instant> clock = now::get;
        MeterRegistry registry = new SimpleMeterRegistry();
        Supplier<Long> enabledRulesSupplier  = () -> { enabledRulesCalls.incrementAndGet();  return 7L; };
        Supplier<Long> disabledRulesSupplier = () -> { disabledRulesCalls.incrementAndGet(); return 3L; };
        Supplier<Long> instancesSupplier     = () -> { instancesCalls.incrementAndGet();     return 5L; };
        AlertingMetrics metrics = new AlertingMetrics(
                registry,
                enabledRulesSupplier,
                disabledRulesSupplier,
                instancesSupplier,
                Duration.ofSeconds(30),
                clock
        );
        // First scrape — each supplier invoked exactly once per gauge.
        metrics.snapshotAllGauges();
        assertThat(enabledRulesCalls.get()).isEqualTo(1);
        assertThat(disabledRulesCalls.get()).isEqualTo(1);
        assertThat(instancesCalls.get()).isEqualTo(stateCount);
        // Second scrape within TTL — served from cache.
        metrics.snapshotAllGauges();
        assertThat(enabledRulesCalls.get()).isEqualTo(1);
        assertThat(disabledRulesCalls.get()).isEqualTo(1);
        assertThat(instancesCalls.get()).isEqualTo(stateCount);
        // Third scrape still within TTL (29 s later) — still cached.
        now.set(now.get().plusSeconds(29));
        metrics.snapshotAllGauges();
        assertThat(enabledRulesCalls.get()).isEqualTo(1);
        assertThat(disabledRulesCalls.get()).isEqualTo(1);
        assertThat(instancesCalls.get()).isEqualTo(stateCount);
        // Advance past TTL — next scrape re-queries the delegate.
        now.set(Instant.parse("2026-04-20T00:00:31Z"));
        metrics.snapshotAllGauges();
        assertThat(enabledRulesCalls.get()).isEqualTo(2);
        assertThat(disabledRulesCalls.get()).isEqualTo(2);
        assertThat(instancesCalls.get()).isEqualTo(stateCount * 2);
        // Immediate follow-up — back in cache.
        metrics.snapshotAllGauges();
        assertThat(enabledRulesCalls.get()).isEqualTo(2);
        assertThat(disabledRulesCalls.get()).isEqualTo(2);
        assertThat(instancesCalls.get()).isEqualTo(stateCount * 2);
    }
    @Test
    void gaugeValueReflectsCachedResult() {
        AtomicReference<Long> enabledValue = new AtomicReference<>(10L);
        AtomicReference<Instant> now = new AtomicReference<>(Instant.parse("2026-04-20T00:00:00Z"));
        MeterRegistry registry = new SimpleMeterRegistry();
        AlertingMetrics metrics = new AlertingMetrics(
                registry,
                enabledValue::get,
                () -> 0L,
                () -> 0L,
                Duration.ofSeconds(30),
                now::get
        );
        // Read once — value cached at 10.
        metrics.snapshotAllGauges();
        // Mutate the underlying supplier output; cache should shield it.
        enabledValue.set(99L);
        double cached = registry.find("alerting_rules_total").tag("state", "enabled").gauge().value();
        assertThat(cached).isEqualTo(10.0);
        // After TTL, new value surfaces.
        now.set(now.get().plusSeconds(31));
        metrics.snapshotAllGauges();
        double refreshed = registry.find("alerting_rules_total").tag("state", "enabled").gauge().value();
        assertThat(refreshed).isEqualTo(99.0);
    }
 }