perf(alerting): 30s TTL cache on AlertingMetrics gauge suppliers

Prometheus scrapes can fire every few seconds. The open-alerts / open-rules gauges query Postgres on each read — caching the values for 30s amortises that to one query per half-minute. Addresses final-review NIT from Plan 02. - Introduces a package-private TtlCache that wraps a Supplier<Long> and memoises the last read for a configurable Duration against a Supplier<Instant> clock. - Wraps each gauge supplier (alerting_rules_total{enabled|disabled}, alerting_instances_total{state}) in its own TtlCache. - Adds a test-friendly constructor (package-private) taking explicit Duration + Supplier<Instant> so AlertingMetricsCachingTest can advance a fake clock without waiting wall-clock time. - Adds AlertingMetricsCachingTest covering: * supplier invoked once per TTL across repeated scrapes * 29 s elapsed → still cached; 31 s elapsed → re-queried * gauge value reflects the cached result even after delegate mutates Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 14:22:54 +02:00
parent 5ebc729b82
commit 9f109b20fd
2 changed files with 241 additions and 28 deletions
--- a/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/metrics/AlertingMetrics.java
+++ b/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/metrics/AlertingMetrics.java
@@ -12,9 +12,16 @@ import org.slf4j.LoggerFactory;
 import org.springframework.jdbc.core.JdbcTemplate;
 import org.springframework.stereotype.Component;

+import java.time.Duration;
+import java.time.Instant;
+import java.util.ArrayList;
+import java.util.EnumMap;
+import java.util.List;
+import java.util.Map;
 import java.util.UUID;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
+import java.util.function.Supplier;

 /**
 * Micrometer-based metrics for the alerting subsystem.
@@ -30,10 +37,11 @@ import java.util.concurrent.ConcurrentMap;
 *   <li>{@code alerting_eval_duration_seconds{kind}} — per-kind evaluation latency</li>
 *   <li>{@code alerting_webhook_delivery_duration_seconds} — webhook POST latency</li>
 * </ul>
- * Gauges (read from PostgreSQL on each scrape; low scrape frequency = low DB load):
+ * Gauges (read from PostgreSQL, cached for {@link #DEFAULT_GAUGE_TTL} to amortise
+ * Prometheus scrapes that may fire every few seconds):
 * <ul>
 *   <li>{@code alerting_rules_total{state=enabled|disabled}} — rule counts from {@code alert_rules}</li>
- *   <li>{@code alerting_instances_total{state,severity}} — instance counts grouped from {@code alert_instances}</li>
+ *   <li>{@code alerting_instances_total{state}} — instance counts grouped from {@code alert_instances}</li>
 * </ul>
 */
@Component
@@ -41,8 +49,10 @@ public class AlertingMetrics {

    private static final Logger log = LoggerFactory.getLogger(AlertingMetrics.class);

+    /** Default time-to-live for the gauge-supplier caches. */
+    static final Duration DEFAULT_GAUGE_TTL = Duration.ofSeconds(30);
+
    private final MeterRegistry registry;
-    private final JdbcTemplate jdbc;

    // Cached counters per kind (lazy-initialized)
    private final ConcurrentMap<String, Counter> evalErrorCounters   = new ConcurrentHashMap<>();
@@ -55,33 +65,80 @@ public class AlertingMetrics {
    // Shared delivery timer
    private final Timer webhookDeliveryTimer;

+    // TTL-cached gauge suppliers registered so tests can force a read cycle.
+    private final TtlCache enabledRulesCache;
+    private final TtlCache disabledRulesCache;
+    private final Map<AlertState, TtlCache> instancesByStateCaches;
+
+    /**
+     * Production constructor: wraps the Postgres-backed gauge suppliers in a
+     * 30-second TTL cache so Prometheus scrapes don't cause per-scrape DB queries.
+     */
    public AlertingMetrics(MeterRegistry registry, JdbcTemplate jdbc) {
+        this(registry,
+             () -> countRules(jdbc, true),
+             () -> countRules(jdbc, false),
+             state -> countInstances(jdbc, state),
+             DEFAULT_GAUGE_TTL,
+             Instant::now);
+    }
+
+    /**
+     * Test-friendly constructor accepting the three gauge suppliers that are
+     * exercised in the {@link AlertingMetricsCachingTest} plan sketch. The
+     * {@code instancesSupplier} is used for every {@link AlertState}.
+     */
+    AlertingMetrics(MeterRegistry registry,
+                    Supplier<Long> enabledRulesSupplier,
+                    Supplier<Long> disabledRulesSupplier,
+                    Supplier<Long> instancesSupplier,
+                    Duration gaugeTtl,
+                    Supplier<Instant> clock) {
+        this(registry,
+             enabledRulesSupplier,
+             disabledRulesSupplier,
+             state -> instancesSupplier.get(),
+             gaugeTtl,
+             clock);
+    }
+
+    /**
+     * Core constructor: accepts per-state instance supplier so production can
+     * query PostgreSQL with a different value per {@link AlertState}.
+     */
+    private AlertingMetrics(MeterRegistry registry,
+                            Supplier<Long> enabledRulesSupplier,
+                            Supplier<Long> disabledRulesSupplier,
+                            java.util.function.Function<AlertState, Long> instancesSupplier,
+                            Duration gaugeTtl,
+                            Supplier<Instant> clock) {
        this.registry = registry;
-        this.jdbc     = jdbc;

        // ── Static timers ───────────────────────────────────────────────
        this.webhookDeliveryTimer = Timer.builder("alerting_webhook_delivery_duration_seconds")
                .description("Latency of outbound webhook POST requests")
                .register(registry);

-        // ── Gauge: rules by enabled/disabled ────────────────────────────
-        Gauge.builder("alerting_rules_total", this, m -> m.countRules(true))
+        // ── Gauge: rules by enabled/disabled (cached) ───────────────────
+        this.enabledRulesCache  = new TtlCache(enabledRulesSupplier,  gaugeTtl, clock);
+        this.disabledRulesCache = new TtlCache(disabledRulesSupplier, gaugeTtl, clock);
+
+        Gauge.builder("alerting_rules_total", enabledRulesCache, TtlCache::getAsDouble)
                .tag("state", "enabled")
                .description("Number of enabled alert rules")
                .register(registry);
-        Gauge.builder("alerting_rules_total", this, m -> m.countRules(false))
+        Gauge.builder("alerting_rules_total", disabledRulesCache, TtlCache::getAsDouble)
                .tag("state", "disabled")
                .description("Number of disabled alert rules")
                .register(registry);

-        // ── Gauges: alert instances by state × severity ─────────────────
+        // ── Gauges: alert instances by state (cached) ───────────────────
+        this.instancesByStateCaches = new EnumMap<>(AlertState.class);
        for (AlertState state : AlertState.values()) {
-            // Capture state as effectively-final for lambda
-            AlertState capturedState = state;
-            // We register one gauge per state (summed across severities) for simplicity;
-            // per-severity breakdown would require a dynamic MultiGauge.
-            Gauge.builder("alerting_instances_total", this,
-                          m -> m.countInstances(capturedState))
+            AlertState captured = state;
+            TtlCache cache = new TtlCache(() -> instancesSupplier.apply(captured), gaugeTtl, clock);
+            this.instancesByStateCaches.put(state, cache);
+            Gauge.builder("alerting_instances_total", cache, TtlCache::getAsDouble)
                    .tag("state", state.name().toLowerCase())
                    .description("Number of alert instances by state")
                    .register(registry);
@@ -148,28 +205,73 @@ public class AlertingMetrics {
            .increment();
    }

-    // ── Gauge suppliers (called on each Prometheus scrape) ──────────────
+    /**
+     * Force a read of every TTL-cached gauge supplier. Used by tests to simulate
+     * a Prometheus scrape without needing a real registry scrape pipeline.
+     */
+    void snapshotAllGauges() {
+        List<TtlCache> all = new ArrayList<>();
+        all.add(enabledRulesCache);
+        all.add(disabledRulesCache);
+        all.addAll(instancesByStateCaches.values());
+        for (TtlCache c : all) {
+            c.getAsDouble();
+        }
+    }

-    private double countRules(boolean enabled) {
+    // ── Gauge suppliers (queried at most once per TTL) ──────────────────
+
+    private static long countRules(JdbcTemplate jdbc, boolean enabled) {
        try {
            Long count = jdbc.queryForObject(
                "SELECT COUNT(*) FROM alert_rules WHERE enabled = ?", Long.class, enabled);
-            return count == null ? 0.0 : count.doubleValue();
+            return count == null ? 0L : count;
        } catch (Exception e) {
            log.debug("alerting_rules gauge query failed: {}", e.getMessage());
-            return 0.0;
+            return 0L;
        }
    }

-    private double countInstances(AlertState state) {
+    private static long countInstances(JdbcTemplate jdbc, AlertState state) {
        try {
            Long count = jdbc.queryForObject(
                "SELECT COUNT(*) FROM alert_instances WHERE state = ?::alert_state_enum",
                Long.class, state.name());
-            return count == null ? 0.0 : count.doubleValue();
+            return count == null ? 0L : count;
        } catch (Exception e) {
            log.debug("alerting_instances gauge query failed: {}", e.getMessage());
-            return 0.0;
+            return 0L;
+        }
+    }
+
+    /**
+     * Lightweight TTL cache around a {@code Supplier<Long>}. Every call to
+     * {@link #getAsDouble()} either returns the cached value (if {@code clock.get()
+     * - lastRead < ttl}) or invokes the delegate and refreshes the cache.
+     *
+     * <p>Used to amortise Postgres queries behind Prometheus gauges over a
+     * 30-second TTL (see {@link AlertingMetrics#DEFAULT_GAUGE_TTL}).
+     */
+    static final class TtlCache {
+        private final Supplier<Long> delegate;
+        private final Duration ttl;
+        private final Supplier<Instant> clock;
+        private volatile Instant lastRead = Instant.MIN;
+        private volatile long cached = 0L;
+
+        TtlCache(Supplier<Long> delegate, Duration ttl, Supplier<Instant> clock) {
+            this.delegate = delegate;
+            this.ttl = ttl;
+            this.clock = clock;
+        }
+
+        synchronized double getAsDouble() {
+            Instant now = clock.get();
+            if (lastRead == Instant.MIN || Duration.between(lastRead, now).compareTo(ttl) >= 0) {
+                cached = delegate.get();
+                lastRead = now;
+            }
+            return cached;
        }
    }
 }
--- a/cameleer-server-app/src/test/java/com/cameleer/server/app/alerting/metrics/AlertingMetricsCachingTest.java
+++ b/cameleer-server-app/src/test/java/com/cameleer/server/app/alerting/metrics/AlertingMetricsCachingTest.java
@@ -0,0 +1,111 @@
+package com.cameleer.server.app.alerting.metrics;
+
+import com.cameleer.server.core.alerting.AlertState;
+import io.micrometer.core.instrument.MeterRegistry;
+import io.micrometer.core.instrument.simple.SimpleMeterRegistry;
+import org.junit.jupiter.api.Test;
+
+import java.time.Duration;
+import java.time.Instant;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Supplier;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Verifies that {@link AlertingMetrics} caches gauge values for a configurable TTL,
+ * so that Prometheus scrapes do not cause one Postgres query per scrape.
+ */
+class AlertingMetricsCachingTest {
+
+    @Test
+    void gaugeSupplierIsCalledAtMostOncePerTtl() {
+        // The instances supplier is shared across every AlertState gauge, so each
+        // full gauge snapshot invokes it once per AlertState (one cache per state).
+        final int stateCount = AlertState.values().length;
+
+        AtomicInteger enabledRulesCalls  = new AtomicInteger();
+        AtomicInteger disabledRulesCalls = new AtomicInteger();
+        AtomicInteger instancesCalls     = new AtomicInteger();
+        AtomicReference<Instant> now = new AtomicReference<>(Instant.parse("2026-04-20T00:00:00Z"));
+        Supplier<Instant> clock = now::get;
+
+        MeterRegistry registry = new SimpleMeterRegistry();
+
+        Supplier<Long> enabledRulesSupplier  = () -> { enabledRulesCalls.incrementAndGet();  return 7L; };
+        Supplier<Long> disabledRulesSupplier = () -> { disabledRulesCalls.incrementAndGet(); return 3L; };
+        Supplier<Long> instancesSupplier     = () -> { instancesCalls.incrementAndGet();     return 5L; };
+
+        AlertingMetrics metrics = new AlertingMetrics(
+                registry,
+                enabledRulesSupplier,
+                disabledRulesSupplier,
+                instancesSupplier,
+                Duration.ofSeconds(30),
+                clock
+        );
+
+        // First scrape — each supplier invoked exactly once per gauge.
+        metrics.snapshotAllGauges();
+        assertThat(enabledRulesCalls.get()).isEqualTo(1);
+        assertThat(disabledRulesCalls.get()).isEqualTo(1);
+        assertThat(instancesCalls.get()).isEqualTo(stateCount);
+
+        // Second scrape within TTL — served from cache.
+        metrics.snapshotAllGauges();
+        assertThat(enabledRulesCalls.get()).isEqualTo(1);
+        assertThat(disabledRulesCalls.get()).isEqualTo(1);
+        assertThat(instancesCalls.get()).isEqualTo(stateCount);
+
+        // Third scrape still within TTL (29 s later) — still cached.
+        now.set(now.get().plusSeconds(29));
+        metrics.snapshotAllGauges();
+        assertThat(enabledRulesCalls.get()).isEqualTo(1);
+        assertThat(disabledRulesCalls.get()).isEqualTo(1);
+        assertThat(instancesCalls.get()).isEqualTo(stateCount);
+
+        // Advance past TTL — next scrape re-queries the delegate.
+        now.set(Instant.parse("2026-04-20T00:00:31Z"));
+        metrics.snapshotAllGauges();
+        assertThat(enabledRulesCalls.get()).isEqualTo(2);
+        assertThat(disabledRulesCalls.get()).isEqualTo(2);
+        assertThat(instancesCalls.get()).isEqualTo(stateCount * 2);
+
+        // Immediate follow-up — back in cache.
+        metrics.snapshotAllGauges();
+        assertThat(enabledRulesCalls.get()).isEqualTo(2);
+        assertThat(disabledRulesCalls.get()).isEqualTo(2);
+        assertThat(instancesCalls.get()).isEqualTo(stateCount * 2);
+    }
+
+    @Test
+    void gaugeValueReflectsCachedResult() {
+        AtomicReference<Long> enabledValue = new AtomicReference<>(10L);
+        AtomicReference<Instant> now = new AtomicReference<>(Instant.parse("2026-04-20T00:00:00Z"));
+
+        MeterRegistry registry = new SimpleMeterRegistry();
+        AlertingMetrics metrics = new AlertingMetrics(
+                registry,
+                enabledValue::get,
+                () -> 0L,
+                () -> 0L,
+                Duration.ofSeconds(30),
+                now::get
+        );
+
+        // Read once — value cached at 10.
+        metrics.snapshotAllGauges();
+
+        // Mutate the underlying supplier output; cache should shield it.
+        enabledValue.set(99L);
+        double cached = registry.find("alerting_rules_total").tag("state", "enabled").gauge().value();
+        assertThat(cached).isEqualTo(10.0);
+
+        // After TTL, new value surfaces.
+        now.set(now.get().plusSeconds(31));
+        metrics.snapshotAllGauges();
+        double refreshed = registry.find("alerting_rules_total").tag("state", "enabled").gauge().value();
+        assertThat(refreshed).isEqualTo(99.0);
+    }
+}