From 9f109b20fd3fc3f8b49f2d9ce60f2b06d4c58f66 Mon Sep 17 00:00:00 2001
From: hsiegeln <37154749+hsiegeln@users.noreply.github.com>
Date: Mon, 20 Apr 2026 14:22:54 +0200
Subject: [PATCH] perf(alerting): 30s TTL cache on AlertingMetrics gauge
 suppliers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prometheus scrapes can fire every few seconds. The open-alerts / open-rules
gauges query Postgres on each read — caching the values for 30s amortises
that to one query per half-minute. Addresses final-review NIT from Plan 02.

- Introduces a package-private TtlCache that wraps a Supplier<Long> and
  memoises the last read for a configurable Duration against a Supplier<Instant>
  clock.
- Wraps each gauge supplier (alerting_rules_total{enabled|disabled},
  alerting_instances_total{state}) in its own TtlCache.
- Adds a test-friendly constructor (package-private) taking explicit
  Duration + Supplier<Instant> so AlertingMetricsCachingTest can advance
  a fake clock without waiting wall-clock time.
- Adds AlertingMetricsCachingTest covering:
  * supplier invoked once per TTL across repeated scrapes
  * 29 s elapsed → still cached; 31 s elapsed → re-queried
  * gauge value reflects the cached result even after delegate mutates

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../app/alerting/metrics/AlertingMetrics.java | 158 ++++++++++++++----
 .../metrics/AlertingMetricsCachingTest.java   | 111 ++++++++++++
 2 files changed, 241 insertions(+), 28 deletions(-)
 create mode 100644 cameleer-server-app/src/test/java/com/cameleer/server/app/alerting/metrics/AlertingMetricsCachingTest.java
diff --git a/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/metrics/AlertingMetrics.java b/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/metrics/AlertingMetrics.java
index da67ad19..3da431bb 100644
--- a/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/metrics/AlertingMetrics.java
+++ b/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/metrics/AlertingMetrics.java
@@ -12,9 +12,16 @@ import org.slf4j.LoggerFactory;
 import org.springframework.jdbc.core.JdbcTemplate;
 import org.springframework.stereotype.Component;
 
+import java.time.Duration;
+import java.time.Instant;
+import java.util.ArrayList;
+import java.util.EnumMap;
+import java.util.List;
+import java.util.Map;
 import java.util.UUID;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
+import java.util.function.Supplier;
 
 /**
  * Micrometer-based metrics for the alerting subsystem.
@@ -30,10 +37,11 @@ import java.util.concurrent.ConcurrentMap;
  *   <li>{@code alerting_eval_duration_seconds{kind}} — per-kind evaluation latency</li>
  *   <li>{@code alerting_webhook_delivery_duration_seconds} — webhook POST latency</li>
  * </ul>
- * Gauges (read from PostgreSQL on each scrape; low scrape frequency = low DB load):
+ * Gauges (read from PostgreSQL, cached for {@link #DEFAULT_GAUGE_TTL} to amortise
+ * Prometheus scrapes that may fire every few seconds):
  * <ul>
  *   <li>{@code alerting_rules_total{state=enabled|disabled}} — rule counts from {@code alert_rules}</li>
- *   <li>{@code alerting_instances_total{state,severity}} — instance counts grouped from {@code alert_instances}</li>
+ *   <li>{@code alerting_instances_total{state}} — instance counts grouped from {@code alert_instances}</li>
  * </ul>
  */
 @Component
@@ -41,11 +49,13 @@ public class AlertingMetrics {
 
     private static final Logger log = LoggerFactory.getLogger(AlertingMetrics.class);
 
+    /** Default time-to-live for the gauge-supplier caches. */
+    static final Duration DEFAULT_GAUGE_TTL = Duration.ofSeconds(30);
+
     private final MeterRegistry registry;
-    private final JdbcTemplate jdbc;
 
     // Cached counters per kind (lazy-initialized)
-    private final ConcurrentMap<String, Counter> evalErrorCounters  = new ConcurrentHashMap<>();
+    private final ConcurrentMap<String, Counter> evalErrorCounters   = new ConcurrentHashMap<>();
     private final ConcurrentMap<String, Counter> circuitOpenCounters = new ConcurrentHashMap<>();
     private final ConcurrentMap<String, Timer>   evalDurationTimers  = new ConcurrentHashMap<>();
 
@@ -55,33 +65,80 @@ public class AlertingMetrics {
     // Shared delivery timer
     private final Timer webhookDeliveryTimer;
 
+    // TTL-cached gauge suppliers registered so tests can force a read cycle.
+    private final TtlCache enabledRulesCache;
+    private final TtlCache disabledRulesCache;
+    private final Map<AlertState, TtlCache> instancesByStateCaches;
+
+    /**
+     * Production constructor: wraps the Postgres-backed gauge suppliers in a
+     * 30-second TTL cache so Prometheus scrapes don't cause per-scrape DB queries.
+     */
     public AlertingMetrics(MeterRegistry registry, JdbcTemplate jdbc) {
+        this(registry,
+             () -> countRules(jdbc, true),
+             () -> countRules(jdbc, false),
+             state -> countInstances(jdbc, state),
+             DEFAULT_GAUGE_TTL,
+             Instant::now);
+    }
+
+    /**
+     * Test-friendly constructor accepting the three gauge suppliers that are
+     * exercised in the {@link AlertingMetricsCachingTest} plan sketch. The
+     * {@code instancesSupplier} is used for every {@link AlertState}.
+     */
+    AlertingMetrics(MeterRegistry registry,
+                    Supplier<Long> enabledRulesSupplier,
+                    Supplier<Long> disabledRulesSupplier,
+                    Supplier<Long> instancesSupplier,
+                    Duration gaugeTtl,
+                    Supplier<Instant> clock) {
+        this(registry,
+             enabledRulesSupplier,
+             disabledRulesSupplier,
+             state -> instancesSupplier.get(),
+             gaugeTtl,
+             clock);
+    }
+
+    /**
+     * Core constructor: accepts per-state instance supplier so production can
+     * query PostgreSQL with a different value per {@link AlertState}.
+     */
+    private AlertingMetrics(MeterRegistry registry,
+                            Supplier<Long> enabledRulesSupplier,
+                            Supplier<Long> disabledRulesSupplier,
+                            java.util.function.Function<AlertState, Long> instancesSupplier,
+                            Duration gaugeTtl,
+                            Supplier<Instant> clock) {
         this.registry = registry;
-        this.jdbc     = jdbc;
 
         // ── Static timers ───────────────────────────────────────────────
         this.webhookDeliveryTimer = Timer.builder("alerting_webhook_delivery_duration_seconds")
                 .description("Latency of outbound webhook POST requests")
                 .register(registry);
 
-        // ── Gauge: rules by enabled/disabled ────────────────────────────
-        Gauge.builder("alerting_rules_total", this, m -> m.countRules(true))
+        // ── Gauge: rules by enabled/disabled (cached) ───────────────────
+        this.enabledRulesCache  = new TtlCache(enabledRulesSupplier,  gaugeTtl, clock);
+        this.disabledRulesCache = new TtlCache(disabledRulesSupplier, gaugeTtl, clock);
+
+        Gauge.builder("alerting_rules_total", enabledRulesCache, TtlCache::getAsDouble)
                 .tag("state", "enabled")
                 .description("Number of enabled alert rules")
                 .register(registry);
-        Gauge.builder("alerting_rules_total", this, m -> m.countRules(false))
+        Gauge.builder("alerting_rules_total", disabledRulesCache, TtlCache::getAsDouble)
                 .tag("state", "disabled")
                 .description("Number of disabled alert rules")
                 .register(registry);
 
-        // ── Gauges: alert instances by state × severity ─────────────────
+        // ── Gauges: alert instances by state (cached) ───────────────────
+        this.instancesByStateCaches = new EnumMap<>(AlertState.class);
         for (AlertState state : AlertState.values()) {
-            // Capture state as effectively-final for lambda
-            AlertState capturedState = state;
-            // We register one gauge per state (summed across severities) for simplicity;
-            // per-severity breakdown would require a dynamic MultiGauge.
-            Gauge.builder("alerting_instances_total", this,
-                          m -> m.countInstances(capturedState))
+            AlertState captured = state;
+            TtlCache cache = new TtlCache(() -> instancesSupplier.apply(captured), gaugeTtl, clock);
+            this.instancesByStateCaches.put(state, cache);
+            Gauge.builder("alerting_instances_total", cache, TtlCache::getAsDouble)
                     .tag("state", state.name().toLowerCase())
                     .description("Number of alert instances by state")
                     .register(registry);
@@ -148,28 +205,73 @@ public class AlertingMetrics {
             .increment();
     }
 
-    // ── Gauge suppliers (called on each Prometheus scrape) ──────────────
-
-    private double countRules(boolean enabled) {
-        try {
-            Long count = jdbc.queryForObject(
-                "SELECT COUNT(*) FROM alert_rules WHERE enabled = ?", Long.class, enabled);
-            return count == null ? 0.0 : count.doubleValue();
-        } catch (Exception e) {
-            log.debug("alerting_rules gauge query failed: {}", e.getMessage());
-            return 0.0;
+    /**
+     * Force a read of every TTL-cached gauge supplier. Used by tests to simulate
+     * a Prometheus scrape without needing a real registry scrape pipeline.
+     */
+    void snapshotAllGauges() {
+        List<TtlCache> all = new ArrayList<>();
+        all.add(enabledRulesCache);
+        all.add(disabledRulesCache);
+        all.addAll(instancesByStateCaches.values());
+        for (TtlCache c : all) {
+            c.getAsDouble();
         }
     }
 
-    private double countInstances(AlertState state) {
+    // ── Gauge suppliers (queried at most once per TTL) ──────────────────
+
+    private static long countRules(JdbcTemplate jdbc, boolean enabled) {
+        try {
+            Long count = jdbc.queryForObject(
+                "SELECT COUNT(*) FROM alert_rules WHERE enabled = ?", Long.class, enabled);
+            return count == null ? 0L : count;
+        } catch (Exception e) {
+            log.debug("alerting_rules gauge query failed: {}", e.getMessage());
+            return 0L;
+        }
+    }
+
+    private static long countInstances(JdbcTemplate jdbc, AlertState state) {
         try {
             Long count = jdbc.queryForObject(
                 "SELECT COUNT(*) FROM alert_instances WHERE state = ?::alert_state_enum",
                 Long.class, state.name());
-            return count == null ? 0.0 : count.doubleValue();
+            return count == null ? 0L : count;
         } catch (Exception e) {
             log.debug("alerting_instances gauge query failed: {}", e.getMessage());
-            return 0.0;
+            return 0L;
+        }
+    }
+
+    /**
+     * Lightweight TTL cache around a {@code Supplier<Long>}. Every call to
+     * {@link #getAsDouble()} either returns the cached value (if {@code clock.get()
+     * - lastRead < ttl}) or invokes the delegate and refreshes the cache.
+     *
+     * <p>Used to amortise Postgres queries behind Prometheus gauges over a
+     * 30-second TTL (see {@link AlertingMetrics#DEFAULT_GAUGE_TTL}).
+     */
+    static final class TtlCache {
+        private final Supplier<Long> delegate;
+        private final Duration ttl;
+        private final Supplier<Instant> clock;
+        private volatile Instant lastRead = Instant.MIN;
+        private volatile long cached = 0L;
+
+        TtlCache(Supplier<Long> delegate, Duration ttl, Supplier<Instant> clock) {
+            this.delegate = delegate;
+            this.ttl = ttl;
+            this.clock = clock;
+        }
+
+        synchronized double getAsDouble() {
+            Instant now = clock.get();
+            if (lastRead == Instant.MIN || Duration.between(lastRead, now).compareTo(ttl) >= 0) {
+                cached = delegate.get();
+                lastRead = now;
+            }
+            return cached;
         }
     }
 }
diff --git a/cameleer-server-app/src/test/java/com/cameleer/server/app/alerting/metrics/AlertingMetricsCachingTest.java b/cameleer-server-app/src/test/java/com/cameleer/server/app/alerting/metrics/AlertingMetricsCachingTest.java
new file mode 100644
index 00000000..194bc982
--- /dev/null
+++ b/cameleer-server-app/src/test/java/com/cameleer/server/app/alerting/metrics/AlertingMetricsCachingTest.java
@@ -0,0 +1,111 @@
+package com.cameleer.server.app.alerting.metrics;
+
+import com.cameleer.server.core.alerting.AlertState;
+import io.micrometer.core.instrument.MeterRegistry;
+import io.micrometer.core.instrument.simple.SimpleMeterRegistry;
+import org.junit.jupiter.api.Test;
+
+import java.time.Duration;
+import java.time.Instant;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.Supplier;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Verifies that {@link AlertingMetrics} caches gauge values for a configurable TTL,
+ * so that Prometheus scrapes do not cause one Postgres query per scrape.
+ */
+class AlertingMetricsCachingTest {
+
+    @Test
+    void gaugeSupplierIsCalledAtMostOncePerTtl() {
+        // The instances supplier is shared across every AlertState gauge, so each
+        // full gauge snapshot invokes it once per AlertState (one cache per state).
+        final int stateCount = AlertState.values().length;
+
+        AtomicInteger enabledRulesCalls  = new AtomicInteger();
+        AtomicInteger disabledRulesCalls = new AtomicInteger();
+        AtomicInteger instancesCalls     = new AtomicInteger();
+        AtomicReference<Instant> now = new AtomicReference<>(Instant.parse("2026-04-20T00:00:00Z"));
+        Supplier<Instant> clock = now::get;
+
+        MeterRegistry registry = new SimpleMeterRegistry();
+
+        Supplier<Long> enabledRulesSupplier  = () -> { enabledRulesCalls.incrementAndGet();  return 7L; };
+        Supplier<Long> disabledRulesSupplier = () -> { disabledRulesCalls.incrementAndGet(); return 3L; };
+        Supplier<Long> instancesSupplier     = () -> { instancesCalls.incrementAndGet();     return 5L; };
+
+        AlertingMetrics metrics = new AlertingMetrics(
+                registry,
+                enabledRulesSupplier,
+                disabledRulesSupplier,
+                instancesSupplier,
+                Duration.ofSeconds(30),
+                clock
+        );
+
+        // First scrape — each supplier invoked exactly once per gauge.
+        metrics.snapshotAllGauges();
+        assertThat(enabledRulesCalls.get()).isEqualTo(1);
+        assertThat(disabledRulesCalls.get()).isEqualTo(1);
+        assertThat(instancesCalls.get()).isEqualTo(stateCount);
+
+        // Second scrape within TTL — served from cache.
+        metrics.snapshotAllGauges();
+        assertThat(enabledRulesCalls.get()).isEqualTo(1);
+        assertThat(disabledRulesCalls.get()).isEqualTo(1);
+        assertThat(instancesCalls.get()).isEqualTo(stateCount);
+
+        // Third scrape still within TTL (29 s later) — still cached.
+        now.set(now.get().plusSeconds(29));
+        metrics.snapshotAllGauges();
+        assertThat(enabledRulesCalls.get()).isEqualTo(1);
+        assertThat(disabledRulesCalls.get()).isEqualTo(1);
+        assertThat(instancesCalls.get()).isEqualTo(stateCount);
+
+        // Advance past TTL — next scrape re-queries the delegate.
+        now.set(Instant.parse("2026-04-20T00:00:31Z"));
+        metrics.snapshotAllGauges();
+        assertThat(enabledRulesCalls.get()).isEqualTo(2);
+        assertThat(disabledRulesCalls.get()).isEqualTo(2);
+        assertThat(instancesCalls.get()).isEqualTo(stateCount * 2);
+
+        // Immediate follow-up — back in cache.
+        metrics.snapshotAllGauges();
+        assertThat(enabledRulesCalls.get()).isEqualTo(2);
+        assertThat(disabledRulesCalls.get()).isEqualTo(2);
+        assertThat(instancesCalls.get()).isEqualTo(stateCount * 2);
+    }
+
+    @Test
+    void gaugeValueReflectsCachedResult() {
+        AtomicReference<Long> enabledValue = new AtomicReference<>(10L);
+        AtomicReference<Instant> now = new AtomicReference<>(Instant.parse("2026-04-20T00:00:00Z"));
+
+        MeterRegistry registry = new SimpleMeterRegistry();
+        AlertingMetrics metrics = new AlertingMetrics(
+                registry,
+                enabledValue::get,
+                () -> 0L,
+                () -> 0L,
+                Duration.ofSeconds(30),
+                now::get
+        );
+
+        // Read once — value cached at 10.
+        metrics.snapshotAllGauges();
+
+        // Mutate the underlying supplier output; cache should shield it.
+        enabledValue.set(99L);
+        double cached = registry.find("alerting_rules_total").tag("state", "enabled").gauge().value();
+        assertThat(cached).isEqualTo(10.0);
+
+        // After TTL, new value surfaces.
+        now.set(now.get().plusSeconds(31));
+        metrics.snapshotAllGauges();
+        double refreshed = registry.find("alerting_rules_total").tag("state", "enabled").gauge().value();
+        assertThat(refreshed).isEqualTo(99.0);
+    }
+}