From 9f109b20fd3fc3f8b49f2d9ce60f2b06d4c58f66 Mon Sep 17 00:00:00 2001 From: hsiegeln <37154749+hsiegeln@users.noreply.github.com> Date: Mon, 20 Apr 2026 14:22:54 +0200 Subject: [PATCH] perf(alerting): 30s TTL cache on AlertingMetrics gauge suppliers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prometheus scrapes can fire every few seconds. The open-alerts / open-rules gauges query Postgres on each read — caching the values for 30s amortises that to one query per half-minute. Addresses final-review NIT from Plan 02. - Introduces a package-private TtlCache that wraps a Supplier and memoises the last read for a configurable Duration against a Supplier clock. - Wraps each gauge supplier (alerting_rules_total{enabled|disabled}, alerting_instances_total{state}) in its own TtlCache. - Adds a test-friendly constructor (package-private) taking explicit Duration + Supplier so AlertingMetricsCachingTest can advance a fake clock without waiting wall-clock time. - Adds AlertingMetricsCachingTest covering: * supplier invoked once per TTL across repeated scrapes * 29 s elapsed → still cached; 31 s elapsed → re-queried * gauge value reflects the cached result even after delegate mutates Co-Authored-By: Claude Opus 4.7 (1M context) --- .../app/alerting/metrics/AlertingMetrics.java | 158 ++++++++++++++---- .../metrics/AlertingMetricsCachingTest.java | 111 ++++++++++++ 2 files changed, 241 insertions(+), 28 deletions(-) create mode 100644 cameleer-server-app/src/test/java/com/cameleer/server/app/alerting/metrics/AlertingMetricsCachingTest.java diff --git a/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/metrics/AlertingMetrics.java b/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/metrics/AlertingMetrics.java index da67ad19..3da431bb 100644 --- a/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/metrics/AlertingMetrics.java +++ b/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/metrics/AlertingMetrics.java @@ -12,9 +12,16 @@ import org.slf4j.LoggerFactory; import org.springframework.jdbc.core.JdbcTemplate; import org.springframework.stereotype.Component; +import java.time.Duration; +import java.time.Instant; +import java.util.ArrayList; +import java.util.EnumMap; +import java.util.List; +import java.util.Map; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; +import java.util.function.Supplier; /** * Micrometer-based metrics for the alerting subsystem. @@ -30,10 +37,11 @@ import java.util.concurrent.ConcurrentMap; *
  • {@code alerting_eval_duration_seconds{kind}} — per-kind evaluation latency
  • *
  • {@code alerting_webhook_delivery_duration_seconds} — webhook POST latency
  • * - * Gauges (read from PostgreSQL on each scrape; low scrape frequency = low DB load): + * Gauges (read from PostgreSQL, cached for {@link #DEFAULT_GAUGE_TTL} to amortise + * Prometheus scrapes that may fire every few seconds): *
      *
    • {@code alerting_rules_total{state=enabled|disabled}} — rule counts from {@code alert_rules}
    • - *
    • {@code alerting_instances_total{state,severity}} — instance counts grouped from {@code alert_instances}
    • + *
    • {@code alerting_instances_total{state}} — instance counts grouped from {@code alert_instances}
    • *
    */ @Component @@ -41,11 +49,13 @@ public class AlertingMetrics { private static final Logger log = LoggerFactory.getLogger(AlertingMetrics.class); + /** Default time-to-live for the gauge-supplier caches. */ + static final Duration DEFAULT_GAUGE_TTL = Duration.ofSeconds(30); + private final MeterRegistry registry; - private final JdbcTemplate jdbc; // Cached counters per kind (lazy-initialized) - private final ConcurrentMap evalErrorCounters = new ConcurrentHashMap<>(); + private final ConcurrentMap evalErrorCounters = new ConcurrentHashMap<>(); private final ConcurrentMap circuitOpenCounters = new ConcurrentHashMap<>(); private final ConcurrentMap evalDurationTimers = new ConcurrentHashMap<>(); @@ -55,33 +65,80 @@ public class AlertingMetrics { // Shared delivery timer private final Timer webhookDeliveryTimer; + // TTL-cached gauge suppliers registered so tests can force a read cycle. + private final TtlCache enabledRulesCache; + private final TtlCache disabledRulesCache; + private final Map instancesByStateCaches; + + /** + * Production constructor: wraps the Postgres-backed gauge suppliers in a + * 30-second TTL cache so Prometheus scrapes don't cause per-scrape DB queries. + */ public AlertingMetrics(MeterRegistry registry, JdbcTemplate jdbc) { + this(registry, + () -> countRules(jdbc, true), + () -> countRules(jdbc, false), + state -> countInstances(jdbc, state), + DEFAULT_GAUGE_TTL, + Instant::now); + } + + /** + * Test-friendly constructor accepting the three gauge suppliers that are + * exercised in the {@link AlertingMetricsCachingTest} plan sketch. The + * {@code instancesSupplier} is used for every {@link AlertState}. + */ + AlertingMetrics(MeterRegistry registry, + Supplier enabledRulesSupplier, + Supplier disabledRulesSupplier, + Supplier instancesSupplier, + Duration gaugeTtl, + Supplier clock) { + this(registry, + enabledRulesSupplier, + disabledRulesSupplier, + state -> instancesSupplier.get(), + gaugeTtl, + clock); + } + + /** + * Core constructor: accepts per-state instance supplier so production can + * query PostgreSQL with a different value per {@link AlertState}. + */ + private AlertingMetrics(MeterRegistry registry, + Supplier enabledRulesSupplier, + Supplier disabledRulesSupplier, + java.util.function.Function instancesSupplier, + Duration gaugeTtl, + Supplier clock) { this.registry = registry; - this.jdbc = jdbc; // ── Static timers ─────────────────────────────────────────────── this.webhookDeliveryTimer = Timer.builder("alerting_webhook_delivery_duration_seconds") .description("Latency of outbound webhook POST requests") .register(registry); - // ── Gauge: rules by enabled/disabled ──────────────────────────── - Gauge.builder("alerting_rules_total", this, m -> m.countRules(true)) + // ── Gauge: rules by enabled/disabled (cached) ─────────────────── + this.enabledRulesCache = new TtlCache(enabledRulesSupplier, gaugeTtl, clock); + this.disabledRulesCache = new TtlCache(disabledRulesSupplier, gaugeTtl, clock); + + Gauge.builder("alerting_rules_total", enabledRulesCache, TtlCache::getAsDouble) .tag("state", "enabled") .description("Number of enabled alert rules") .register(registry); - Gauge.builder("alerting_rules_total", this, m -> m.countRules(false)) + Gauge.builder("alerting_rules_total", disabledRulesCache, TtlCache::getAsDouble) .tag("state", "disabled") .description("Number of disabled alert rules") .register(registry); - // ── Gauges: alert instances by state × severity ───────────────── + // ── Gauges: alert instances by state (cached) ─────────────────── + this.instancesByStateCaches = new EnumMap<>(AlertState.class); for (AlertState state : AlertState.values()) { - // Capture state as effectively-final for lambda - AlertState capturedState = state; - // We register one gauge per state (summed across severities) for simplicity; - // per-severity breakdown would require a dynamic MultiGauge. - Gauge.builder("alerting_instances_total", this, - m -> m.countInstances(capturedState)) + AlertState captured = state; + TtlCache cache = new TtlCache(() -> instancesSupplier.apply(captured), gaugeTtl, clock); + this.instancesByStateCaches.put(state, cache); + Gauge.builder("alerting_instances_total", cache, TtlCache::getAsDouble) .tag("state", state.name().toLowerCase()) .description("Number of alert instances by state") .register(registry); @@ -148,28 +205,73 @@ public class AlertingMetrics { .increment(); } - // ── Gauge suppliers (called on each Prometheus scrape) ────────────── - - private double countRules(boolean enabled) { - try { - Long count = jdbc.queryForObject( - "SELECT COUNT(*) FROM alert_rules WHERE enabled = ?", Long.class, enabled); - return count == null ? 0.0 : count.doubleValue(); - } catch (Exception e) { - log.debug("alerting_rules gauge query failed: {}", e.getMessage()); - return 0.0; + /** + * Force a read of every TTL-cached gauge supplier. Used by tests to simulate + * a Prometheus scrape without needing a real registry scrape pipeline. + */ + void snapshotAllGauges() { + List all = new ArrayList<>(); + all.add(enabledRulesCache); + all.add(disabledRulesCache); + all.addAll(instancesByStateCaches.values()); + for (TtlCache c : all) { + c.getAsDouble(); } } - private double countInstances(AlertState state) { + // ── Gauge suppliers (queried at most once per TTL) ────────────────── + + private static long countRules(JdbcTemplate jdbc, boolean enabled) { + try { + Long count = jdbc.queryForObject( + "SELECT COUNT(*) FROM alert_rules WHERE enabled = ?", Long.class, enabled); + return count == null ? 0L : count; + } catch (Exception e) { + log.debug("alerting_rules gauge query failed: {}", e.getMessage()); + return 0L; + } + } + + private static long countInstances(JdbcTemplate jdbc, AlertState state) { try { Long count = jdbc.queryForObject( "SELECT COUNT(*) FROM alert_instances WHERE state = ?::alert_state_enum", Long.class, state.name()); - return count == null ? 0.0 : count.doubleValue(); + return count == null ? 0L : count; } catch (Exception e) { log.debug("alerting_instances gauge query failed: {}", e.getMessage()); - return 0.0; + return 0L; + } + } + + /** + * Lightweight TTL cache around a {@code Supplier}. Every call to + * {@link #getAsDouble()} either returns the cached value (if {@code clock.get() + * - lastRead < ttl}) or invokes the delegate and refreshes the cache. + * + *

    Used to amortise Postgres queries behind Prometheus gauges over a + * 30-second TTL (see {@link AlertingMetrics#DEFAULT_GAUGE_TTL}). + */ + static final class TtlCache { + private final Supplier delegate; + private final Duration ttl; + private final Supplier clock; + private volatile Instant lastRead = Instant.MIN; + private volatile long cached = 0L; + + TtlCache(Supplier delegate, Duration ttl, Supplier clock) { + this.delegate = delegate; + this.ttl = ttl; + this.clock = clock; + } + + synchronized double getAsDouble() { + Instant now = clock.get(); + if (lastRead == Instant.MIN || Duration.between(lastRead, now).compareTo(ttl) >= 0) { + cached = delegate.get(); + lastRead = now; + } + return cached; } } } diff --git a/cameleer-server-app/src/test/java/com/cameleer/server/app/alerting/metrics/AlertingMetricsCachingTest.java b/cameleer-server-app/src/test/java/com/cameleer/server/app/alerting/metrics/AlertingMetricsCachingTest.java new file mode 100644 index 00000000..194bc982 --- /dev/null +++ b/cameleer-server-app/src/test/java/com/cameleer/server/app/alerting/metrics/AlertingMetricsCachingTest.java @@ -0,0 +1,111 @@ +package com.cameleer.server.app.alerting.metrics; + +import com.cameleer.server.core.alerting.AlertState; +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.simple.SimpleMeterRegistry; +import org.junit.jupiter.api.Test; + +import java.time.Duration; +import java.time.Instant; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Supplier; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Verifies that {@link AlertingMetrics} caches gauge values for a configurable TTL, + * so that Prometheus scrapes do not cause one Postgres query per scrape. + */ +class AlertingMetricsCachingTest { + + @Test + void gaugeSupplierIsCalledAtMostOncePerTtl() { + // The instances supplier is shared across every AlertState gauge, so each + // full gauge snapshot invokes it once per AlertState (one cache per state). + final int stateCount = AlertState.values().length; + + AtomicInteger enabledRulesCalls = new AtomicInteger(); + AtomicInteger disabledRulesCalls = new AtomicInteger(); + AtomicInteger instancesCalls = new AtomicInteger(); + AtomicReference now = new AtomicReference<>(Instant.parse("2026-04-20T00:00:00Z")); + Supplier clock = now::get; + + MeterRegistry registry = new SimpleMeterRegistry(); + + Supplier enabledRulesSupplier = () -> { enabledRulesCalls.incrementAndGet(); return 7L; }; + Supplier disabledRulesSupplier = () -> { disabledRulesCalls.incrementAndGet(); return 3L; }; + Supplier instancesSupplier = () -> { instancesCalls.incrementAndGet(); return 5L; }; + + AlertingMetrics metrics = new AlertingMetrics( + registry, + enabledRulesSupplier, + disabledRulesSupplier, + instancesSupplier, + Duration.ofSeconds(30), + clock + ); + + // First scrape — each supplier invoked exactly once per gauge. + metrics.snapshotAllGauges(); + assertThat(enabledRulesCalls.get()).isEqualTo(1); + assertThat(disabledRulesCalls.get()).isEqualTo(1); + assertThat(instancesCalls.get()).isEqualTo(stateCount); + + // Second scrape within TTL — served from cache. + metrics.snapshotAllGauges(); + assertThat(enabledRulesCalls.get()).isEqualTo(1); + assertThat(disabledRulesCalls.get()).isEqualTo(1); + assertThat(instancesCalls.get()).isEqualTo(stateCount); + + // Third scrape still within TTL (29 s later) — still cached. + now.set(now.get().plusSeconds(29)); + metrics.snapshotAllGauges(); + assertThat(enabledRulesCalls.get()).isEqualTo(1); + assertThat(disabledRulesCalls.get()).isEqualTo(1); + assertThat(instancesCalls.get()).isEqualTo(stateCount); + + // Advance past TTL — next scrape re-queries the delegate. + now.set(Instant.parse("2026-04-20T00:00:31Z")); + metrics.snapshotAllGauges(); + assertThat(enabledRulesCalls.get()).isEqualTo(2); + assertThat(disabledRulesCalls.get()).isEqualTo(2); + assertThat(instancesCalls.get()).isEqualTo(stateCount * 2); + + // Immediate follow-up — back in cache. + metrics.snapshotAllGauges(); + assertThat(enabledRulesCalls.get()).isEqualTo(2); + assertThat(disabledRulesCalls.get()).isEqualTo(2); + assertThat(instancesCalls.get()).isEqualTo(stateCount * 2); + } + + @Test + void gaugeValueReflectsCachedResult() { + AtomicReference enabledValue = new AtomicReference<>(10L); + AtomicReference now = new AtomicReference<>(Instant.parse("2026-04-20T00:00:00Z")); + + MeterRegistry registry = new SimpleMeterRegistry(); + AlertingMetrics metrics = new AlertingMetrics( + registry, + enabledValue::get, + () -> 0L, + () -> 0L, + Duration.ofSeconds(30), + now::get + ); + + // Read once — value cached at 10. + metrics.snapshotAllGauges(); + + // Mutate the underlying supplier output; cache should shield it. + enabledValue.set(99L); + double cached = registry.find("alerting_rules_total").tag("state", "enabled").gauge().value(); + assertThat(cached).isEqualTo(10.0); + + // After TTL, new value surfaces. + now.set(now.get().plusSeconds(31)); + metrics.snapshotAllGauges(); + double refreshed = registry.find("alerting_rules_total").tag("state", "enabled").gauge().value(); + assertThat(refreshed).isEqualTo(99.0); + } +}