perf(alerting): 30s TTL cache on AlertingMetrics gauge suppliers

Prometheus scrapes can fire every few seconds. The open-alerts / open-rules
gauges query Postgres on each read — caching the values for 30s amortises
that to one query per half-minute. Addresses final-review NIT from Plan 02.

- Introduces a package-private TtlCache that wraps a Supplier<Long> and
  memoises the last read for a configurable Duration against a Supplier<Instant>
  clock.
- Wraps each gauge supplier (alerting_rules_total{enabled|disabled},
  alerting_instances_total{state}) in its own TtlCache.
- Adds a test-friendly constructor (package-private) taking explicit
  Duration + Supplier<Instant> so AlertingMetricsCachingTest can advance
  a fake clock without waiting wall-clock time.
- Adds AlertingMetricsCachingTest covering:
  * supplier invoked once per TTL across repeated scrapes
  * 29 s elapsed → still cached; 31 s elapsed → re-queried
  * gauge value reflects the cached result even after delegate mutates

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
hsiegeln
2026-04-20 14:22:54 +02:00
parent 5ebc729b82
commit 9f109b20fd
2 changed files with 241 additions and 28 deletions

View File

@@ -12,9 +12,16 @@ import org.slf4j.LoggerFactory;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.stereotype.Component;
import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.function.Supplier;
/**
* Micrometer-based metrics for the alerting subsystem.
@@ -30,10 +37,11 @@ import java.util.concurrent.ConcurrentMap;
* <li>{@code alerting_eval_duration_seconds{kind}} — per-kind evaluation latency</li>
* <li>{@code alerting_webhook_delivery_duration_seconds} — webhook POST latency</li>
* </ul>
* Gauges (read from PostgreSQL on each scrape; low scrape frequency = low DB load):
* Gauges (read from PostgreSQL, cached for {@link #DEFAULT_GAUGE_TTL} to amortise
* Prometheus scrapes that may fire every few seconds):
* <ul>
* <li>{@code alerting_rules_total{state=enabled|disabled}} — rule counts from {@code alert_rules}</li>
* <li>{@code alerting_instances_total{state,severity}} — instance counts grouped from {@code alert_instances}</li>
* <li>{@code alerting_instances_total{state}} — instance counts grouped from {@code alert_instances}</li>
* </ul>
*/
@Component
@@ -41,8 +49,10 @@ public class AlertingMetrics {
private static final Logger log = LoggerFactory.getLogger(AlertingMetrics.class);
/** Default time-to-live for the gauge-supplier caches. */
static final Duration DEFAULT_GAUGE_TTL = Duration.ofSeconds(30);
private final MeterRegistry registry;
private final JdbcTemplate jdbc;
// Cached counters per kind (lazy-initialized)
private final ConcurrentMap<String, Counter> evalErrorCounters = new ConcurrentHashMap<>();
@@ -55,33 +65,80 @@ public class AlertingMetrics {
// Shared delivery timer
private final Timer webhookDeliveryTimer;
// TTL-cached gauge suppliers registered so tests can force a read cycle.
private final TtlCache enabledRulesCache;
private final TtlCache disabledRulesCache;
private final Map<AlertState, TtlCache> instancesByStateCaches;
/**
* Production constructor: wraps the Postgres-backed gauge suppliers in a
* 30-second TTL cache so Prometheus scrapes don't cause per-scrape DB queries.
*/
public AlertingMetrics(MeterRegistry registry, JdbcTemplate jdbc) {
this(registry,
() -> countRules(jdbc, true),
() -> countRules(jdbc, false),
state -> countInstances(jdbc, state),
DEFAULT_GAUGE_TTL,
Instant::now);
}
/**
* Test-friendly constructor accepting the three gauge suppliers that are
* exercised in the {@link AlertingMetricsCachingTest} plan sketch. The
* {@code instancesSupplier} is used for every {@link AlertState}.
*/
AlertingMetrics(MeterRegistry registry,
Supplier<Long> enabledRulesSupplier,
Supplier<Long> disabledRulesSupplier,
Supplier<Long> instancesSupplier,
Duration gaugeTtl,
Supplier<Instant> clock) {
this(registry,
enabledRulesSupplier,
disabledRulesSupplier,
state -> instancesSupplier.get(),
gaugeTtl,
clock);
}
/**
* Core constructor: accepts per-state instance supplier so production can
* query PostgreSQL with a different value per {@link AlertState}.
*/
private AlertingMetrics(MeterRegistry registry,
Supplier<Long> enabledRulesSupplier,
Supplier<Long> disabledRulesSupplier,
java.util.function.Function<AlertState, Long> instancesSupplier,
Duration gaugeTtl,
Supplier<Instant> clock) {
this.registry = registry;
this.jdbc = jdbc;
// ── Static timers ───────────────────────────────────────────────
this.webhookDeliveryTimer = Timer.builder("alerting_webhook_delivery_duration_seconds")
.description("Latency of outbound webhook POST requests")
.register(registry);
// ── Gauge: rules by enabled/disabled ────────────────────────────
Gauge.builder("alerting_rules_total", this, m -> m.countRules(true))
// ── Gauge: rules by enabled/disabled (cached) ───────────────────
this.enabledRulesCache = new TtlCache(enabledRulesSupplier, gaugeTtl, clock);
this.disabledRulesCache = new TtlCache(disabledRulesSupplier, gaugeTtl, clock);
Gauge.builder("alerting_rules_total", enabledRulesCache, TtlCache::getAsDouble)
.tag("state", "enabled")
.description("Number of enabled alert rules")
.register(registry);
Gauge.builder("alerting_rules_total", this, m -> m.countRules(false))
Gauge.builder("alerting_rules_total", disabledRulesCache, TtlCache::getAsDouble)
.tag("state", "disabled")
.description("Number of disabled alert rules")
.register(registry);
// ── Gauges: alert instances by state × severity ─────────────────
// ── Gauges: alert instances by state (cached) ───────────────────
this.instancesByStateCaches = new EnumMap<>(AlertState.class);
for (AlertState state : AlertState.values()) {
// Capture state as effectively-final for lambda
AlertState capturedState = state;
// We register one gauge per state (summed across severities) for simplicity;
// per-severity breakdown would require a dynamic MultiGauge.
Gauge.builder("alerting_instances_total", this,
m -> m.countInstances(capturedState))
AlertState captured = state;
TtlCache cache = new TtlCache(() -> instancesSupplier.apply(captured), gaugeTtl, clock);
this.instancesByStateCaches.put(state, cache);
Gauge.builder("alerting_instances_total", cache, TtlCache::getAsDouble)
.tag("state", state.name().toLowerCase())
.description("Number of alert instances by state")
.register(registry);
@@ -148,28 +205,73 @@ public class AlertingMetrics {
.increment();
}
// ── Gauge suppliers (called on each Prometheus scrape) ──────────────
/**
* Force a read of every TTL-cached gauge supplier. Used by tests to simulate
* a Prometheus scrape without needing a real registry scrape pipeline.
*/
void snapshotAllGauges() {
List<TtlCache> all = new ArrayList<>();
all.add(enabledRulesCache);
all.add(disabledRulesCache);
all.addAll(instancesByStateCaches.values());
for (TtlCache c : all) {
c.getAsDouble();
}
}
private double countRules(boolean enabled) {
// ── Gauge suppliers (queried at most once per TTL) ──────────────────
private static long countRules(JdbcTemplate jdbc, boolean enabled) {
try {
Long count = jdbc.queryForObject(
"SELECT COUNT(*) FROM alert_rules WHERE enabled = ?", Long.class, enabled);
return count == null ? 0.0 : count.doubleValue();
return count == null ? 0L : count;
} catch (Exception e) {
log.debug("alerting_rules gauge query failed: {}", e.getMessage());
return 0.0;
return 0L;
}
}
private double countInstances(AlertState state) {
private static long countInstances(JdbcTemplate jdbc, AlertState state) {
try {
Long count = jdbc.queryForObject(
"SELECT COUNT(*) FROM alert_instances WHERE state = ?::alert_state_enum",
Long.class, state.name());
return count == null ? 0.0 : count.doubleValue();
return count == null ? 0L : count;
} catch (Exception e) {
log.debug("alerting_instances gauge query failed: {}", e.getMessage());
return 0.0;
return 0L;
}
}
/**
* Lightweight TTL cache around a {@code Supplier<Long>}. Every call to
* {@link #getAsDouble()} either returns the cached value (if {@code clock.get()
* - lastRead < ttl}) or invokes the delegate and refreshes the cache.
*
* <p>Used to amortise Postgres queries behind Prometheus gauges over a
* 30-second TTL (see {@link AlertingMetrics#DEFAULT_GAUGE_TTL}).
*/
static final class TtlCache {
private final Supplier<Long> delegate;
private final Duration ttl;
private final Supplier<Instant> clock;
private volatile Instant lastRead = Instant.MIN;
private volatile long cached = 0L;
TtlCache(Supplier<Long> delegate, Duration ttl, Supplier<Instant> clock) {
this.delegate = delegate;
this.ttl = ttl;
this.clock = clock;
}
synchronized double getAsDouble() {
Instant now = clock.get();
if (lastRead == Instant.MIN || Duration.between(lastRead, now).compareTo(ttl) >= 0) {
cached = delegate.get();
lastRead = now;
}
return cached;
}
}
}

View File

@@ -0,0 +1,111 @@
package com.cameleer.server.app.alerting.metrics;
import com.cameleer.server.core.alerting.AlertState;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.simple.SimpleMeterRegistry;
import org.junit.jupiter.api.Test;
import java.time.Duration;
import java.time.Instant;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Supplier;
import static org.assertj.core.api.Assertions.assertThat;
/**
* Verifies that {@link AlertingMetrics} caches gauge values for a configurable TTL,
* so that Prometheus scrapes do not cause one Postgres query per scrape.
*/
class AlertingMetricsCachingTest {
@Test
void gaugeSupplierIsCalledAtMostOncePerTtl() {
// The instances supplier is shared across every AlertState gauge, so each
// full gauge snapshot invokes it once per AlertState (one cache per state).
final int stateCount = AlertState.values().length;
AtomicInteger enabledRulesCalls = new AtomicInteger();
AtomicInteger disabledRulesCalls = new AtomicInteger();
AtomicInteger instancesCalls = new AtomicInteger();
AtomicReference<Instant> now = new AtomicReference<>(Instant.parse("2026-04-20T00:00:00Z"));
Supplier<Instant> clock = now::get;
MeterRegistry registry = new SimpleMeterRegistry();
Supplier<Long> enabledRulesSupplier = () -> { enabledRulesCalls.incrementAndGet(); return 7L; };
Supplier<Long> disabledRulesSupplier = () -> { disabledRulesCalls.incrementAndGet(); return 3L; };
Supplier<Long> instancesSupplier = () -> { instancesCalls.incrementAndGet(); return 5L; };
AlertingMetrics metrics = new AlertingMetrics(
registry,
enabledRulesSupplier,
disabledRulesSupplier,
instancesSupplier,
Duration.ofSeconds(30),
clock
);
// First scrape — each supplier invoked exactly once per gauge.
metrics.snapshotAllGauges();
assertThat(enabledRulesCalls.get()).isEqualTo(1);
assertThat(disabledRulesCalls.get()).isEqualTo(1);
assertThat(instancesCalls.get()).isEqualTo(stateCount);
// Second scrape within TTL — served from cache.
metrics.snapshotAllGauges();
assertThat(enabledRulesCalls.get()).isEqualTo(1);
assertThat(disabledRulesCalls.get()).isEqualTo(1);
assertThat(instancesCalls.get()).isEqualTo(stateCount);
// Third scrape still within TTL (29 s later) — still cached.
now.set(now.get().plusSeconds(29));
metrics.snapshotAllGauges();
assertThat(enabledRulesCalls.get()).isEqualTo(1);
assertThat(disabledRulesCalls.get()).isEqualTo(1);
assertThat(instancesCalls.get()).isEqualTo(stateCount);
// Advance past TTL — next scrape re-queries the delegate.
now.set(Instant.parse("2026-04-20T00:00:31Z"));
metrics.snapshotAllGauges();
assertThat(enabledRulesCalls.get()).isEqualTo(2);
assertThat(disabledRulesCalls.get()).isEqualTo(2);
assertThat(instancesCalls.get()).isEqualTo(stateCount * 2);
// Immediate follow-up — back in cache.
metrics.snapshotAllGauges();
assertThat(enabledRulesCalls.get()).isEqualTo(2);
assertThat(disabledRulesCalls.get()).isEqualTo(2);
assertThat(instancesCalls.get()).isEqualTo(stateCount * 2);
}
@Test
void gaugeValueReflectsCachedResult() {
AtomicReference<Long> enabledValue = new AtomicReference<>(10L);
AtomicReference<Instant> now = new AtomicReference<>(Instant.parse("2026-04-20T00:00:00Z"));
MeterRegistry registry = new SimpleMeterRegistry();
AlertingMetrics metrics = new AlertingMetrics(
registry,
enabledValue::get,
() -> 0L,
() -> 0L,
Duration.ofSeconds(30),
now::get
);
// Read once — value cached at 10.
metrics.snapshotAllGauges();
// Mutate the underlying supplier output; cache should shield it.
enabledValue.set(99L);
double cached = registry.find("alerting_rules_total").tag("state", "enabled").gauge().value();
assertThat(cached).isEqualTo(10.0);
// After TTL, new value surfaces.
now.set(now.get().plusSeconds(31));
metrics.snapshotAllGauges();
double refreshed = registry.find("alerting_rules_total").tag("state", "enabled").gauge().value();
assertThat(refreshed).isEqualTo(99.0);
}
}