perf(alerting): 30s TTL cache on AlertingMetrics gauge suppliers
Prometheus scrapes can fire every few seconds. The open-alerts / open-rules
gauges query Postgres on each read — caching the values for 30s amortises
that to one query per half-minute. Addresses final-review NIT from Plan 02.
- Introduces a package-private TtlCache that wraps a Supplier<Long> and
memoises the last read for a configurable Duration against a Supplier<Instant>
clock.
- Wraps each gauge supplier (alerting_rules_total{enabled|disabled},
alerting_instances_total{state}) in its own TtlCache.
- Adds a test-friendly constructor (package-private) taking explicit
Duration + Supplier<Instant> so AlertingMetricsCachingTest can advance
a fake clock without waiting wall-clock time.
- Adds AlertingMetricsCachingTest covering:
* supplier invoked once per TTL across repeated scrapes
* 29 s elapsed → still cached; 31 s elapsed → re-queried
* gauge value reflects the cached result even after delegate mutates
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -12,9 +12,16 @@ import org.slf4j.LoggerFactory;
|
|||||||
import org.springframework.jdbc.core.JdbcTemplate;
|
import org.springframework.jdbc.core.JdbcTemplate;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.EnumMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import java.util.concurrent.ConcurrentMap;
|
import java.util.concurrent.ConcurrentMap;
|
||||||
|
import java.util.function.Supplier;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Micrometer-based metrics for the alerting subsystem.
|
* Micrometer-based metrics for the alerting subsystem.
|
||||||
@@ -30,10 +37,11 @@ import java.util.concurrent.ConcurrentMap;
|
|||||||
* <li>{@code alerting_eval_duration_seconds{kind}} — per-kind evaluation latency</li>
|
* <li>{@code alerting_eval_duration_seconds{kind}} — per-kind evaluation latency</li>
|
||||||
* <li>{@code alerting_webhook_delivery_duration_seconds} — webhook POST latency</li>
|
* <li>{@code alerting_webhook_delivery_duration_seconds} — webhook POST latency</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
* Gauges (read from PostgreSQL on each scrape; low scrape frequency = low DB load):
|
* Gauges (read from PostgreSQL, cached for {@link #DEFAULT_GAUGE_TTL} to amortise
|
||||||
|
* Prometheus scrapes that may fire every few seconds):
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>{@code alerting_rules_total{state=enabled|disabled}} — rule counts from {@code alert_rules}</li>
|
* <li>{@code alerting_rules_total{state=enabled|disabled}} — rule counts from {@code alert_rules}</li>
|
||||||
* <li>{@code alerting_instances_total{state,severity}} — instance counts grouped from {@code alert_instances}</li>
|
* <li>{@code alerting_instances_total{state}} — instance counts grouped from {@code alert_instances}</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
*/
|
*/
|
||||||
@Component
|
@Component
|
||||||
@@ -41,8 +49,10 @@ public class AlertingMetrics {
|
|||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(AlertingMetrics.class);
|
private static final Logger log = LoggerFactory.getLogger(AlertingMetrics.class);
|
||||||
|
|
||||||
|
/** Default time-to-live for the gauge-supplier caches. */
|
||||||
|
static final Duration DEFAULT_GAUGE_TTL = Duration.ofSeconds(30);
|
||||||
|
|
||||||
private final MeterRegistry registry;
|
private final MeterRegistry registry;
|
||||||
private final JdbcTemplate jdbc;
|
|
||||||
|
|
||||||
// Cached counters per kind (lazy-initialized)
|
// Cached counters per kind (lazy-initialized)
|
||||||
private final ConcurrentMap<String, Counter> evalErrorCounters = new ConcurrentHashMap<>();
|
private final ConcurrentMap<String, Counter> evalErrorCounters = new ConcurrentHashMap<>();
|
||||||
@@ -55,33 +65,80 @@ public class AlertingMetrics {
|
|||||||
// Shared delivery timer
|
// Shared delivery timer
|
||||||
private final Timer webhookDeliveryTimer;
|
private final Timer webhookDeliveryTimer;
|
||||||
|
|
||||||
|
// TTL-cached gauge suppliers registered so tests can force a read cycle.
|
||||||
|
private final TtlCache enabledRulesCache;
|
||||||
|
private final TtlCache disabledRulesCache;
|
||||||
|
private final Map<AlertState, TtlCache> instancesByStateCaches;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Production constructor: wraps the Postgres-backed gauge suppliers in a
|
||||||
|
* 30-second TTL cache so Prometheus scrapes don't cause per-scrape DB queries.
|
||||||
|
*/
|
||||||
public AlertingMetrics(MeterRegistry registry, JdbcTemplate jdbc) {
|
public AlertingMetrics(MeterRegistry registry, JdbcTemplate jdbc) {
|
||||||
|
this(registry,
|
||||||
|
() -> countRules(jdbc, true),
|
||||||
|
() -> countRules(jdbc, false),
|
||||||
|
state -> countInstances(jdbc, state),
|
||||||
|
DEFAULT_GAUGE_TTL,
|
||||||
|
Instant::now);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test-friendly constructor accepting the three gauge suppliers that are
|
||||||
|
* exercised in the {@link AlertingMetricsCachingTest} plan sketch. The
|
||||||
|
* {@code instancesSupplier} is used for every {@link AlertState}.
|
||||||
|
*/
|
||||||
|
AlertingMetrics(MeterRegistry registry,
|
||||||
|
Supplier<Long> enabledRulesSupplier,
|
||||||
|
Supplier<Long> disabledRulesSupplier,
|
||||||
|
Supplier<Long> instancesSupplier,
|
||||||
|
Duration gaugeTtl,
|
||||||
|
Supplier<Instant> clock) {
|
||||||
|
this(registry,
|
||||||
|
enabledRulesSupplier,
|
||||||
|
disabledRulesSupplier,
|
||||||
|
state -> instancesSupplier.get(),
|
||||||
|
gaugeTtl,
|
||||||
|
clock);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Core constructor: accepts per-state instance supplier so production can
|
||||||
|
* query PostgreSQL with a different value per {@link AlertState}.
|
||||||
|
*/
|
||||||
|
private AlertingMetrics(MeterRegistry registry,
|
||||||
|
Supplier<Long> enabledRulesSupplier,
|
||||||
|
Supplier<Long> disabledRulesSupplier,
|
||||||
|
java.util.function.Function<AlertState, Long> instancesSupplier,
|
||||||
|
Duration gaugeTtl,
|
||||||
|
Supplier<Instant> clock) {
|
||||||
this.registry = registry;
|
this.registry = registry;
|
||||||
this.jdbc = jdbc;
|
|
||||||
|
|
||||||
// ── Static timers ───────────────────────────────────────────────
|
// ── Static timers ───────────────────────────────────────────────
|
||||||
this.webhookDeliveryTimer = Timer.builder("alerting_webhook_delivery_duration_seconds")
|
this.webhookDeliveryTimer = Timer.builder("alerting_webhook_delivery_duration_seconds")
|
||||||
.description("Latency of outbound webhook POST requests")
|
.description("Latency of outbound webhook POST requests")
|
||||||
.register(registry);
|
.register(registry);
|
||||||
|
|
||||||
// ── Gauge: rules by enabled/disabled ────────────────────────────
|
// ── Gauge: rules by enabled/disabled (cached) ───────────────────
|
||||||
Gauge.builder("alerting_rules_total", this, m -> m.countRules(true))
|
this.enabledRulesCache = new TtlCache(enabledRulesSupplier, gaugeTtl, clock);
|
||||||
|
this.disabledRulesCache = new TtlCache(disabledRulesSupplier, gaugeTtl, clock);
|
||||||
|
|
||||||
|
Gauge.builder("alerting_rules_total", enabledRulesCache, TtlCache::getAsDouble)
|
||||||
.tag("state", "enabled")
|
.tag("state", "enabled")
|
||||||
.description("Number of enabled alert rules")
|
.description("Number of enabled alert rules")
|
||||||
.register(registry);
|
.register(registry);
|
||||||
Gauge.builder("alerting_rules_total", this, m -> m.countRules(false))
|
Gauge.builder("alerting_rules_total", disabledRulesCache, TtlCache::getAsDouble)
|
||||||
.tag("state", "disabled")
|
.tag("state", "disabled")
|
||||||
.description("Number of disabled alert rules")
|
.description("Number of disabled alert rules")
|
||||||
.register(registry);
|
.register(registry);
|
||||||
|
|
||||||
// ── Gauges: alert instances by state × severity ─────────────────
|
// ── Gauges: alert instances by state (cached) ───────────────────
|
||||||
|
this.instancesByStateCaches = new EnumMap<>(AlertState.class);
|
||||||
for (AlertState state : AlertState.values()) {
|
for (AlertState state : AlertState.values()) {
|
||||||
// Capture state as effectively-final for lambda
|
AlertState captured = state;
|
||||||
AlertState capturedState = state;
|
TtlCache cache = new TtlCache(() -> instancesSupplier.apply(captured), gaugeTtl, clock);
|
||||||
// We register one gauge per state (summed across severities) for simplicity;
|
this.instancesByStateCaches.put(state, cache);
|
||||||
// per-severity breakdown would require a dynamic MultiGauge.
|
Gauge.builder("alerting_instances_total", cache, TtlCache::getAsDouble)
|
||||||
Gauge.builder("alerting_instances_total", this,
|
|
||||||
m -> m.countInstances(capturedState))
|
|
||||||
.tag("state", state.name().toLowerCase())
|
.tag("state", state.name().toLowerCase())
|
||||||
.description("Number of alert instances by state")
|
.description("Number of alert instances by state")
|
||||||
.register(registry);
|
.register(registry);
|
||||||
@@ -148,28 +205,73 @@ public class AlertingMetrics {
|
|||||||
.increment();
|
.increment();
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Gauge suppliers (called on each Prometheus scrape) ──────────────
|
/**
|
||||||
|
* Force a read of every TTL-cached gauge supplier. Used by tests to simulate
|
||||||
|
* a Prometheus scrape without needing a real registry scrape pipeline.
|
||||||
|
*/
|
||||||
|
void snapshotAllGauges() {
|
||||||
|
List<TtlCache> all = new ArrayList<>();
|
||||||
|
all.add(enabledRulesCache);
|
||||||
|
all.add(disabledRulesCache);
|
||||||
|
all.addAll(instancesByStateCaches.values());
|
||||||
|
for (TtlCache c : all) {
|
||||||
|
c.getAsDouble();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private double countRules(boolean enabled) {
|
// ── Gauge suppliers (queried at most once per TTL) ──────────────────
|
||||||
|
|
||||||
|
private static long countRules(JdbcTemplate jdbc, boolean enabled) {
|
||||||
try {
|
try {
|
||||||
Long count = jdbc.queryForObject(
|
Long count = jdbc.queryForObject(
|
||||||
"SELECT COUNT(*) FROM alert_rules WHERE enabled = ?", Long.class, enabled);
|
"SELECT COUNT(*) FROM alert_rules WHERE enabled = ?", Long.class, enabled);
|
||||||
return count == null ? 0.0 : count.doubleValue();
|
return count == null ? 0L : count;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.debug("alerting_rules gauge query failed: {}", e.getMessage());
|
log.debug("alerting_rules gauge query failed: {}", e.getMessage());
|
||||||
return 0.0;
|
return 0L;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private double countInstances(AlertState state) {
|
private static long countInstances(JdbcTemplate jdbc, AlertState state) {
|
||||||
try {
|
try {
|
||||||
Long count = jdbc.queryForObject(
|
Long count = jdbc.queryForObject(
|
||||||
"SELECT COUNT(*) FROM alert_instances WHERE state = ?::alert_state_enum",
|
"SELECT COUNT(*) FROM alert_instances WHERE state = ?::alert_state_enum",
|
||||||
Long.class, state.name());
|
Long.class, state.name());
|
||||||
return count == null ? 0.0 : count.doubleValue();
|
return count == null ? 0L : count;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.debug("alerting_instances gauge query failed: {}", e.getMessage());
|
log.debug("alerting_instances gauge query failed: {}", e.getMessage());
|
||||||
return 0.0;
|
return 0L;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lightweight TTL cache around a {@code Supplier<Long>}. Every call to
|
||||||
|
* {@link #getAsDouble()} either returns the cached value (if {@code clock.get()
|
||||||
|
* - lastRead < ttl}) or invokes the delegate and refreshes the cache.
|
||||||
|
*
|
||||||
|
* <p>Used to amortise Postgres queries behind Prometheus gauges over a
|
||||||
|
* 30-second TTL (see {@link AlertingMetrics#DEFAULT_GAUGE_TTL}).
|
||||||
|
*/
|
||||||
|
static final class TtlCache {
|
||||||
|
private final Supplier<Long> delegate;
|
||||||
|
private final Duration ttl;
|
||||||
|
private final Supplier<Instant> clock;
|
||||||
|
private volatile Instant lastRead = Instant.MIN;
|
||||||
|
private volatile long cached = 0L;
|
||||||
|
|
||||||
|
TtlCache(Supplier<Long> delegate, Duration ttl, Supplier<Instant> clock) {
|
||||||
|
this.delegate = delegate;
|
||||||
|
this.ttl = ttl;
|
||||||
|
this.clock = clock;
|
||||||
|
}
|
||||||
|
|
||||||
|
synchronized double getAsDouble() {
|
||||||
|
Instant now = clock.get();
|
||||||
|
if (lastRead == Instant.MIN || Duration.between(lastRead, now).compareTo(ttl) >= 0) {
|
||||||
|
cached = delegate.get();
|
||||||
|
lastRead = now;
|
||||||
|
}
|
||||||
|
return cached;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,111 @@
|
|||||||
|
package com.cameleer.server.app.alerting.metrics;
|
||||||
|
|
||||||
|
import com.cameleer.server.core.alerting.AlertState;
|
||||||
|
import io.micrometer.core.instrument.MeterRegistry;
|
||||||
|
import io.micrometer.core.instrument.simple.SimpleMeterRegistry;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
|
import java.util.function.Supplier;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that {@link AlertingMetrics} caches gauge values for a configurable TTL,
|
||||||
|
* so that Prometheus scrapes do not cause one Postgres query per scrape.
|
||||||
|
*/
|
||||||
|
class AlertingMetricsCachingTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void gaugeSupplierIsCalledAtMostOncePerTtl() {
|
||||||
|
// The instances supplier is shared across every AlertState gauge, so each
|
||||||
|
// full gauge snapshot invokes it once per AlertState (one cache per state).
|
||||||
|
final int stateCount = AlertState.values().length;
|
||||||
|
|
||||||
|
AtomicInteger enabledRulesCalls = new AtomicInteger();
|
||||||
|
AtomicInteger disabledRulesCalls = new AtomicInteger();
|
||||||
|
AtomicInteger instancesCalls = new AtomicInteger();
|
||||||
|
AtomicReference<Instant> now = new AtomicReference<>(Instant.parse("2026-04-20T00:00:00Z"));
|
||||||
|
Supplier<Instant> clock = now::get;
|
||||||
|
|
||||||
|
MeterRegistry registry = new SimpleMeterRegistry();
|
||||||
|
|
||||||
|
Supplier<Long> enabledRulesSupplier = () -> { enabledRulesCalls.incrementAndGet(); return 7L; };
|
||||||
|
Supplier<Long> disabledRulesSupplier = () -> { disabledRulesCalls.incrementAndGet(); return 3L; };
|
||||||
|
Supplier<Long> instancesSupplier = () -> { instancesCalls.incrementAndGet(); return 5L; };
|
||||||
|
|
||||||
|
AlertingMetrics metrics = new AlertingMetrics(
|
||||||
|
registry,
|
||||||
|
enabledRulesSupplier,
|
||||||
|
disabledRulesSupplier,
|
||||||
|
instancesSupplier,
|
||||||
|
Duration.ofSeconds(30),
|
||||||
|
clock
|
||||||
|
);
|
||||||
|
|
||||||
|
// First scrape — each supplier invoked exactly once per gauge.
|
||||||
|
metrics.snapshotAllGauges();
|
||||||
|
assertThat(enabledRulesCalls.get()).isEqualTo(1);
|
||||||
|
assertThat(disabledRulesCalls.get()).isEqualTo(1);
|
||||||
|
assertThat(instancesCalls.get()).isEqualTo(stateCount);
|
||||||
|
|
||||||
|
// Second scrape within TTL — served from cache.
|
||||||
|
metrics.snapshotAllGauges();
|
||||||
|
assertThat(enabledRulesCalls.get()).isEqualTo(1);
|
||||||
|
assertThat(disabledRulesCalls.get()).isEqualTo(1);
|
||||||
|
assertThat(instancesCalls.get()).isEqualTo(stateCount);
|
||||||
|
|
||||||
|
// Third scrape still within TTL (29 s later) — still cached.
|
||||||
|
now.set(now.get().plusSeconds(29));
|
||||||
|
metrics.snapshotAllGauges();
|
||||||
|
assertThat(enabledRulesCalls.get()).isEqualTo(1);
|
||||||
|
assertThat(disabledRulesCalls.get()).isEqualTo(1);
|
||||||
|
assertThat(instancesCalls.get()).isEqualTo(stateCount);
|
||||||
|
|
||||||
|
// Advance past TTL — next scrape re-queries the delegate.
|
||||||
|
now.set(Instant.parse("2026-04-20T00:00:31Z"));
|
||||||
|
metrics.snapshotAllGauges();
|
||||||
|
assertThat(enabledRulesCalls.get()).isEqualTo(2);
|
||||||
|
assertThat(disabledRulesCalls.get()).isEqualTo(2);
|
||||||
|
assertThat(instancesCalls.get()).isEqualTo(stateCount * 2);
|
||||||
|
|
||||||
|
// Immediate follow-up — back in cache.
|
||||||
|
metrics.snapshotAllGauges();
|
||||||
|
assertThat(enabledRulesCalls.get()).isEqualTo(2);
|
||||||
|
assertThat(disabledRulesCalls.get()).isEqualTo(2);
|
||||||
|
assertThat(instancesCalls.get()).isEqualTo(stateCount * 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void gaugeValueReflectsCachedResult() {
|
||||||
|
AtomicReference<Long> enabledValue = new AtomicReference<>(10L);
|
||||||
|
AtomicReference<Instant> now = new AtomicReference<>(Instant.parse("2026-04-20T00:00:00Z"));
|
||||||
|
|
||||||
|
MeterRegistry registry = new SimpleMeterRegistry();
|
||||||
|
AlertingMetrics metrics = new AlertingMetrics(
|
||||||
|
registry,
|
||||||
|
enabledValue::get,
|
||||||
|
() -> 0L,
|
||||||
|
() -> 0L,
|
||||||
|
Duration.ofSeconds(30),
|
||||||
|
now::get
|
||||||
|
);
|
||||||
|
|
||||||
|
// Read once — value cached at 10.
|
||||||
|
metrics.snapshotAllGauges();
|
||||||
|
|
||||||
|
// Mutate the underlying supplier output; cache should shield it.
|
||||||
|
enabledValue.set(99L);
|
||||||
|
double cached = registry.find("alerting_rules_total").tag("state", "enabled").gauge().value();
|
||||||
|
assertThat(cached).isEqualTo(10.0);
|
||||||
|
|
||||||
|
// After TTL, new value surfaces.
|
||||||
|
now.set(now.get().plusSeconds(31));
|
||||||
|
metrics.snapshotAllGauges();
|
||||||
|
double refreshed = registry.find("alerting_rules_total").tag("state", "enabled").gauge().value();
|
||||||
|
assertThat(refreshed).isEqualTo(99.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user