perf(alerting): 30s TTL cache on AlertingMetrics gauge suppliers
Prometheus scrapes can fire every few seconds. The open-alerts / open-rules
gauges query Postgres on each read — caching the values for 30s amortises
that to one query per half-minute. Addresses final-review NIT from Plan 02.
- Introduces a package-private TtlCache that wraps a Supplier<Long> and
memoises the last read for a configurable Duration against a Supplier<Instant>
clock.
- Wraps each gauge supplier (alerting_rules_total{enabled|disabled},
alerting_instances_total{state}) in its own TtlCache.
- Adds a test-friendly constructor (package-private) taking explicit
Duration + Supplier<Instant> so AlertingMetricsCachingTest can advance
a fake clock without waiting wall-clock time.
- Adds AlertingMetricsCachingTest covering:
* supplier invoked once per TTL across repeated scrapes
* 29 s elapsed → still cached; 31 s elapsed → re-queried
* gauge value reflects the cached result even after delegate mutates
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,111 @@
|
||||
package com.cameleer.server.app.alerting.metrics;
|
||||
|
||||
import com.cameleer.server.core.alerting.AlertState;
|
||||
import io.micrometer.core.instrument.MeterRegistry;
|
||||
import io.micrometer.core.instrument.simple.SimpleMeterRegistry;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
/**
|
||||
* Verifies that {@link AlertingMetrics} caches gauge values for a configurable TTL,
|
||||
* so that Prometheus scrapes do not cause one Postgres query per scrape.
|
||||
*/
|
||||
class AlertingMetricsCachingTest {
|
||||
|
||||
@Test
|
||||
void gaugeSupplierIsCalledAtMostOncePerTtl() {
|
||||
// The instances supplier is shared across every AlertState gauge, so each
|
||||
// full gauge snapshot invokes it once per AlertState (one cache per state).
|
||||
final int stateCount = AlertState.values().length;
|
||||
|
||||
AtomicInteger enabledRulesCalls = new AtomicInteger();
|
||||
AtomicInteger disabledRulesCalls = new AtomicInteger();
|
||||
AtomicInteger instancesCalls = new AtomicInteger();
|
||||
AtomicReference<Instant> now = new AtomicReference<>(Instant.parse("2026-04-20T00:00:00Z"));
|
||||
Supplier<Instant> clock = now::get;
|
||||
|
||||
MeterRegistry registry = new SimpleMeterRegistry();
|
||||
|
||||
Supplier<Long> enabledRulesSupplier = () -> { enabledRulesCalls.incrementAndGet(); return 7L; };
|
||||
Supplier<Long> disabledRulesSupplier = () -> { disabledRulesCalls.incrementAndGet(); return 3L; };
|
||||
Supplier<Long> instancesSupplier = () -> { instancesCalls.incrementAndGet(); return 5L; };
|
||||
|
||||
AlertingMetrics metrics = new AlertingMetrics(
|
||||
registry,
|
||||
enabledRulesSupplier,
|
||||
disabledRulesSupplier,
|
||||
instancesSupplier,
|
||||
Duration.ofSeconds(30),
|
||||
clock
|
||||
);
|
||||
|
||||
// First scrape — each supplier invoked exactly once per gauge.
|
||||
metrics.snapshotAllGauges();
|
||||
assertThat(enabledRulesCalls.get()).isEqualTo(1);
|
||||
assertThat(disabledRulesCalls.get()).isEqualTo(1);
|
||||
assertThat(instancesCalls.get()).isEqualTo(stateCount);
|
||||
|
||||
// Second scrape within TTL — served from cache.
|
||||
metrics.snapshotAllGauges();
|
||||
assertThat(enabledRulesCalls.get()).isEqualTo(1);
|
||||
assertThat(disabledRulesCalls.get()).isEqualTo(1);
|
||||
assertThat(instancesCalls.get()).isEqualTo(stateCount);
|
||||
|
||||
// Third scrape still within TTL (29 s later) — still cached.
|
||||
now.set(now.get().plusSeconds(29));
|
||||
metrics.snapshotAllGauges();
|
||||
assertThat(enabledRulesCalls.get()).isEqualTo(1);
|
||||
assertThat(disabledRulesCalls.get()).isEqualTo(1);
|
||||
assertThat(instancesCalls.get()).isEqualTo(stateCount);
|
||||
|
||||
// Advance past TTL — next scrape re-queries the delegate.
|
||||
now.set(Instant.parse("2026-04-20T00:00:31Z"));
|
||||
metrics.snapshotAllGauges();
|
||||
assertThat(enabledRulesCalls.get()).isEqualTo(2);
|
||||
assertThat(disabledRulesCalls.get()).isEqualTo(2);
|
||||
assertThat(instancesCalls.get()).isEqualTo(stateCount * 2);
|
||||
|
||||
// Immediate follow-up — back in cache.
|
||||
metrics.snapshotAllGauges();
|
||||
assertThat(enabledRulesCalls.get()).isEqualTo(2);
|
||||
assertThat(disabledRulesCalls.get()).isEqualTo(2);
|
||||
assertThat(instancesCalls.get()).isEqualTo(stateCount * 2);
|
||||
}
|
||||
|
||||
@Test
|
||||
void gaugeValueReflectsCachedResult() {
|
||||
AtomicReference<Long> enabledValue = new AtomicReference<>(10L);
|
||||
AtomicReference<Instant> now = new AtomicReference<>(Instant.parse("2026-04-20T00:00:00Z"));
|
||||
|
||||
MeterRegistry registry = new SimpleMeterRegistry();
|
||||
AlertingMetrics metrics = new AlertingMetrics(
|
||||
registry,
|
||||
enabledValue::get,
|
||||
() -> 0L,
|
||||
() -> 0L,
|
||||
Duration.ofSeconds(30),
|
||||
now::get
|
||||
);
|
||||
|
||||
// Read once — value cached at 10.
|
||||
metrics.snapshotAllGauges();
|
||||
|
||||
// Mutate the underlying supplier output; cache should shield it.
|
||||
enabledValue.set(99L);
|
||||
double cached = registry.find("alerting_rules_total").tag("state", "enabled").gauge().value();
|
||||
assertThat(cached).isEqualTo(10.0);
|
||||
|
||||
// After TTL, new value surfaces.
|
||||
now.set(now.get().plusSeconds(31));
|
||||
metrics.snapshotAllGauges();
|
||||
double refreshed = registry.find("alerting_rules_total").tag("state", "enabled").gauge().value();
|
||||
assertThat(refreshed).isEqualTo(99.0);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user