perf(alerting): 30s TTL cache on AlertingMetrics gauge suppliers

Prometheus scrapes can fire every few seconds. The open-alerts / open-rules
gauges query Postgres on each read — caching the values for 30s amortises
that to one query per half-minute. Addresses final-review NIT from Plan 02.

- Introduces a package-private TtlCache that wraps a Supplier<Long> and
  memoises the last read for a configurable Duration against a Supplier<Instant>
  clock.
- Wraps each gauge supplier (alerting_rules_total{enabled|disabled},
  alerting_instances_total{state}) in its own TtlCache.
- Adds a test-friendly constructor (package-private) taking explicit
  Duration + Supplier<Instant> so AlertingMetricsCachingTest can advance
  a fake clock without waiting wall-clock time.
- Adds AlertingMetricsCachingTest covering:
  * supplier invoked once per TTL across repeated scrapes
  * 29 s elapsed → still cached; 31 s elapsed → re-queried
  * gauge value reflects the cached result even after delegate mutates

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
hsiegeln
2026-04-20 14:22:54 +02:00
parent 5ebc729b82
commit 9f109b20fd
2 changed files with 241 additions and 28 deletions

View File

@@ -0,0 +1,111 @@
package com.cameleer.server.app.alerting.metrics;
import com.cameleer.server.core.alerting.AlertState;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.simple.SimpleMeterRegistry;
import org.junit.jupiter.api.Test;
import java.time.Duration;
import java.time.Instant;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Supplier;
import static org.assertj.core.api.Assertions.assertThat;
/**
* Verifies that {@link AlertingMetrics} caches gauge values for a configurable TTL,
* so that Prometheus scrapes do not cause one Postgres query per scrape.
*/
class AlertingMetricsCachingTest {
@Test
void gaugeSupplierIsCalledAtMostOncePerTtl() {
// The instances supplier is shared across every AlertState gauge, so each
// full gauge snapshot invokes it once per AlertState (one cache per state).
final int stateCount = AlertState.values().length;
AtomicInteger enabledRulesCalls = new AtomicInteger();
AtomicInteger disabledRulesCalls = new AtomicInteger();
AtomicInteger instancesCalls = new AtomicInteger();
AtomicReference<Instant> now = new AtomicReference<>(Instant.parse("2026-04-20T00:00:00Z"));
Supplier<Instant> clock = now::get;
MeterRegistry registry = new SimpleMeterRegistry();
Supplier<Long> enabledRulesSupplier = () -> { enabledRulesCalls.incrementAndGet(); return 7L; };
Supplier<Long> disabledRulesSupplier = () -> { disabledRulesCalls.incrementAndGet(); return 3L; };
Supplier<Long> instancesSupplier = () -> { instancesCalls.incrementAndGet(); return 5L; };
AlertingMetrics metrics = new AlertingMetrics(
registry,
enabledRulesSupplier,
disabledRulesSupplier,
instancesSupplier,
Duration.ofSeconds(30),
clock
);
// First scrape — each supplier invoked exactly once per gauge.
metrics.snapshotAllGauges();
assertThat(enabledRulesCalls.get()).isEqualTo(1);
assertThat(disabledRulesCalls.get()).isEqualTo(1);
assertThat(instancesCalls.get()).isEqualTo(stateCount);
// Second scrape within TTL — served from cache.
metrics.snapshotAllGauges();
assertThat(enabledRulesCalls.get()).isEqualTo(1);
assertThat(disabledRulesCalls.get()).isEqualTo(1);
assertThat(instancesCalls.get()).isEqualTo(stateCount);
// Third scrape still within TTL (29 s later) — still cached.
now.set(now.get().plusSeconds(29));
metrics.snapshotAllGauges();
assertThat(enabledRulesCalls.get()).isEqualTo(1);
assertThat(disabledRulesCalls.get()).isEqualTo(1);
assertThat(instancesCalls.get()).isEqualTo(stateCount);
// Advance past TTL — next scrape re-queries the delegate.
now.set(Instant.parse("2026-04-20T00:00:31Z"));
metrics.snapshotAllGauges();
assertThat(enabledRulesCalls.get()).isEqualTo(2);
assertThat(disabledRulesCalls.get()).isEqualTo(2);
assertThat(instancesCalls.get()).isEqualTo(stateCount * 2);
// Immediate follow-up — back in cache.
metrics.snapshotAllGauges();
assertThat(enabledRulesCalls.get()).isEqualTo(2);
assertThat(disabledRulesCalls.get()).isEqualTo(2);
assertThat(instancesCalls.get()).isEqualTo(stateCount * 2);
}
@Test
void gaugeValueReflectsCachedResult() {
AtomicReference<Long> enabledValue = new AtomicReference<>(10L);
AtomicReference<Instant> now = new AtomicReference<>(Instant.parse("2026-04-20T00:00:00Z"));
MeterRegistry registry = new SimpleMeterRegistry();
AlertingMetrics metrics = new AlertingMetrics(
registry,
enabledValue::get,
() -> 0L,
() -> 0L,
Duration.ofSeconds(30),
now::get
);
// Read once — value cached at 10.
metrics.snapshotAllGauges();
// Mutate the underlying supplier output; cache should shield it.
enabledValue.set(99L);
double cached = registry.find("alerting_rules_total").tag("state", "enabled").gauge().value();
assertThat(cached).isEqualTo(10.0);
// After TTL, new value surfaces.
now.set(now.get().plusSeconds(31));
metrics.snapshotAllGauges();
double refreshed = registry.find("alerting_rules_total").tag("state", "enabled").gauge().value();
assertThat(refreshed).isEqualTo(99.0);
}
}