feat(server): persist server self-metrics into ClickHouse

Snapshot the full Micrometer registry (cameleer business metrics, alerting
metrics, and Spring Boot Actuator defaults) every 60s into a new
server_metrics table so server health survives restarts without an external
Prometheus. Includes a dashboard-builder reference for the SaaS team.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
hsiegeln
2026-04-23 23:20:45 +02:00
parent 0bbe5d6623
commit 48ce75bf38
14 changed files with 913 additions and 1 deletions

View File

@@ -0,0 +1,130 @@
package com.cameleer.server.app.metrics;
import com.cameleer.server.core.storage.ServerMetricsStore;
import com.cameleer.server.core.storage.model.ServerMetricSample;
import io.micrometer.core.instrument.Counter;
import io.micrometer.core.instrument.Gauge;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.Timer;
import io.micrometer.core.instrument.simple.SimpleMeterRegistry;
import org.junit.jupiter.api.Test;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import static org.assertj.core.api.Assertions.assertThat;
class ServerMetricsSnapshotSchedulerTest {
@Test
void snapshot_capturesCounterGaugeAndTimerMeasurements() {
MeterRegistry registry = new SimpleMeterRegistry();
Counter counter = Counter.builder("cameleer.test.counter")
.tag("env", "dev")
.register(registry);
counter.increment(3);
AtomicInteger gaugeSource = new AtomicInteger(42);
Gauge.builder("cameleer.test.gauge", gaugeSource, AtomicInteger::doubleValue)
.register(registry);
Timer timer = Timer.builder("cameleer.test.timer").register(registry);
timer.record(Duration.ofMillis(5));
timer.record(Duration.ofMillis(15));
RecordingStore store = new RecordingStore();
ServerMetricsSnapshotScheduler scheduler =
new ServerMetricsSnapshotScheduler(registry, store, "tenant-7", "server-A");
scheduler.snapshot();
assertThat(store.batches).hasSize(1);
List<ServerMetricSample> samples = store.batches.get(0);
// Every sample is stamped with tenant + instance + finite value
assertThat(samples).allSatisfy(s -> {
assertThat(s.tenantId()).isEqualTo("tenant-7");
assertThat(s.serverInstanceId()).isEqualTo("server-A");
assertThat(Double.isFinite(s.value())).isTrue();
assertThat(s.collectedAt()).isNotNull();
});
// Counter -> 1 row with statistic=count, value=3, tag propagated
List<ServerMetricSample> counterRows = samples.stream()
.filter(s -> s.metricName().equals("cameleer.test.counter"))
.toList();
assertThat(counterRows).hasSize(1);
assertThat(counterRows.get(0).statistic()).isEqualTo("count");
assertThat(counterRows.get(0).metricType()).isEqualTo("counter");
assertThat(counterRows.get(0).value()).isEqualTo(3.0);
assertThat(counterRows.get(0).tags()).containsEntry("env", "dev");
// Gauge -> 1 row with statistic=value
List<ServerMetricSample> gaugeRows = samples.stream()
.filter(s -> s.metricName().equals("cameleer.test.gauge"))
.toList();
assertThat(gaugeRows).hasSize(1);
assertThat(gaugeRows.get(0).statistic()).isEqualTo("value");
assertThat(gaugeRows.get(0).metricType()).isEqualTo("gauge");
assertThat(gaugeRows.get(0).value()).isEqualTo(42.0);
// Timer -> emits multiple statistics (count, total_time, max)
List<ServerMetricSample> timerRows = samples.stream()
.filter(s -> s.metricName().equals("cameleer.test.timer"))
.toList();
assertThat(timerRows).isNotEmpty();
// SimpleMeterRegistry emits Statistic.TOTAL ("total"); other registries (Prometheus)
// emit TOTAL_TIME ("total_time"). Accept either so the test isn't registry-coupled.
assertThat(timerRows).extracting(ServerMetricSample::statistic)
.contains("count", "max");
assertThat(timerRows).extracting(ServerMetricSample::statistic)
.containsAnyOf("total_time", "total");
assertThat(timerRows).allSatisfy(s ->
assertThat(s.metricType()).isEqualTo("timer"));
ServerMetricSample count = timerRows.stream()
.filter(s -> s.statistic().equals("count"))
.findFirst().orElseThrow();
assertThat(count.value()).isEqualTo(2.0);
}
@Test
void snapshot_withEmptyRegistry_doesNotWriteBatch() {
MeterRegistry registry = new SimpleMeterRegistry();
// Force removal of any auto-registered meters (SimpleMeterRegistry has none by default).
RecordingStore store = new RecordingStore();
ServerMetricsSnapshotScheduler scheduler =
new ServerMetricsSnapshotScheduler(registry, store, "t", "s");
scheduler.snapshot();
assertThat(store.batches).isEmpty();
}
@Test
void snapshot_swallowsStoreFailures() {
MeterRegistry registry = new SimpleMeterRegistry();
Counter.builder("cameleer.test").register(registry).increment();
ServerMetricsStore throwingStore = batch -> {
throw new RuntimeException("clickhouse down");
};
ServerMetricsSnapshotScheduler scheduler =
new ServerMetricsSnapshotScheduler(registry, throwingStore, "t", "s");
// Must not propagate — the scheduler thread would otherwise die.
scheduler.snapshot();
}
private static final class RecordingStore implements ServerMetricsStore {
final List<List<ServerMetricSample>> batches = new ArrayList<>();
@Override
public void insertBatch(List<ServerMetricSample> samples) {
batches.add(List.copyOf(samples));
}
}
}

View File

@@ -0,0 +1,117 @@
package com.cameleer.server.app.storage;
import com.cameleer.server.core.storage.model.ServerMetricSample;
import com.zaxxer.hikari.HikariDataSource;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.springframework.jdbc.core.JdbcTemplate;
import org.testcontainers.clickhouse.ClickHouseContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import java.time.Instant;
import java.util.List;
import java.util.Map;
import static org.assertj.core.api.Assertions.assertThat;
@Testcontainers
class ClickHouseServerMetricsStoreIT {
@Container
static final ClickHouseContainer clickhouse =
new ClickHouseContainer("clickhouse/clickhouse-server:24.12");
private JdbcTemplate jdbc;
private ClickHouseServerMetricsStore store;
@BeforeEach
void setUp() {
HikariDataSource ds = new HikariDataSource();
ds.setJdbcUrl(clickhouse.getJdbcUrl());
ds.setUsername(clickhouse.getUsername());
ds.setPassword(clickhouse.getPassword());
jdbc = new JdbcTemplate(ds);
jdbc.execute("""
CREATE TABLE IF NOT EXISTS server_metrics (
tenant_id LowCardinality(String) DEFAULT 'default',
collected_at DateTime64(3),
server_instance_id LowCardinality(String),
metric_name LowCardinality(String),
metric_type LowCardinality(String),
statistic LowCardinality(String) DEFAULT 'value',
metric_value Float64,
tags Map(String, String) DEFAULT map(),
server_received_at DateTime64(3) DEFAULT now64(3)
)
ENGINE = MergeTree()
ORDER BY (tenant_id, collected_at, server_instance_id, metric_name, statistic)
""");
jdbc.execute("TRUNCATE TABLE server_metrics");
store = new ClickHouseServerMetricsStore(jdbc);
}
@Test
void insertBatch_roundTripsAllColumns() {
Instant ts = Instant.parse("2026-04-23T12:00:00Z");
store.insertBatch(List.of(
new ServerMetricSample("tenant-a", ts, "srv-1",
"cameleer.ingestion.drops", "counter", "count", 17.0,
Map.of("reason", "buffer_full")),
new ServerMetricSample("tenant-a", ts, "srv-1",
"jvm.memory.used", "gauge", "value", 1_048_576.0,
Map.of("area", "heap", "id", "G1 Eden Space"))
));
Integer count = jdbc.queryForObject(
"SELECT count() FROM server_metrics WHERE tenant_id = 'tenant-a'",
Integer.class);
assertThat(count).isEqualTo(2);
Double dropsValue = jdbc.queryForObject(
"""
SELECT metric_value FROM server_metrics
WHERE tenant_id = 'tenant-a'
AND server_instance_id = 'srv-1'
AND metric_name = 'cameleer.ingestion.drops'
AND statistic = 'count'
""",
Double.class);
assertThat(dropsValue).isEqualTo(17.0);
String heapArea = jdbc.queryForObject(
"""
SELECT tags['area'] FROM server_metrics
WHERE tenant_id = 'tenant-a'
AND metric_name = 'jvm.memory.used'
""",
String.class);
assertThat(heapArea).isEqualTo("heap");
}
@Test
void insertBatch_emptyList_doesNothing() {
store.insertBatch(List.of());
Integer count = jdbc.queryForObject(
"SELECT count() FROM server_metrics", Integer.class);
assertThat(count).isEqualTo(0);
}
@Test
void insertBatch_nullTags_storesEmptyMap() {
store.insertBatch(List.of(
new ServerMetricSample("default", Instant.parse("2026-04-23T12:00:00Z"),
"srv-2", "process.cpu.usage", "gauge", "value", 0.12, null)
));
Integer count = jdbc.queryForObject(
"SELECT count() FROM server_metrics WHERE server_instance_id = 'srv-2'",
Integer.class);
assertThat(count).isEqualTo(1);
}
}