feat(server): persist server self-metrics into ClickHouse
Snapshot the full Micrometer registry (cameleer business metrics, alerting metrics, and Spring Boot Actuator defaults) every 60s into a new server_metrics table so server health survives restarts without an external Prometheus. Includes a dashboard-builder reference for the SaaS team. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,130 @@
|
||||
package com.cameleer.server.app.metrics;
|
||||
|
||||
import com.cameleer.server.core.storage.ServerMetricsStore;
|
||||
import com.cameleer.server.core.storage.model.ServerMetricSample;
|
||||
import io.micrometer.core.instrument.Counter;
|
||||
import io.micrometer.core.instrument.Gauge;
|
||||
import io.micrometer.core.instrument.MeterRegistry;
|
||||
import io.micrometer.core.instrument.Timer;
|
||||
import io.micrometer.core.instrument.simple.SimpleMeterRegistry;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
class ServerMetricsSnapshotSchedulerTest {
|
||||
|
||||
@Test
|
||||
void snapshot_capturesCounterGaugeAndTimerMeasurements() {
|
||||
MeterRegistry registry = new SimpleMeterRegistry();
|
||||
|
||||
Counter counter = Counter.builder("cameleer.test.counter")
|
||||
.tag("env", "dev")
|
||||
.register(registry);
|
||||
counter.increment(3);
|
||||
|
||||
AtomicInteger gaugeSource = new AtomicInteger(42);
|
||||
Gauge.builder("cameleer.test.gauge", gaugeSource, AtomicInteger::doubleValue)
|
||||
.register(registry);
|
||||
|
||||
Timer timer = Timer.builder("cameleer.test.timer").register(registry);
|
||||
timer.record(Duration.ofMillis(5));
|
||||
timer.record(Duration.ofMillis(15));
|
||||
|
||||
RecordingStore store = new RecordingStore();
|
||||
ServerMetricsSnapshotScheduler scheduler =
|
||||
new ServerMetricsSnapshotScheduler(registry, store, "tenant-7", "server-A");
|
||||
|
||||
scheduler.snapshot();
|
||||
|
||||
assertThat(store.batches).hasSize(1);
|
||||
List<ServerMetricSample> samples = store.batches.get(0);
|
||||
|
||||
// Every sample is stamped with tenant + instance + finite value
|
||||
assertThat(samples).allSatisfy(s -> {
|
||||
assertThat(s.tenantId()).isEqualTo("tenant-7");
|
||||
assertThat(s.serverInstanceId()).isEqualTo("server-A");
|
||||
assertThat(Double.isFinite(s.value())).isTrue();
|
||||
assertThat(s.collectedAt()).isNotNull();
|
||||
});
|
||||
|
||||
// Counter -> 1 row with statistic=count, value=3, tag propagated
|
||||
List<ServerMetricSample> counterRows = samples.stream()
|
||||
.filter(s -> s.metricName().equals("cameleer.test.counter"))
|
||||
.toList();
|
||||
assertThat(counterRows).hasSize(1);
|
||||
assertThat(counterRows.get(0).statistic()).isEqualTo("count");
|
||||
assertThat(counterRows.get(0).metricType()).isEqualTo("counter");
|
||||
assertThat(counterRows.get(0).value()).isEqualTo(3.0);
|
||||
assertThat(counterRows.get(0).tags()).containsEntry("env", "dev");
|
||||
|
||||
// Gauge -> 1 row with statistic=value
|
||||
List<ServerMetricSample> gaugeRows = samples.stream()
|
||||
.filter(s -> s.metricName().equals("cameleer.test.gauge"))
|
||||
.toList();
|
||||
assertThat(gaugeRows).hasSize(1);
|
||||
assertThat(gaugeRows.get(0).statistic()).isEqualTo("value");
|
||||
assertThat(gaugeRows.get(0).metricType()).isEqualTo("gauge");
|
||||
assertThat(gaugeRows.get(0).value()).isEqualTo(42.0);
|
||||
|
||||
// Timer -> emits multiple statistics (count, total_time, max)
|
||||
List<ServerMetricSample> timerRows = samples.stream()
|
||||
.filter(s -> s.metricName().equals("cameleer.test.timer"))
|
||||
.toList();
|
||||
assertThat(timerRows).isNotEmpty();
|
||||
// SimpleMeterRegistry emits Statistic.TOTAL ("total"); other registries (Prometheus)
|
||||
// emit TOTAL_TIME ("total_time"). Accept either so the test isn't registry-coupled.
|
||||
assertThat(timerRows).extracting(ServerMetricSample::statistic)
|
||||
.contains("count", "max");
|
||||
assertThat(timerRows).extracting(ServerMetricSample::statistic)
|
||||
.containsAnyOf("total_time", "total");
|
||||
assertThat(timerRows).allSatisfy(s ->
|
||||
assertThat(s.metricType()).isEqualTo("timer"));
|
||||
ServerMetricSample count = timerRows.stream()
|
||||
.filter(s -> s.statistic().equals("count"))
|
||||
.findFirst().orElseThrow();
|
||||
assertThat(count.value()).isEqualTo(2.0);
|
||||
}
|
||||
|
||||
@Test
|
||||
void snapshot_withEmptyRegistry_doesNotWriteBatch() {
|
||||
MeterRegistry registry = new SimpleMeterRegistry();
|
||||
// Force removal of any auto-registered meters (SimpleMeterRegistry has none by default).
|
||||
RecordingStore store = new RecordingStore();
|
||||
ServerMetricsSnapshotScheduler scheduler =
|
||||
new ServerMetricsSnapshotScheduler(registry, store, "t", "s");
|
||||
|
||||
scheduler.snapshot();
|
||||
|
||||
assertThat(store.batches).isEmpty();
|
||||
}
|
||||
|
||||
@Test
|
||||
void snapshot_swallowsStoreFailures() {
|
||||
MeterRegistry registry = new SimpleMeterRegistry();
|
||||
Counter.builder("cameleer.test").register(registry).increment();
|
||||
|
||||
ServerMetricsStore throwingStore = batch -> {
|
||||
throw new RuntimeException("clickhouse down");
|
||||
};
|
||||
|
||||
ServerMetricsSnapshotScheduler scheduler =
|
||||
new ServerMetricsSnapshotScheduler(registry, throwingStore, "t", "s");
|
||||
|
||||
// Must not propagate — the scheduler thread would otherwise die.
|
||||
scheduler.snapshot();
|
||||
}
|
||||
|
||||
private static final class RecordingStore implements ServerMetricsStore {
|
||||
final List<List<ServerMetricSample>> batches = new ArrayList<>();
|
||||
|
||||
@Override
|
||||
public void insertBatch(List<ServerMetricSample> samples) {
|
||||
batches.add(List.copyOf(samples));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,117 @@
|
||||
package com.cameleer.server.app.storage;
|
||||
|
||||
import com.cameleer.server.core.storage.model.ServerMetricSample;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.jdbc.core.JdbcTemplate;
|
||||
import org.testcontainers.clickhouse.ClickHouseContainer;
|
||||
import org.testcontainers.junit.jupiter.Container;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
|
||||
import java.time.Instant;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
@Testcontainers
|
||||
class ClickHouseServerMetricsStoreIT {
|
||||
|
||||
@Container
|
||||
static final ClickHouseContainer clickhouse =
|
||||
new ClickHouseContainer("clickhouse/clickhouse-server:24.12");
|
||||
|
||||
private JdbcTemplate jdbc;
|
||||
private ClickHouseServerMetricsStore store;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() {
|
||||
HikariDataSource ds = new HikariDataSource();
|
||||
ds.setJdbcUrl(clickhouse.getJdbcUrl());
|
||||
ds.setUsername(clickhouse.getUsername());
|
||||
ds.setPassword(clickhouse.getPassword());
|
||||
|
||||
jdbc = new JdbcTemplate(ds);
|
||||
|
||||
jdbc.execute("""
|
||||
CREATE TABLE IF NOT EXISTS server_metrics (
|
||||
tenant_id LowCardinality(String) DEFAULT 'default',
|
||||
collected_at DateTime64(3),
|
||||
server_instance_id LowCardinality(String),
|
||||
metric_name LowCardinality(String),
|
||||
metric_type LowCardinality(String),
|
||||
statistic LowCardinality(String) DEFAULT 'value',
|
||||
metric_value Float64,
|
||||
tags Map(String, String) DEFAULT map(),
|
||||
server_received_at DateTime64(3) DEFAULT now64(3)
|
||||
)
|
||||
ENGINE = MergeTree()
|
||||
ORDER BY (tenant_id, collected_at, server_instance_id, metric_name, statistic)
|
||||
""");
|
||||
|
||||
jdbc.execute("TRUNCATE TABLE server_metrics");
|
||||
|
||||
store = new ClickHouseServerMetricsStore(jdbc);
|
||||
}
|
||||
|
||||
@Test
|
||||
void insertBatch_roundTripsAllColumns() {
|
||||
Instant ts = Instant.parse("2026-04-23T12:00:00Z");
|
||||
store.insertBatch(List.of(
|
||||
new ServerMetricSample("tenant-a", ts, "srv-1",
|
||||
"cameleer.ingestion.drops", "counter", "count", 17.0,
|
||||
Map.of("reason", "buffer_full")),
|
||||
new ServerMetricSample("tenant-a", ts, "srv-1",
|
||||
"jvm.memory.used", "gauge", "value", 1_048_576.0,
|
||||
Map.of("area", "heap", "id", "G1 Eden Space"))
|
||||
));
|
||||
|
||||
Integer count = jdbc.queryForObject(
|
||||
"SELECT count() FROM server_metrics WHERE tenant_id = 'tenant-a'",
|
||||
Integer.class);
|
||||
assertThat(count).isEqualTo(2);
|
||||
|
||||
Double dropsValue = jdbc.queryForObject(
|
||||
"""
|
||||
SELECT metric_value FROM server_metrics
|
||||
WHERE tenant_id = 'tenant-a'
|
||||
AND server_instance_id = 'srv-1'
|
||||
AND metric_name = 'cameleer.ingestion.drops'
|
||||
AND statistic = 'count'
|
||||
""",
|
||||
Double.class);
|
||||
assertThat(dropsValue).isEqualTo(17.0);
|
||||
|
||||
String heapArea = jdbc.queryForObject(
|
||||
"""
|
||||
SELECT tags['area'] FROM server_metrics
|
||||
WHERE tenant_id = 'tenant-a'
|
||||
AND metric_name = 'jvm.memory.used'
|
||||
""",
|
||||
String.class);
|
||||
assertThat(heapArea).isEqualTo("heap");
|
||||
}
|
||||
|
||||
@Test
|
||||
void insertBatch_emptyList_doesNothing() {
|
||||
store.insertBatch(List.of());
|
||||
|
||||
Integer count = jdbc.queryForObject(
|
||||
"SELECT count() FROM server_metrics", Integer.class);
|
||||
assertThat(count).isEqualTo(0);
|
||||
}
|
||||
|
||||
@Test
|
||||
void insertBatch_nullTags_storesEmptyMap() {
|
||||
store.insertBatch(List.of(
|
||||
new ServerMetricSample("default", Instant.parse("2026-04-23T12:00:00Z"),
|
||||
"srv-2", "process.cpu.usage", "gauge", "value", 0.12, null)
|
||||
));
|
||||
|
||||
Integer count = jdbc.queryForObject(
|
||||
"SELECT count() FROM server_metrics WHERE server_instance_id = 'srv-2'",
|
||||
Integer.class);
|
||||
assertThat(count).isEqualTo(1);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user