feat(server): persist server self-metrics into ClickHouse

Snapshot the full Micrometer registry (cameleer business metrics, alerting
metrics, and Spring Boot Actuator defaults) every 60s into a new
server_metrics table so server health survives restarts without an external
Prometheus. Includes a dashboard-builder reference for the SaaS team.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
hsiegeln
2026-04-23 23:20:45 +02:00
parent 0bbe5d6623
commit 48ce75bf38
14 changed files with 913 additions and 1 deletions

View File

@@ -9,6 +9,7 @@ import com.cameleer.server.app.storage.ClickHouseRouteCatalogStore;
import com.cameleer.server.core.storage.RouteCatalogStore;
import com.cameleer.server.app.storage.ClickHouseMetricsQueryStore;
import com.cameleer.server.app.storage.ClickHouseMetricsStore;
import com.cameleer.server.app.storage.ClickHouseServerMetricsStore;
import com.cameleer.server.app.storage.ClickHouseStatsStore;
import com.cameleer.server.core.admin.AuditRepository;
import com.cameleer.server.core.admin.AuditService;
@@ -67,6 +68,12 @@ public class StorageBeanConfig {
return new ClickHouseMetricsQueryStore(tenantProperties.getId(), clickHouseJdbc);
}
@Bean
public ServerMetricsStore clickHouseServerMetricsStore(
@Qualifier("clickHouseJdbcTemplate") JdbcTemplate clickHouseJdbc) {
return new ClickHouseServerMetricsStore(clickHouseJdbc);
}
// ── Execution Store ──────────────────────────────────────────────────
@Bean

View File

@@ -0,0 +1,63 @@
package com.cameleer.server.app.metrics;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.UUID;
/**
* Resolves a stable identifier for this server process, used as the
* {@code server_instance_id} on every server_metrics sample. The value is
* fixed at boot, so counters restart cleanly whenever the id rotates.
*
* <p>Precedence:
* <ol>
* <li>{@code cameleer.server.instance-id} property / {@code CAMELEER_SERVER_INSTANCE_ID} env
* <li>{@code HOSTNAME} env (populated by Docker/Kubernetes)
* <li>{@link InetAddress#getLocalHost()} hostname
* <li>Random UUID (fallback — only hit when DNS and env are both silent)
* </ol>
*/
@Configuration
public class ServerInstanceIdConfig {
private static final Logger log = LoggerFactory.getLogger(ServerInstanceIdConfig.class);
@Bean("serverInstanceId")
public String serverInstanceId(
@Value("${cameleer.server.instance-id:}") String configuredId) {
if (!isBlank(configuredId)) {
log.info("Server instance id resolved from configuration: {}", configuredId);
return configuredId;
}
String hostnameEnv = System.getenv("HOSTNAME");
if (!isBlank(hostnameEnv)) {
log.info("Server instance id resolved from HOSTNAME env: {}", hostnameEnv);
return hostnameEnv;
}
try {
String localHost = InetAddress.getLocalHost().getHostName();
if (!isBlank(localHost)) {
log.info("Server instance id resolved from localhost lookup: {}", localHost);
return localHost;
}
} catch (UnknownHostException e) {
log.debug("InetAddress.getLocalHost() failed, falling back to UUID: {}", e.getMessage());
}
String fallback = UUID.randomUUID().toString();
log.warn("Server instance id could not be resolved; using random UUID {}", fallback);
return fallback;
}
private static boolean isBlank(String s) {
return s == null || s.isBlank();
}
}

View File

@@ -0,0 +1,106 @@
package com.cameleer.server.app.metrics;
import com.cameleer.server.core.storage.ServerMetricsStore;
import com.cameleer.server.core.storage.model.ServerMetricSample;
import io.micrometer.core.instrument.Measurement;
import io.micrometer.core.instrument.Meter;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.Tag;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.time.Instant;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
/**
* Periodically snapshots every meter in the server's {@link MeterRegistry}
* and writes the result to ClickHouse via {@link ServerMetricsStore}. This
* gives us historical server-health data (buffer depths, agent transitions,
* flush latency, JVM memory, HTTP response counts, etc.) without requiring
* an external Prometheus.
*
* <p>Each Micrometer {@link Meter#measure() measurement} becomes one row, so
* a single Timer produces rows for {@code count}, {@code total_time}, and
* {@code max} each tick. Counter values are cumulative since meter
* registration (Prometheus convention) — callers compute rate() themselves.
*
* <p>Disabled via {@code cameleer.server.self-metrics.enabled=false}.
*/
@Component
@ConditionalOnProperty(
prefix = "cameleer.server.self-metrics",
name = "enabled",
havingValue = "true",
matchIfMissing = true)
public class ServerMetricsSnapshotScheduler {
private static final Logger log = LoggerFactory.getLogger(ServerMetricsSnapshotScheduler.class);
private final MeterRegistry registry;
private final ServerMetricsStore store;
private final String tenantId;
private final String serverInstanceId;
public ServerMetricsSnapshotScheduler(
MeterRegistry registry,
ServerMetricsStore store,
@Value("${cameleer.server.tenant.id:default}") String tenantId,
@Qualifier("serverInstanceId") String serverInstanceId) {
this.registry = registry;
this.store = store;
this.tenantId = tenantId;
this.serverInstanceId = serverInstanceId;
}
@Scheduled(fixedDelayString = "${cameleer.server.self-metrics.interval-ms:60000}",
initialDelayString = "${cameleer.server.self-metrics.interval-ms:60000}")
public void snapshot() {
try {
Instant now = Instant.now();
List<ServerMetricSample> batch = new ArrayList<>();
for (Meter meter : registry.getMeters()) {
Meter.Id id = meter.getId();
Map<String, String> tags = flattenTags(id.getTagsAsIterable());
String type = id.getType().name().toLowerCase();
for (Measurement m : meter.measure()) {
double v = m.getValue();
if (!Double.isFinite(v)) continue;
batch.add(new ServerMetricSample(
tenantId,
now,
serverInstanceId,
id.getName(),
type,
m.getStatistic().getTagValueRepresentation(),
v,
tags));
}
}
if (!batch.isEmpty()) {
store.insertBatch(batch);
log.debug("Persisted {} server self-metric samples", batch.size());
}
} catch (Exception e) {
log.warn("Server self-metrics snapshot failed: {}", e.getMessage());
}
}
private static Map<String, String> flattenTags(Iterable<Tag> tags) {
Map<String, String> out = new LinkedHashMap<>();
for (Tag t : tags) {
out.put(t.getKey(), t.getValue());
}
return out;
}
}

View File

@@ -0,0 +1,46 @@
package com.cameleer.server.app.storage;
import com.cameleer.server.core.storage.ServerMetricsStore;
import com.cameleer.server.core.storage.model.ServerMetricSample;
import org.springframework.jdbc.core.JdbcTemplate;
import java.sql.Timestamp;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class ClickHouseServerMetricsStore implements ServerMetricsStore {
private final JdbcTemplate jdbc;
public ClickHouseServerMetricsStore(JdbcTemplate jdbc) {
this.jdbc = jdbc;
}
@Override
public void insertBatch(List<ServerMetricSample> samples) {
if (samples.isEmpty()) return;
jdbc.batchUpdate("""
INSERT INTO server_metrics
(tenant_id, collected_at, server_instance_id, metric_name,
metric_type, statistic, metric_value, tags)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""",
samples.stream().map(s -> new Object[]{
s.tenantId(),
Timestamp.from(s.collectedAt()),
s.serverInstanceId(),
s.metricName(),
s.metricType(),
s.statistic(),
s.value(),
tagsToClickHouseMap(s.tags())
}).toList());
}
private Map<String, String> tagsToClickHouseMap(Map<String, String> tags) {
if (tags == null || tags.isEmpty()) return new HashMap<>();
return new HashMap<>(tags);
}
}

View File

@@ -112,6 +112,10 @@ cameleer:
url: ${CAMELEER_SERVER_CLICKHOUSE_URL:jdbc:clickhouse://localhost:8123/cameleer}
username: ${CAMELEER_SERVER_CLICKHOUSE_USERNAME:default}
password: ${CAMELEER_SERVER_CLICKHOUSE_PASSWORD:}
self-metrics:
enabled: ${CAMELEER_SERVER_SELFMETRICS_ENABLED:true}
interval-ms: ${CAMELEER_SERVER_SELFMETRICS_INTERVALMS:60000}
instance-id: ${CAMELEER_SERVER_INSTANCE_ID:}
springdoc:
api-docs:

View File

@@ -401,6 +401,29 @@ CREATE TABLE IF NOT EXISTS route_catalog (
ENGINE = ReplacingMergeTree(last_seen)
ORDER BY (tenant_id, environment, application_id, route_id);
-- ── Server Self-Metrics ────────────────────────────────────────────────
-- Periodic snapshot of the server's own Micrometer registry (written by
-- ServerMetricsSnapshotScheduler). No `environment` column — the server
-- straddles environments. `statistic` distinguishes Timer/DistributionSummary
-- sub-measurements (count, total_time, max, mean) from plain counter/gauge values.
CREATE TABLE IF NOT EXISTS server_metrics (
tenant_id LowCardinality(String) DEFAULT 'default',
collected_at DateTime64(3),
server_instance_id LowCardinality(String),
metric_name LowCardinality(String),
metric_type LowCardinality(String),
statistic LowCardinality(String) DEFAULT 'value',
metric_value Float64,
tags Map(String, String) DEFAULT map(),
server_received_at DateTime64(3) DEFAULT now64(3)
)
ENGINE = MergeTree()
PARTITION BY (tenant_id, toYYYYMM(collected_at))
ORDER BY (tenant_id, collected_at, server_instance_id, metric_name, statistic)
TTL toDateTime(collected_at) + INTERVAL 90 DAY DELETE
SETTINGS index_granularity = 8192;
-- insert_id tiebreak for keyset pagination (fixes same-millisecond cursor collision).
-- IF NOT EXISTS on ADD COLUMN is idempotent. MATERIALIZE COLUMN is a background mutation,
-- effectively a no-op once all parts are already materialized.

View File

@@ -0,0 +1,130 @@
package com.cameleer.server.app.metrics;
import com.cameleer.server.core.storage.ServerMetricsStore;
import com.cameleer.server.core.storage.model.ServerMetricSample;
import io.micrometer.core.instrument.Counter;
import io.micrometer.core.instrument.Gauge;
import io.micrometer.core.instrument.MeterRegistry;
import io.micrometer.core.instrument.Timer;
import io.micrometer.core.instrument.simple.SimpleMeterRegistry;
import org.junit.jupiter.api.Test;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import static org.assertj.core.api.Assertions.assertThat;
class ServerMetricsSnapshotSchedulerTest {
@Test
void snapshot_capturesCounterGaugeAndTimerMeasurements() {
MeterRegistry registry = new SimpleMeterRegistry();
Counter counter = Counter.builder("cameleer.test.counter")
.tag("env", "dev")
.register(registry);
counter.increment(3);
AtomicInteger gaugeSource = new AtomicInteger(42);
Gauge.builder("cameleer.test.gauge", gaugeSource, AtomicInteger::doubleValue)
.register(registry);
Timer timer = Timer.builder("cameleer.test.timer").register(registry);
timer.record(Duration.ofMillis(5));
timer.record(Duration.ofMillis(15));
RecordingStore store = new RecordingStore();
ServerMetricsSnapshotScheduler scheduler =
new ServerMetricsSnapshotScheduler(registry, store, "tenant-7", "server-A");
scheduler.snapshot();
assertThat(store.batches).hasSize(1);
List<ServerMetricSample> samples = store.batches.get(0);
// Every sample is stamped with tenant + instance + finite value
assertThat(samples).allSatisfy(s -> {
assertThat(s.tenantId()).isEqualTo("tenant-7");
assertThat(s.serverInstanceId()).isEqualTo("server-A");
assertThat(Double.isFinite(s.value())).isTrue();
assertThat(s.collectedAt()).isNotNull();
});
// Counter -> 1 row with statistic=count, value=3, tag propagated
List<ServerMetricSample> counterRows = samples.stream()
.filter(s -> s.metricName().equals("cameleer.test.counter"))
.toList();
assertThat(counterRows).hasSize(1);
assertThat(counterRows.get(0).statistic()).isEqualTo("count");
assertThat(counterRows.get(0).metricType()).isEqualTo("counter");
assertThat(counterRows.get(0).value()).isEqualTo(3.0);
assertThat(counterRows.get(0).tags()).containsEntry("env", "dev");
// Gauge -> 1 row with statistic=value
List<ServerMetricSample> gaugeRows = samples.stream()
.filter(s -> s.metricName().equals("cameleer.test.gauge"))
.toList();
assertThat(gaugeRows).hasSize(1);
assertThat(gaugeRows.get(0).statistic()).isEqualTo("value");
assertThat(gaugeRows.get(0).metricType()).isEqualTo("gauge");
assertThat(gaugeRows.get(0).value()).isEqualTo(42.0);
// Timer -> emits multiple statistics (count, total_time, max)
List<ServerMetricSample> timerRows = samples.stream()
.filter(s -> s.metricName().equals("cameleer.test.timer"))
.toList();
assertThat(timerRows).isNotEmpty();
// SimpleMeterRegistry emits Statistic.TOTAL ("total"); other registries (Prometheus)
// emit TOTAL_TIME ("total_time"). Accept either so the test isn't registry-coupled.
assertThat(timerRows).extracting(ServerMetricSample::statistic)
.contains("count", "max");
assertThat(timerRows).extracting(ServerMetricSample::statistic)
.containsAnyOf("total_time", "total");
assertThat(timerRows).allSatisfy(s ->
assertThat(s.metricType()).isEqualTo("timer"));
ServerMetricSample count = timerRows.stream()
.filter(s -> s.statistic().equals("count"))
.findFirst().orElseThrow();
assertThat(count.value()).isEqualTo(2.0);
}
@Test
void snapshot_withEmptyRegistry_doesNotWriteBatch() {
MeterRegistry registry = new SimpleMeterRegistry();
// Force removal of any auto-registered meters (SimpleMeterRegistry has none by default).
RecordingStore store = new RecordingStore();
ServerMetricsSnapshotScheduler scheduler =
new ServerMetricsSnapshotScheduler(registry, store, "t", "s");
scheduler.snapshot();
assertThat(store.batches).isEmpty();
}
@Test
void snapshot_swallowsStoreFailures() {
MeterRegistry registry = new SimpleMeterRegistry();
Counter.builder("cameleer.test").register(registry).increment();
ServerMetricsStore throwingStore = batch -> {
throw new RuntimeException("clickhouse down");
};
ServerMetricsSnapshotScheduler scheduler =
new ServerMetricsSnapshotScheduler(registry, throwingStore, "t", "s");
// Must not propagate — the scheduler thread would otherwise die.
scheduler.snapshot();
}
private static final class RecordingStore implements ServerMetricsStore {
final List<List<ServerMetricSample>> batches = new ArrayList<>();
@Override
public void insertBatch(List<ServerMetricSample> samples) {
batches.add(List.copyOf(samples));
}
}
}

View File

@@ -0,0 +1,117 @@
package com.cameleer.server.app.storage;
import com.cameleer.server.core.storage.model.ServerMetricSample;
import com.zaxxer.hikari.HikariDataSource;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.springframework.jdbc.core.JdbcTemplate;
import org.testcontainers.clickhouse.ClickHouseContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import java.time.Instant;
import java.util.List;
import java.util.Map;
import static org.assertj.core.api.Assertions.assertThat;
@Testcontainers
class ClickHouseServerMetricsStoreIT {
@Container
static final ClickHouseContainer clickhouse =
new ClickHouseContainer("clickhouse/clickhouse-server:24.12");
private JdbcTemplate jdbc;
private ClickHouseServerMetricsStore store;
@BeforeEach
void setUp() {
HikariDataSource ds = new HikariDataSource();
ds.setJdbcUrl(clickhouse.getJdbcUrl());
ds.setUsername(clickhouse.getUsername());
ds.setPassword(clickhouse.getPassword());
jdbc = new JdbcTemplate(ds);
jdbc.execute("""
CREATE TABLE IF NOT EXISTS server_metrics (
tenant_id LowCardinality(String) DEFAULT 'default',
collected_at DateTime64(3),
server_instance_id LowCardinality(String),
metric_name LowCardinality(String),
metric_type LowCardinality(String),
statistic LowCardinality(String) DEFAULT 'value',
metric_value Float64,
tags Map(String, String) DEFAULT map(),
server_received_at DateTime64(3) DEFAULT now64(3)
)
ENGINE = MergeTree()
ORDER BY (tenant_id, collected_at, server_instance_id, metric_name, statistic)
""");
jdbc.execute("TRUNCATE TABLE server_metrics");
store = new ClickHouseServerMetricsStore(jdbc);
}
@Test
void insertBatch_roundTripsAllColumns() {
Instant ts = Instant.parse("2026-04-23T12:00:00Z");
store.insertBatch(List.of(
new ServerMetricSample("tenant-a", ts, "srv-1",
"cameleer.ingestion.drops", "counter", "count", 17.0,
Map.of("reason", "buffer_full")),
new ServerMetricSample("tenant-a", ts, "srv-1",
"jvm.memory.used", "gauge", "value", 1_048_576.0,
Map.of("area", "heap", "id", "G1 Eden Space"))
));
Integer count = jdbc.queryForObject(
"SELECT count() FROM server_metrics WHERE tenant_id = 'tenant-a'",
Integer.class);
assertThat(count).isEqualTo(2);
Double dropsValue = jdbc.queryForObject(
"""
SELECT metric_value FROM server_metrics
WHERE tenant_id = 'tenant-a'
AND server_instance_id = 'srv-1'
AND metric_name = 'cameleer.ingestion.drops'
AND statistic = 'count'
""",
Double.class);
assertThat(dropsValue).isEqualTo(17.0);
String heapArea = jdbc.queryForObject(
"""
SELECT tags['area'] FROM server_metrics
WHERE tenant_id = 'tenant-a'
AND metric_name = 'jvm.memory.used'
""",
String.class);
assertThat(heapArea).isEqualTo("heap");
}
@Test
void insertBatch_emptyList_doesNothing() {
store.insertBatch(List.of());
Integer count = jdbc.queryForObject(
"SELECT count() FROM server_metrics", Integer.class);
assertThat(count).isEqualTo(0);
}
@Test
void insertBatch_nullTags_storesEmptyMap() {
store.insertBatch(List.of(
new ServerMetricSample("default", Instant.parse("2026-04-23T12:00:00Z"),
"srv-2", "process.cpu.usage", "gauge", "value", 0.12, null)
));
Integer count = jdbc.queryForObject(
"SELECT count() FROM server_metrics WHERE server_instance_id = 'srv-2'",
Integer.class);
assertThat(count).isEqualTo(1);
}
}