feat(server): persist server self-metrics into ClickHouse
Snapshot the full Micrometer registry (cameleer business metrics, alerting metrics, and Spring Boot Actuator defaults) every 60s into a new server_metrics table so server health survives restarts without an external Prometheus. Includes a dashboard-builder reference for the SaaS team. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -9,6 +9,7 @@ import com.cameleer.server.app.storage.ClickHouseRouteCatalogStore;
|
||||
import com.cameleer.server.core.storage.RouteCatalogStore;
|
||||
import com.cameleer.server.app.storage.ClickHouseMetricsQueryStore;
|
||||
import com.cameleer.server.app.storage.ClickHouseMetricsStore;
|
||||
import com.cameleer.server.app.storage.ClickHouseServerMetricsStore;
|
||||
import com.cameleer.server.app.storage.ClickHouseStatsStore;
|
||||
import com.cameleer.server.core.admin.AuditRepository;
|
||||
import com.cameleer.server.core.admin.AuditService;
|
||||
@@ -67,6 +68,12 @@ public class StorageBeanConfig {
|
||||
return new ClickHouseMetricsQueryStore(tenantProperties.getId(), clickHouseJdbc);
|
||||
}
|
||||
|
||||
@Bean
|
||||
public ServerMetricsStore clickHouseServerMetricsStore(
|
||||
@Qualifier("clickHouseJdbcTemplate") JdbcTemplate clickHouseJdbc) {
|
||||
return new ClickHouseServerMetricsStore(clickHouseJdbc);
|
||||
}
|
||||
|
||||
// ── Execution Store ──────────────────────────────────────────────────
|
||||
|
||||
@Bean
|
||||
|
||||
@@ -0,0 +1,63 @@
|
||||
package com.cameleer.server.app.metrics;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import java.net.InetAddress;
|
||||
import java.net.UnknownHostException;
|
||||
import java.util.UUID;
|
||||
|
||||
/**
|
||||
* Resolves a stable identifier for this server process, used as the
|
||||
* {@code server_instance_id} on every server_metrics sample. The value is
|
||||
* fixed at boot, so counters restart cleanly whenever the id rotates.
|
||||
*
|
||||
* <p>Precedence:
|
||||
* <ol>
|
||||
* <li>{@code cameleer.server.instance-id} property / {@code CAMELEER_SERVER_INSTANCE_ID} env
|
||||
* <li>{@code HOSTNAME} env (populated by Docker/Kubernetes)
|
||||
* <li>{@link InetAddress#getLocalHost()} hostname
|
||||
* <li>Random UUID (fallback — only hit when DNS and env are both silent)
|
||||
* </ol>
|
||||
*/
|
||||
@Configuration
|
||||
public class ServerInstanceIdConfig {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(ServerInstanceIdConfig.class);
|
||||
|
||||
@Bean("serverInstanceId")
|
||||
public String serverInstanceId(
|
||||
@Value("${cameleer.server.instance-id:}") String configuredId) {
|
||||
if (!isBlank(configuredId)) {
|
||||
log.info("Server instance id resolved from configuration: {}", configuredId);
|
||||
return configuredId;
|
||||
}
|
||||
|
||||
String hostnameEnv = System.getenv("HOSTNAME");
|
||||
if (!isBlank(hostnameEnv)) {
|
||||
log.info("Server instance id resolved from HOSTNAME env: {}", hostnameEnv);
|
||||
return hostnameEnv;
|
||||
}
|
||||
|
||||
try {
|
||||
String localHost = InetAddress.getLocalHost().getHostName();
|
||||
if (!isBlank(localHost)) {
|
||||
log.info("Server instance id resolved from localhost lookup: {}", localHost);
|
||||
return localHost;
|
||||
}
|
||||
} catch (UnknownHostException e) {
|
||||
log.debug("InetAddress.getLocalHost() failed, falling back to UUID: {}", e.getMessage());
|
||||
}
|
||||
|
||||
String fallback = UUID.randomUUID().toString();
|
||||
log.warn("Server instance id could not be resolved; using random UUID {}", fallback);
|
||||
return fallback;
|
||||
}
|
||||
|
||||
private static boolean isBlank(String s) {
|
||||
return s == null || s.isBlank();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,106 @@
|
||||
package com.cameleer.server.app.metrics;
|
||||
|
||||
import com.cameleer.server.core.storage.ServerMetricsStore;
|
||||
import com.cameleer.server.core.storage.model.ServerMetricSample;
|
||||
import io.micrometer.core.instrument.Measurement;
|
||||
import io.micrometer.core.instrument.Meter;
|
||||
import io.micrometer.core.instrument.MeterRegistry;
|
||||
import io.micrometer.core.instrument.Tag;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Qualifier;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
import org.springframework.scheduling.annotation.Scheduled;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Periodically snapshots every meter in the server's {@link MeterRegistry}
|
||||
* and writes the result to ClickHouse via {@link ServerMetricsStore}. This
|
||||
* gives us historical server-health data (buffer depths, agent transitions,
|
||||
* flush latency, JVM memory, HTTP response counts, etc.) without requiring
|
||||
* an external Prometheus.
|
||||
*
|
||||
* <p>Each Micrometer {@link Meter#measure() measurement} becomes one row, so
|
||||
* a single Timer produces rows for {@code count}, {@code total_time}, and
|
||||
* {@code max} each tick. Counter values are cumulative since meter
|
||||
* registration (Prometheus convention) — callers compute rate() themselves.
|
||||
*
|
||||
* <p>Disabled via {@code cameleer.server.self-metrics.enabled=false}.
|
||||
*/
|
||||
@Component
|
||||
@ConditionalOnProperty(
|
||||
prefix = "cameleer.server.self-metrics",
|
||||
name = "enabled",
|
||||
havingValue = "true",
|
||||
matchIfMissing = true)
|
||||
public class ServerMetricsSnapshotScheduler {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(ServerMetricsSnapshotScheduler.class);
|
||||
|
||||
private final MeterRegistry registry;
|
||||
private final ServerMetricsStore store;
|
||||
private final String tenantId;
|
||||
private final String serverInstanceId;
|
||||
|
||||
public ServerMetricsSnapshotScheduler(
|
||||
MeterRegistry registry,
|
||||
ServerMetricsStore store,
|
||||
@Value("${cameleer.server.tenant.id:default}") String tenantId,
|
||||
@Qualifier("serverInstanceId") String serverInstanceId) {
|
||||
this.registry = registry;
|
||||
this.store = store;
|
||||
this.tenantId = tenantId;
|
||||
this.serverInstanceId = serverInstanceId;
|
||||
}
|
||||
|
||||
@Scheduled(fixedDelayString = "${cameleer.server.self-metrics.interval-ms:60000}",
|
||||
initialDelayString = "${cameleer.server.self-metrics.interval-ms:60000}")
|
||||
public void snapshot() {
|
||||
try {
|
||||
Instant now = Instant.now();
|
||||
List<ServerMetricSample> batch = new ArrayList<>();
|
||||
|
||||
for (Meter meter : registry.getMeters()) {
|
||||
Meter.Id id = meter.getId();
|
||||
Map<String, String> tags = flattenTags(id.getTagsAsIterable());
|
||||
String type = id.getType().name().toLowerCase();
|
||||
|
||||
for (Measurement m : meter.measure()) {
|
||||
double v = m.getValue();
|
||||
if (!Double.isFinite(v)) continue;
|
||||
batch.add(new ServerMetricSample(
|
||||
tenantId,
|
||||
now,
|
||||
serverInstanceId,
|
||||
id.getName(),
|
||||
type,
|
||||
m.getStatistic().getTagValueRepresentation(),
|
||||
v,
|
||||
tags));
|
||||
}
|
||||
}
|
||||
|
||||
if (!batch.isEmpty()) {
|
||||
store.insertBatch(batch);
|
||||
log.debug("Persisted {} server self-metric samples", batch.size());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.warn("Server self-metrics snapshot failed: {}", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private static Map<String, String> flattenTags(Iterable<Tag> tags) {
|
||||
Map<String, String> out = new LinkedHashMap<>();
|
||||
for (Tag t : tags) {
|
||||
out.put(t.getKey(), t.getValue());
|
||||
}
|
||||
return out;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
package com.cameleer.server.app.storage;
|
||||
|
||||
import com.cameleer.server.core.storage.ServerMetricsStore;
|
||||
import com.cameleer.server.core.storage.model.ServerMetricSample;
|
||||
import org.springframework.jdbc.core.JdbcTemplate;
|
||||
|
||||
import java.sql.Timestamp;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class ClickHouseServerMetricsStore implements ServerMetricsStore {
|
||||
|
||||
private final JdbcTemplate jdbc;
|
||||
|
||||
public ClickHouseServerMetricsStore(JdbcTemplate jdbc) {
|
||||
this.jdbc = jdbc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void insertBatch(List<ServerMetricSample> samples) {
|
||||
if (samples.isEmpty()) return;
|
||||
|
||||
jdbc.batchUpdate("""
|
||||
INSERT INTO server_metrics
|
||||
(tenant_id, collected_at, server_instance_id, metric_name,
|
||||
metric_type, statistic, metric_value, tags)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
samples.stream().map(s -> new Object[]{
|
||||
s.tenantId(),
|
||||
Timestamp.from(s.collectedAt()),
|
||||
s.serverInstanceId(),
|
||||
s.metricName(),
|
||||
s.metricType(),
|
||||
s.statistic(),
|
||||
s.value(),
|
||||
tagsToClickHouseMap(s.tags())
|
||||
}).toList());
|
||||
}
|
||||
|
||||
private Map<String, String> tagsToClickHouseMap(Map<String, String> tags) {
|
||||
if (tags == null || tags.isEmpty()) return new HashMap<>();
|
||||
return new HashMap<>(tags);
|
||||
}
|
||||
}
|
||||
@@ -112,6 +112,10 @@ cameleer:
|
||||
url: ${CAMELEER_SERVER_CLICKHOUSE_URL:jdbc:clickhouse://localhost:8123/cameleer}
|
||||
username: ${CAMELEER_SERVER_CLICKHOUSE_USERNAME:default}
|
||||
password: ${CAMELEER_SERVER_CLICKHOUSE_PASSWORD:}
|
||||
self-metrics:
|
||||
enabled: ${CAMELEER_SERVER_SELFMETRICS_ENABLED:true}
|
||||
interval-ms: ${CAMELEER_SERVER_SELFMETRICS_INTERVALMS:60000}
|
||||
instance-id: ${CAMELEER_SERVER_INSTANCE_ID:}
|
||||
|
||||
springdoc:
|
||||
api-docs:
|
||||
|
||||
@@ -401,6 +401,29 @@ CREATE TABLE IF NOT EXISTS route_catalog (
|
||||
ENGINE = ReplacingMergeTree(last_seen)
|
||||
ORDER BY (tenant_id, environment, application_id, route_id);
|
||||
|
||||
-- ── Server Self-Metrics ────────────────────────────────────────────────
|
||||
-- Periodic snapshot of the server's own Micrometer registry (written by
|
||||
-- ServerMetricsSnapshotScheduler). No `environment` column — the server
|
||||
-- straddles environments. `statistic` distinguishes Timer/DistributionSummary
|
||||
-- sub-measurements (count, total_time, max, mean) from plain counter/gauge values.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS server_metrics (
|
||||
tenant_id LowCardinality(String) DEFAULT 'default',
|
||||
collected_at DateTime64(3),
|
||||
server_instance_id LowCardinality(String),
|
||||
metric_name LowCardinality(String),
|
||||
metric_type LowCardinality(String),
|
||||
statistic LowCardinality(String) DEFAULT 'value',
|
||||
metric_value Float64,
|
||||
tags Map(String, String) DEFAULT map(),
|
||||
server_received_at DateTime64(3) DEFAULT now64(3)
|
||||
)
|
||||
ENGINE = MergeTree()
|
||||
PARTITION BY (tenant_id, toYYYYMM(collected_at))
|
||||
ORDER BY (tenant_id, collected_at, server_instance_id, metric_name, statistic)
|
||||
TTL toDateTime(collected_at) + INTERVAL 90 DAY DELETE
|
||||
SETTINGS index_granularity = 8192;
|
||||
|
||||
-- insert_id tiebreak for keyset pagination (fixes same-millisecond cursor collision).
|
||||
-- IF NOT EXISTS on ADD COLUMN is idempotent. MATERIALIZE COLUMN is a background mutation,
|
||||
-- effectively a no-op once all parts are already materialized.
|
||||
|
||||
Reference in New Issue
Block a user