feat(clickhouse): add ClickHouseStatsStore with -Merge aggregate queries

Implements StatsStore interface for ClickHouse using AggregatingMergeTree
tables with -Merge combinators (countMerge, countIfMerge, sumMerge,
quantileMerge). Uses literal SQL for aggregate table queries to avoid
ClickHouse JDBC driver PreparedStatement issues with AggregateFunction
columns. Raw table queries (SLA, topErrors, activeErrorTypes) use normal
prepared statements.

Includes 13 integration tests covering stats, timeseries, grouped
timeseries, SLA compliance, SLA counts by app/route, top errors, active
error types, punchcard, and processor stats. Also fixes AggregateFunction
type signatures in V4 DDL (count() takes no args, countIf takes UInt8).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
hsiegeln
2026-03-31 21:49:22 +02:00
parent eb0d26814f
commit 052990bb59
3 changed files with 953 additions and 13 deletions

View File

@@ -0,0 +1,375 @@
package com.cameleer3.server.app.storage;
import com.cameleer3.server.core.search.ExecutionStats;
import com.cameleer3.server.core.search.StatsTimeseries;
import com.cameleer3.server.core.search.TopError;
import com.cameleer3.server.core.storage.StatsStore.PunchcardCell;
import com.zaxxer.hikari.HikariDataSource;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.springframework.core.io.ClassPathResource;
import org.springframework.jdbc.core.JdbcTemplate;
import org.testcontainers.clickhouse.ClickHouseContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import java.nio.charset.StandardCharsets;
import java.sql.Timestamp;
import java.time.Instant;
import java.util.List;
import java.util.Map;
import static org.assertj.core.api.Assertions.assertThat;
@Testcontainers
class ClickHouseStatsStoreIT {
@Container
static final ClickHouseContainer clickhouse =
new ClickHouseContainer("clickhouse/clickhouse-server:24.12");
private JdbcTemplate jdbc;
private ClickHouseStatsStore store;
// base time: 2026-03-31T10:00:00Z (a Tuesday)
private static final Instant BASE = Instant.parse("2026-03-31T10:00:00Z");
@BeforeEach
void setUp() throws Exception {
HikariDataSource ds = new HikariDataSource();
ds.setJdbcUrl(clickhouse.getJdbcUrl());
ds.setUsername(clickhouse.getUsername());
ds.setPassword(clickhouse.getPassword());
jdbc = new JdbcTemplate(ds);
// Load DDL from classpath resources
String executionsDdl = new ClassPathResource("clickhouse/V2__executions.sql")
.getContentAsString(StandardCharsets.UTF_8);
String processorsDdl = new ClassPathResource("clickhouse/V3__processor_executions.sql")
.getContentAsString(StandardCharsets.UTF_8);
String statsDdl = new ClassPathResource("clickhouse/V4__stats_tables_and_mvs.sql")
.getContentAsString(StandardCharsets.UTF_8);
jdbc.execute(executionsDdl);
jdbc.execute(processorsDdl);
// Drop MVs first (they reference the stats tables), then recreate everything
jdbc.execute("DROP TABLE IF EXISTS stats_1m_all_mv");
jdbc.execute("DROP TABLE IF EXISTS stats_1m_app_mv");
jdbc.execute("DROP TABLE IF EXISTS stats_1m_route_mv");
jdbc.execute("DROP TABLE IF EXISTS stats_1m_processor_mv");
jdbc.execute("DROP TABLE IF EXISTS stats_1m_processor_detail_mv");
jdbc.execute("DROP TABLE IF EXISTS stats_1m_all");
jdbc.execute("DROP TABLE IF EXISTS stats_1m_app");
jdbc.execute("DROP TABLE IF EXISTS stats_1m_route");
jdbc.execute("DROP TABLE IF EXISTS stats_1m_processor");
jdbc.execute("DROP TABLE IF EXISTS stats_1m_processor_detail");
// Strip SQL line comments first (they may contain semicolons),
// then split by ';' and execute non-empty statements.
String cleanedDdl = statsDdl.replaceAll("--[^\n]*", "");
for (String stmt : cleanedDdl.split(";")) {
String trimmed = stmt.trim();
if (!trimmed.isEmpty()) {
jdbc.execute(trimmed);
}
}
// Truncate base tables
jdbc.execute("TRUNCATE TABLE executions");
jdbc.execute("TRUNCATE TABLE processor_executions");
seedTestData();
// Try the failing query to capture it in query_log, then check
try {
jdbc.queryForMap(
"SELECT countMerge(total_count) AS tc, countIfMerge(failed_count) AS fc, " +
"sumMerge(duration_sum) / greatest(countMerge(total_count), 1) AS avg, " +
"quantileMerge(0.99)(p99_duration) AS p99, " +
"countIfMerge(running_count) AS rc " +
"FROM stats_1m_all WHERE tenant_id = 'default' " +
"AND bucket >= '2026-03-31 09:59:00' AND bucket < '2026-03-31 10:05:00'");
} catch (Exception e) {
System.out.println("Expected error: " + e.getMessage().substring(0, 80));
}
jdbc.execute("SYSTEM FLUSH LOGS");
// Get ALL recent queries to see what the driver sends
var queryLog = jdbc.queryForList(
"SELECT type, substring(query, 1, 200) AS q " +
"FROM system.query_log WHERE event_time > now() - 30 " +
"AND query NOT LIKE '%system.query_log%' AND query NOT LIKE '%FLUSH%' " +
"ORDER BY event_time DESC LIMIT 20");
for (var entry : queryLog) {
System.out.println("LOG: " + entry.get("type") + " | " + entry.get("q"));
}
store = new ClickHouseStatsStore(jdbc);
}
private void seedTestData() {
// 10 executions across 2 apps, 2 routes, spanning 5 minutes
// app-1, route-a: 4 COMPLETED (200ms, 300ms, 400ms, 500ms)
insertExecution("exec-01", BASE.plusSeconds(0), "app-1", "route-a", "agent-1",
"COMPLETED", 200L, "", "");
insertExecution("exec-02", BASE.plusSeconds(60), "app-1", "route-a", "agent-1",
"COMPLETED", 300L, "", "");
insertExecution("exec-03", BASE.plusSeconds(120), "app-1", "route-a", "agent-1",
"COMPLETED", 400L, "", "");
insertExecution("exec-04", BASE.plusSeconds(180), "app-1", "route-a", "agent-1",
"COMPLETED", 500L, "", "");
// app-1, route-a: 2 FAILED (100ms, 150ms) with error_type="NPE"
insertExecution("exec-05", BASE.plusSeconds(60), "app-1", "route-a", "agent-1",
"FAILED", 100L, "NPE", "null ref");
insertExecution("exec-06", BASE.plusSeconds(120), "app-1", "route-a", "agent-1",
"FAILED", 150L, "NPE", "null ref");
// app-1, route-b: 2 COMPLETED (50ms, 60ms)
insertExecution("exec-07", BASE.plusSeconds(60), "app-1", "route-b", "agent-1",
"COMPLETED", 50L, "", "");
insertExecution("exec-08", BASE.plusSeconds(120), "app-1", "route-b", "agent-1",
"COMPLETED", 60L, "", "");
// app-2, route-c: 1 COMPLETED (1000ms)
insertExecution("exec-09", BASE.plusSeconds(60), "app-2", "route-c", "agent-2",
"COMPLETED", 1000L, "", "");
// app-2, route-c: 1 RUNNING (null duration)
insertExecution("exec-10", BASE.plusSeconds(180), "app-2", "route-c", "agent-2",
"RUNNING", null, "", "");
// 5 processor records for processor stats testing
// app-1, route-a, processor_type="to": 3 COMPLETED
insertProcessor("exec-01", 1, "proc-to-1", "to", BASE.plusSeconds(0),
"app-1", "route-a", "COMPLETED", 50L);
insertProcessor("exec-02", 1, "proc-to-2", "to", BASE.plusSeconds(60),
"app-1", "route-a", "COMPLETED", 80L);
insertProcessor("exec-03", 1, "proc-to-3", "to", BASE.plusSeconds(120),
"app-1", "route-a", "COMPLETED", 90L);
// app-1, route-a, processor_type="log": 2 COMPLETED
insertProcessor("exec-01", 2, "proc-log-1", "log", BASE.plusSeconds(1),
"app-1", "route-a", "COMPLETED", 10L);
insertProcessor("exec-02", 2, "proc-log-2", "log", BASE.plusSeconds(61),
"app-1", "route-a", "COMPLETED", 15L);
}
private void insertExecution(String executionId, Instant startTime, String appName,
String routeId, String agentId, String status,
Long durationMs, String errorType, String errorMessage) {
jdbc.update(
"INSERT INTO executions (tenant_id, execution_id, start_time, route_id, " +
"agent_id, application_name, status, duration_ms, error_type, error_message) " +
"VALUES ('default', ?, ?, ?, ?, ?, ?, ?, ?, ?)",
executionId, Timestamp.from(startTime), routeId, agentId, appName,
status, durationMs, errorType, errorMessage);
}
private void insertProcessor(String executionId, int seq, String processorId,
String processorType, Instant startTime,
String appName, String routeId, String status,
Long durationMs) {
jdbc.update(
"INSERT INTO processor_executions (tenant_id, execution_id, seq, processor_id, " +
"processor_type, start_time, route_id, application_name, status, duration_ms) " +
"VALUES ('default', ?, ?, ?, ?, ?, ?, ?, ?, ?)",
executionId, seq, processorId, processorType, Timestamp.from(startTime),
routeId, appName, status, durationMs);
}
// ── Stats Tests ──────────────────────────────────────────────────────
@Test
void stats_returnsCorrectGlobalTotals() {
Instant from = BASE.minusSeconds(60);
Instant to = BASE.plusSeconds(300);
ExecutionStats stats = store.stats(from, to);
assertThat(stats.totalCount()).isEqualTo(10);
assertThat(stats.failedCount()).isEqualTo(2);
assertThat(stats.activeCount()).isEqualTo(1);
assertThat(stats.avgDurationMs()).isGreaterThan(0);
assertThat(stats.p99LatencyMs()).isGreaterThan(0);
}
@Test
void statsForApp_filtersCorrectly() {
Instant from = BASE.minusSeconds(60);
Instant to = BASE.plusSeconds(300);
ExecutionStats app1 = store.statsForApp(from, to, "app-1");
assertThat(app1.totalCount()).isEqualTo(8);
ExecutionStats app2 = store.statsForApp(from, to, "app-2");
assertThat(app2.totalCount()).isEqualTo(2);
}
@Test
void statsForRoute_filtersCorrectly() {
Instant from = BASE.minusSeconds(60);
Instant to = BASE.plusSeconds(300);
ExecutionStats routeA = store.statsForRoute(from, to, "route-a", List.of());
assertThat(routeA.totalCount()).isEqualTo(6);
}
// ── Timeseries Tests ─────────────────────────────────────────────────
@Test
void timeseries_returnsBuckets() {
Instant from = BASE.minusSeconds(60);
Instant to = BASE.plusSeconds(300);
StatsTimeseries ts = store.timeseries(from, to, 5);
assertThat(ts.buckets()).isNotEmpty();
long totalAcrossBuckets = ts.buckets().stream()
.mapToLong(StatsTimeseries.TimeseriesBucket::totalCount).sum();
assertThat(totalAcrossBuckets).isEqualTo(10);
}
@Test
void timeseriesForApp_filtersCorrectly() {
Instant from = BASE.minusSeconds(60);
Instant to = BASE.plusSeconds(300);
StatsTimeseries ts = store.timeseriesForApp(from, to, 5, "app-1");
long totalAcrossBuckets = ts.buckets().stream()
.mapToLong(StatsTimeseries.TimeseriesBucket::totalCount).sum();
assertThat(totalAcrossBuckets).isEqualTo(8);
}
@Test
void timeseriesGroupedByApp_returnsMap() {
Instant from = BASE.minusSeconds(60);
Instant to = BASE.plusSeconds(300);
Map<String, StatsTimeseries> grouped = store.timeseriesGroupedByApp(from, to, 5);
assertThat(grouped).containsKeys("app-1", "app-2");
}
@Test
void timeseriesGroupedByRoute_returnsMap() {
Instant from = BASE.minusSeconds(60);
Instant to = BASE.plusSeconds(300);
Map<String, StatsTimeseries> grouped = store.timeseriesGroupedByRoute(from, to, 5, "app-1");
assertThat(grouped).containsKeys("route-a", "route-b");
}
// ── SLA Tests ────────────────────────────────────────────────────────
@Test
void slaCompliance_calculatesCorrectly() {
Instant from = BASE.minusSeconds(60);
Instant to = BASE.plusSeconds(300);
// threshold=250ms: among 9 non-RUNNING executions:
// compliant (<=250ms): exec-01(200), exec-05(100), exec-06(150), exec-07(50), exec-08(60) = 5
// total non-running: 9
// compliance = 5/9 * 100 ~ 55.56%
double sla = store.slaCompliance(from, to, 250, null, null);
assertThat(sla).isBetween(55.0, 56.0);
}
// ── Top Errors Tests ─────────────────────────────────────────────────
@Test
void topErrors_returnsRankedErrors() {
Instant from = BASE.minusSeconds(60);
Instant to = BASE.plusSeconds(300);
List<TopError> errors = store.topErrors(from, to, null, null, 10);
assertThat(errors).isNotEmpty();
assertThat(errors.get(0).errorType()).isEqualTo("NPE");
assertThat(errors.get(0).count()).isEqualTo(2);
}
// ── Active Error Types Test ──────────────────────────────────────────
@Test
void activeErrorTypes_countsDistinct() {
Instant from = BASE.minusSeconds(60);
Instant to = BASE.plusSeconds(300);
int count = store.activeErrorTypes(from, to, "app-1");
assertThat(count).isEqualTo(1); // only "NPE"
}
// ── Punchcard Test ───────────────────────────────────────────────────
@Test
void punchcard_returnsWeekdayHourCells() {
Instant from = BASE.minusSeconds(60);
Instant to = BASE.plusSeconds(300);
List<PunchcardCell> cells = store.punchcard(from, to, null);
assertThat(cells).isNotEmpty();
long totalCount = cells.stream().mapToLong(PunchcardCell::totalCount).sum();
assertThat(totalCount).isEqualTo(10);
}
@Test
void slaCountsByApp_returnsMap() {
Instant from = BASE.minusSeconds(60);
Instant to = BASE.plusSeconds(300);
// threshold=250ms
Map<String, long[]> counts = store.slaCountsByApp(from, to, 250);
assertThat(counts).containsKeys("app-1", "app-2");
// app-1: 8 total executions, all non-RUNNING
// compliant (<=250ms): exec-01(200), exec-05(100), exec-06(150), exec-07(50), exec-08(60) = 5
long[] app1 = counts.get("app-1");
assertThat(app1[0]).isEqualTo(5); // compliant
assertThat(app1[1]).isEqualTo(8); // total non-running
// app-2: 1 COMPLETED(1000ms) + 1 RUNNING → 1 non-RUNNING, 0 compliant
long[] app2 = counts.get("app-2");
assertThat(app2[0]).isEqualTo(0); // compliant
assertThat(app2[1]).isEqualTo(1); // total non-running
}
@Test
void slaCountsByRoute_returnsMap() {
Instant from = BASE.minusSeconds(60);
Instant to = BASE.plusSeconds(300);
Map<String, long[]> counts = store.slaCountsByRoute(from, to, "app-1", 250);
assertThat(counts).containsKeys("route-a", "route-b");
// route-a: exec-01(200)OK, exec-02(300)NO, exec-03(400)NO, exec-04(500)NO,
// exec-05(100)OK, exec-06(150)OK → 3 compliant, 6 total
long[] routeA = counts.get("route-a");
assertThat(routeA[0]).isEqualTo(3); // compliant
assertThat(routeA[1]).isEqualTo(6); // total
// route-b: exec-07(50)OK, exec-08(60)OK → 2 compliant, 2 total
long[] routeB = counts.get("route-b");
assertThat(routeB[0]).isEqualTo(2);
assertThat(routeB[1]).isEqualTo(2);
}
// ── Processor Stats Test ─────────────────────────────────────────────
@Test
void statsForProcessor_filtersCorrectly() {
Instant from = BASE.minusSeconds(60);
Instant to = BASE.plusSeconds(300);
ExecutionStats toStats = store.statsForProcessor(from, to, "route-a", "to");
assertThat(toStats.totalCount()).isEqualTo(3);
assertThat(toStats.activeCount()).isEqualTo(0); // processor stats have no running_count
ExecutionStats logStats = store.statsForProcessor(from, to, "route-a", "log");
assertThat(logStats.totalCount()).isEqualTo(2);
}
}