Fix ClickHouse OOM: PREWHERE on active-count query + per-query memory limits
The active-count query scanned all wide rows on the base table, exceeding the 3.6 GiB memory limit. Use PREWHERE status = 'RUNNING' so ClickHouse reads only the status column first. Add SETTINGS max_memory_usage = 1 GiB to all queries so concurrent requests degrade gracefully instead of crashing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -24,6 +24,9 @@ import java.util.List;
|
||||
*/
|
||||
public class ClickHouseSearchEngine implements SearchEngine {
|
||||
|
||||
/** Per-query memory cap (1 GiB) — prevents a single query from OOMing ClickHouse. */
|
||||
private static final String SETTINGS = " SETTINGS max_memory_usage = 1000000000";
|
||||
|
||||
private final JdbcTemplate jdbcTemplate;
|
||||
|
||||
public ClickHouseSearchEngine(JdbcTemplate jdbcTemplate) {
|
||||
@@ -42,7 +45,7 @@ public class ClickHouseSearchEngine implements SearchEngine {
|
||||
// Count query
|
||||
var countParams = params.toArray();
|
||||
Long total = jdbcTemplate.queryForObject(
|
||||
"SELECT count() FROM route_executions" + where, Long.class, countParams);
|
||||
"SELECT count() FROM route_executions" + where + SETTINGS, Long.class, countParams);
|
||||
if (total == null) total = 0L;
|
||||
|
||||
if (total == 0) {
|
||||
@@ -56,7 +59,7 @@ public class ClickHouseSearchEngine implements SearchEngine {
|
||||
String dataSql = "SELECT execution_id, route_id, agent_id, status, start_time, end_time, " +
|
||||
"duration_ms, correlation_id, error_message, diagram_content_hash " +
|
||||
"FROM route_executions" + where +
|
||||
" ORDER BY " + request.sortColumn() + " " + orderDir + " LIMIT ? OFFSET ?";
|
||||
" ORDER BY " + request.sortColumn() + " " + orderDir + " LIMIT ? OFFSET ?" + SETTINGS;
|
||||
|
||||
List<ExecutionSummary> data = jdbcTemplate.query(dataSql, (rs, rowNum) -> {
|
||||
Timestamp endTs = rs.getTimestamp("end_time");
|
||||
@@ -85,7 +88,7 @@ public class ClickHouseSearchEngine implements SearchEngine {
|
||||
|
||||
String where = conditions.isEmpty() ? "" : " WHERE " + String.join(" AND ", conditions);
|
||||
Long result = jdbcTemplate.queryForObject(
|
||||
"SELECT count() FROM route_executions" + where, Long.class, params.toArray());
|
||||
"SELECT count() FROM route_executions" + where + SETTINGS, Long.class, params.toArray());
|
||||
return result != null ? result : 0L;
|
||||
}
|
||||
|
||||
@@ -112,7 +115,7 @@ public class ClickHouseSearchEngine implements SearchEngine {
|
||||
"countIfMerge(failed_count) AS failed, " +
|
||||
"toInt64(ifNotFinite(sumMerge(duration_sum) / countMerge(total_count), 0)) AS avg_ms, " +
|
||||
"toInt64(ifNotFinite(quantileTDigestMerge(0.99)(p99_duration), 0)) AS p99_ms " +
|
||||
"FROM route_execution_stats_5m" + where;
|
||||
"FROM route_execution_stats_5m" + where + SETTINGS;
|
||||
|
||||
record PeriodStats(long totalCount, long failedCount, long avgDurationMs, long p99LatencyMs) {}
|
||||
PeriodStats current = jdbcTemplate.queryForObject(rollupSql,
|
||||
@@ -123,14 +126,13 @@ public class ClickHouseSearchEngine implements SearchEngine {
|
||||
rs.getLong("p99_ms")),
|
||||
params.toArray());
|
||||
|
||||
// Active count — lightweight real-time query on base table (RUNNING is transient)
|
||||
var activeConditions = new ArrayList<String>();
|
||||
// Active count — PREWHERE reads only the status column before touching wide rows
|
||||
var scopeConditions = new ArrayList<String>();
|
||||
var activeParams = new ArrayList<Object>();
|
||||
activeConditions.add("status = 'RUNNING'");
|
||||
addScopeFilters(routeId, agentIds, activeConditions, activeParams);
|
||||
String activeWhere = " WHERE " + String.join(" AND ", activeConditions);
|
||||
addScopeFilters(routeId, agentIds, scopeConditions, activeParams);
|
||||
String scopeWhere = scopeConditions.isEmpty() ? "" : " WHERE " + String.join(" AND ", scopeConditions);
|
||||
Long activeCount = jdbcTemplate.queryForObject(
|
||||
"SELECT count() FROM route_executions" + activeWhere,
|
||||
"SELECT count() FROM route_executions PREWHERE status = 'RUNNING'" + scopeWhere + SETTINGS,
|
||||
Long.class, activeParams.toArray());
|
||||
|
||||
// Previous period (same window shifted back 24h) — read from rollup
|
||||
@@ -151,7 +153,7 @@ public class ClickHouseSearchEngine implements SearchEngine {
|
||||
"countIfMerge(failed_count) AS failed, " +
|
||||
"toInt64(ifNotFinite(sumMerge(duration_sum) / countMerge(total_count), 0)) AS avg_ms, " +
|
||||
"toInt64(ifNotFinite(quantileTDigestMerge(0.99)(p99_duration), 0)) AS p99_ms " +
|
||||
"FROM route_execution_stats_5m" + prevWhere;
|
||||
"FROM route_execution_stats_5m" + prevWhere + SETTINGS;
|
||||
|
||||
PeriodStats prev = jdbcTemplate.queryForObject(prevRollupSql,
|
||||
(rs, rowNum) -> new PeriodStats(
|
||||
@@ -171,7 +173,7 @@ public class ClickHouseSearchEngine implements SearchEngine {
|
||||
String todayWhere = " WHERE " + String.join(" AND ", todayConditions);
|
||||
|
||||
Long totalToday = jdbcTemplate.queryForObject(
|
||||
"SELECT countMerge(total_count) FROM route_execution_stats_5m" + todayWhere,
|
||||
"SELECT countMerge(total_count) FROM route_execution_stats_5m" + todayWhere + SETTINGS,
|
||||
Long.class, todayParams.toArray());
|
||||
|
||||
return new ExecutionStats(
|
||||
@@ -210,7 +212,7 @@ public class ClickHouseSearchEngine implements SearchEngine {
|
||||
"toInt64(ifNotFinite(sumMerge(duration_sum) / countMerge(total_count), 0)) AS avg_ms, " +
|
||||
"toInt64(ifNotFinite(quantileTDigestMerge(0.99)(p99_duration), 0)) AS p99_ms " +
|
||||
"FROM route_execution_stats_5m" + where +
|
||||
" GROUP BY ts_bucket ORDER BY ts_bucket";
|
||||
" GROUP BY ts_bucket ORDER BY ts_bucket" + SETTINGS;
|
||||
|
||||
List<StatsTimeseries.TimeseriesBucket> buckets = jdbcTemplate.query(sql, (rs, rowNum) ->
|
||||
new StatsTimeseries.TimeseriesBucket(
|
||||
|
||||
Reference in New Issue
Block a user