fix: deduplicate all stats MVs and preserve loop iterations
All checks were successful
CI / build (push) Successful in 2m25s
CI / cleanup-branch (push) Has been skipped
CI / docker (push) Successful in 1m20s
CI / deploy-feature (push) Has been skipped
CI / deploy (push) Successful in 1m3s
SonarQube / sonarqube (push) Successful in 3m49s

Extend uniq-based dedup from processor tables to all stats tables
(stats_1m_all, stats_1m_app, stats_1m_route). Execution-level tables
use uniq(execution_id). Processor-level tables now use
uniq(concat(execution_id, toString(seq))) so loop iterations (same
exchange, different seq) are counted while chunk retry duplicates
(same exchange+seq) are collapsed.

All stats tables are dropped, recreated, and backfilled from raw
data on startup. All Java queries updated: countMerge -> uniqMerge,
countIfMerge -> uniqIfMerge.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
hsiegeln
2026-04-12 23:48:01 +02:00
parent 1872d46466
commit ae908fb382
6 changed files with 101 additions and 45 deletions

View File

@@ -126,14 +126,19 @@ SETTINGS index_granularity = 8192;
-- ── Stats: Materialized Views + AggregatingMergeTree ────────────────────
-- stats_1m_all (global)
-- All stats tables use uniq(execution_id) to deduplicate chunk retries
-- DROP + CREATE + backfill rebuilds from raw data on startup
DROP VIEW IF EXISTS stats_1m_all_mv;
DROP TABLE IF EXISTS stats_1m_all;
CREATE TABLE IF NOT EXISTS stats_1m_all (
tenant_id LowCardinality(String),
bucket DateTime,
environment LowCardinality(String) DEFAULT 'default',
total_count AggregateFunction(count),
failed_count AggregateFunction(countIf, UInt8),
running_count AggregateFunction(countIf, UInt8),
total_count AggregateFunction(uniq, String),
failed_count AggregateFunction(uniqIf, String, UInt8),
running_count AggregateFunction(uniqIf, String, UInt8),
duration_sum AggregateFunction(sum, Nullable(Int64)),
duration_max AggregateFunction(max, Nullable(Int64)),
p99_duration AggregateFunction(quantile(0.99), Nullable(Int64))
@@ -143,14 +148,28 @@ PARTITION BY (tenant_id, toYYYYMM(bucket))
ORDER BY (tenant_id, bucket, environment)
TTL bucket + INTERVAL 365 DAY DELETE;
INSERT INTO stats_1m_all
SELECT
tenant_id,
toStartOfMinute(start_time) AS bucket,
environment,
uniqState(execution_id) AS total_count,
uniqIfState(execution_id, status = 'FAILED') AS failed_count,
uniqIfState(execution_id, status = 'RUNNING') AS running_count,
sumState(duration_ms) AS duration_sum,
maxState(duration_ms) AS duration_max,
quantileState(0.99)(duration_ms) AS p99_duration
FROM executions
GROUP BY tenant_id, bucket, environment;
CREATE MATERIALIZED VIEW IF NOT EXISTS stats_1m_all_mv TO stats_1m_all AS
SELECT
tenant_id,
toStartOfMinute(start_time) AS bucket,
environment,
countState() AS total_count,
countIfState(status = 'FAILED') AS failed_count,
countIfState(status = 'RUNNING') AS running_count,
uniqState(execution_id) AS total_count,
uniqIfState(execution_id, status = 'FAILED') AS failed_count,
uniqIfState(execution_id, status = 'RUNNING') AS running_count,
sumState(duration_ms) AS duration_sum,
maxState(duration_ms) AS duration_max,
quantileState(0.99)(duration_ms) AS p99_duration
@@ -159,14 +178,17 @@ GROUP BY tenant_id, bucket, environment;
-- stats_1m_app (per-application)
DROP VIEW IF EXISTS stats_1m_app_mv;
DROP TABLE IF EXISTS stats_1m_app;
CREATE TABLE IF NOT EXISTS stats_1m_app (
tenant_id LowCardinality(String),
application_id LowCardinality(String),
bucket DateTime,
environment LowCardinality(String) DEFAULT 'default',
total_count AggregateFunction(count),
failed_count AggregateFunction(countIf, UInt8),
running_count AggregateFunction(countIf, UInt8),
total_count AggregateFunction(uniq, String),
failed_count AggregateFunction(uniqIf, String, UInt8),
running_count AggregateFunction(uniqIf, String, UInt8),
duration_sum AggregateFunction(sum, Nullable(Int64)),
duration_max AggregateFunction(max, Nullable(Int64)),
p99_duration AggregateFunction(quantile(0.99), Nullable(Int64))
@@ -176,15 +198,30 @@ PARTITION BY (tenant_id, toYYYYMM(bucket))
ORDER BY (tenant_id, bucket, environment, application_id)
TTL bucket + INTERVAL 365 DAY DELETE;
INSERT INTO stats_1m_app
SELECT
tenant_id,
application_id,
toStartOfMinute(start_time) AS bucket,
environment,
uniqState(execution_id) AS total_count,
uniqIfState(execution_id, status = 'FAILED') AS failed_count,
uniqIfState(execution_id, status = 'RUNNING') AS running_count,
sumState(duration_ms) AS duration_sum,
maxState(duration_ms) AS duration_max,
quantileState(0.99)(duration_ms) AS p99_duration
FROM executions
GROUP BY tenant_id, application_id, bucket, environment;
CREATE MATERIALIZED VIEW IF NOT EXISTS stats_1m_app_mv TO stats_1m_app AS
SELECT
tenant_id,
application_id,
toStartOfMinute(start_time) AS bucket,
environment,
countState() AS total_count,
countIfState(status = 'FAILED') AS failed_count,
countIfState(status = 'RUNNING') AS running_count,
uniqState(execution_id) AS total_count,
uniqIfState(execution_id, status = 'FAILED') AS failed_count,
uniqIfState(execution_id, status = 'RUNNING') AS running_count,
sumState(duration_ms) AS duration_sum,
maxState(duration_ms) AS duration_max,
quantileState(0.99)(duration_ms) AS p99_duration
@@ -193,15 +230,18 @@ GROUP BY tenant_id, application_id, bucket, environment;
-- stats_1m_route (per-route)
DROP VIEW IF EXISTS stats_1m_route_mv;
DROP TABLE IF EXISTS stats_1m_route;
CREATE TABLE IF NOT EXISTS stats_1m_route (
tenant_id LowCardinality(String),
application_id LowCardinality(String),
route_id LowCardinality(String),
bucket DateTime,
environment LowCardinality(String) DEFAULT 'default',
total_count AggregateFunction(count),
failed_count AggregateFunction(countIf, UInt8),
running_count AggregateFunction(countIf, UInt8),
total_count AggregateFunction(uniq, String),
failed_count AggregateFunction(uniqIf, String, UInt8),
running_count AggregateFunction(uniqIf, String, UInt8),
duration_sum AggregateFunction(sum, Nullable(Int64)),
duration_max AggregateFunction(max, Nullable(Int64)),
p99_duration AggregateFunction(quantile(0.99), Nullable(Int64))
@@ -211,6 +251,22 @@ PARTITION BY (tenant_id, toYYYYMM(bucket))
ORDER BY (tenant_id, bucket, environment, application_id, route_id)
TTL bucket + INTERVAL 365 DAY DELETE;
INSERT INTO stats_1m_route
SELECT
tenant_id,
application_id,
route_id,
toStartOfMinute(start_time) AS bucket,
environment,
uniqState(execution_id) AS total_count,
uniqIfState(execution_id, status = 'FAILED') AS failed_count,
uniqIfState(execution_id, status = 'RUNNING') AS running_count,
sumState(duration_ms) AS duration_sum,
maxState(duration_ms) AS duration_max,
quantileState(0.99)(duration_ms) AS p99_duration
FROM executions
GROUP BY tenant_id, application_id, route_id, bucket, environment;
CREATE MATERIALIZED VIEW IF NOT EXISTS stats_1m_route_mv TO stats_1m_route AS
SELECT
tenant_id,
@@ -218,9 +274,9 @@ SELECT
route_id,
toStartOfMinute(start_time) AS bucket,
environment,
countState() AS total_count,
countIfState(status = 'FAILED') AS failed_count,
countIfState(status = 'RUNNING') AS running_count,
uniqState(execution_id) AS total_count,
uniqIfState(execution_id, status = 'FAILED') AS failed_count,
uniqIfState(execution_id, status = 'RUNNING') AS running_count,
sumState(duration_ms) AS duration_sum,
maxState(duration_ms) AS duration_max,
quantileState(0.99)(duration_ms) AS p99_duration
@@ -258,8 +314,8 @@ SELECT
processor_type,
toStartOfMinute(start_time) AS bucket,
environment,
uniqState(execution_id) AS total_count,
uniqIfState(execution_id, status = 'FAILED') AS failed_count,
uniqState(concat(execution_id, toString(seq))) AS total_count,
uniqIfState(concat(execution_id, toString(seq)), status = 'FAILED') AS failed_count,
sumState(duration_ms) AS duration_sum,
maxState(duration_ms) AS duration_max,
quantileState(0.99)(duration_ms) AS p99_duration
@@ -273,8 +329,8 @@ SELECT
processor_type,
toStartOfMinute(start_time) AS bucket,
environment,
uniqState(execution_id) AS total_count,
uniqIfState(execution_id, status = 'FAILED') AS failed_count,
uniqState(concat(execution_id, toString(seq))) AS total_count,
uniqIfState(concat(execution_id, toString(seq)), status = 'FAILED') AS failed_count,
sumState(duration_ms) AS duration_sum,
maxState(duration_ms) AS duration_max,
quantileState(0.99)(duration_ms) AS p99_duration
@@ -314,8 +370,8 @@ SELECT
processor_type,
toStartOfMinute(start_time) AS bucket,
environment,
uniqState(execution_id) AS total_count,
uniqIfState(execution_id, status = 'FAILED') AS failed_count,
uniqState(concat(execution_id, toString(seq))) AS total_count,
uniqIfState(concat(execution_id, toString(seq)), status = 'FAILED') AS failed_count,
sumState(duration_ms) AS duration_sum,
maxState(duration_ms) AS duration_max,
quantileState(0.99)(duration_ms) AS p99_duration
@@ -331,8 +387,8 @@ SELECT
processor_type,
toStartOfMinute(start_time) AS bucket,
environment,
uniqState(execution_id) AS total_count,
uniqIfState(execution_id, status = 'FAILED') AS failed_count,
uniqState(concat(execution_id, toString(seq))) AS total_count,
uniqIfState(concat(execution_id, toString(seq)), status = 'FAILED') AS failed_count,
sumState(duration_ms) AS duration_sum,
maxState(duration_ms) AS duration_max,
quantileState(0.99)(duration_ms) AS p99_duration