feat: progressive drill-down dashboard with RED metrics and SLA compliance (#94)

Three-level dashboard driven by sidebar selection:
- L1 (no selection): all-apps overview with health table, per-app charts
- L2 (app selected): route performance table, error velocity, top errors
- L3 (route selected): processor table, latency heatmap data, bottleneck KPI

Backend: 3 new endpoints (timeseries/by-app, timeseries/by-route, errors/top),
per-app SLA settings (app_settings table, V12 migration), exact SLA compliance
from executions hypertable, error velocity with acceleration detection.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
hsiegeln
2026-03-29 23:29:20 +02:00
parent b2ae37637d
commit 213aa86c47
21 changed files with 2293 additions and 19 deletions

View File

@@ -0,0 +1,19 @@
package com.cameleer3.server.core.admin;
import java.time.Instant;
public record AppSettings(
String appId,
int slaThresholdMs,
double healthErrorWarn,
double healthErrorCrit,
double healthSlaWarn,
double healthSlaCrit,
Instant createdAt,
Instant updatedAt) {
public static AppSettings defaults(String appId) {
Instant now = Instant.now();
return new AppSettings(appId, 300, 1.0, 5.0, 99.0, 95.0, now, now);
}
}

View File

@@ -0,0 +1,11 @@
package com.cameleer3.server.core.admin;
import java.util.List;
import java.util.Optional;
public interface AppSettingsRepository {
Optional<AppSettings> findByAppId(String appId);
List<AppSettings> findAll();
AppSettings save(AppSettings settings);
void delete(String appId);
}

View File

@@ -14,4 +14,23 @@ public record ExecutionStats(
long prevTotalCount,
long prevFailedCount,
long prevAvgDurationMs,
long prevP99LatencyMs) {}
long prevP99LatencyMs,
double slaCompliance) {
/** Constructor without SLA compliance (backward-compatible, sets to -1). */
public ExecutionStats(long totalCount, long failedCount, long avgDurationMs,
long p99LatencyMs, long activeCount, long totalToday,
long prevTotalCount, long prevFailedCount,
long prevAvgDurationMs, long prevP99LatencyMs) {
this(totalCount, failedCount, avgDurationMs, p99LatencyMs, activeCount,
totalToday, prevTotalCount, prevFailedCount, prevAvgDurationMs,
prevP99LatencyMs, -1.0);
}
/** Return a copy with the given SLA compliance value. */
public ExecutionStats withSlaCompliance(double slaCompliance) {
return new ExecutionStats(totalCount, failedCount, avgDurationMs, p99LatencyMs,
activeCount, totalToday, prevTotalCount, prevFailedCount,
prevAvgDurationMs, prevP99LatencyMs, slaCompliance);
}
}

View File

@@ -5,6 +5,7 @@ import com.cameleer3.server.core.storage.StatsStore;
import java.time.Instant;
import java.util.List;
import java.util.Map;
public class SearchService {
@@ -48,4 +49,38 @@ public class SearchService {
String routeId, List<String> agentIds) {
return statsStore.timeseriesForRoute(from, to, bucketCount, routeId, agentIds);
}
// ── Dashboard-specific queries ────────────────────────────────────────
public Map<String, StatsTimeseries> timeseriesGroupedByApp(Instant from, Instant to, int bucketCount) {
return statsStore.timeseriesGroupedByApp(from, to, bucketCount);
}
public Map<String, StatsTimeseries> timeseriesGroupedByRoute(Instant from, Instant to,
int bucketCount, String applicationName) {
return statsStore.timeseriesGroupedByRoute(from, to, bucketCount, applicationName);
}
public double slaCompliance(Instant from, Instant to, int thresholdMs,
String applicationName, String routeId) {
return statsStore.slaCompliance(from, to, thresholdMs, applicationName, routeId);
}
public Map<String, long[]> slaCountsByApp(Instant from, Instant to, int defaultThresholdMs) {
return statsStore.slaCountsByApp(from, to, defaultThresholdMs);
}
public Map<String, long[]> slaCountsByRoute(Instant from, Instant to,
String applicationName, int thresholdMs) {
return statsStore.slaCountsByRoute(from, to, applicationName, thresholdMs);
}
public List<TopError> topErrors(Instant from, Instant to, String applicationName,
String routeId, int limit) {
return statsStore.topErrors(from, to, applicationName, routeId, limit);
}
public int activeErrorTypes(Instant from, Instant to, String applicationName) {
return statsStore.activeErrorTypes(from, to, applicationName);
}
}

View File

@@ -0,0 +1,12 @@
package com.cameleer3.server.core.search;
import java.time.Instant;
public record TopError(
String errorType,
String routeId,
String processorId,
long count,
double velocity,
String trend,
Instant lastSeen) {}

View File

@@ -2,9 +2,11 @@ package com.cameleer3.server.core.storage;
import com.cameleer3.server.core.search.ExecutionStats;
import com.cameleer3.server.core.search.StatsTimeseries;
import com.cameleer3.server.core.search.TopError;
import java.time.Instant;
import java.util.List;
import java.util.Map;
public interface StatsStore {
@@ -33,4 +35,29 @@ public interface StatsStore {
// Per-processor timeseries
StatsTimeseries timeseriesForProcessor(Instant from, Instant to, int bucketCount,
String routeId, String processorType);
// Grouped timeseries by application (for L1 dashboard charts)
Map<String, StatsTimeseries> timeseriesGroupedByApp(Instant from, Instant to, int bucketCount);
// Grouped timeseries by route within an application (for L2 dashboard charts)
Map<String, StatsTimeseries> timeseriesGroupedByRoute(Instant from, Instant to, int bucketCount,
String applicationName);
// SLA compliance: % of completed exchanges with duration <= thresholdMs
double slaCompliance(Instant from, Instant to, int thresholdMs,
String applicationName, String routeId);
// Batch SLA counts by app: {appId -> [compliant, total]}
Map<String, long[]> slaCountsByApp(Instant from, Instant to, int defaultThresholdMs);
// Batch SLA counts by route within an app: {routeId -> [compliant, total]}
Map<String, long[]> slaCountsByRoute(Instant from, Instant to, String applicationName,
int thresholdMs);
// Top N errors with velocity trend
List<TopError> topErrors(Instant from, Instant to, String applicationName,
String routeId, int limit);
// Count of distinct error types in window
int activeErrorTypes(Instant from, Instant to, String applicationName);
}