feat: progressive drill-down dashboard with RED metrics and SLA compliance (#94)
Three-level dashboard driven by sidebar selection: - L1 (no selection): all-apps overview with health table, per-app charts - L2 (app selected): route performance table, error velocity, top errors - L3 (route selected): processor table, latency heatmap data, bottleneck KPI Backend: 3 new endpoints (timeseries/by-app, timeseries/by-route, errors/top), per-app SLA settings (app_settings table, V12 migration), exact SLA compliance from executions hypertable, error velocity with acceleration detection. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,19 @@
|
||||
package com.cameleer3.server.core.admin;
|
||||
|
||||
import java.time.Instant;
|
||||
|
||||
public record AppSettings(
|
||||
String appId,
|
||||
int slaThresholdMs,
|
||||
double healthErrorWarn,
|
||||
double healthErrorCrit,
|
||||
double healthSlaWarn,
|
||||
double healthSlaCrit,
|
||||
Instant createdAt,
|
||||
Instant updatedAt) {
|
||||
|
||||
public static AppSettings defaults(String appId) {
|
||||
Instant now = Instant.now();
|
||||
return new AppSettings(appId, 300, 1.0, 5.0, 99.0, 95.0, now, now);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
package com.cameleer3.server.core.admin;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
public interface AppSettingsRepository {
|
||||
Optional<AppSettings> findByAppId(String appId);
|
||||
List<AppSettings> findAll();
|
||||
AppSettings save(AppSettings settings);
|
||||
void delete(String appId);
|
||||
}
|
||||
@@ -14,4 +14,23 @@ public record ExecutionStats(
|
||||
long prevTotalCount,
|
||||
long prevFailedCount,
|
||||
long prevAvgDurationMs,
|
||||
long prevP99LatencyMs) {}
|
||||
long prevP99LatencyMs,
|
||||
double slaCompliance) {
|
||||
|
||||
/** Constructor without SLA compliance (backward-compatible, sets to -1). */
|
||||
public ExecutionStats(long totalCount, long failedCount, long avgDurationMs,
|
||||
long p99LatencyMs, long activeCount, long totalToday,
|
||||
long prevTotalCount, long prevFailedCount,
|
||||
long prevAvgDurationMs, long prevP99LatencyMs) {
|
||||
this(totalCount, failedCount, avgDurationMs, p99LatencyMs, activeCount,
|
||||
totalToday, prevTotalCount, prevFailedCount, prevAvgDurationMs,
|
||||
prevP99LatencyMs, -1.0);
|
||||
}
|
||||
|
||||
/** Return a copy with the given SLA compliance value. */
|
||||
public ExecutionStats withSlaCompliance(double slaCompliance) {
|
||||
return new ExecutionStats(totalCount, failedCount, avgDurationMs, p99LatencyMs,
|
||||
activeCount, totalToday, prevTotalCount, prevFailedCount,
|
||||
prevAvgDurationMs, prevP99LatencyMs, slaCompliance);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ import com.cameleer3.server.core.storage.StatsStore;
|
||||
|
||||
import java.time.Instant;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class SearchService {
|
||||
|
||||
@@ -48,4 +49,38 @@ public class SearchService {
|
||||
String routeId, List<String> agentIds) {
|
||||
return statsStore.timeseriesForRoute(from, to, bucketCount, routeId, agentIds);
|
||||
}
|
||||
|
||||
// ── Dashboard-specific queries ────────────────────────────────────────
|
||||
|
||||
public Map<String, StatsTimeseries> timeseriesGroupedByApp(Instant from, Instant to, int bucketCount) {
|
||||
return statsStore.timeseriesGroupedByApp(from, to, bucketCount);
|
||||
}
|
||||
|
||||
public Map<String, StatsTimeseries> timeseriesGroupedByRoute(Instant from, Instant to,
|
||||
int bucketCount, String applicationName) {
|
||||
return statsStore.timeseriesGroupedByRoute(from, to, bucketCount, applicationName);
|
||||
}
|
||||
|
||||
public double slaCompliance(Instant from, Instant to, int thresholdMs,
|
||||
String applicationName, String routeId) {
|
||||
return statsStore.slaCompliance(from, to, thresholdMs, applicationName, routeId);
|
||||
}
|
||||
|
||||
public Map<String, long[]> slaCountsByApp(Instant from, Instant to, int defaultThresholdMs) {
|
||||
return statsStore.slaCountsByApp(from, to, defaultThresholdMs);
|
||||
}
|
||||
|
||||
public Map<String, long[]> slaCountsByRoute(Instant from, Instant to,
|
||||
String applicationName, int thresholdMs) {
|
||||
return statsStore.slaCountsByRoute(from, to, applicationName, thresholdMs);
|
||||
}
|
||||
|
||||
public List<TopError> topErrors(Instant from, Instant to, String applicationName,
|
||||
String routeId, int limit) {
|
||||
return statsStore.topErrors(from, to, applicationName, routeId, limit);
|
||||
}
|
||||
|
||||
public int activeErrorTypes(Instant from, Instant to, String applicationName) {
|
||||
return statsStore.activeErrorTypes(from, to, applicationName);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,12 @@
|
||||
package com.cameleer3.server.core.search;
|
||||
|
||||
import java.time.Instant;
|
||||
|
||||
public record TopError(
|
||||
String errorType,
|
||||
String routeId,
|
||||
String processorId,
|
||||
long count,
|
||||
double velocity,
|
||||
String trend,
|
||||
Instant lastSeen) {}
|
||||
@@ -2,9 +2,11 @@ package com.cameleer3.server.core.storage;
|
||||
|
||||
import com.cameleer3.server.core.search.ExecutionStats;
|
||||
import com.cameleer3.server.core.search.StatsTimeseries;
|
||||
import com.cameleer3.server.core.search.TopError;
|
||||
|
||||
import java.time.Instant;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public interface StatsStore {
|
||||
|
||||
@@ -33,4 +35,29 @@ public interface StatsStore {
|
||||
// Per-processor timeseries
|
||||
StatsTimeseries timeseriesForProcessor(Instant from, Instant to, int bucketCount,
|
||||
String routeId, String processorType);
|
||||
|
||||
// Grouped timeseries by application (for L1 dashboard charts)
|
||||
Map<String, StatsTimeseries> timeseriesGroupedByApp(Instant from, Instant to, int bucketCount);
|
||||
|
||||
// Grouped timeseries by route within an application (for L2 dashboard charts)
|
||||
Map<String, StatsTimeseries> timeseriesGroupedByRoute(Instant from, Instant to, int bucketCount,
|
||||
String applicationName);
|
||||
|
||||
// SLA compliance: % of completed exchanges with duration <= thresholdMs
|
||||
double slaCompliance(Instant from, Instant to, int thresholdMs,
|
||||
String applicationName, String routeId);
|
||||
|
||||
// Batch SLA counts by app: {appId -> [compliant, total]}
|
||||
Map<String, long[]> slaCountsByApp(Instant from, Instant to, int defaultThresholdMs);
|
||||
|
||||
// Batch SLA counts by route within an app: {routeId -> [compliant, total]}
|
||||
Map<String, long[]> slaCountsByRoute(Instant from, Instant to, String applicationName,
|
||||
int thresholdMs);
|
||||
|
||||
// Top N errors with velocity trend
|
||||
List<TopError> topErrors(Instant from, Instant to, String applicationName,
|
||||
String routeId, int limit);
|
||||
|
||||
// Count of distinct error types in window
|
||||
int activeErrorTypes(Instant from, Instant to, String applicationName);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user