feat: progressive drill-down dashboard with RED metrics and SLA compliance (#94)
Three-level dashboard driven by sidebar selection: - L1 (no selection): all-apps overview with health table, per-app charts - L2 (app selected): route performance table, error velocity, top errors - L3 (route selected): processor table, latency heatmap data, bottleneck KPI Backend: 3 new endpoints (timeseries/by-app, timeseries/by-route, errors/top), per-app SLA settings (app_settings table, V12 migration), exact SLA compliance from executions hypertable, error velocity with acceleration detection. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
442
ui/src/pages/DashboardTab/DashboardL1.tsx
Normal file
442
ui/src/pages/DashboardTab/DashboardL1.tsx
Normal file
@@ -0,0 +1,442 @@
|
||||
import { useMemo } from 'react';
|
||||
import { useNavigate } from 'react-router';
|
||||
import {
|
||||
KpiStrip,
|
||||
DataTable,
|
||||
AreaChart,
|
||||
LineChart,
|
||||
Card,
|
||||
Sparkline,
|
||||
MonoText,
|
||||
StatusDot,
|
||||
Badge,
|
||||
} from '@cameleer/design-system';
|
||||
import type { KpiItem, Column } from '@cameleer/design-system';
|
||||
import { useGlobalFilters } from '@cameleer/design-system';
|
||||
import { useRouteMetrics } from '../../api/queries/catalog';
|
||||
import { useExecutionStats, useStatsTimeseries } from '../../api/queries/executions';
|
||||
import { useTimeseriesByApp, useTopErrors, useAllAppSettings } from '../../api/queries/dashboard';
|
||||
import type { AppSettings } from '../../api/queries/dashboard';
|
||||
import type { RouteMetrics } from '../../api/types';
|
||||
import {
|
||||
computeHealthDot,
|
||||
formatThroughput,
|
||||
formatSlaCompliance,
|
||||
trendIndicator,
|
||||
type HealthStatus,
|
||||
} from './dashboard-utils';
|
||||
import styles from './DashboardTab.module.css';
|
||||
|
||||
// ── Row type for application health table ───────────────────────────────────
|
||||
|
||||
interface AppRow {
|
||||
id: string;
|
||||
appId: string;
|
||||
health: HealthStatus;
|
||||
throughput: number;
|
||||
throughputLabel: string;
|
||||
successRate: number;
|
||||
p99DurationMs: number;
|
||||
slaCompliance: number;
|
||||
errorCount: number;
|
||||
sparkline: number[];
|
||||
}
|
||||
|
||||
// ── Table columns ───────────────────────────────────────────────────────────
|
||||
|
||||
const APP_COLUMNS: Column<AppRow>[] = [
|
||||
{
|
||||
key: 'health',
|
||||
header: '',
|
||||
render: (_, row) => <StatusDot status={row.health} />,
|
||||
},
|
||||
{
|
||||
key: 'appId',
|
||||
header: 'Application',
|
||||
sortable: true,
|
||||
render: (_, row) => (
|
||||
<span className={styles.appNameCell}>{row.appId}</span>
|
||||
),
|
||||
},
|
||||
{
|
||||
key: 'throughput',
|
||||
header: 'Throughput',
|
||||
sortable: true,
|
||||
render: (_, row) => (
|
||||
<MonoText size="sm">{row.throughputLabel}</MonoText>
|
||||
),
|
||||
},
|
||||
{
|
||||
key: 'successRate',
|
||||
header: 'Success %',
|
||||
sortable: true,
|
||||
render: (_, row) => {
|
||||
const pct = row.successRate;
|
||||
const cls = pct >= 99 ? styles.rateGood : pct >= 97 ? styles.rateWarn : styles.rateBad;
|
||||
return <MonoText size="sm" className={cls}>{pct.toFixed(1)}%</MonoText>;
|
||||
},
|
||||
},
|
||||
{
|
||||
key: 'p99DurationMs',
|
||||
header: 'P99',
|
||||
sortable: true,
|
||||
render: (_, row) => {
|
||||
const cls = row.p99DurationMs > 300 ? styles.rateBad : row.p99DurationMs > 200 ? styles.rateWarn : styles.rateGood;
|
||||
return <MonoText size="sm" className={cls}>{Math.round(row.p99DurationMs)}ms</MonoText>;
|
||||
},
|
||||
},
|
||||
{
|
||||
key: 'slaCompliance',
|
||||
header: 'SLA %',
|
||||
sortable: true,
|
||||
render: (_, row) => {
|
||||
const cls = row.slaCompliance >= 99 ? styles.rateGood : row.slaCompliance >= 95 ? styles.rateWarn : styles.rateBad;
|
||||
return <MonoText size="sm" className={cls}>{formatSlaCompliance(row.slaCompliance)}</MonoText>;
|
||||
},
|
||||
},
|
||||
{
|
||||
key: 'errorCount',
|
||||
header: 'Errors',
|
||||
sortable: true,
|
||||
render: (_, row) => {
|
||||
const cls = row.errorCount > 10 ? styles.rateBad : row.errorCount > 0 ? styles.rateWarn : styles.rateGood;
|
||||
return <MonoText size="sm" className={cls}>{row.errorCount.toLocaleString()}</MonoText>;
|
||||
},
|
||||
},
|
||||
{
|
||||
key: 'sparkline',
|
||||
header: 'Trend',
|
||||
render: (_, row) => (
|
||||
<Sparkline data={row.sparkline} width={80} height={24} />
|
||||
),
|
||||
},
|
||||
];
|
||||
|
||||
// ── Aggregate RouteMetrics by appId ─────────────────────────────────────────
|
||||
|
||||
function aggregateByApp(
|
||||
metrics: RouteMetrics[],
|
||||
windowSeconds: number,
|
||||
settingsMap: Map<string, AppSettings>,
|
||||
): AppRow[] {
|
||||
const grouped = new Map<string, RouteMetrics[]>();
|
||||
for (const m of metrics) {
|
||||
const list = grouped.get(m.appId) ?? [];
|
||||
list.push(m);
|
||||
grouped.set(m.appId, list);
|
||||
}
|
||||
|
||||
const rows: AppRow[] = [];
|
||||
for (const [appId, routes] of grouped) {
|
||||
const totalExchanges = routes.reduce((s, r) => s + r.exchangeCount, 0);
|
||||
const totalFailed = routes.reduce((s, r) => s + r.exchangeCount * r.errorRate, 0);
|
||||
const successRate = totalExchanges > 0 ? ((totalExchanges - totalFailed) / totalExchanges) * 100 : 100;
|
||||
const errorRate = totalExchanges > 0 ? totalFailed / totalExchanges : 0;
|
||||
|
||||
// Weighted average p99 by exchange count
|
||||
const p99Sum = routes.reduce((s, r) => s + r.p99DurationMs * r.exchangeCount, 0);
|
||||
const p99DurationMs = totalExchanges > 0 ? p99Sum / totalExchanges : 0;
|
||||
|
||||
// SLA compliance: weighted average of per-route slaCompliance from backend
|
||||
const appSettings = settingsMap.get(appId);
|
||||
const slaWeightedSum = routes.reduce((s, r) => s + (r.slaCompliance ?? 100) * r.exchangeCount, 0);
|
||||
const slaCompliance = totalExchanges > 0 ? slaWeightedSum / totalExchanges : 100;
|
||||
|
||||
const errorCount = Math.round(totalFailed);
|
||||
|
||||
// Merge sparklines: sum across routes per bucket position
|
||||
const maxLen = Math.max(...routes.map((r) => (r.sparkline ?? []).length), 0);
|
||||
const sparkline: number[] = [];
|
||||
for (let i = 0; i < maxLen; i++) {
|
||||
sparkline.push(routes.reduce((s, r) => s + ((r.sparkline ?? [])[i] ?? 0), 0));
|
||||
}
|
||||
|
||||
rows.push({
|
||||
id: appId,
|
||||
appId,
|
||||
health: computeHealthDot(errorRate, slaCompliance, appSettings),
|
||||
throughput: totalExchanges,
|
||||
throughputLabel: formatThroughput(totalExchanges, windowSeconds),
|
||||
successRate,
|
||||
p99DurationMs,
|
||||
slaCompliance,
|
||||
errorCount,
|
||||
sparkline,
|
||||
});
|
||||
}
|
||||
|
||||
return rows.sort((a, b) => {
|
||||
const order: Record<HealthStatus, number> = { error: 0, warning: 1, success: 2 };
|
||||
return order[a.health] - order[b.health];
|
||||
});
|
||||
}
|
||||
|
||||
// ── Build KPI items ─────────────────────────────────────────────────────────
|
||||
|
||||
function buildKpiItems(
|
||||
stats: {
|
||||
totalCount: number;
|
||||
failedCount: number;
|
||||
p99LatencyMs: number;
|
||||
prevTotalCount: number;
|
||||
prevFailedCount: number;
|
||||
prevP99LatencyMs: number;
|
||||
} | undefined,
|
||||
windowSeconds: number,
|
||||
slaCompliance: number,
|
||||
activeErrorCount: number,
|
||||
throughputSparkline: number[],
|
||||
successSparkline: number[],
|
||||
latencySparkline: number[],
|
||||
slaSparkline: number[],
|
||||
errorSparkline: number[],
|
||||
): KpiItem[] {
|
||||
const totalCount = stats?.totalCount ?? 0;
|
||||
const failedCount = stats?.failedCount ?? 0;
|
||||
const prevTotalCount = stats?.prevTotalCount ?? 0;
|
||||
const prevFailedCount = stats?.prevFailedCount ?? 0;
|
||||
const p99Ms = stats?.p99LatencyMs ?? 0;
|
||||
const prevP99Ms = stats?.prevP99LatencyMs ?? 0;
|
||||
|
||||
// Throughput
|
||||
const throughput = windowSeconds > 0 ? totalCount / windowSeconds : 0;
|
||||
const prevThroughput = windowSeconds > 0 ? prevTotalCount / windowSeconds : 0;
|
||||
const throughputTrend = trendIndicator(throughput, prevThroughput);
|
||||
|
||||
// Success Rate
|
||||
const successPct = totalCount > 0 ? ((totalCount - failedCount) / totalCount) * 100 : 100;
|
||||
const prevSuccessPct = prevTotalCount > 0
|
||||
? ((prevTotalCount - prevFailedCount) / prevTotalCount) * 100
|
||||
: 100;
|
||||
const successTrend = trendIndicator(successPct, prevSuccessPct);
|
||||
|
||||
// P99 Latency
|
||||
const p99Trend = trendIndicator(p99Ms, prevP99Ms);
|
||||
|
||||
// SLA compliance trend — higher is better, so invert the variant
|
||||
const slaTrend = trendIndicator(slaCompliance, 100);
|
||||
|
||||
// Active Errors
|
||||
const prevErrorRate = prevTotalCount > 0 ? (prevFailedCount / prevTotalCount) * 100 : 0;
|
||||
const currentErrorRate = totalCount > 0 ? (failedCount / totalCount) * 100 : 0;
|
||||
const errorTrend = trendIndicator(currentErrorRate, prevErrorRate);
|
||||
|
||||
return [
|
||||
{
|
||||
label: 'Throughput',
|
||||
value: formatThroughput(totalCount, windowSeconds),
|
||||
trend: {
|
||||
label: throughputTrend.label,
|
||||
variant: throughputTrend.direction === 'up' ? 'success' as const : throughputTrend.direction === 'down' ? 'error' as const : 'muted' as const,
|
||||
},
|
||||
subtitle: `${totalCount.toLocaleString()} msg total`,
|
||||
sparkline: throughputSparkline,
|
||||
borderColor: 'var(--amber)',
|
||||
},
|
||||
{
|
||||
label: 'Success Rate',
|
||||
value: `${successPct.toFixed(1)}%`,
|
||||
trend: {
|
||||
label: successTrend.label,
|
||||
variant: successPct >= 99 ? 'success' as const : successPct >= 97 ? 'warning' as const : 'error' as const,
|
||||
},
|
||||
subtitle: `${(totalCount - failedCount).toLocaleString()} succeeded`,
|
||||
sparkline: successSparkline,
|
||||
borderColor: successPct >= 99 ? 'var(--success)' : 'var(--error)',
|
||||
},
|
||||
{
|
||||
label: 'P99 Latency',
|
||||
value: `${Math.round(p99Ms)}ms`,
|
||||
trend: {
|
||||
label: p99Trend.label,
|
||||
variant: p99Ms > 300 ? 'error' as const : p99Ms > 200 ? 'warning' as const : 'success' as const,
|
||||
},
|
||||
subtitle: `prev ${Math.round(prevP99Ms)}ms`,
|
||||
sparkline: latencySparkline,
|
||||
borderColor: p99Ms > 300 ? 'var(--warning)' : 'var(--success)',
|
||||
},
|
||||
{
|
||||
label: 'SLA Compliance',
|
||||
value: formatSlaCompliance(slaCompliance),
|
||||
trend: {
|
||||
label: slaTrend.label,
|
||||
variant: slaCompliance >= 99 ? 'success' as const : slaCompliance >= 95 ? 'warning' as const : 'error' as const,
|
||||
},
|
||||
subtitle: 'P99 within threshold',
|
||||
sparkline: slaSparkline,
|
||||
borderColor: slaCompliance >= 99 ? 'var(--success)' : 'var(--warning)',
|
||||
},
|
||||
{
|
||||
label: 'Active Errors',
|
||||
value: String(activeErrorCount),
|
||||
trend: {
|
||||
label: errorTrend.label,
|
||||
variant: activeErrorCount === 0 ? 'success' as const : 'error' as const,
|
||||
},
|
||||
subtitle: `${failedCount.toLocaleString()} failures total`,
|
||||
sparkline: errorSparkline,
|
||||
borderColor: activeErrorCount === 0 ? 'var(--success)' : 'var(--error)',
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
// ── Component ───────────────────────────────────────────────────────────────
|
||||
|
||||
export default function DashboardL1() {
|
||||
const navigate = useNavigate();
|
||||
const { timeRange } = useGlobalFilters();
|
||||
const timeFrom = timeRange.start.toISOString();
|
||||
const timeTo = timeRange.end.toISOString();
|
||||
const windowSeconds = (timeRange.end.getTime() - timeRange.start.getTime()) / 1000;
|
||||
|
||||
const { data: metrics } = useRouteMetrics(timeFrom, timeTo);
|
||||
const { data: stats } = useExecutionStats(timeFrom, timeTo);
|
||||
const { data: timeseries } = useStatsTimeseries(timeFrom, timeTo);
|
||||
const { data: timeseriesByApp } = useTimeseriesByApp(timeFrom, timeTo);
|
||||
const { data: topErrors } = useTopErrors(timeFrom, timeTo);
|
||||
const { data: allAppSettings } = useAllAppSettings();
|
||||
|
||||
// Build settings lookup map
|
||||
const settingsMap = useMemo(() => {
|
||||
const map = new Map<string, AppSettings>();
|
||||
for (const s of allAppSettings ?? []) {
|
||||
map.set(s.appId, s);
|
||||
}
|
||||
return map;
|
||||
}, [allAppSettings]);
|
||||
|
||||
// Aggregate route metrics by appId for the table
|
||||
const appRows = useMemo(
|
||||
() => aggregateByApp(metrics ?? [], windowSeconds, settingsMap),
|
||||
[metrics, windowSeconds, settingsMap],
|
||||
);
|
||||
|
||||
// Global SLA compliance from backend stats (exact calculation from executions table)
|
||||
const globalSlaCompliance = (stats as Record<string, unknown>)?.slaCompliance as number ?? -1;
|
||||
const effectiveSlaCompliance = globalSlaCompliance >= 0 ? globalSlaCompliance : 100;
|
||||
|
||||
// Active error count = distinct error types
|
||||
const activeErrorCount = useMemo(
|
||||
() => (topErrors ?? []).length,
|
||||
[topErrors],
|
||||
);
|
||||
|
||||
// KPI sparklines from timeseries buckets
|
||||
const throughputSparkline = useMemo(
|
||||
() => (timeseries?.buckets ?? []).map((b) => b.totalCount),
|
||||
[timeseries],
|
||||
);
|
||||
const successSparkline = useMemo(
|
||||
() => (timeseries?.buckets ?? []).map((b) =>
|
||||
b.totalCount > 0 ? ((b.totalCount - b.failedCount) / b.totalCount) * 100 : 100,
|
||||
),
|
||||
[timeseries],
|
||||
);
|
||||
const latencySparkline = useMemo(
|
||||
() => (timeseries?.buckets ?? []).map((b) => b.p99DurationMs),
|
||||
[timeseries],
|
||||
);
|
||||
const slaSparkline = useMemo(
|
||||
() => (timeseries?.buckets ?? []).map((b) =>
|
||||
b.p99DurationMs <= 300 ? 100 : 0,
|
||||
),
|
||||
[timeseries],
|
||||
);
|
||||
const errorSparkline = useMemo(
|
||||
() => (timeseries?.buckets ?? []).map((b) => b.failedCount),
|
||||
[timeseries],
|
||||
);
|
||||
|
||||
const kpiItems = useMemo(
|
||||
() => buildKpiItems(
|
||||
stats,
|
||||
windowSeconds,
|
||||
effectiveSlaCompliance,
|
||||
activeErrorCount,
|
||||
throughputSparkline,
|
||||
successSparkline,
|
||||
latencySparkline,
|
||||
slaSparkline,
|
||||
errorSparkline,
|
||||
),
|
||||
[stats, windowSeconds, effectiveSlaCompliance, activeErrorCount,
|
||||
throughputSparkline, successSparkline, latencySparkline, slaSparkline, errorSparkline],
|
||||
);
|
||||
|
||||
// ── Per-app chart series (throughput stacked area) ──────────────────────
|
||||
const throughputByAppSeries = useMemo(() => {
|
||||
if (!timeseriesByApp) return [];
|
||||
return Object.entries(timeseriesByApp).map(([appId, { buckets }]) => ({
|
||||
label: appId,
|
||||
data: buckets.map((b, i) => ({
|
||||
x: i as number,
|
||||
y: b.totalCount,
|
||||
})),
|
||||
}));
|
||||
}, [timeseriesByApp]);
|
||||
|
||||
// ── Per-app chart series (error rate line) ─────────────────────────────
|
||||
const errorRateByAppSeries = useMemo(() => {
|
||||
if (!timeseriesByApp) return [];
|
||||
return Object.entries(timeseriesByApp).map(([appId, { buckets }]) => ({
|
||||
label: appId,
|
||||
data: buckets.map((b, i) => ({
|
||||
x: i as number,
|
||||
y: b.totalCount > 0 ? (b.failedCount / b.totalCount) * 100 : 0,
|
||||
})),
|
||||
}));
|
||||
}, [timeseriesByApp]);
|
||||
|
||||
return (
|
||||
<div className={styles.content}>
|
||||
<div className={styles.refreshIndicator}>
|
||||
<span className={styles.refreshDot} />
|
||||
<span className={styles.refreshText}>Auto-refresh: 30s</span>
|
||||
</div>
|
||||
|
||||
{/* KPI header cards */}
|
||||
<KpiStrip items={kpiItems} />
|
||||
|
||||
{/* Application Health table */}
|
||||
<div className={styles.tableSection}>
|
||||
<div className={styles.tableHeader}>
|
||||
<span className={styles.tableTitle}>Application Health</span>
|
||||
<div className={styles.tableRight}>
|
||||
<span className={styles.tableMeta}>{appRows.length} applications</span>
|
||||
<Badge label="ALL" color="muted" />
|
||||
</div>
|
||||
</div>
|
||||
<DataTable
|
||||
columns={APP_COLUMNS}
|
||||
data={appRows}
|
||||
sortable
|
||||
onRowClick={(row) => navigate(`/dashboard/${row.appId}`)}
|
||||
/>
|
||||
</div>
|
||||
|
||||
{/* Side-by-side charts */}
|
||||
{throughputByAppSeries.length > 0 && (
|
||||
<div className={styles.chartGrid}>
|
||||
<Card title="Throughput by Application (msg/s)">
|
||||
<AreaChart
|
||||
series={throughputByAppSeries}
|
||||
yLabel="msg/s"
|
||||
stacked
|
||||
height={200}
|
||||
className={styles.chart}
|
||||
/>
|
||||
</Card>
|
||||
|
||||
<Card title="Error Rate by Application (%)">
|
||||
<LineChart
|
||||
series={errorRateByAppSeries}
|
||||
yLabel="%"
|
||||
height={200}
|
||||
className={styles.chart}
|
||||
/>
|
||||
</Card>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
Reference in New Issue
Block a user