feat: progressive drill-down dashboard with RED metrics and SLA compliance (#94)
Three-level dashboard driven by sidebar selection: - L1 (no selection): all-apps overview with health table, per-app charts - L2 (app selected): route performance table, error velocity, top errors - L3 (route selected): processor table, latency heatmap data, bottleneck KPI Backend: 3 new endpoints (timeseries/by-app, timeseries/by-route, errors/top), per-app SLA settings (app_settings table, V12 migration), exact SLA compliance from executions hypertable, error velocity with acceleration detection. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
434
ui/src/pages/DashboardTab/DashboardL3.tsx
Normal file
434
ui/src/pages/DashboardTab/DashboardL3.tsx
Normal file
@@ -0,0 +1,434 @@
|
||||
import { useMemo } from 'react';
|
||||
import { useParams } from 'react-router';
|
||||
import {
|
||||
KpiStrip,
|
||||
DataTable,
|
||||
AreaChart,
|
||||
LineChart,
|
||||
Card,
|
||||
MonoText,
|
||||
Badge,
|
||||
} from '@cameleer/design-system';
|
||||
import type { KpiItem, Column } from '@cameleer/design-system';
|
||||
import { useGlobalFilters } from '@cameleer/design-system';
|
||||
import { useExecutionStats, useStatsTimeseries } from '../../api/queries/executions';
|
||||
import { useProcessorMetrics } from '../../api/queries/processor-metrics';
|
||||
import { useTopErrors, useAppSettings } from '../../api/queries/dashboard';
|
||||
import type { TopError } from '../../api/queries/dashboard';
|
||||
import { useDiagramByRoute } from '../../api/queries/diagrams';
|
||||
import { ProcessDiagram } from '../../components/ProcessDiagram';
|
||||
import {
|
||||
formatRelativeTime,
|
||||
trendArrow,
|
||||
formatThroughput,
|
||||
formatSlaCompliance,
|
||||
trendIndicator,
|
||||
} from './dashboard-utils';
|
||||
import styles from './DashboardTab.module.css';
|
||||
|
||||
// ── Row types ───────────────────────────────────────────────────────────────
|
||||
|
||||
interface ProcessorRow {
|
||||
id: string;
|
||||
processorId: string;
|
||||
processorType: string;
|
||||
totalCount: number;
|
||||
avgDurationMs: number;
|
||||
p99DurationMs: number;
|
||||
errorRate: number;
|
||||
pctTime: number;
|
||||
}
|
||||
|
||||
interface ErrorRow extends TopError {
|
||||
id: string;
|
||||
}
|
||||
|
||||
// ── Processor table columns ─────────────────────────────────────────────────
|
||||
|
||||
const PROCESSOR_COLUMNS: Column<ProcessorRow>[] = [
|
||||
{
|
||||
key: 'processorId',
|
||||
header: 'Processor ID',
|
||||
sortable: true,
|
||||
render: (_, row) => <MonoText size="sm">{row.processorId}</MonoText>,
|
||||
},
|
||||
{
|
||||
key: 'processorType',
|
||||
header: 'Type',
|
||||
sortable: true,
|
||||
render: (_, row) => <Badge label={row.processorType} color="muted" />,
|
||||
},
|
||||
{
|
||||
key: 'totalCount',
|
||||
header: 'Invocations',
|
||||
sortable: true,
|
||||
render: (_, row) => (
|
||||
<MonoText size="sm">{row.totalCount.toLocaleString()}</MonoText>
|
||||
),
|
||||
},
|
||||
{
|
||||
key: 'avgDurationMs',
|
||||
header: 'Avg(ms)',
|
||||
sortable: true,
|
||||
render: (_, row) => (
|
||||
<MonoText size="sm">{Math.round(row.avgDurationMs)}</MonoText>
|
||||
),
|
||||
},
|
||||
{
|
||||
key: 'p99DurationMs',
|
||||
header: 'P99(ms)',
|
||||
sortable: true,
|
||||
render: (_, row) => {
|
||||
const cls = row.p99DurationMs > 300
|
||||
? styles.rateBad
|
||||
: row.p99DurationMs > 200
|
||||
? styles.rateWarn
|
||||
: styles.rateGood;
|
||||
return <MonoText size="sm" className={cls}>{Math.round(row.p99DurationMs)}</MonoText>;
|
||||
},
|
||||
},
|
||||
{
|
||||
key: 'errorRate',
|
||||
header: 'Error Rate(%)',
|
||||
sortable: true,
|
||||
render: (_, row) => {
|
||||
const pct = row.errorRate * 100;
|
||||
const cls = pct > 5 ? styles.rateBad : pct > 1 ? styles.rateWarn : styles.rateGood;
|
||||
return <MonoText size="sm" className={cls}>{pct.toFixed(2)}%</MonoText>;
|
||||
},
|
||||
},
|
||||
{
|
||||
key: 'pctTime',
|
||||
header: '% Time',
|
||||
sortable: true,
|
||||
render: (_, row) => (
|
||||
<MonoText size="sm">{row.pctTime.toFixed(1)}%</MonoText>
|
||||
),
|
||||
},
|
||||
];
|
||||
|
||||
// ── Error table columns ─────────────────────────────────────────────────────
|
||||
|
||||
const ERROR_COLUMNS: Column<ErrorRow>[] = [
|
||||
{
|
||||
key: 'errorType',
|
||||
header: 'Error Type',
|
||||
sortable: true,
|
||||
render: (_, row) => <MonoText size="sm">{row.errorType}</MonoText>,
|
||||
},
|
||||
{
|
||||
key: 'processorId',
|
||||
header: 'Processor',
|
||||
sortable: true,
|
||||
render: (_, row) => (
|
||||
<MonoText size="sm">{row.processorId ?? '\u2014'}</MonoText>
|
||||
),
|
||||
},
|
||||
{
|
||||
key: 'count',
|
||||
header: 'Count',
|
||||
sortable: true,
|
||||
render: (_, row) => (
|
||||
<MonoText size="sm">{row.count.toLocaleString()}</MonoText>
|
||||
),
|
||||
},
|
||||
{
|
||||
key: 'trend',
|
||||
header: 'Velocity',
|
||||
render: (_, row) => (
|
||||
<span>{trendArrow(row.trend)} {row.trend}</span>
|
||||
),
|
||||
},
|
||||
{
|
||||
key: 'lastSeen',
|
||||
header: 'Last Seen',
|
||||
sortable: true,
|
||||
render: (_, row) => (
|
||||
<span>{formatRelativeTime(row.lastSeen)}</span>
|
||||
),
|
||||
},
|
||||
];
|
||||
|
||||
// ── Build KPI items ─────────────────────────────────────────────────────────
|
||||
|
||||
function buildKpiItems(
|
||||
stats: {
|
||||
totalCount: number;
|
||||
failedCount: number;
|
||||
avgDurationMs: number;
|
||||
p99LatencyMs: number;
|
||||
activeCount: number;
|
||||
prevTotalCount: number;
|
||||
prevFailedCount: number;
|
||||
prevP99LatencyMs: number;
|
||||
} | undefined,
|
||||
slaThresholdMs: number,
|
||||
bottleneck: { processorId: string; avgMs: number; pct: number } | null,
|
||||
throughputSparkline: number[],
|
||||
windowSeconds: number,
|
||||
): KpiItem[] {
|
||||
const totalCount = stats?.totalCount ?? 0;
|
||||
const failedCount = stats?.failedCount ?? 0;
|
||||
const prevTotalCount = stats?.prevTotalCount ?? 0;
|
||||
const p99Ms = stats?.p99LatencyMs ?? 0;
|
||||
const avgMs = stats?.avgDurationMs ?? 0;
|
||||
|
||||
const successRate = totalCount > 0 ? ((totalCount - failedCount) / totalCount) * 100 : 100;
|
||||
const slaCompliance = totalCount > 0
|
||||
? ((totalCount - failedCount) / totalCount) * 100
|
||||
: 100;
|
||||
|
||||
const throughputTrend = trendIndicator(totalCount, prevTotalCount);
|
||||
|
||||
return [
|
||||
{
|
||||
label: 'Throughput',
|
||||
value: formatThroughput(totalCount, windowSeconds),
|
||||
trend: {
|
||||
label: throughputTrend.label,
|
||||
variant: throughputTrend.direction === 'up' ? 'success' as const : throughputTrend.direction === 'down' ? 'error' as const : 'muted' as const,
|
||||
},
|
||||
subtitle: `${totalCount.toLocaleString()} total exchanges`,
|
||||
sparkline: throughputSparkline,
|
||||
borderColor: 'var(--amber)',
|
||||
},
|
||||
{
|
||||
label: 'Success Rate',
|
||||
value: `${successRate.toFixed(2)}%`,
|
||||
trend: {
|
||||
label: failedCount > 0 ? `${failedCount} failed` : 'No errors',
|
||||
variant: successRate >= 99 ? 'success' as const : successRate >= 97 ? 'warning' as const : 'error' as const,
|
||||
},
|
||||
subtitle: `${totalCount - failedCount} succeeded / ${totalCount.toLocaleString()} total`,
|
||||
borderColor: successRate >= 99 ? 'var(--success)' : 'var(--error)',
|
||||
},
|
||||
{
|
||||
label: 'P99 Latency',
|
||||
value: `${Math.round(p99Ms)}ms`,
|
||||
trend: {
|
||||
label: p99Ms > slaThresholdMs ? 'BREACH' : 'OK',
|
||||
variant: p99Ms > slaThresholdMs ? 'error' as const : 'success' as const,
|
||||
},
|
||||
subtitle: `SLA threshold: ${slaThresholdMs}ms \u00B7 Avg: ${Math.round(avgMs)}ms`,
|
||||
borderColor: p99Ms > slaThresholdMs ? 'var(--warning)' : 'var(--success)',
|
||||
},
|
||||
{
|
||||
label: 'SLA Compliance',
|
||||
value: formatSlaCompliance(slaCompliance),
|
||||
trend: {
|
||||
label: slaCompliance >= 99.9 ? 'Excellent' : slaCompliance >= 99 ? 'Good' : 'Degraded',
|
||||
variant: slaCompliance >= 99 ? 'success' as const : slaCompliance >= 95 ? 'warning' as const : 'error' as const,
|
||||
},
|
||||
subtitle: `Target: 99.9%`,
|
||||
borderColor: slaCompliance >= 99 ? 'var(--success)' : 'var(--warning)',
|
||||
},
|
||||
{
|
||||
label: 'Bottleneck',
|
||||
value: bottleneck ? `${Math.round(bottleneck.avgMs)}ms` : '\u2014',
|
||||
trend: {
|
||||
label: bottleneck ? `${bottleneck.pct.toFixed(1)}% of total` : '\u2014',
|
||||
variant: bottleneck && bottleneck.pct > 50 ? 'error' as const : 'muted' as const,
|
||||
},
|
||||
subtitle: bottleneck
|
||||
? `${bottleneck.processorId} \u00B7 ${Math.round(bottleneck.avgMs)}ms \u00B7 ${bottleneck.pct.toFixed(1)}% of total`
|
||||
: 'No processor data',
|
||||
borderColor: 'var(--running)',
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
// ── Component ───────────────────────────────────────────────────────────────
|
||||
|
||||
export default function DashboardL3() {
|
||||
const { appId, routeId } = useParams<{ appId: string; routeId: string }>();
|
||||
const { timeRange } = useGlobalFilters();
|
||||
const timeFrom = timeRange.start.toISOString();
|
||||
const timeTo = timeRange.end.toISOString();
|
||||
const windowSeconds = (timeRange.end.getTime() - timeRange.start.getTime()) / 1000;
|
||||
|
||||
// ── Data hooks ──────────────────────────────────────────────────────────
|
||||
const { data: stats } = useExecutionStats(timeFrom, timeTo, routeId, appId);
|
||||
const { data: timeseries } = useStatsTimeseries(timeFrom, timeTo, routeId, appId);
|
||||
const { data: processorMetrics } = useProcessorMetrics(routeId ?? null, appId);
|
||||
const { data: topErrors } = useTopErrors(timeFrom, timeTo, appId, routeId);
|
||||
const { data: diagramLayout } = useDiagramByRoute(appId, routeId);
|
||||
const { data: appSettings } = useAppSettings(appId);
|
||||
|
||||
const slaThresholdMs = appSettings?.slaThresholdMs ?? 300;
|
||||
|
||||
// ── Bottleneck (processor with highest avgDurationMs) ───────────────────
|
||||
const bottleneck = useMemo(() => {
|
||||
if (!processorMetrics?.length) return null;
|
||||
const routeAvg = stats?.avgDurationMs ?? 0;
|
||||
const sorted = [...processorMetrics].sort(
|
||||
(a: any, b: any) => b.avgDurationMs - a.avgDurationMs,
|
||||
);
|
||||
const top = sorted[0];
|
||||
const pct = routeAvg > 0 ? (top.avgDurationMs / routeAvg) * 100 : 0;
|
||||
return { processorId: top.processorId, avgMs: top.avgDurationMs, pct };
|
||||
}, [processorMetrics, stats]);
|
||||
|
||||
// ── Sparklines from timeseries ──────────────────────────────────────────
|
||||
const throughputSparkline = useMemo(
|
||||
() => (timeseries?.buckets || []).map((b: any) => b.totalCount),
|
||||
[timeseries],
|
||||
);
|
||||
|
||||
// ── KPI strip ───────────────────────────────────────────────────────────
|
||||
const kpiItems = useMemo(
|
||||
() => buildKpiItems(stats, slaThresholdMs, bottleneck, throughputSparkline, windowSeconds),
|
||||
[stats, slaThresholdMs, bottleneck, throughputSparkline, windowSeconds],
|
||||
);
|
||||
|
||||
// ── Chart series ────────────────────────────────────────────────────────
|
||||
const throughputChartSeries = useMemo(() => [{
|
||||
label: 'Throughput',
|
||||
data: (timeseries?.buckets || []).map((b: any, i: number) => ({
|
||||
x: i,
|
||||
y: b.totalCount,
|
||||
})),
|
||||
}], [timeseries]);
|
||||
|
||||
const latencyChartSeries = useMemo(() => [{
|
||||
label: 'P99',
|
||||
data: (timeseries?.buckets || []).map((b: any, i: number) => ({
|
||||
x: i,
|
||||
y: b.p99DurationMs,
|
||||
})),
|
||||
}], [timeseries]);
|
||||
|
||||
const errorRateChartSeries = useMemo(() => [{
|
||||
label: 'Error Rate',
|
||||
data: (timeseries?.buckets || []).map((b: any, i: number) => ({
|
||||
x: i,
|
||||
y: b.totalCount > 0 ? (b.failedCount / b.totalCount) * 100 : 0,
|
||||
})),
|
||||
color: 'var(--error)',
|
||||
}], [timeseries]);
|
||||
|
||||
// ── Processor table rows ────────────────────────────────────────────────
|
||||
const processorRows: ProcessorRow[] = useMemo(() => {
|
||||
if (!processorMetrics?.length) return [];
|
||||
const routeAvg = stats?.avgDurationMs ?? 0;
|
||||
return processorMetrics.map((m: any) => ({
|
||||
id: m.processorId,
|
||||
processorId: m.processorId,
|
||||
processorType: m.processorType,
|
||||
totalCount: m.totalCount,
|
||||
avgDurationMs: m.avgDurationMs,
|
||||
p99DurationMs: m.p99DurationMs,
|
||||
errorRate: m.errorRate,
|
||||
pctTime: routeAvg > 0 ? (m.avgDurationMs / routeAvg) * 100 : 0,
|
||||
}));
|
||||
}, [processorMetrics, stats]);
|
||||
|
||||
// ── Latency heatmap for ProcessDiagram ──────────────────────────────────
|
||||
const latencyHeatmap = useMemo(() => {
|
||||
if (!processorMetrics?.length) return new Map();
|
||||
const totalAvg = processorMetrics.reduce(
|
||||
(sum: number, m: any) => sum + m.avgDurationMs, 0,
|
||||
);
|
||||
const map = new Map<string, { avgDurationMs: number; p99DurationMs: number; pctOfRoute: number }>();
|
||||
for (const m of processorMetrics) {
|
||||
map.set(m.processorId, {
|
||||
avgDurationMs: m.avgDurationMs,
|
||||
p99DurationMs: m.p99DurationMs,
|
||||
pctOfRoute: totalAvg > 0 ? (m.avgDurationMs / totalAvg) * 100 : 0,
|
||||
});
|
||||
}
|
||||
return map;
|
||||
}, [processorMetrics]);
|
||||
|
||||
// ── Error table rows ────────────────────────────────────────────────────
|
||||
const errorRows: ErrorRow[] = useMemo(
|
||||
() => (topErrors || []).map((e, i) => ({ ...e, id: `${e.errorType}-${i}` })),
|
||||
[topErrors],
|
||||
);
|
||||
|
||||
return (
|
||||
<div className={styles.content}>
|
||||
<div className={styles.refreshIndicator}>
|
||||
<span className={styles.refreshDot} />
|
||||
<span className={styles.refreshText}>Auto-refresh: 30s</span>
|
||||
</div>
|
||||
|
||||
{/* KPI Strip */}
|
||||
<KpiStrip items={kpiItems} />
|
||||
|
||||
{/* Charts — 3 in a row */}
|
||||
{(timeseries?.buckets?.length ?? 0) > 0 && (
|
||||
<div className={styles.chartRow}>
|
||||
<Card title="Throughput">
|
||||
<AreaChart
|
||||
series={throughputChartSeries}
|
||||
yLabel="msg/s"
|
||||
height={200}
|
||||
/>
|
||||
</Card>
|
||||
|
||||
<Card title="Latency Percentiles">
|
||||
<LineChart
|
||||
series={latencyChartSeries}
|
||||
yLabel="ms"
|
||||
threshold={{ value: slaThresholdMs, label: `SLA ${slaThresholdMs}ms` }}
|
||||
height={200}
|
||||
/>
|
||||
</Card>
|
||||
|
||||
<Card title="Error Rate">
|
||||
<AreaChart
|
||||
series={errorRateChartSeries}
|
||||
yLabel="%"
|
||||
height={200}
|
||||
/>
|
||||
</Card>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Process Diagram with Latency Heatmap */}
|
||||
{appId && routeId && (
|
||||
<div className={styles.diagramSection}>
|
||||
<ProcessDiagram
|
||||
application={appId}
|
||||
routeId={routeId}
|
||||
diagramLayout={diagramLayout}
|
||||
latencyHeatmap={latencyHeatmap}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Processor Metrics Table */}
|
||||
<div className={styles.tableSection}>
|
||||
<div className={styles.tableHeader}>
|
||||
<span className={styles.tableTitle}>Processor Metrics</span>
|
||||
<div>
|
||||
<span className={styles.tableMeta}>
|
||||
{processorRows.length} processor{processorRows.length !== 1 ? 's' : ''}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
<DataTable
|
||||
columns={PROCESSOR_COLUMNS}
|
||||
data={processorRows}
|
||||
sortable
|
||||
defaultSort={{ key: 'p99DurationMs', direction: 'desc' }}
|
||||
/>
|
||||
</div>
|
||||
|
||||
{/* Top 5 Errors — hidden if empty */}
|
||||
{errorRows.length > 0 && (
|
||||
<div className={styles.errorsSection}>
|
||||
<div className={styles.tableHeader}>
|
||||
<span className={styles.tableTitle}>Top 5 Errors</span>
|
||||
<Badge label={`${errorRows.length}`} color="error" />
|
||||
</div>
|
||||
<DataTable
|
||||
columns={ERROR_COLUMNS}
|
||||
data={errorRows}
|
||||
sortable
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
Reference in New Issue
Block a user