Indeterminate progress bars were misleading when agents don't report JVM metrics — replaced with plain "N/A" text. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
524 lines
18 KiB
TypeScript
524 lines
18 KiB
TypeScript
import { useState, useMemo } from 'react';
|
|
import { useParams, Link } from 'react-router';
|
|
import {
|
|
StatCard, StatusDot, Badge, MonoText, ProgressBar,
|
|
GroupCard, DataTable, LineChart, EventFeed, DetailPanel,
|
|
} from '@cameleer/design-system';
|
|
import type { Column, FeedEvent } from '@cameleer/design-system';
|
|
import styles from './AgentHealth.module.css';
|
|
import { useAgents, useAgentEvents } from '../../api/queries/agents';
|
|
import { useAgentMetrics } from '../../api/queries/agent-metrics';
|
|
import type { AgentInstance } from '../../api/types';
|
|
|
|
// ── Helpers ──────────────────────────────────────────────────────────────────
|
|
|
|
function timeAgo(iso?: string): string {
|
|
if (!iso) return '\u2014';
|
|
const diff = Date.now() - new Date(iso).getTime();
|
|
const secs = Math.floor(diff / 1000);
|
|
if (secs < 60) return `${secs}s ago`;
|
|
const mins = Math.floor(secs / 60);
|
|
if (mins < 60) return `${mins}m ago`;
|
|
const hours = Math.floor(mins / 60);
|
|
if (hours < 24) return `${hours}h ago`;
|
|
return `${Math.floor(hours / 24)}d ago`;
|
|
}
|
|
|
|
function formatUptime(seconds?: number): string {
|
|
if (!seconds) return '\u2014';
|
|
const days = Math.floor(seconds / 86400);
|
|
const hours = Math.floor((seconds % 86400) / 3600);
|
|
const mins = Math.floor((seconds % 3600) / 60);
|
|
if (days > 0) return `${days}d ${hours}h`;
|
|
if (hours > 0) return `${hours}h ${mins}m`;
|
|
return `${mins}m`;
|
|
}
|
|
|
|
function formatErrorRate(rate?: number): string {
|
|
if (rate == null) return '\u2014';
|
|
return `${(rate * 100).toFixed(1)}%`;
|
|
}
|
|
|
|
type NormStatus = 'live' | 'stale' | 'dead';
|
|
|
|
function normalizeStatus(status: string): NormStatus {
|
|
return status.toLowerCase() as NormStatus;
|
|
}
|
|
|
|
function statusColor(s: NormStatus): 'success' | 'warning' | 'error' {
|
|
if (s === 'live') return 'success';
|
|
if (s === 'stale') return 'warning';
|
|
return 'error';
|
|
}
|
|
|
|
// ── Data grouping ────────────────────────────────────────────────────────────
|
|
|
|
interface AppGroup {
|
|
appId: string;
|
|
instances: AgentInstance[];
|
|
liveCount: number;
|
|
staleCount: number;
|
|
deadCount: number;
|
|
totalTps: number;
|
|
totalActiveRoutes: number;
|
|
totalRoutes: number;
|
|
}
|
|
|
|
function groupByApp(agentList: AgentInstance[]): AppGroup[] {
|
|
const map = new Map<string, AgentInstance[]>();
|
|
for (const a of agentList) {
|
|
const app = a.application;
|
|
const list = map.get(app) ?? [];
|
|
list.push(a);
|
|
map.set(app, list);
|
|
}
|
|
return Array.from(map.entries()).map(([appId, instances]) => ({
|
|
appId,
|
|
instances,
|
|
liveCount: instances.filter((i) => normalizeStatus(i.status) === 'live').length,
|
|
staleCount: instances.filter((i) => normalizeStatus(i.status) === 'stale').length,
|
|
deadCount: instances.filter((i) => normalizeStatus(i.status) === 'dead').length,
|
|
totalTps: instances.reduce((s, i) => s + (i.tps ?? 0), 0),
|
|
totalActiveRoutes: instances.reduce((s, i) => s + (i.activeRoutes ?? 0), 0),
|
|
totalRoutes: instances.reduce((s, i) => s + (i.totalRoutes ?? 0), 0),
|
|
}));
|
|
}
|
|
|
|
function appHealth(group: AppGroup): 'success' | 'warning' | 'error' {
|
|
if (group.deadCount > 0) return 'error';
|
|
if (group.staleCount > 0) return 'warning';
|
|
return 'success';
|
|
}
|
|
|
|
// ── Detail sub-components ────────────────────────────────────────────────────
|
|
|
|
function AgentOverviewContent({ agent }: { agent: AgentInstance }) {
|
|
const { data: memMetrics } = useAgentMetrics(
|
|
agent.id,
|
|
['jvm.memory.heap.used', 'jvm.memory.heap.max'],
|
|
1,
|
|
);
|
|
const { data: cpuMetrics } = useAgentMetrics(agent.id, ['jvm.cpu.process'], 1);
|
|
|
|
const cpuValue = cpuMetrics?.metrics?.['jvm.cpu.process']?.[0]?.value;
|
|
const heapUsed = memMetrics?.metrics?.['jvm.memory.heap.used']?.[0]?.value;
|
|
const heapMax = memMetrics?.metrics?.['jvm.memory.heap.max']?.[0]?.value;
|
|
|
|
const heapPercent =
|
|
heapUsed != null && heapMax != null && heapMax > 0
|
|
? Math.round((heapUsed / heapMax) * 100)
|
|
: undefined;
|
|
const cpuPercent = cpuValue != null ? Math.round(cpuValue * 100) : undefined;
|
|
|
|
const ns = normalizeStatus(agent.status);
|
|
|
|
return (
|
|
<div className={styles.detailContent}>
|
|
<div className={styles.detailRow}>
|
|
<span className={styles.detailLabel}>Status</span>
|
|
<Badge label={agent.status} color={statusColor(ns)} variant="filled" />
|
|
</div>
|
|
<div className={styles.detailRow}>
|
|
<span className={styles.detailLabel}>Application</span>
|
|
<MonoText size="xs">{agent.application}</MonoText>
|
|
</div>
|
|
<div className={styles.detailRow}>
|
|
<span className={styles.detailLabel}>Uptime</span>
|
|
<MonoText size="xs">{formatUptime(agent.uptimeSeconds)}</MonoText>
|
|
</div>
|
|
<div className={styles.detailRow}>
|
|
<span className={styles.detailLabel}>Last Seen</span>
|
|
<MonoText size="xs">{timeAgo(agent.lastHeartbeat)}</MonoText>
|
|
</div>
|
|
<div className={styles.detailRow}>
|
|
<span className={styles.detailLabel}>Throughput</span>
|
|
<MonoText size="xs">{agent.tps != null ? `${agent.tps.toFixed(1)}/s` : '\u2014'}</MonoText>
|
|
</div>
|
|
<div className={styles.detailRow}>
|
|
<span className={styles.detailLabel}>Errors</span>
|
|
<MonoText size="xs" className={agent.errorRate ? styles.instanceError : undefined}>
|
|
{formatErrorRate(agent.errorRate)}
|
|
</MonoText>
|
|
</div>
|
|
<div className={styles.detailRow}>
|
|
<span className={styles.detailLabel}>Routes</span>
|
|
<span>{agent.activeRoutes ?? 0}/{agent.totalRoutes ?? 0} active</span>
|
|
</div>
|
|
<div className={styles.detailRow}>
|
|
<span className={styles.detailLabel}>Heap Memory</span>
|
|
{heapPercent != null ? (
|
|
<div className={styles.detailProgress}>
|
|
<ProgressBar
|
|
value={heapPercent}
|
|
variant={heapPercent > 85 ? 'error' : heapPercent > 70 ? 'warning' : 'success'}
|
|
size="sm"
|
|
/>
|
|
<MonoText size="xs">{heapPercent}%</MonoText>
|
|
</div>
|
|
) : (
|
|
<MonoText size="xs">N/A</MonoText>
|
|
)}
|
|
</div>
|
|
<div className={styles.detailRow}>
|
|
<span className={styles.detailLabel}>CPU</span>
|
|
{cpuPercent != null ? (
|
|
<div className={styles.detailProgress}>
|
|
<ProgressBar
|
|
value={cpuPercent}
|
|
variant={cpuPercent > 80 ? 'error' : cpuPercent > 60 ? 'warning' : 'success'}
|
|
size="sm"
|
|
/>
|
|
<MonoText size="xs">{cpuPercent}%</MonoText>
|
|
</div>
|
|
) : (
|
|
<MonoText size="xs">N/A</MonoText>
|
|
)}
|
|
</div>
|
|
</div>
|
|
);
|
|
}
|
|
|
|
function AgentPerformanceContent({ agent }: { agent: AgentInstance }) {
|
|
const { data: tpsMetrics } = useAgentMetrics(agent.id, ['cameleer.tps'], 60);
|
|
const { data: errMetrics } = useAgentMetrics(agent.id, ['cameleer.error.rate'], 60);
|
|
|
|
const tpsSeries = useMemo(() => {
|
|
const raw = tpsMetrics?.metrics?.['cameleer.tps'] ?? [];
|
|
return [{ label: 'TPS', data: raw.map((p) => ({ x: new Date(p.time), y: p.value })) }];
|
|
}, [tpsMetrics]);
|
|
|
|
const errSeries = useMemo(() => {
|
|
const raw = errMetrics?.metrics?.['cameleer.error.rate'] ?? [];
|
|
return [{
|
|
label: 'Error Rate',
|
|
data: raw.map((p) => ({ x: new Date(p.time), y: p.value * 100 })),
|
|
color: 'var(--error)',
|
|
}];
|
|
}, [errMetrics]);
|
|
|
|
return (
|
|
<div className={styles.detailContent}>
|
|
<div className={styles.chartPanel}>
|
|
<div className={styles.chartTitle}>Throughput (msg/s)</div>
|
|
{tpsSeries[0].data.length > 0 ? (
|
|
<LineChart series={tpsSeries} height={160} yLabel="msg/s" />
|
|
) : (
|
|
<div className={styles.emptyChart}>No data available</div>
|
|
)}
|
|
</div>
|
|
<div className={styles.chartPanel}>
|
|
<div className={styles.chartTitle}>Error Rate (%)</div>
|
|
{errSeries[0].data.length > 0 ? (
|
|
<LineChart series={errSeries} height={160} yLabel="%" />
|
|
) : (
|
|
<div className={styles.emptyChart}>No data available</div>
|
|
)}
|
|
</div>
|
|
</div>
|
|
);
|
|
}
|
|
|
|
// ── AgentHealth page ─────────────────────────────────────────────────────────
|
|
|
|
export default function AgentHealth() {
|
|
const { appId } = useParams();
|
|
const { data: agents } = useAgents(undefined, appId);
|
|
const { data: events } = useAgentEvents(appId);
|
|
|
|
const [selectedInstance, setSelectedInstance] = useState<AgentInstance | null>(null);
|
|
const [panelOpen, setPanelOpen] = useState(false);
|
|
|
|
const agentList = agents ?? [];
|
|
|
|
const groups = useMemo(() => groupByApp(agentList), [agentList]);
|
|
|
|
// Aggregate stats
|
|
const totalInstances = agentList.length;
|
|
const liveCount = agentList.filter((a) => normalizeStatus(a.status) === 'live').length;
|
|
const staleCount = agentList.filter((a) => normalizeStatus(a.status) === 'stale').length;
|
|
const deadCount = agentList.filter((a) => normalizeStatus(a.status) === 'dead').length;
|
|
const totalTps = agentList.reduce((s, a) => s + (a.tps ?? 0), 0);
|
|
const totalActiveRoutes = agentList.reduce((s, a) => s + (a.activeRoutes ?? 0), 0);
|
|
const totalRoutes = agentList.reduce((s, a) => s + (a.totalRoutes ?? 0), 0);
|
|
|
|
// Map events to FeedEvent
|
|
const feedEvents: FeedEvent[] = useMemo(
|
|
() =>
|
|
(events ?? []).map((e: { id: number; agentId: string; eventType: string; detail: string; timestamp: string }) => ({
|
|
id: String(e.id),
|
|
severity:
|
|
e.eventType === 'WENT_DEAD'
|
|
? ('error' as const)
|
|
: e.eventType === 'WENT_STALE'
|
|
? ('warning' as const)
|
|
: e.eventType === 'RECOVERED'
|
|
? ('success' as const)
|
|
: ('running' as const),
|
|
message: `${e.agentId}: ${e.eventType}${e.detail ? ' \u2014 ' + e.detail : ''}`,
|
|
timestamp: new Date(e.timestamp),
|
|
})),
|
|
[events],
|
|
);
|
|
|
|
// Column definitions for the instance DataTable
|
|
const instanceColumns: Column<AgentInstance>[] = useMemo(
|
|
() => [
|
|
{
|
|
key: 'status',
|
|
header: '',
|
|
width: '12px',
|
|
render: (_val, row) => <StatusDot variant={normalizeStatus(row.status)} />,
|
|
},
|
|
{
|
|
key: 'name',
|
|
header: 'Instance',
|
|
render: (_val, row) => (
|
|
<MonoText size="sm" className={styles.instanceName}>{row.name ?? row.id}</MonoText>
|
|
),
|
|
},
|
|
{
|
|
key: 'state',
|
|
header: 'State',
|
|
render: (_val, row) => {
|
|
const ns = normalizeStatus(row.status);
|
|
return <Badge label={row.status} color={statusColor(ns)} variant="filled" />;
|
|
},
|
|
},
|
|
{
|
|
key: 'uptime',
|
|
header: 'Uptime',
|
|
render: (_val, row) => (
|
|
<MonoText size="xs" className={styles.instanceMeta}>{formatUptime(row.uptimeSeconds)}</MonoText>
|
|
),
|
|
},
|
|
{
|
|
key: 'tps',
|
|
header: 'TPS',
|
|
render: (_val, row) => (
|
|
<MonoText size="xs" className={styles.instanceMeta}>
|
|
{row.tps != null ? `${row.tps.toFixed(1)}/s` : '\u2014'}
|
|
</MonoText>
|
|
),
|
|
},
|
|
{
|
|
key: 'errorRate',
|
|
header: 'Errors',
|
|
render: (_val, row) => (
|
|
<MonoText size="xs" className={row.errorRate ? styles.instanceError : styles.instanceMeta}>
|
|
{formatErrorRate(row.errorRate)}
|
|
</MonoText>
|
|
),
|
|
},
|
|
{
|
|
key: 'lastHeartbeat',
|
|
header: 'Heartbeat',
|
|
render: (_val, row) => {
|
|
const ns = normalizeStatus(row.status);
|
|
return (
|
|
<MonoText
|
|
size="xs"
|
|
className={
|
|
ns === 'dead'
|
|
? styles.instanceHeartbeatDead
|
|
: ns === 'stale'
|
|
? styles.instanceHeartbeatStale
|
|
: styles.instanceMeta
|
|
}
|
|
>
|
|
{timeAgo(row.lastHeartbeat)}
|
|
</MonoText>
|
|
);
|
|
},
|
|
},
|
|
],
|
|
[],
|
|
);
|
|
|
|
function handleInstanceClick(inst: AgentInstance) {
|
|
setSelectedInstance(inst);
|
|
setPanelOpen(true);
|
|
}
|
|
|
|
// Detail panel tabs
|
|
const detailTabs = selectedInstance
|
|
? [
|
|
{
|
|
label: 'Overview',
|
|
value: 'overview',
|
|
content: <AgentOverviewContent agent={selectedInstance} />,
|
|
},
|
|
{
|
|
label: 'Performance',
|
|
value: 'performance',
|
|
content: <AgentPerformanceContent agent={selectedInstance} />,
|
|
},
|
|
]
|
|
: [];
|
|
|
|
const isFullWidth = !!appId;
|
|
|
|
return (
|
|
<div className={styles.content}>
|
|
{/* Stat strip */}
|
|
<div className={styles.statStrip}>
|
|
<StatCard
|
|
label="Total Agents"
|
|
value={String(totalInstances)}
|
|
accent={deadCount > 0 ? 'warning' : 'amber'}
|
|
detail={
|
|
<span className={styles.breakdown}>
|
|
<span className={styles.bpLive}><StatusDot variant="live" /> {liveCount} live</span>
|
|
<span className={styles.bpStale}><StatusDot variant="stale" /> {staleCount} stale</span>
|
|
<span className={styles.bpDead}><StatusDot variant="dead" /> {deadCount} dead</span>
|
|
</span>
|
|
}
|
|
/>
|
|
<StatCard
|
|
label="Applications"
|
|
value={String(groups.length)}
|
|
accent="running"
|
|
detail={
|
|
<span className={styles.breakdown}>
|
|
<span className={styles.bpLive}>
|
|
<StatusDot variant="live" /> {groups.filter((g) => g.deadCount === 0 && g.staleCount === 0).length} healthy
|
|
</span>
|
|
<span className={styles.bpStale}>
|
|
<StatusDot variant="stale" /> {groups.filter((g) => g.staleCount > 0 && g.deadCount === 0).length} degraded
|
|
</span>
|
|
<span className={styles.bpDead}>
|
|
<StatusDot variant="dead" /> {groups.filter((g) => g.deadCount > 0).length} critical
|
|
</span>
|
|
</span>
|
|
}
|
|
/>
|
|
<StatCard
|
|
label="Active Routes"
|
|
value={
|
|
<span
|
|
className={
|
|
styles[
|
|
totalActiveRoutes === 0
|
|
? 'routesError'
|
|
: totalActiveRoutes < totalRoutes
|
|
? 'routesWarning'
|
|
: 'routesSuccess'
|
|
]
|
|
}
|
|
>
|
|
{totalActiveRoutes}/{totalRoutes}
|
|
</span>
|
|
}
|
|
accent={totalActiveRoutes === 0 ? 'error' : totalActiveRoutes < totalRoutes ? 'warning' : 'success'}
|
|
detail={totalActiveRoutes < totalRoutes ? `${totalRoutes - totalActiveRoutes} suspended` : 'all routes active'}
|
|
/>
|
|
<StatCard
|
|
label="Total TPS"
|
|
value={totalTps.toFixed(1)}
|
|
accent="amber"
|
|
detail="msg/s"
|
|
/>
|
|
<StatCard
|
|
label="Dead"
|
|
value={String(deadCount)}
|
|
accent={deadCount > 0 ? 'error' : 'success'}
|
|
detail={deadCount > 0 ? 'requires attention' : 'all healthy'}
|
|
/>
|
|
</div>
|
|
|
|
{/* Scope trail + badges */}
|
|
<div className={styles.scopeTrail}>
|
|
{appId && (
|
|
<>
|
|
<Link to="/agents" className={styles.scopeLink}>All Agents</Link>
|
|
<span className={styles.scopeSep}>▸</span>
|
|
<span className={styles.scopeCurrent}>{appId}</span>
|
|
</>
|
|
)}
|
|
<Badge
|
|
label={`${liveCount}/${totalInstances} live`}
|
|
color={deadCount > 0 ? 'error' : staleCount > 0 ? 'warning' : 'success'}
|
|
variant="filled"
|
|
/>
|
|
</div>
|
|
|
|
{/* Group cards grid */}
|
|
<div className={isFullWidth ? styles.groupGridSingle : styles.groupGrid}>
|
|
{groups.map((group) => (
|
|
<GroupCard
|
|
key={group.appId}
|
|
title={group.appId}
|
|
accent={appHealth(group)}
|
|
headerRight={
|
|
<Badge
|
|
label={`${group.liveCount}/${group.instances.length} LIVE`}
|
|
color={appHealth(group)}
|
|
variant="filled"
|
|
/>
|
|
}
|
|
meta={
|
|
<div className={styles.groupMeta}>
|
|
<span><strong>{group.totalTps.toFixed(1)}</strong> msg/s</span>
|
|
<span><strong>{group.totalActiveRoutes}</strong>/{group.totalRoutes} routes</span>
|
|
<span>
|
|
<StatusDot
|
|
variant={
|
|
appHealth(group) === 'success'
|
|
? 'live'
|
|
: appHealth(group) === 'warning'
|
|
? 'stale'
|
|
: 'dead'
|
|
}
|
|
/>
|
|
</span>
|
|
</div>
|
|
}
|
|
footer={
|
|
group.deadCount > 0 ? (
|
|
<div className={styles.alertBanner}>
|
|
<span className={styles.alertIcon}>⚠</span>
|
|
<span>
|
|
Single point of failure —{' '}
|
|
{group.deadCount === group.instances.length
|
|
? 'no redundancy'
|
|
: `${group.deadCount} dead instance${group.deadCount > 1 ? 's' : ''}`}
|
|
</span>
|
|
</div>
|
|
) : undefined
|
|
}
|
|
>
|
|
<DataTable<AgentInstance>
|
|
columns={instanceColumns}
|
|
data={group.instances}
|
|
onRowClick={handleInstanceClick}
|
|
selectedId={panelOpen ? selectedInstance?.id : undefined}
|
|
pageSize={50}
|
|
flush
|
|
/>
|
|
</GroupCard>
|
|
))}
|
|
</div>
|
|
|
|
{/* EventFeed */}
|
|
{feedEvents.length > 0 && (
|
|
<div className={styles.eventCard}>
|
|
<div className={styles.eventCardHeader}>
|
|
<span className={styles.sectionTitle}>Timeline</span>
|
|
<span className={styles.sectionMeta}>{feedEvents.length} events</span>
|
|
</div>
|
|
<EventFeed events={feedEvents} maxItems={100} />
|
|
</div>
|
|
)}
|
|
|
|
{/* Detail panel — auto-portals to AppShell level via design system */}
|
|
{selectedInstance && (
|
|
<DetailPanel
|
|
open={panelOpen}
|
|
onClose={() => { setPanelOpen(false); setSelectedInstance(null); }}
|
|
title={selectedInstance.name ?? selectedInstance.id}
|
|
tabs={detailTabs}
|
|
/>
|
|
)}
|
|
</div>
|
|
);
|
|
}
|