Compare commits
328 Commits
5373ac6541
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c5b6f2bbad | ||
| 83c3ac3ef3 | |||
| 7dd7317cb8 | |||
| 2654271494 | |||
|
|
888f589934 | ||
|
|
9aad2f3871 | ||
|
|
cbaac2bfa5 | ||
|
|
7529a9ce99 | ||
|
|
09309de982 | ||
|
|
56c41814fc | ||
|
|
68704e15b4 | ||
|
|
510206c752 | ||
|
|
58e9695b4c | ||
|
|
f27a0044f1 | ||
|
|
5c9323cfed | ||
|
|
2dcbd5a772 | ||
|
|
f9b5f235cc | ||
|
|
0b419db9f1 | ||
|
|
5f6f9e523d | ||
|
|
35319dc666 | ||
|
|
3c2409ed6e | ||
|
|
ca401363ec | ||
|
|
b5ee9e1d1f | ||
|
|
75a41929c4 | ||
|
|
d58c8cde2e | ||
|
|
64608a7677 | ||
|
|
48ce75bf38 | ||
|
|
0bbe5d6623 | ||
|
|
e1ac896a6e | ||
|
|
58009d7c23 | ||
|
|
b799d55835 | ||
|
|
166568edea | ||
|
|
f049a0a6a0 | ||
|
|
f8e382c217 | ||
|
|
c7e5c7fa2d | ||
|
|
0995ab35c4 | ||
|
|
480a53c80c | ||
|
|
d3ce5e861b | ||
|
|
e5c8fff0f9 | ||
|
|
21db92ff00 | ||
|
|
165c9f10e3 | ||
|
|
ade1733418 | ||
|
|
0cf64b2928 | ||
|
|
0fc9c8cb4c | ||
|
|
fe4a6dbf24 | ||
|
|
9cfe3985d0 | ||
|
|
18da187960 | ||
|
|
9c1bd24f16 | ||
|
|
177673ba62 | ||
|
|
77f5c82dfe | ||
|
|
663a6624a7 | ||
|
|
cc3cd610b2 | ||
|
|
b6239bdb6b | ||
|
|
0ae27ad9ed | ||
|
|
e00848dc65 | ||
|
|
f31975e0ef | ||
|
|
2c0cf7dc9c | ||
|
|
fb7b15f539 | ||
|
|
1d7009d69c | ||
|
|
99a91a57be | ||
|
|
427988bcc8 | ||
|
|
a208f2eec7 | ||
|
|
13f218d522 | ||
|
|
900fba5af6 | ||
|
|
b3d1dd377d | ||
|
|
e36c82c4db | ||
|
|
d192f6b57c | ||
|
|
fe1681e6e8 | ||
|
|
571f85cd0f | ||
|
|
25d2a3014a | ||
|
|
1a97e2146e | ||
|
|
d1150e5dd8 | ||
|
|
b0995d84bc | ||
|
|
9756a20223 | ||
|
|
1b4b522233 | ||
|
|
48217e0034 | ||
|
|
c3ecff9d45 | ||
|
|
07099357af | ||
|
|
ed0e616109 | ||
|
|
382e1801a7 | ||
|
|
2312a7304d | ||
|
|
47d5611462 | ||
|
|
9043dc00b0 | ||
|
|
a141e99a07 | ||
|
|
15d00f039c | ||
|
|
064c302073 | ||
|
|
35748ea7a1 | ||
|
|
e558494f8d | ||
|
|
1f0ab002d6 | ||
|
|
242ef1f0af | ||
|
|
c6aef5ab35 | ||
|
|
007597715a | ||
|
|
b6e54db6ec | ||
|
|
e9f523f2b8 | ||
|
|
653f983a08 | ||
|
|
459cdfe427 | ||
|
|
652346dcd4 | ||
|
|
5304c8ee01 | ||
|
|
2c82f29aef | ||
|
|
4371372a26 | ||
|
|
f8dccaae2b | ||
|
|
9ecc9ee72a | ||
|
|
9c54313ff1 | ||
|
|
e5eb48b0fa | ||
|
|
b655de3975 | ||
|
|
4e19f925c6 | ||
|
|
8a7f9cb370 | ||
|
|
b5ecd39100 | ||
|
|
629a009b36 | ||
|
|
ffdaeabc9f | ||
|
|
703bd412ed | ||
|
|
4d4c59efe3 | ||
|
|
837e5d46f5 | ||
|
|
0a71bca7b8 | ||
|
|
b7b6bd2a96 | ||
|
|
d33c039a17 | ||
|
|
6d5ce60608 | ||
|
|
d595746830 | ||
|
|
5a7c0ce4bc | ||
|
|
3a649f40cd | ||
|
|
b1bdb88ea4 | ||
|
|
0e4166bd5f | ||
|
|
42fb6c8b8c | ||
|
|
1579f10a41 | ||
|
|
063a4a5532 | ||
|
|
98a7b7819f | ||
|
|
e96c3cd0cf | ||
|
|
b7c0a225f5 | ||
|
|
f487e6caef | ||
|
|
bb06c4c689 | ||
|
|
5c48b780b2 | ||
|
|
4f5a11f715 | ||
|
|
cc193a1075 | ||
|
|
08efdfa9c5 | ||
|
|
00c7c0cd71 | ||
|
|
d067490f71 | ||
|
|
52ff385b04 | ||
|
|
6052975750 | ||
|
|
0434299d53 | ||
|
|
97f25b4c7e | ||
|
|
6591f2fde3 | ||
|
|
24464c0772 | ||
|
|
e4ccce1e3b | ||
|
|
76352c0d6f | ||
|
|
e716dbf8ca | ||
|
|
76129d407e | ||
|
|
9b1240274d | ||
|
|
a79eafeaf4 | ||
|
|
9b851c4622 | ||
|
|
d3e86b9d77 | ||
|
|
7f9cfc7f18 | ||
|
|
06fa7d832f | ||
|
|
d580b6e90c | ||
|
|
ff95187707 | ||
|
|
1a376eb25f | ||
|
|
58ec67aef9 | ||
|
|
2835d08418 | ||
|
|
79fa4c097c | ||
|
|
c2eab71a31 | ||
|
|
88b003d4f0 | ||
|
|
e6dcad1e07 | ||
|
|
eda74b7339 | ||
|
|
e470fc0dab | ||
|
|
32c52aa22e | ||
|
|
cfc619505a | ||
|
|
e0496fdba2 | ||
|
|
f096365e05 | ||
|
|
36cb93ecdd | ||
|
|
9960fd8c36 | ||
|
|
4d37dff9f8 | ||
|
|
7677df33e5 | ||
|
|
0f6bafae8e | ||
|
|
377968eb53 | ||
|
|
e483e52eee | ||
|
|
ba4e2bb68f | ||
|
|
989dde23eb | ||
|
|
3c3d90c45b | ||
|
|
5bd0e09df3 | ||
|
|
b8d4b59f40 | ||
|
|
850c030642 | ||
|
|
4acf0aeeff | ||
|
|
0bad014811 | ||
|
|
c2252a0e72 | ||
|
|
b41f34c090 | ||
|
|
6fa8e3aa30 | ||
|
|
031fe725b5 | ||
|
|
2f9b9c9b0f | ||
|
|
817b61058a | ||
|
|
e4492b10e1 | ||
|
|
6f78d0a513 | ||
|
|
1c4a98c0da | ||
|
|
be45ba2d59 | ||
|
|
41df042e98 | ||
|
|
06c6f53bbc | ||
|
|
98cbf8f3fc | ||
|
|
a694491140 | ||
|
|
a9a6b465d4 | ||
|
|
d32208d403 | ||
|
|
6c1cbc289c | ||
|
|
0f635576a3 | ||
|
|
56faabcdf1 | ||
|
|
b55221e90a | ||
|
|
95f90f43dc | ||
|
|
8283d531f6 | ||
|
|
d5adaaab72 | ||
|
|
5684479938 | ||
|
|
a6e7458adb | ||
|
|
87bada1fc7 | ||
|
|
dfacedb0ca | ||
|
|
36571013c1 | ||
|
|
9bda4d8f8d | ||
|
|
10e2b69974 | ||
|
|
e955302fe8 | ||
|
|
97a6b2e010 | ||
|
|
7436a37b99 | ||
|
|
9046070529 | ||
|
|
fb54f9cbd2 | ||
|
|
90083f886a | ||
|
|
74bfabf618 | ||
|
|
b7d201d743 | ||
| 181a479037 | |||
|
|
849265a1c6 | ||
|
|
8a6744d3e9 | ||
|
|
88804aca2c | ||
|
|
0cd0a27452 | ||
|
|
9f28c69709 | ||
|
|
b20f08b3d0 | ||
|
|
35fea645b6 | ||
|
|
2bc214e324 | ||
|
|
837fcbf926 | ||
|
|
e3b656f159 | ||
|
|
be703eb71d | ||
|
|
207ae246af | ||
|
|
69fe80353c | ||
|
|
99b739d946 | ||
|
|
c70fa130ab | ||
|
|
efd8396045 | ||
|
|
dd2a5536ab | ||
|
|
e1321a4002 | ||
|
|
da2819332c | ||
|
|
55b2a00458 | ||
|
|
6e8d890442 | ||
|
|
5b1b3f215a | ||
|
|
82e82350f9 | ||
|
|
e95c21d0cb | ||
|
|
70bf59daca | ||
|
|
c0b8c9a1ad | ||
|
|
414f7204bf | ||
|
|
23d02ba6a0 | ||
|
|
e8de8d88ad | ||
|
|
f037d8c922 | ||
|
|
468132d1dd | ||
|
|
c443fc606a | ||
|
|
05f420d162 | ||
|
|
10e132cd50 | ||
|
|
35f17a7eeb | ||
|
|
e861e0199c | ||
|
|
1b6e6ce40c | ||
|
|
0037309e4f | ||
|
|
3e81572477 | ||
|
|
23f3c3990c | ||
|
|
436a0e4d4c | ||
|
|
a74785f64d | ||
|
|
588e0b723a | ||
|
|
c87c77c1cf | ||
|
|
b16ea8b185 | ||
|
|
4a63149338 | ||
|
|
a2b2ccbab7 | ||
|
|
52a08a8769 | ||
|
|
3d0a4d289b | ||
|
|
037a27d405 | ||
|
|
e7ce1a73d0 | ||
|
|
46867cc659 | ||
|
|
efa8390108 | ||
|
|
e590682f8f | ||
|
|
83837ada8f | ||
|
|
f8c1ba4988 | ||
|
|
ae6473635d | ||
|
|
6b5aefd4c2 | ||
|
|
1ea0258393 | ||
|
|
09b49f096c | ||
|
|
18cacb33ee | ||
|
|
d850d00bab | ||
|
|
579b5f1a04 | ||
| ec460faf02 | |||
|
|
1ebc2fa71e | ||
|
|
d88bede097 | ||
|
|
bcde6678b8 | ||
|
|
5edf7eb23a | ||
|
|
1ed2d3a611 | ||
|
|
f75ee9f352 | ||
|
|
9f109b20fd | ||
|
|
5ebc729b82 | ||
|
|
f4c2cb120b | ||
|
|
8689643e11 | ||
|
|
0191ca4b13 | ||
|
|
3963ea5591 | ||
|
|
816096f4d1 | ||
|
|
d42a6ca6a8 | ||
|
|
ef8c60c2b5 | ||
|
|
f48fc750f2 | ||
|
|
334e815c25 | ||
|
|
7e91459cd6 | ||
|
|
269a63af1f | ||
|
|
8d8bae4e18 | ||
|
|
891dcaef32 | ||
|
|
54e4217e21 | ||
|
|
167d0ebd42 | ||
|
|
019e79a362 | ||
|
|
ac2a943feb | ||
|
|
18e6dde67a | ||
|
|
5ddd89f883 | ||
|
|
38083d7c3f | ||
|
|
197c60126c | ||
|
|
31ee974830 | ||
|
|
51bc796bec | ||
|
|
c6c3dd9cfe | ||
|
|
82c29f46a5 | ||
|
|
83a8912da6 | ||
|
|
1a8b9eb41b | ||
|
|
b2066cdb68 | ||
|
|
1260cbe674 | ||
|
|
0aa1776b57 | ||
|
|
2942025a54 | ||
| 99c3cab84a | |||
| c861482c9e | |||
|
|
39a134a0db | ||
|
|
94e941b026 |
@@ -35,30 +35,44 @@ These paths intentionally stay flat (no `/environments/{envSlug}` prefix). Every
|
||||
|
||||
ClickHouse is shared across tenants. Every ClickHouse query must filter by `tenant_id` (from `CAMELEER_SERVER_TENANT_ID` env var, resolved via `TenantContext`/config) in addition to `environment`. New controllers added under `/environments/{envSlug}/...` must preserve this — the env filter from the path does not replace the tenant filter.
|
||||
|
||||
## User ID conventions
|
||||
|
||||
`users.user_id` stores the **bare** identifier:
|
||||
- Local users: `<username>` (e.g. `admin`, `alice`)
|
||||
- OIDC users: `oidc:<sub>` (e.g. `oidc:c7a93b…`)
|
||||
|
||||
JWT subjects carry a `user:` namespace prefix (`user:admin`, `user:oidc:<sub>`) so `JwtAuthenticationFilter` can distinguish user tokens from agent tokens. All three write paths upsert the **bare** form:
|
||||
|
||||
- `UiAuthController.login` — computes `userId = request.username()`, signs with `subject = "user:" + userId`.
|
||||
- `OidcAuthController.callback` — `userId = "oidc:" + oidcUser.subject()`, signs with `subject = "user:" + userId`.
|
||||
- `UserAdminController.createUser` — `userId = request.username()`.
|
||||
|
||||
Env-scoped read-path controllers (`AlertController`, `AlertRuleController`, `AlertSilenceController`, `OutboundConnectionAdminController`) strip `"user:"` from `SecurityContextHolder.authentication.name` before using it as an FK. All FKs to `users(user_id)` (e.g. `alert_rules.created_by`, `outbound_connections.created_by`, `alert_reads.user_id`, `user_roles.user_id`, `user_groups.user_id`) therefore reference the bare form. If you add a new controller that needs the acting user id for an FK insert, follow the same strip pattern.
|
||||
|
||||
## controller/ — REST endpoints
|
||||
|
||||
### Env-scoped (user-facing data & config)
|
||||
|
||||
- `AppController` — `/api/v1/environments/{envSlug}/apps`. GET list / POST create / GET `{appSlug}` / DELETE `{appSlug}` / GET `{appSlug}/versions` / POST `{appSlug}/versions` (JAR upload) / PUT `{appSlug}/container-config`. App slug uniqueness is per-env (`(env, app_slug)` is the natural key). `CreateAppRequest` body has no env (path), validates slug regex.
|
||||
- `DeploymentController` — `/api/v1/environments/{envSlug}/apps/{appSlug}/deployments`. GET list / POST create (body `{ appVersionId }`) / POST `{id}/stop` / POST `{id}/promote` (body `{ targetEnvironment: slug }` — target app slug must exist in target env) / GET `{id}/logs`.
|
||||
- `ApplicationConfigController` — `/api/v1/environments/{envSlug}`. GET `/config` (list), GET/PUT `/apps/{appSlug}/config`, GET `/apps/{appSlug}/processor-routes`, POST `/apps/{appSlug}/config/test-expression`. PUT also pushes `CONFIG_UPDATE` to LIVE agents in this env.
|
||||
- `AppController` — `/api/v1/environments/{envSlug}/apps`. GET list / POST create / GET `{appSlug}` / DELETE `{appSlug}` / GET `{appSlug}/versions` / POST `{appSlug}/versions` (JAR upload) / PUT `{appSlug}/container-config` / GET `{appSlug}/dirty-state` (returns `DirtyStateResponse{dirty, lastSuccessfulDeploymentId, differences}` — compares current JAR+config against last RUNNING deployment snapshot; dirty=true when no snapshot exists). App slug uniqueness is per-env (`(env, app_slug)` is the natural key). `CreateAppRequest` body has no env (path), validates slug regex. Injects `DirtyStateCalculator` bean (registered in `RuntimeBeanConfig`, requires `ObjectMapper` with `JavaTimeModule`).
|
||||
- `DeploymentController` — `/api/v1/environments/{envSlug}/apps/{appSlug}/deployments`. GET list / POST create (body `{ appVersionId }`) / POST `{id}/stop` / POST `{id}/promote` (body `{ targetEnvironment: slug }` — target app slug must exist in target env) / GET `{id}/logs`. All lifecycle ops (`POST /` deploy, `POST /{id}/stop`, `POST /{id}/promote`) audited under `AuditCategory.DEPLOYMENT`. Action codes: `deploy_app`, `stop_deployment`, `promote_deployment`. Acting user resolved via the `user:` prefix-strip convention; both SUCCESS and FAILURE branches write audit rows. `created_by` (TEXT, nullable) populated from `SecurityContextHolder` and surfaced on the `Deployment` DTO.
|
||||
- `ApplicationConfigController` — `/api/v1/environments/{envSlug}`. GET `/config` (list), GET/PUT `/apps/{appSlug}/config`, GET `/apps/{appSlug}/processor-routes`, POST `/apps/{appSlug}/config/test-expression`. PUT accepts `?apply=staged|live` (default `live`). `live` saves to DB and pushes `CONFIG_UPDATE` SSE to live agents in this env (existing behavior); `staged` saves to DB only, skipping the SSE push — used by the unified app deployment page. Audit action is `stage_app_config` for staged writes, `update_app_config` for live. Invalid `apply` values return 400.
|
||||
- `AppSettingsController` — `/api/v1/environments/{envSlug}`. GET `/app-settings` (list), GET/PUT/DELETE `/apps/{appSlug}/settings`. ADMIN/OPERATOR only.
|
||||
- `SearchController` — `/api/v1/environments/{envSlug}`. GET `/executions`, POST `/executions/search`, GET `/stats`, `/stats/timeseries`, `/stats/timeseries/by-app`, `/stats/timeseries/by-route`, `/stats/punchcard`, `/attributes/keys`, `/errors/top`.
|
||||
- `LogQueryController` — GET `/api/v1/environments/{envSlug}/logs` (filters: source (multi, comma-split, OR-joined), level (multi, comma-split, OR-joined), application, agentId, exchangeId, logger, q, time range; sort asc/desc). Cursor-paginated, returns `{ data, nextCursor, hasMore, levelCounts }`; cursor is base64url of `"{timestampIso}|{insert_id_uuid}"` — same-millisecond tiebreak via the `insert_id` UUID column on `logs`.
|
||||
- `SearchController` — `/api/v1/environments/{envSlug}`. GET `/executions`, POST `/executions/search`, GET `/stats`, `/stats/timeseries`, `/stats/timeseries/by-app`, `/stats/timeseries/by-route`, `/stats/punchcard`, `/attributes/keys`, `/errors/top`. GET `/executions` accepts repeat `attr` query params: `attr=order` (key-exists), `attr=order:47` (exact), `attr=order:4*` (wildcard — `*` maps to SQL LIKE `%`). First `:` splits key/value; later colons stay in the value. Invalid keys → 400. POST `/executions/search` accepts the same filters via `SearchRequest.attributeFilters` in the body.
|
||||
- `LogQueryController` — GET `/api/v1/environments/{envSlug}/logs` (filters: source (multi, comma-split, OR-joined), level (multi, comma-split, OR-joined), application, agentId, exchangeId, logger, q, time range, instanceIds (multi, comma-split, AND-joined as WHERE instance_id IN (...) — used by the Checkpoint detail drawer to scope logs to a deployment's replicas); sort asc/desc). Cursor-paginated, returns `{ data, nextCursor, hasMore, levelCounts }`; cursor is base64url of `"{timestampIso}|{insert_id_uuid}"` — same-millisecond tiebreak via the `insert_id` UUID column on `logs`.
|
||||
- `RouteCatalogController` — GET `/api/v1/environments/{envSlug}/routes` (merged route catalog from registry + ClickHouse; env filter unconditional).
|
||||
- `RouteMetricsController` — GET `/api/v1/environments/{envSlug}/routes/metrics`, GET `/api/v1/environments/{envSlug}/routes/metrics/processors`.
|
||||
- `AgentListController` — GET `/api/v1/environments/{envSlug}/agents` (registered agents with runtime metrics, filtered to env).
|
||||
- `AgentEventsController` — GET `/api/v1/environments/{envSlug}/agents/events` (lifecycle events; cursor-paginated, returns `{ data, nextCursor, hasMore }`; order `(timestamp DESC, insert_id DESC)`; cursor is base64url of `"{timestampIso}|{insert_id_uuid}"` — `insert_id` is a stable UUID column used as a same-millisecond tiebreak).
|
||||
- `AgentMetricsController` — GET `/api/v1/environments/{envSlug}/agents/{agentId}/metrics` (JVM/Camel metrics). Rejects cross-env agents (404) as defence-in-depth.
|
||||
- `DiagramRenderController` — GET `/api/v1/environments/{envSlug}/apps/{appSlug}/routes/{routeId}/diagram` (env-scoped lookup). Also GET `/api/v1/diagrams/{contentHash}/render` (flat — content hashes are globally unique).
|
||||
- `AlertRuleController` — `/api/v1/environments/{envSlug}/alerts/rules`. GET list / POST create / GET `{id}` / PUT `{id}` / DELETE `{id}` / POST `{id}/enable` / POST `{id}/disable` / POST `{id}/render-preview` / POST `{id}/test-evaluate`. OPERATOR+ for mutations, VIEWER+ for reads. CRITICAL: attribute keys in `ExchangeMatchCondition.filter.attributes` are validated at rule-save time against `^[a-zA-Z0-9._-]+$` — they are later inlined into ClickHouse SQL. Webhook validation: verifies `outboundConnectionId` exists and `isAllowedInEnvironment`. Null notification templates default to `""` (NOT NULL constraint). Audit: `ALERT_RULE_CHANGE`.
|
||||
- `AlertController` — `/api/v1/environments/{envSlug}/alerts`. GET list (inbox filtered by userId/groupIds/roleNames via `InAppInboxQuery`) / GET `/unread-count` / GET `{id}` / POST `{id}/ack` / POST `{id}/read` / POST `/bulk-read`. VIEWER+ for all. Inbox SQL: `? = ANY(target_user_ids) OR target_group_ids && ? OR target_role_names && ?` — requires at least one matching target (no broadcast concept).
|
||||
- `DiagramRenderController` — GET `/api/v1/environments/{envSlug}/apps/{appSlug}/routes/{routeId}/diagram` returns the most recent diagram for (app, env, route) via `DiagramStore.findLatestContentHashForAppRoute`. Registry-independent — routes whose publishing agents were removed still resolve. Also GET `/api/v1/diagrams/{contentHash}/render` (flat — content hashes are globally unique), the point-in-time path consumed by the exchange viewer via `ExecutionDetail.diagramContentHash`.
|
||||
- `AlertRuleController` — `/api/v1/environments/{envSlug}/alerts/rules`. GET list / POST create / GET `{id}` / PUT `{id}` / DELETE `{id}` / POST `{id}/enable` / POST `{id}/disable` / POST `{id}/render-preview` / POST `{id}/test-evaluate`. OPERATOR+ for mutations, VIEWER+ for reads. CRITICAL: attribute keys in `ExchangeMatchCondition.filter.attributes` are validated at rule-save time against `^[a-zA-Z0-9._-]+$` — they are later inlined into ClickHouse SQL. `AgentLifecycleCondition` is allowlist-only — the `AgentLifecycleEventType` enum (REGISTERED / RE_REGISTERED / DEREGISTERED / WENT_STALE / WENT_DEAD / RECOVERED) plus the record compact ctor (non-empty `eventTypes`, `withinSeconds ≥ 1`) do the validation; custom agent-emitted event types are tracked in backlog issue #145. Webhook validation: verifies `outboundConnectionId` exists and `isAllowedInEnvironment`. Null notification templates default to `""` (NOT NULL constraint). Audit: `ALERT_RULE_CHANGE`.
|
||||
- `AlertController` — `/api/v1/environments/{envSlug}/alerts`. GET list (inbox filtered by userId/groupIds/roleNames via `InAppInboxQuery`; optional multi-value `state`, `severity`, tri-state `acked`, tri-state `read` query params; soft-deleted rows always excluded) / GET `/unread-count` / GET `{id}` / POST `{id}/ack` / POST `{id}/read` / POST `/bulk-read` / POST `/bulk-ack` (VIEWER+) / DELETE `{id}` (OPERATOR+, soft-delete) / POST `/bulk-delete` (OPERATOR+) / POST `{id}/restore` (OPERATOR+, clears `deleted_at`). `requireLiveInstance` helper returns 404 on soft-deleted rows; `restore` explicitly fetches regardless of `deleted_at`. `BulkIdsRequest` is the shared body for bulk-read/ack/delete (`{ instanceIds }`). `AlertDto` includes `readAt`; `deletedAt` is intentionally NOT on the wire. Inbox SQL: `? = ANY(target_user_ids) OR target_group_ids && ? OR target_role_names && ?` — requires at least one matching target (no broadcast concept).
|
||||
- `AlertSilenceController` — `/api/v1/environments/{envSlug}/alerts/silences`. GET list / POST create / DELETE `{id}`. 422 if `endsAt <= startsAt`. OPERATOR+ for mutations, VIEWER+ for list. Audit: `ALERT_SILENCE_CHANGE`.
|
||||
- `AlertNotificationController` — Dual-path (no class-level prefix). GET `/api/v1/environments/{envSlug}/alerts/{alertId}/notifications` (VIEWER+); POST `/api/v1/alerts/notifications/{id}/retry` (OPERATOR+, flat — notification IDs globally unique). Retry resets attempts to 0 and sets `nextAttemptAt = now`.
|
||||
|
||||
### Env admin (env-slug-parameterized, not env-scoped data)
|
||||
|
||||
- `EnvironmentAdminController` — `/api/v1/admin/environments`. GET list / POST create / GET `{envSlug}` / PUT `{envSlug}` / DELETE `{envSlug}` / PUT `{envSlug}/default-container-config` / PUT `{envSlug}/jar-retention`. Slug immutable — PUT body has no slug field; any slug supplied is dropped by Jackson. Slug validated on POST.
|
||||
- `EnvironmentAdminController` — `/api/v1/admin/environments`. GET list / POST create / GET `{envSlug}` / PUT `{envSlug}` / DELETE `{envSlug}` / PUT `{envSlug}/default-container-config` / PUT `{envSlug}/jar-retention`. Slug immutable — PUT body has no slug field; any slug supplied is dropped by Jackson. Slug validated on POST. `UpdateEnvironmentRequest` carries `color` (nullable); unknown values rejected with 400 via `EnvironmentColor.isValid`. Null/absent color preserves the existing value.
|
||||
|
||||
### Agent-only (JWT-authoritative, intentionally flat)
|
||||
|
||||
@@ -71,8 +85,7 @@ ClickHouse is shared across tenants. Every ClickHouse query must filter by `tena
|
||||
|
||||
- `LogIngestionController` — POST `/api/v1/data/logs` (accepts `List<LogEntry>`; WARNs on missing identity, unregistered agents, empty payloads, buffer-full drops).
|
||||
- `EventIngestionController` — POST `/api/v1/data/events`.
|
||||
- `ChunkIngestionController` — POST `/api/v1/ingestion/chunk/{executions|metrics|diagrams}`.
|
||||
- `ExecutionController` — POST `/api/v1/data/executions` (legacy ingestion path when ClickHouse disabled).
|
||||
- `ChunkIngestionController` — POST `/api/v1/data/executions`. Accepts a single `ExecutionChunk` or an array (fields include `exchangeId`, `applicationId`, `instanceId`, `routeId`, `status`, `startTime`, `endTime`, `durationMs`, `chunkSeq`, `final`, `processors: FlatProcessorRecord[]`). The accumulator merges non-final chunks by exchangeId and emits the merged envelope on the final chunk or on stale timeout. Legacy `ExecutionController` / `RouteExecution` shape is retired.
|
||||
- `MetricsController` — POST `/api/v1/data/metrics`.
|
||||
- `DiagramController` — POST `/api/v1/data/diagrams` (resolves applicationId + environment from the agent registry keyed on JWT subject; stamps both on the stored `TaggedDiagram`).
|
||||
|
||||
@@ -96,6 +109,7 @@ ClickHouse is shared across tenants. Every ClickHouse query must filter by `tena
|
||||
- `UsageAnalyticsController` — GET `/api/v1/admin/usage` (ClickHouse `usage_events`).
|
||||
- `ClickHouseAdminController` — GET `/api/v1/admin/clickhouse/**` (conditional on `infrastructureendpoints` flag).
|
||||
- `DatabaseAdminController` — GET `/api/v1/admin/database/**` (conditional on `infrastructureendpoints` flag).
|
||||
- `ServerMetricsAdminController` — `/api/v1/admin/server-metrics/**`. GET `/catalog`, GET `/instances`, POST `/query`. Generic read API over the `server_metrics` ClickHouse table so SaaS dashboards don't need direct CH access. Delegates to `ServerMetricsQueryStore` (impl `ClickHouseServerMetricsQueryStore`). Visibility matches ClickHouse/Database admin: `@ConditionalOnProperty(infrastructureendpoints, matchIfMissing=true)` + class-level `@PreAuthorize("hasRole('ADMIN')")`. Validation: metric/tag regex `^[a-zA-Z0-9._]+$`, statistic regex `^[a-z_]+$`, `to - from ≤ 31 days`, stepSeconds ∈ [10, 3600], response capped at 500 series. `IllegalArgumentException` → 400. `/query` supports `raw` + `delta` modes (delta does per-`server_instance_id` positive-clipped differences, then aggregates across instances). Derived `statistic=mean` for timers computes `sum(total|total_time)/sum(count)` per bucket.
|
||||
|
||||
### Other (flat)
|
||||
|
||||
@@ -105,10 +119,10 @@ ClickHouse is shared across tenants. Every ClickHouse query must filter by `tena
|
||||
## runtime/ — Docker orchestration
|
||||
|
||||
- `DockerRuntimeOrchestrator` — implements RuntimeOrchestrator; Docker Java client (zerodep transport), container lifecycle
|
||||
- `DeploymentExecutor` — @Async staged deploy: PRE_FLIGHT -> PULL_IMAGE -> CREATE_NETWORK -> START_REPLICAS -> HEALTH_CHECK -> SWAP_TRAFFIC -> COMPLETE. Container names are `{tenantId}-{envSlug}-{appSlug}-{replicaIndex}` (globally unique on Docker daemon). Sets per-replica `CAMELEER_AGENT_INSTANCEID` env var to `{envSlug}-{appSlug}-{replicaIndex}`.
|
||||
- `DeploymentExecutor` — @Async staged deploy: PRE_FLIGHT -> PULL_IMAGE -> CREATE_NETWORK -> START_REPLICAS -> HEALTH_CHECK -> SWAP_TRAFFIC -> COMPLETE. Container names are `{tenantId}-{envSlug}-{appSlug}-{replicaIndex}-{generation}`, where `generation` is the first 8 chars of the deployment UUID — old and new replicas coexist during a blue/green swap. Per-replica `CAMELEER_AGENT_INSTANCEID` env var is `{envSlug}-{appSlug}-{replicaIndex}-{generation}`. Branches on `DeploymentStrategy.fromWire(config.deploymentStrategy())`: **blue-green** (default) starts all N → waits for all healthy → stops old (partial health = FAILED, preserves old untouched); **rolling** replaces replicas one at a time with rollback only for in-flight new containers (already-replaced old stay stopped; un-replaced old keep serving). DEGRADED is now only set by `DockerEventMonitor` post-deploy, never by the executor.
|
||||
- `DockerNetworkManager` — ensures bridge networks (cameleer-traefik, cameleer-env-{slug}), connects containers
|
||||
- `DockerEventMonitor` — persistent Docker event stream listener (die, oom, start, stop), updates deployment status
|
||||
- `TraefikLabelBuilder` — generates Traefik Docker labels for path-based or subdomain routing. Also emits `cameleer.replica` and `cameleer.instance-id` labels per container for labels-first identity.
|
||||
- `TraefikLabelBuilder` — generates Traefik Docker labels for path-based or subdomain routing. Per-container identity labels: `cameleer.replica` (index), `cameleer.generation` (deployment-scoped 8-char id — for Prometheus/Grafana deploy-boundary annotations), `cameleer.instance-id` (`{envSlug}-{appSlug}-{replicaIndex}-{generation}`). Router/service label keys are generation-agnostic so load balancing spans old + new replicas during a blue/green overlap.
|
||||
- `PrometheusLabelBuilder` — generates Prometheus Docker labels (`prometheus.scrape/path/port`) per runtime type for `docker_sd_configs` auto-discovery
|
||||
- `ContainerLogForwarder` — streams Docker container stdout/stderr to ClickHouse with `source='container'`. One follow-stream thread per container, batches lines every 2s/50 lines via `ClickHouseLogStore.insertBufferedBatch()`. 60-second max capture timeout.
|
||||
- `DisabledRuntimeOrchestrator` — no-op when runtime not enabled
|
||||
@@ -116,11 +130,13 @@ ClickHouse is shared across tenants. Every ClickHouse query must filter by `tena
|
||||
## metrics/ — Prometheus observability
|
||||
|
||||
- `ServerMetrics` — centralized business metrics: gauges (agents by state, SSE connections, buffer depths), counters (ingestion drops, agent transitions, deployment outcomes, auth failures), timers (flush duration, deployment duration). Exposed via `/api/v1/prometheus`.
|
||||
- `ServerInstanceIdConfig` — `@Configuration`, exposes `@Bean("serverInstanceId") String`. Resolution precedence: `cameleer.server.instance-id` property → `HOSTNAME` env → `InetAddress.getLocalHost()` → random UUID. Fixed at boot; rotates across restarts so counters restart cleanly.
|
||||
- `ServerMetricsSnapshotScheduler` — `@Scheduled(fixedDelayString = "${cameleer.server.self-metrics.interval-ms:60000}")`. Walks `MeterRegistry.getMeters()` each tick, emits one `ServerMetricSample` per `Measurement` (Timer/DistributionSummary produce multiple rows per meter — one per Micrometer `Statistic`). Skips non-finite values; logs and swallows store failures. Disabled via `cameleer.server.self-metrics.enabled=false` (`@ConditionalOnProperty`). Write-only — no query endpoint yet; inspect via `/api/v1/admin/clickhouse/query`.
|
||||
|
||||
## storage/ — PostgreSQL repositories (JdbcTemplate)
|
||||
|
||||
- `PostgresAppRepository`, `PostgresAppVersionRepository`, `PostgresEnvironmentRepository`
|
||||
- `PostgresDeploymentRepository` — includes JSONB replica_states, deploy_stage, findByContainerId
|
||||
- `PostgresDeploymentRepository` — includes JSONB replica_states, deploy_stage, findByContainerId. Also carries `deployed_config_snapshot` JSONB (Flyway V3) populated by `DeploymentExecutor` via `saveDeployedConfigSnapshot(UUID, DeploymentConfigSnapshot)` on successful RUNNING transition. Consumed by `DirtyStateCalculator` for the `/apps/{slug}/dirty-state` endpoint and by the UI for checkpoint restore.
|
||||
- `PostgresUserRepository`, `PostgresRoleRepository`, `PostgresGroupRepository`
|
||||
- `PostgresAuditRepository`, `PostgresOidcConfigRepository`, `PostgresClaimMappingRepository`, `PostgresSensitiveKeysRepository`
|
||||
- `PostgresAppSettingsRepository`, `PostgresApplicationConfigRepository`, `PostgresThresholdRepository`. Both `app_settings` and `application_config` are env-scoped (PK `(app_id, environment)` / `(application, environment)`); finders take `(app, env)` — no env-agnostic variants.
|
||||
@@ -132,6 +148,8 @@ ClickHouse is shared across tenants. Every ClickHouse query must filter by `tena
|
||||
- `ClickHouseDiagramStore`, `ClickHouseAgentEventRepository`
|
||||
- `ClickHouseUsageTracker` — usage_events for billing
|
||||
- `ClickHouseRouteCatalogStore` — persistent route catalog with first_seen cache, warm-loaded on startup
|
||||
- `ClickHouseServerMetricsStore` — periodic dumps of the server's own Micrometer registry into the `server_metrics` table. Tenant-stamped (bound at the scheduler, not the bean); no `environment` column (server straddles envs). Batch-insert via `JdbcTemplate.batchUpdate` with `Map(String, String)` tag binding. Written by `ServerMetricsSnapshotScheduler`.
|
||||
- `ClickHouseServerMetricsQueryStore` — read side of `server_metrics` for dashboards. Implements `ServerMetricsQueryStore`. `catalog(from,to)` returns name+type+statistics+tagKeys, `listInstances(from,to)` returns server_instance_ids with first/last seen, `query(request)` builds bucketed time-series with `raw` or `delta` mode and supports a derived `mean` statistic for timers. All identifier inputs regex-validated; tenant_id always bound; max range 31 days; series count capped at 500. Exposed via `ServerMetricsAdminController`.
|
||||
|
||||
## search/ — ClickHouse search and log stores
|
||||
|
||||
@@ -143,7 +161,8 @@ ClickHouse is shared across tenants. Every ClickHouse query must filter by `tena
|
||||
- `SecurityConfig` — WebSecurityFilterChain, JWT filter, CORS, OIDC conditional. `/api/v1/admin/outbound-connections/**` GETs permit OPERATOR in addition to ADMIN (defense-in-depth at controller level); mutations remain ADMIN-only. Alerting matchers: GET `/environments/*/alerts/**` VIEWER+; POST/PUT/DELETE rules and silences OPERATOR+; ack/read/bulk-read VIEWER+; POST `/alerts/notifications/*/retry` OPERATOR+.
|
||||
- `JwtAuthenticationFilter` — OncePerRequestFilter, validates Bearer tokens
|
||||
- `JwtServiceImpl` — HMAC-SHA256 JWT (Nimbus JOSE)
|
||||
- `OidcAuthController` — /api/v1/auth/oidc (login-uri, token-exchange, logout)
|
||||
- `UiAuthController` — `/api/v1/auth` (login, refresh, me). Upserts `users.user_id = request.username()` (bare); signs JWTs with `subject = "user:" + userId`. `refresh`/`me` strip the `"user:"` prefix from incoming subjects via `stripSubjectPrefix()` before any DB/RBAC lookup.
|
||||
- `OidcAuthController` — `/api/v1/auth/oidc` (login-uri, token-exchange, logout). Upserts `users.user_id = "oidc:" + oidcUser.subject()` (no `user:` prefix); signs JWTs with `subject = "user:oidc:" + oidcUser.subject()`. `applyClaimMappings` + `getSystemRoleNames` calls all use the bare `oidc:<sub>` form.
|
||||
- `OidcTokenExchanger` — code -> tokens, role extraction from access_token then id_token
|
||||
- `OidcProviderHelper` — OIDC discovery, JWK source cache
|
||||
|
||||
@@ -157,6 +176,14 @@ ClickHouse is shared across tenants. Every ClickHouse query must filter by `tena
|
||||
|
||||
- `JarRetentionJob` — @Scheduled 03:00 daily, per-environment retention, skips deployed versions
|
||||
|
||||
## alerting/eval/ — Rule evaluation
|
||||
|
||||
- `AlertEvaluatorJob` — @Scheduled tick driver; per-rule claim/release via `AlertRuleRepository`, dispatches to per-kind `ConditionEvaluator`, persists advanced cursor on release via `AlertRule.withEvalState`.
|
||||
- `BatchResultApplier` — `@Component` that wraps a single rule's tick outcome (`EvalResult.Batch` = `firings` + `nextEvalState`) in one `@Transactional` boundary: instance upserts + notification enqueues + cursor advance commit atomically or roll back together. This is the exactly-once-per-exchange guarantee for `PER_EXCHANGE` fire mode.
|
||||
- `ConditionEvaluator` — interface; per-kind implementations: `ExchangeMatchEvaluator`, `AgentLifecycleEvaluator`, `AgentStateEvaluator`, `DeploymentStateEvaluator`, `JvmMetricEvaluator`, `LogPatternEvaluator`, `RouteMetricEvaluator`.
|
||||
- `AlertStateTransitions` — PER_EXCHANGE vs rule-level FSM helpers (fire/resolve/ack).
|
||||
- `PerKindCircuitBreaker` — trips noisy per-kind evaluators; `TickCache` — per-tick shared lookups (apps, envs, silences).
|
||||
|
||||
## http/ — Outbound HTTP client implementation
|
||||
|
||||
- `SslContextBuilder` — composes SSL context from `OutboundHttpProperties` + `OutboundHttpRequestContext`. Supports SYSTEM_DEFAULT (JDK roots + configured CA extras), TRUST_ALL (short-circuit no-op TrustManager), TRUST_PATHS (JDK roots + system extras + per-request extras). Throws `IllegalArgumentException("CA file not found: ...")` on missing PEM.
|
||||
@@ -168,7 +195,7 @@ ClickHouse is shared across tenants. Every ClickHouse query must filter by `tena
|
||||
- `crypto/SecretCipher` — AES-GCM symmetric cipher with key derived via HMAC-SHA256(jwtSecret, "cameleer-outbound-secret-v1"). Ciphertext format: base64(IV(12 bytes) || GCM output with 128-bit tag). `encrypt` throws `IllegalStateException`; `decrypt` throws `IllegalArgumentException` on tamper/wrong-key/malformed.
|
||||
- `storage/PostgresOutboundConnectionRepository` — JdbcTemplate impl. `save()` upserts by id; JSONB serialization via ObjectMapper; UUID arrays via `ConnectionCallback`. Reads `created_by`/`updated_by` as String (= users.user_id TEXT).
|
||||
- `OutboundConnectionServiceImpl` — service layer. Tenant bound at construction via `cameleer.server.tenant.id` property. Uniqueness check via `findByName`. Narrowing-envs guard: rejects update that removes envs while rules reference the connection (rulesReferencing stubbed in Plan 01, wired in Plan 02). Delete guard: rejects if referenced by rules.
|
||||
- `controller/OutboundConnectionAdminController` — REST controller. Class-level `@PreAuthorize("hasRole('ADMIN')")` defaults; GETs relaxed to ADMIN|OPERATOR. Extracts acting user id from `SecurityContextHolder.authentication.name`, strips "user:" prefix. Audit via `AuditCategory.OUTBOUND_CONNECTION_CHANGE`.
|
||||
- `controller/OutboundConnectionAdminController` — REST controller. Class-level `@PreAuthorize("hasRole('ADMIN')")` defaults; GETs relaxed to ADMIN|OPERATOR. Resolves acting user id via the user-id convention (strip `"user:"` from `authentication.name` → matches `users.user_id` FK). Audit via `AuditCategory.OUTBOUND_CONNECTION_CHANGE`.
|
||||
- `dto/OutboundConnectionRequest` — Bean Validation: `@NotBlank` name, `@Pattern("^https://.+")` url, `@NotNull` method/tlsTrustMode/auth. Compact ctor throws `IllegalArgumentException` if TRUST_PATHS with empty paths list.
|
||||
- `dto/OutboundConnectionDto` — response DTO. `hmacSecretSet: boolean` instead of the ciphertext; `authKind: OutboundAuthKind` instead of the full auth config.
|
||||
- `dto/OutboundConnectionTestResult` — result of POST `/{id}/test`: status, latencyMs, responseSnippet (first 512 chars), tlsProtocol/cipherSuite/peerCertSubject (protocol is "TLS" stub; enriched in Plan 02 follow-up), error (nullable).
|
||||
|
||||
@@ -8,8 +8,11 @@ paths:
|
||||
|
||||
# CI/CD & Deployment
|
||||
|
||||
- CI workflow: `.gitea/workflows/ci.yml` — build -> docker -> deploy on push to main or feature branches
|
||||
- CI workflow: `.gitea/workflows/ci.yml` — build -> docker -> deploy on push to main or feature branches. `paths-ignore` skips the whole pipeline for docs-only / `.planning/` / `.claude/` / `*.md` changes (push and PR triggers).
|
||||
- Build step skips integration tests (`-DskipITs`) — Testcontainers needs Docker daemon
|
||||
- Build caches (parallel `actions/cache@v4` steps in the `build` job): `~/.m2/repository` (key on all `pom.xml`), `~/.npm` (key on `ui/package-lock.json`), `ui/node_modules/.vite` (key on `ui/package-lock.json` + `ui/vite.config.ts`). UI install uses `npm ci --prefer-offline --no-audit --fund=false` so the npm cache is the primary source.
|
||||
- Maven build performance (set in `pom.xml` and `cameleer-server-app/pom.xml`): `useIncrementalCompilation=true` on the compiler plugin; Surefire uses `forkCount=1C` + `reuseForks=true` (one JVM per CPU core, reused across test classes); Failsafe keeps `forkCount=1` + `reuseForks=true`. Unit tests must not rely on per-class JVM isolation.
|
||||
- UI build script (`ui/package.json`): `build` is `vite build` only — the type-check pass was split out into `npm run typecheck` (run separately when you want a full `tsc --noEmit` sweep).
|
||||
- Docker: multi-stage build (`Dockerfile`), `$BUILDPLATFORM` for native Maven on ARM64 runner, amd64 runtime. `docker-entrypoint.sh` imports `/certs/ca.pem` into JVM truststore before starting the app (supports custom CAs for OIDC discovery without `CAMELEER_SERVER_SECURITY_OIDCTLSSKIPVERIFY`).
|
||||
- `REGISTRY_TOKEN` build arg required for `cameleer-common` dependency resolution
|
||||
- Registry: `gitea.siegeln.net/cameleer/cameleer-server` (container images)
|
||||
|
||||
@@ -17,7 +17,7 @@ paths:
|
||||
- `CommandType` — enum for command types (config-update, deep-trace, replay, route-control, etc.)
|
||||
- `CommandStatus` — enum for command acknowledgement states
|
||||
- `CommandReply` — record: command execution result from agent
|
||||
- `AgentEventRecord`, `AgentEventRepository` — event persistence. `AgentEventRepository.queryPage(...)` is cursor-paginated (`AgentEventPage{data, nextCursor, hasMore}`); the legacy non-paginated `query(...)` path is gone.
|
||||
- `AgentEventRecord`, `AgentEventRepository` — event persistence. `AgentEventRepository.queryPage(...)` is cursor-paginated (`AgentEventPage{data, nextCursor, hasMore}`); the legacy non-paginated `query(...)` path is gone. `AgentEventRepository.findInWindow(env, appSlug, agentId, eventTypes, from, to, limit)` returns matching events ordered by `(timestamp ASC, insert_id ASC)` — consumed by `AgentLifecycleEvaluator`.
|
||||
- `AgentEventPage` — record: `(List<AgentEventRecord> data, String nextCursor, boolean hasMore)` returned by `AgentEventRepository.queryPage`
|
||||
- `AgentEventListener` — callback interface for agent events
|
||||
- `RouteStateRegistry` — tracks per-agent route states
|
||||
@@ -26,16 +26,18 @@ paths:
|
||||
|
||||
- `App` — record: id, environmentId, slug, displayName, containerConfig (JSONB)
|
||||
- `AppVersion` — record: id, appId, version, jarPath, detectedRuntimeType, detectedMainClass
|
||||
- `Environment` — record: id, slug, jarRetentionCount
|
||||
- `Deployment` — record: id, appId, appVersionId, environmentId, status, targetState, deploymentStrategy, replicaStates (JSONB), deployStage, containerId, containerName
|
||||
- `DeploymentStatus` — enum: STOPPED, STARTING, RUNNING, DEGRADED, STOPPING, FAILED
|
||||
- `Environment` — record: id, slug, displayName, production, enabled, defaultContainerConfig, jarRetentionCount, color, createdAt. `color` is one of the 8 preset palette values validated by `EnvironmentColor.VALUES` and CHECK-constrained in PostgreSQL (V2 migration).
|
||||
- `EnvironmentColor` — constants: `DEFAULT = "slate"`, `VALUES = {slate,red,amber,green,teal,blue,purple,pink}`, `isValid(String)`.
|
||||
- `Deployment` — record: id, appId, appVersionId, environmentId, status, targetState, deploymentStrategy, replicaStates (JSONB), deployStage, containerId, containerName, createdBy (String, user_id reference; nullable for pre-V4 historical rows)
|
||||
- `DeploymentStatus` — enum: STOPPED, STARTING, RUNNING, DEGRADED, STOPPING, FAILED. `DEGRADED` is reserved for post-deploy drift (a replica died after RUNNING); `DeploymentExecutor` now marks partial-healthy deploys FAILED, not DEGRADED.
|
||||
- `DeployStage` — enum: PRE_FLIGHT, PULL_IMAGE, CREATE_NETWORK, START_REPLICAS, HEALTH_CHECK, SWAP_TRAFFIC, COMPLETE
|
||||
- `DeploymentService` — createDeployment (deletes terminal deployments first), markRunning, markFailed, markStopped
|
||||
- `DeploymentStrategy` — enum: BLUE_GREEN, ROLLING. Stored on `ResolvedContainerConfig.deploymentStrategy` as kebab-case string (`"blue-green"` / `"rolling"`). `fromWire(String)` is the only conversion entry point; unknown/null inputs fall back to BLUE_GREEN so the executor dispatch site never null-checks or throws.
|
||||
- `DeploymentService` — createDeployment (calls `deleteFailedByAppAndEnvironment` first so FAILED rows don't pile up; STOPPED rows are preserved as restorable checkpoints), markRunning, markFailed, markStopped
|
||||
- `RuntimeType` — enum: AUTO, SPRING_BOOT, QUARKUS, PLAIN_JAVA, NATIVE
|
||||
- `RuntimeDetector` — probes JAR files at upload time: detects runtime from manifest Main-Class (Spring Boot loader, Quarkus entry point, plain Java) or native binary (non-ZIP magic bytes)
|
||||
- `ContainerRequest` — record: 20 fields for Docker container creation (includes runtimeType, customArgs, mainClass)
|
||||
- `ContainerStatus` — record: state, running, exitCode, error
|
||||
- `ResolvedContainerConfig` — record: typed config with memoryLimitMb, memoryReserveMb, cpuRequest, cpuLimit, appPort, exposedPorts, customEnvVars, stripPathPrefix, sslOffloading, routingMode, routingDomain, serverUrl, replicas, deploymentStrategy, routeControlEnabled, replayEnabled, runtimeType, customArgs, extraNetworks
|
||||
- `ResolvedContainerConfig` — record: typed config with memoryLimitMb, memoryReserveMb, cpuRequest, cpuLimit, appPort, exposedPorts, customEnvVars, stripPathPrefix, sslOffloading, routingMode, routingDomain, serverUrl, replicas, deploymentStrategy, routeControlEnabled, replayEnabled, runtimeType, customArgs, extraNetworks, externalRouting (default `true`; when `false`, `TraefikLabelBuilder` strips all `traefik.*` labels so the container is not publicly routed), certResolver (server-wide, sourced from `CAMELEER_SERVER_RUNTIME_CERTRESOLVER`; when blank the `tls.certresolver` label is omitted — use for dev installs with a static TLS store)
|
||||
- `RoutingMode` — enum for routing strategies
|
||||
- `ConfigMerger` — pure function: resolve(globalDefaults, envConfig, appConfig) -> ResolvedContainerConfig
|
||||
- `RuntimeOrchestrator` — interface: startContainer, stopContainer, getContainerStatus, getLogs, startLogCapture, stopLogCapture
|
||||
@@ -45,14 +47,15 @@ paths:
|
||||
## search/ — Execution search and stats
|
||||
|
||||
- `SearchService` — search, count, stats, statsForApp, statsForRoute, timeseries, timeseriesForApp, timeseriesForRoute, timeseriesGroupedByApp, timeseriesGroupedByRoute, slaCompliance, slaCountsByApp, slaCountsByRoute, topErrors, activeErrorTypes, punchcard, distinctAttributeKeys. `statsForRoute`/`timeseriesForRoute` take `(routeId, applicationId)` — app filter is applied to `stats_1m_route`.
|
||||
- `SearchRequest` / `SearchResult` — search DTOs
|
||||
- `SearchRequest` / `SearchResult` — search DTOs. `SearchRequest.attributeFilters: List<AttributeFilter>` carries structured facet filters for execution attributes — key-only (exists), exact (key=value), or wildcard (`*` in value). The 21-arg legacy ctor is preserved for call-site churn; the compact ctor normalises null → `List.of()`.
|
||||
- `AttributeFilter(key, value)` — record with key regex `^[a-zA-Z0-9._-]+$` (inlined into SQL, same constraint as alerting), `value == null` means key-exists, `value` containing `*` becomes a SQL LIKE pattern via `toLikePattern()`.
|
||||
- `ExecutionStats`, `ExecutionSummary` — stats aggregation records
|
||||
- `StatsTimeseries`, `TopError` — timeseries and error DTOs
|
||||
- `LogSearchRequest` / `LogSearchResponse` — log search DTOs. `LogSearchRequest.sources` / `levels` are `List<String>` (null-normalized, multi-value OR); `cursor` + `limit` + `sort` drive keyset pagination. Response carries `nextCursor` + `hasMore` + per-level `levelCounts`.
|
||||
|
||||
## storage/ — Storage abstractions
|
||||
|
||||
- `ExecutionStore`, `MetricsStore`, `MetricsQueryStore`, `StatsStore`, `DiagramStore`, `RouteCatalogStore`, `SearchIndex`, `LogIndex` — interfaces
|
||||
- `ExecutionStore`, `MetricsStore`, `MetricsQueryStore`, `StatsStore`, `DiagramStore`, `RouteCatalogStore`, `SearchIndex`, `LogIndex` — interfaces. `DiagramStore.findLatestContentHashForAppRoute(appId, routeId, env)` resolves the latest diagram by (app, env, route) without consulting the agent registry, so routes whose publishing agents were removed between app versions still resolve. `findContentHashForRoute(route, instance)` is retained for the ingestion path that stamps a per-execution `diagramContentHash` at ingest time (point-in-time link from `ExecutionDetail`/`ExecutionSummary`).
|
||||
- `RouteCatalogEntry` — record: applicationId, routeId, environment, firstSeen, lastSeen
|
||||
- `LogEntryResult` — log query result record
|
||||
- `model/` — `ExecutionDocument`, `MetricTimeSeries`, `MetricsSnapshot`
|
||||
@@ -78,7 +81,7 @@ paths:
|
||||
- `AppSettings`, `AppSettingsRepository` — per-app-per-env settings config and persistence. Record carries `(applicationId, environment, …)`; repository methods are `findByApplicationAndEnvironment`, `findByEnvironment`, `save`, `delete(appId, env)`. `AppSettings.defaults(appId, env)` produces a default instance scoped to an environment.
|
||||
- `ThresholdConfig`, `ThresholdRepository` — alerting threshold config and persistence
|
||||
- `AuditService` — audit logging facade
|
||||
- `AuditRecord`, `AuditResult`, `AuditCategory` (enum: `INFRA, AUTH, USER_MGMT, CONFIG, RBAC, AGENT, OUTBOUND_CONNECTION_CHANGE, OUTBOUND_HTTP_TRUST_CHANGE`), `AuditRepository` — audit trail records and persistence
|
||||
- `AuditRecord`, `AuditResult`, `AuditCategory` (enum: `INFRA, AUTH, USER_MGMT, CONFIG, RBAC, AGENT, OUTBOUND_CONNECTION_CHANGE, OUTBOUND_HTTP_TRUST_CHANGE, ALERT_RULE_CHANGE, ALERT_SILENCE_CHANGE, DEPLOYMENT`), `AuditRepository` — audit trail records and persistence
|
||||
|
||||
## http/ — Outbound HTTP primitives (cross-cutting)
|
||||
|
||||
@@ -107,8 +110,8 @@ paths:
|
||||
|
||||
## ingestion/ — Buffered data pipeline
|
||||
|
||||
- `IngestionService` — ingestExecution, ingestMetric, ingestLog, ingestDiagram
|
||||
- `ChunkAccumulator` — batches data for efficient flush
|
||||
- `IngestionService` — diagram + metrics facade (`ingestDiagram`, `acceptMetrics`, `getMetricsBuffer`). Execution ingestion went through here via the legacy `RouteExecution` shape until `ChunkAccumulator` took over writes from the chunked pipeline — the `ingestExecution` path plus its `ExecutionStore.upsert` / `upsertProcessors` dependencies were removed.
|
||||
- `ChunkAccumulator` — batches data for efficient flush; owns the execution write path (chunks → buffers → flush scheduler → `ClickHouseExecutionStore.insertExecutionBatch`).
|
||||
- `WriteBuffer` — bounded ring buffer for async flush
|
||||
- `BufferedLogEntry` — log entry wrapper with metadata
|
||||
- `MergedExecution`, `TaggedExecution`, `TaggedDiagram` — tagged ingestion records. `TaggedDiagram` carries `(instanceId, applicationId, environment, graph)` — env is resolved from the agent registry in the controller and stamped on the ClickHouse `route_diagrams` row.
|
||||
- `MergedExecution`, `TaggedDiagram` — tagged ingestion records. `TaggedDiagram` carries `(instanceId, applicationId, environment, graph)` — env is resolved from the agent registry in the controller and stamped on the ClickHouse `route_diagrams` row.
|
||||
|
||||
@@ -13,19 +13,28 @@ paths:
|
||||
When deployed via the cameleer-saas platform, this server orchestrates customer app containers using Docker. Key components:
|
||||
|
||||
- **ConfigMerger** (`core/runtime/ConfigMerger.java`) — pure function: resolve(globalDefaults, envConfig, appConfig) -> ResolvedContainerConfig. Three-layer merge: global (application.yml) -> environment (defaultContainerConfig JSONB) -> app (containerConfig JSONB). Includes `runtimeType` (default `"auto"`) and `customArgs` (default `""`).
|
||||
- **TraefikLabelBuilder** (`app/runtime/TraefikLabelBuilder.java`) — generates Traefik Docker labels for path-based (`/{envSlug}/{appSlug}/`) or subdomain-based (`{appSlug}-{envSlug}.{domain}`) routing. Supports strip-prefix and SSL offloading toggles. Also sets per-replica identity labels: `cameleer.replica` (index) and `cameleer.instance-id` (`{envSlug}-{appSlug}-{replicaIndex}`). Internal processing uses labels (not container name parsing) for extensibility.
|
||||
- **TraefikLabelBuilder** (`app/runtime/TraefikLabelBuilder.java`) — generates Traefik Docker labels for path-based (`/{envSlug}/{appSlug}/`) or subdomain-based (`{appSlug}-{envSlug}.{domain}`) routing. Supports strip-prefix and SSL offloading toggles. Per-replica identity labels: `cameleer.replica` (index), `cameleer.generation` (8-char deployment UUID prefix — pin Prometheus/Grafana deploy boundaries with this), `cameleer.instance-id` (`{envSlug}-{appSlug}-{replicaIndex}-{generation}`). Traefik router/service keys deliberately omit the generation so load balancing spans old + new replicas during a blue/green overlap. When `ResolvedContainerConfig.externalRouting()` is `false` (UI: Resources → External Routing, default `true`), the builder emits ONLY the identity labels (`managed-by`, `cameleer.*`) and skips every `traefik.*` label — the container stays on `cameleer-traefik` and the per-env network (so sibling containers can still reach it via Docker DNS) but is invisible to Traefik. The `tls.certresolver` label is emitted only when `CAMELEER_SERVER_RUNTIME_CERTRESOLVER` is set to a non-blank resolver name (matching a resolver configured in the Traefik static config). When unset (dev installs backed by a static TLS store) only `tls=true` is emitted and Traefik serves the default cert from the TLS store.
|
||||
- **PrometheusLabelBuilder** (`app/runtime/PrometheusLabelBuilder.java`) — generates Prometheus `docker_sd_configs` labels per resolved runtime type: Spring Boot `/actuator/prometheus:8081`, Quarkus/native `/q/metrics:9000`, plain Java `/metrics:9464`. Labels merged into container metadata alongside Traefik labels at deploy time.
|
||||
- **DockerNetworkManager** (`app/runtime/DockerNetworkManager.java`) — manages two Docker network tiers:
|
||||
- `cameleer-traefik` — shared network; Traefik, server, and all app containers attach here. Server joined via docker-compose with `cameleer-server` DNS alias.
|
||||
- `cameleer-env-{slug}` — per-environment isolated network; containers in the same environment discover each other via Docker DNS. In SaaS mode, env networks are tenant-scoped: `cameleer-env-{tenantId}-{envSlug}` (overloaded `envNetworkName(tenantId, envSlug)` method) to prevent cross-tenant collisions when multiple tenants have identically-named environments.
|
||||
- **DockerEventMonitor** (`app/runtime/DockerEventMonitor.java`) — persistent Docker event stream listener for containers with `managed-by=cameleer-server` label. Detects die/oom/start/stop events and updates deployment replica states. Periodic reconciliation (@Scheduled every 30s) inspects actual container state and corrects deployment status mismatches (fixes stale DEGRADED with all replicas healthy).
|
||||
- **DeploymentProgress** (`ui/src/components/DeploymentProgress.tsx`) — UI step indicator showing 7 deploy stages with amber active/green completed styling.
|
||||
- **ContainerLogForwarder** (`app/runtime/ContainerLogForwarder.java`) — streams Docker container stdout/stderr to ClickHouse `logs` table with `source='container'`. Uses `docker logs --follow` per container, batches lines every 2s or 50 lines. Parses Docker timestamp prefix, infers log level via regex. `DeploymentExecutor` starts capture after each replica launches with the replica's `instanceId` (`{envSlug}-{appSlug}-{replicaIndex}`); `DockerEventMonitor` stops capture on die/oom. 60-second max capture timeout with 30s cleanup scheduler. Thread pool of 10 daemon threads. Container logs use the same `instanceId` as the agent (set via `CAMELEER_AGENT_INSTANCEID` env var) for unified log correlation at the instance level.
|
||||
- **ContainerLogForwarder** (`app/runtime/ContainerLogForwarder.java`) — streams Docker container stdout/stderr to ClickHouse `logs` table with `source='container'`. Uses `docker logs --follow` per container, batches lines every 2s or 50 lines. Parses Docker timestamp prefix, infers log level via regex. `DeploymentExecutor` starts capture after each replica launches with the replica's `instanceId` (`{envSlug}-{appSlug}-{replicaIndex}-{generation}`); `DockerEventMonitor` stops capture on die/oom. 60-second max capture timeout with 30s cleanup scheduler. Thread pool of 10 daemon threads. Container logs use the same `instanceId` as the agent (set via `CAMELEER_AGENT_INSTANCEID` env var) for unified log correlation at the instance level. Instance-id changes per deployment — cross-deploy queries aggregate on `application + environment` (and optionally `replica_index`).
|
||||
- **StartupLogPanel** (`ui/src/components/StartupLogPanel.tsx`) — collapsible log panel rendered below `DeploymentProgress`. Queries `/api/v1/logs?source=container&application={appSlug}&environment={envSlug}`. Auto-polls every 3s while deployment is STARTING; shows green "live" badge during polling, red "stopped" badge on FAILED. Uses `useStartupLogs` hook and `LogViewer` (design system).
|
||||
|
||||
## DeploymentExecutor Details
|
||||
|
||||
Primary network for app containers is set via `CAMELEER_SERVER_RUNTIME_DOCKERNETWORK` env var (in SaaS mode: `cameleer-tenant-{slug}`); apps also connect to `cameleer-traefik` (routing) and `cameleer-env-{tenantId}-{envSlug}` (per-environment discovery) as additional networks. Resolves `runtimeType: auto` to concrete type from `AppVersion.detectedRuntimeType` at PRE_FLIGHT (fails deployment if unresolvable). Builds Docker entrypoint per runtime type (all JVM types use `-javaagent:/app/agent.jar -jar`, plain Java uses `-cp` with main class, native runs binary directly). Sets per-replica `CAMELEER_AGENT_INSTANCEID` env var to `{envSlug}-{appSlug}-{replicaIndex}` so container logs and agent logs share the same instance identity. Sets `CAMELEER_AGENT_*` env vars from `ResolvedContainerConfig` (routeControlEnabled, replayEnabled, health port). These are startup-only agent properties — changing them requires redeployment.
|
||||
Primary network for app containers is set via `CAMELEER_SERVER_RUNTIME_DOCKERNETWORK` env var (in SaaS mode: `cameleer-tenant-{slug}`); apps also connect to `cameleer-traefik` (routing) and `cameleer-env-{tenantId}-{envSlug}` (per-environment discovery) as additional networks. Resolves `runtimeType: auto` to concrete type from `AppVersion.detectedRuntimeType` at PRE_FLIGHT (fails deployment if unresolvable). Builds Docker entrypoint per runtime type (all JVM types use `-javaagent:/app/agent.jar -jar`, plain Java uses `-cp` with main class, native runs binary directly). Sets per-replica `CAMELEER_AGENT_INSTANCEID` env var to `{envSlug}-{appSlug}-{replicaIndex}-{generation}` so container logs and agent logs share the same instance identity. Sets `CAMELEER_AGENT_*` env vars from `ResolvedContainerConfig` (routeControlEnabled, replayEnabled, health port). These are startup-only agent properties — changing them requires redeployment.
|
||||
|
||||
**Container naming** — `{tenantId}-{envSlug}-{appSlug}-{replicaIndex}-{generation}`, where `generation` is the first 8 characters of the deployment UUID. The generation suffix lets old + new replicas coexist during a blue/green swap (deterministic names without a generation used to 409). All lookups across the executor, `DockerEventMonitor`, and `ContainerLogForwarder` key on container **id**, not name — the name is operator-visibility only.
|
||||
|
||||
**Strategy dispatch** — `DeploymentStrategy.fromWire(config.deploymentStrategy())` branches the executor. Unknown values fall back to BLUE_GREEN so misconfiguration never throws at runtime.
|
||||
|
||||
- **Blue/green** (default): start all N new replicas → wait for ALL healthy → stop the previous deployment. Resource peak ≈ 2× replicas for the health-check window. Partial health aborts with status FAILED; the previous deployment is preserved untouched (user's safety net).
|
||||
- **Rolling**: replace replicas one at a time — start new[i] → wait healthy → stop old[i] → next. Resource peak = replicas + 1. Mid-rollout health failure stops in-flight new containers and aborts; already-replaced old replicas are NOT restored (not reversible) but un-replaced old[i+1..N] keep serving traffic. User redeploys to recover.
|
||||
|
||||
Traffic routing is implicit: Traefik labels (`cameleer.app`, `cameleer.environment`) are generation-agnostic, so new replicas attract load balancing as soon as they come up healthy — no explicit swap step.
|
||||
|
||||
## Deployment Status Model
|
||||
|
||||
@@ -34,17 +43,13 @@ Primary network for app containers is set via `CAMELEER_SERVER_RUNTIME_DOCKERNET
|
||||
| `STOPPED` | Intentionally stopped or initial state |
|
||||
| `STARTING` | Deploy in progress |
|
||||
| `RUNNING` | All replicas healthy and serving |
|
||||
| `DEGRADED` | Some replicas healthy, some dead |
|
||||
| `DEGRADED` | Post-deploy: a replica died after the deploy was marked RUNNING. Set by `DockerEventMonitor` reconciliation, never by `DeploymentExecutor` directly. |
|
||||
| `STOPPING` | Graceful shutdown in progress |
|
||||
| `FAILED` | Terminal failure (pre-flight, health check, or crash) |
|
||||
| `FAILED` | Terminal failure (pre-flight, health check, or crash). Partial-healthy deploys now mark FAILED — DEGRADED is reserved for post-deploy drift. |
|
||||
|
||||
**Replica support**: deployments can specify a replica count. `DEGRADED` is used when at least one but not all replicas are healthy.
|
||||
**Deploy stages** (`DeployStage`): PRE_FLIGHT -> PULL_IMAGE -> CREATE_NETWORK -> START_REPLICAS -> HEALTH_CHECK -> SWAP_TRAFFIC -> COMPLETE (or FAILED at any stage). Rolling reuses the same stage labels inside the per-replica loop; the UI progress bar shows the most recent stage.
|
||||
|
||||
**Deploy stages** (`DeployStage`): PRE_FLIGHT -> PULL_IMAGE -> CREATE_NETWORK -> START_REPLICAS -> HEALTH_CHECK -> SWAP_TRAFFIC -> COMPLETE (or FAILED at any stage).
|
||||
|
||||
**Blue/green strategy**: when re-deploying, new replicas are started and health-checked before old ones are stopped, minimising downtime.
|
||||
|
||||
**Deployment uniqueness**: `DeploymentService.createDeployment()` deletes any STOPPED/FAILED deployments for the same app+environment before creating a new one, preventing duplicate rows.
|
||||
**Deployment retention**: `DeploymentService.createDeployment()` deletes FAILED deployments for the same app+environment before creating a new one, preventing failed-attempt buildup. STOPPED deployments are preserved as restorable checkpoints — the UI Checkpoints disclosure lists every deployment with a non-null `deployed_config_snapshot` (RUNNING, DEGRADED, STOPPED) minus the current one.
|
||||
|
||||
## JAR Management
|
||||
|
||||
|
||||
@@ -8,7 +8,9 @@ paths:
|
||||
|
||||
# Prometheus Metrics
|
||||
|
||||
Server exposes `/api/v1/prometheus` (unauthenticated, Prometheus text format). Spring Boot Actuator provides JVM, GC, thread pool, and `http.server.requests` metrics automatically. Business metrics via `ServerMetrics` component:
|
||||
Server exposes `/api/v1/prometheus` (unauthenticated, Prometheus text format). Spring Boot Actuator provides JVM, GC, thread pool, and `http.server.requests` metrics automatically. Business metrics via `ServerMetrics` component.
|
||||
|
||||
The same `MeterRegistry` is also snapshotted to ClickHouse every 60 s by `ServerMetricsSnapshotScheduler` (see "Server self-metrics persistence" at the bottom of this file) — so historical server-health data survives restarts without an external Prometheus.
|
||||
|
||||
## Gauges (auto-polled)
|
||||
|
||||
@@ -83,3 +85,23 @@ Mean processing time = `camel.route.policy.total_time / camel.route.policy.count
|
||||
| `cameleer.sse.reconnects.count` | counter | `instanceId` |
|
||||
| `cameleer.taps.evaluated.count` | counter | `instanceId` |
|
||||
| `cameleer.metrics.exported.count` | counter | `instanceId` |
|
||||
|
||||
## Server self-metrics persistence
|
||||
|
||||
`ServerMetricsSnapshotScheduler` walks `MeterRegistry.getMeters()` every 60 s (configurable via `cameleer.server.self-metrics.interval-ms`) and writes one row per Micrometer `Measurement` to the ClickHouse `server_metrics` table. Full registry is captured — Spring Boot Actuator series (`jvm.*`, `process.*`, `http.server.requests`, `hikaricp.*`, `jdbc.*`, `tomcat.*`, `logback.events`, `system.*`) plus `cameleer.*` and `alerting_*`.
|
||||
|
||||
**Table** (`cameleer-server-app/src/main/resources/clickhouse/init.sql`):
|
||||
|
||||
```
|
||||
server_metrics(tenant_id, collected_at, server_instance_id,
|
||||
metric_name, metric_type, statistic, metric_value,
|
||||
tags Map(String,String), server_received_at)
|
||||
```
|
||||
|
||||
- `metric_type` — lowercase Micrometer `Meter.Type` (counter, gauge, timer, distribution_summary, long_task_timer, other)
|
||||
- `statistic` — Micrometer `Statistic.getTagValueRepresentation()` (value, count, total, total_time, max, mean, active_tasks, duration). Timers emit 3 rows per tick (count + total_time + max); gauges/counters emit 1 (`statistic='value'` or `'count'`).
|
||||
- No `environment` column — the server is env-agnostic.
|
||||
- `tenant_id` threaded from `cameleer.server.tenant.id` (single-tenant per server).
|
||||
- `server_instance_id` resolved once at boot by `ServerInstanceIdConfig` (property → HOSTNAME → localhost → UUID fallback). Rotates across restarts so counter resets are unambiguous.
|
||||
- TTL: 90 days (vs 365 for `agent_metrics`). Write-only in v1 — no query endpoint or UI page. Inspect via ClickHouse admin: `/api/v1/admin/clickhouse/query` or direct SQL.
|
||||
- Toggle off entirely with `cameleer.server.self-metrics.enabled=false` (uses `@ConditionalOnProperty`).
|
||||
|
||||
@@ -10,13 +10,18 @@ The UI has 4 main tabs: **Exchanges**, **Dashboard**, **Runtime**, **Deployments
|
||||
- **Exchanges** — route execution search and detail (`ui/src/pages/Exchanges/`)
|
||||
- **Dashboard** — metrics and stats with L1/L2/L3 drill-down (`ui/src/pages/DashboardTab/`)
|
||||
- **Runtime** — live agent status, logs, commands (`ui/src/pages/RuntimeTab/`). AgentHealth supports compact view (dense health-tinted cards) and expanded view (full GroupCard+DataTable per app). View mode persisted to localStorage.
|
||||
- **Deployments** — app management, JAR upload, deployment lifecycle (`ui/src/pages/AppsTab/`)
|
||||
- Config sub-tabs: **Monitoring | Resources | Variables | Traces & Taps | Route Recording**
|
||||
- Create app: full page at `/apps/new` (not a modal)
|
||||
- Deployment progress: `ui/src/components/DeploymentProgress.tsx` (7-stage step indicator)
|
||||
- **Deployments** — unified app deployment page (`ui/src/pages/AppsTab/`)
|
||||
- Routes: `/apps` (list, `AppListView` in `AppsTab.tsx`), `/apps/new` + `/apps/:slug` (both render `AppDeploymentPage`).
|
||||
- Identity & Artifact section always visible; name editable pre-first-deploy, read-only after. JAR picker client-stages; new JAR + any form edits flip the primary button from `Save` to `Redeploy`. Environment fixed to the currently-selected env (no selector).
|
||||
- Config sub-tabs: **Monitoring | Resources | Variables | Sensitive Keys | Deployment | ● Traces & Taps | ● Route Recording**. The four staged tabs feed dirty detection; the `●` live tabs apply in real-time (amber LiveBanner + default `?apply=live` on their writes) and never mark dirty.
|
||||
- Primary action state machine: `Save` → `Uploading… N%` (during JAR upload; button shows percent with a tinted progress-fill overlay) → `Redeploy` → `Deploying…` during active deploy. Upload progress sourced from `useUploadJar` (XHR `upload.onprogress` → page-level `uploadPct` state). The button is disabled during `uploading` and `deploying`.
|
||||
- Checkpoints render as a collapsible `CheckpointsTable` (default **collapsed**) **inside the Identity & Artifact `configGrid`** as an in-grid row (`Checkpoints | ▸ Expand (N)` / `▾ Collapse (N)`). `CheckpointsTable` returns a React.Fragment of grid-ready children so the label + trigger align with the other identity rows; when opened, a third grid child spans both columns via `grid-column: 1 / -1` so the 7-column table gets full width. Wired through `IdentitySection.checkpointsSlot` — `CheckpointDetailDrawer` stays in `IdentitySection.children` because it portals. Columns: Version · JAR (filename) · Deployed by · Deployed (relative `timeAgo` + user-locale sub-line via `new Date(iso).toLocaleString()`) · Strategy · Outcome · ›. Row click opens the drawer. Drawer tabs are ordered **Config | Logs** with `Config` as the default. Config panel has Snapshot / Diff vs current view modes. Replica filter in the Logs panel uses DS `Select`. Restore lives in the drawer footer (forces review). Visible row cap = `Environment.jarRetentionCount` (default 10 if 0/null); older rows accessible via "Show older (N)" expander. Currently-running deployment is excluded — represented separately by `StatusCard`. The empty-checkpoints case returns `null` (no row). The legacy `Checkpoints.tsx` row-list component is gone.
|
||||
- Deployment tab: `StatusCard` + `DeploymentProgress` (during STARTING / FAILED) + flex-grow `StartupLogPanel` (no fixed maxHeight). Auto-activates when a deploy starts. The former `HistoryDisclosure` is retired — per-deployment config and logs live in the Checkpoints drawer. `StartupLogPanel` header mirrors the Runtime Application Log pattern: title + live/stopped badge + `N entries` + sort toggle (↑/↓, default **desc**) + refresh icon (`RefreshCw`). Sort drives the backend fetch via `useStartupLogs(…, sort)` so the 500-line limit returns the window closest to the user's interest; display order matches fetch order. Refresh scrolls to the latest edge (top for desc, bottom for asc). Sort + refresh buttons disable while a refetch is in flight. 3s polling while STARTING is unchanged.
|
||||
- Unsaved-change router blocker uses DS `AlertDialog` (not `window.beforeunload`). Env switch intentionally discards edits without warning.
|
||||
|
||||
**Admin pages** (ADMIN-only, under `/admin/`):
|
||||
- **Sensitive Keys** (`ui/src/pages/Admin/SensitiveKeysPage.tsx`) — global sensitive key masking config. Shows agent built-in defaults as outlined Badge reference, editable Tag pills for custom keys, amber-highlighted push-to-agents toggle. Keys add to (not replace) agent defaults. Per-app sensitive key additions managed via `ApplicationConfigController` API. Note: `AppConfigDetailPage.tsx` exists but is not routed in `router.tsx`.
|
||||
- **Server Metrics** (`ui/src/pages/Admin/ServerMetricsAdminPage.tsx`) — dashboard over the `server_metrics` ClickHouse table. Visibility matches Database/ClickHouse pages: gated on `capabilities.infrastructureEndpoints` in `buildAdminTreeNodes`; backend is `@ConditionalOnProperty(infrastructureendpoints) + @PreAuthorize('hasRole(ADMIN)')`. Uses the generic `/api/v1/admin/server-metrics/{catalog,instances,query}` API via `ui/src/api/queries/admin/serverMetrics.ts` hooks (`useServerMetricsCatalog`, `useServerMetricsInstances`, `useServerMetricsSeries`), all three of which take a `ServerMetricsRange = { from: Date; to: Date }`. Time range is driven by the global TopBar picker via `useGlobalFilters()` — no page-local selector; bucket size auto-scales through `stepSecondsFor(windowSeconds)` (10 s up to 1 h buckets). Toolbar is just server-instance badges. Sections: Server health (agents/ingestion/auth), JVM (memory/CPU/GC/threads), HTTP & DB pools, Alerting (conditional on catalog), Deployments (conditional on catalog). Each panel is a `ThemedChart` with `Line`/`Area` children from the design system; multi-series responses are flattened into overlap rows by bucket timestamp. Alerting and Deployments rows are hidden when their metrics aren't in the catalog (zero-deploy / alerting-disabled installs).
|
||||
|
||||
## Key UI Files
|
||||
|
||||
@@ -25,6 +30,8 @@ The UI has 4 main tabs: **Exchanges**, **Dashboard**, **Runtime**, **Deployments
|
||||
- `ui/src/auth/auth-store.ts` — Zustand: accessToken, user, roles, login/logout
|
||||
- `ui/src/api/environment-store.ts` — Zustand: selected environment (localStorage)
|
||||
- `ui/src/components/ContentTabs.tsx` — main tab switcher
|
||||
- `ui/src/components/EnvironmentSwitcherButton.tsx` + `EnvironmentSwitcherModal.tsx` — explicit env picker (button in TopBar; DS `Modal`-based list). Replaces the retired `EnvironmentSelector` (All-Envs dropdown). When `envRecords.length > 0` and the stored `selectedEnv` no longer matches any env, `LayoutShell` opens the modal in `forced` mode (non-dismissible). Switcher pulls env records from `useEnvironments()` (admin endpoint; readable by VIEWER+).
|
||||
- `ui/src/components/env-colors.ts` + `ui/src/styles/env-colors.css` — 8-swatch preset palette for the per-environment color indicator. Tokens `--env-color-slate/red/amber/green/teal/blue/purple/pink` are defined for both light and dark themes. `envColorVar(name)` falls back to `slate` for unknown values. `LayoutShell` renders a 3px fixed top bar in the current env's color (z-index 900, below DS modals).
|
||||
- `ui/src/components/ExecutionDiagram/` — interactive trace view (canvas)
|
||||
- `ui/src/components/ProcessDiagram/` — ELK-rendered route diagram
|
||||
- `ui/src/hooks/useScope.ts` — TabKey type, scope inference
|
||||
@@ -33,6 +40,28 @@ The UI has 4 main tabs: **Exchanges**, **Dashboard**, **Runtime**, **Deployments
|
||||
- `ui/src/api/queries/agents.ts` — `useAgents` for agent list, `useInfiniteAgentEvents` for cursor-paginated timeline stream
|
||||
- `ui/src/hooks/useInfiniteStream.ts` — tanstack `useInfiniteQuery` wrapper with top-gated auto-refetch, flattened `items[]`, and `refresh()` invalidator
|
||||
- `ui/src/components/InfiniteScrollArea.tsx` — scrollable container with IntersectionObserver top/bottom sentinels. Streaming log/event views use this + `useInfiniteStream`. Bounded views (LogTab, StartupLogPanel) keep `useLogs`/`useStartupLogs`
|
||||
- `ui/src/components/SideDrawer.tsx` — project-local right-slide drawer (DS has Modal but no Drawer). Portal-rendered, ESC + transparent-backdrop click closes, sticky header/footer, sizes md/lg/xl. Currently consumed only by `CheckpointDetailDrawer` — promote to `@cameleer/design-system` once a second consumer appears.
|
||||
|
||||
## Alerts
|
||||
|
||||
- **Sidebar section** (`buildAlertsTreeNodes` in `ui/src/components/sidebar-utils.ts`) — Inbox, Rules, Silences.
|
||||
- **Routes** in `ui/src/router.tsx`: `/alerts` (redirect to inbox), `/alerts/inbox`, `/alerts/rules`, `/alerts/rules/new`, `/alerts/rules/:id`, `/alerts/silences`. No redirects for the retired `/alerts/all` and `/alerts/history` — stale URLs 404 per the clean-break policy.
|
||||
- **Pages** under `ui/src/pages/Alerts/`:
|
||||
- `InboxPage.tsx` — single filterable inbox. Filters: severity (multi), state (PENDING/FIRING/RESOLVED, default FIRING), Hide acked toggle (default on), Hide read toggle (default on). Row actions: Acknowledge, Mark read, Silence rule… (duration quick menu), Delete (OPERATOR+, soft-delete with undo toast wired to `useRestoreAlert`). Bulk toolbar (selection-driven): Acknowledge N · Mark N read · Silence rules · Delete N (ConfirmDialog; OPERATOR+).
|
||||
- `SilenceRuleMenu.tsx` — DS `Dropdown`-based duration picker (1h / 8h / 24h / Custom…). Used by the row-level and bulk silence actions. "Custom…" navigates to `/alerts/silences?ruleId=<id>`.
|
||||
- `RulesListPage.tsx` — CRUD + enable/disable toggle + env-promotion dropdown (pure UI prefill, no new endpoint).
|
||||
- `RuleEditor/RuleEditorWizard.tsx` — 5-step wizard (Scope / Condition / Trigger / Notify / Review). `form-state.ts` is the single source of truth (`initialForm` / `toRequest` / `validateStep`). Seven condition-form subcomponents under `RuleEditor/condition-forms/` — including `AgentLifecycleForm.tsx` (multi-select event-type chips for the six-entry `AgentLifecycleEventType` allowlist + lookback-window input).
|
||||
- `SilencesPage.tsx` — matcher-based create + end-early. Reads `?ruleId=` search param to prefill the Rule ID field (driven by InboxPage's "Silence rule… → Custom…" flow).
|
||||
- `AlertRow.tsx` shared list row; `alerts-page.module.css` shared styling.
|
||||
- **Components**:
|
||||
- `NotificationBell.tsx` — polls `/alerts/unread-count` every 30 s (paused when tab hidden via TanStack Query `refetchIntervalInBackground: false`).
|
||||
- `AlertStateChip.tsx`, `SeverityBadge.tsx` — shared state/severity indicators.
|
||||
- `MustacheEditor/` — CodeMirror 6 editor with variable autocomplete + inline linter. Shared between rule title/message, webhook body/header overrides, and (future) Admin Outbound Connection editor (reduced-context mode for URL).
|
||||
- `MustacheEditor/alert-variables.ts` — variable registry aligned with `NotificationContextBuilder.java`. Add new leaves here whenever the backend context grows.
|
||||
- **API queries** under `ui/src/api/queries/`: `alerts.ts`, `alertRules.ts`, `alertSilences.ts`, `alertNotifications.ts`, `alertMeta.ts`. All env-scoped via `useSelectedEnv` from `alertMeta`.
|
||||
- **CMD-K**: `buildAlertSearchData` in `LayoutShell.tsx` registers `alert` and `alertRule` result categories. Badges convey severity + state. Palette navigates directly to the deep-link path — no sidebar-reveal state for alerts.
|
||||
- **Sidebar accordion**: entering `/alerts/*` collapses Applications + Admin + Starred (mirrors Admin accordion).
|
||||
- **Top-nav**: `<NotificationBell />` is the first child of `<TopBar>`, sitting alongside `SearchTrigger` + status `ButtonGroup` + `TimeRangeDropdown` + `AutoRefreshToggle`.
|
||||
|
||||
## UI Styling
|
||||
|
||||
|
||||
@@ -5,8 +5,20 @@ on:
|
||||
branches: [main, 'feature/**', 'fix/**', 'feat/**']
|
||||
tags-ignore:
|
||||
- 'v*'
|
||||
paths-ignore:
|
||||
- '.planning/**'
|
||||
- 'docs/**'
|
||||
- '**/*.md'
|
||||
- '.claude/**'
|
||||
- 'AGENTS.md'
|
||||
- 'CLAUDE.md'
|
||||
pull_request:
|
||||
branches: [main]
|
||||
paths-ignore:
|
||||
- '.planning/**'
|
||||
- 'docs/**'
|
||||
- '**/*.md'
|
||||
- '.claude/**'
|
||||
delete:
|
||||
|
||||
jobs:
|
||||
@@ -45,11 +57,25 @@ jobs:
|
||||
key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
|
||||
restore-keys: ${{ runner.os }}-maven-
|
||||
|
||||
- name: Cache npm registry
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.npm
|
||||
key: ${{ runner.os }}-npm-${{ hashFiles('ui/package-lock.json') }}
|
||||
restore-keys: ${{ runner.os }}-npm-
|
||||
|
||||
- name: Cache Vite build artifacts
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ui/node_modules/.vite
|
||||
key: ${{ runner.os }}-vite-${{ hashFiles('ui/package-lock.json', 'ui/vite.config.ts') }}
|
||||
restore-keys: ${{ runner.os }}-vite-
|
||||
|
||||
- name: Build UI
|
||||
working-directory: ui
|
||||
run: |
|
||||
echo '//gitea.siegeln.net/api/packages/cameleer/npm/:_authToken=${REGISTRY_TOKEN}' >> .npmrc
|
||||
npm ci
|
||||
npm ci --prefer-offline --no-audit --fund=false
|
||||
npm run build
|
||||
env:
|
||||
REGISTRY_TOKEN: ${{ secrets.REGISTRY_TOKEN }}
|
||||
|
||||
120
.planning/it-triage-report.md
Normal file
120
.planning/it-triage-report.md
Normal file
@@ -0,0 +1,120 @@
|
||||
# IT Triage Report — 2026-04-21
|
||||
|
||||
Branch: `main`, starting HEAD `90460705` (chore: refresh GitNexus index stats).
|
||||
|
||||
## Summary
|
||||
|
||||
- **Starting state**: 65 IT failures (46 F + 19 E) out of 555 tests on a clean build. Side-note: `target/classes` incremental-build staleness from the `90083f88` V1..V18 → V1 schema collapse makes the number look worse (every context load dies on `Flyway V2__claim_mapping.sql failed`). A fresh `mvn clean verify` gives the real 65.
|
||||
- **Final state**: **12 failures across 3 test classes** (`AgentSseControllerIT`, `SseSigningIT`, `ClickHouseStatsStoreIT`). **53 failures closed across 14 test classes.**
|
||||
- **11 commits landed on local `main`** (not pushed).
|
||||
- No new env vars, endpoints, tables, or columns added. `V1__init.sql` untouched. No tests rewritten to pass-by-weakening — every assertion change is accompanied by a comment explaining the contract it now captures.
|
||||
|
||||
## Commits (in order)
|
||||
|
||||
| SHA | Test classes | What changed |
|
||||
|---|---|---|
|
||||
| `7436a37b` | AgentRegistrationControllerIT | environmentId, flat→env URL, heartbeat auto-heal, absolute sseEndpoint |
|
||||
| `97a6b2e0` | AgentCommandControllerIT | environmentId, CommandGroupResponse new shape (200 w/ aggregate replies) |
|
||||
| `e955302f` | BootstrapTokenIT / JwtRefreshIT / RegistrationSecurityIT / SseSigningIT / AgentSseControllerIT | environmentId in register bodies; AGENT-role smoke target; drop flaky iat-coupled assertion |
|
||||
| `10e2b699` | SecurityFilterIT | env-scoped agent list URL |
|
||||
| `9bda4d8f` | FlywayMigrationIT, ConfigEnvIsolationIT | decouple from shared Testcontainers Postgres state |
|
||||
| `36571013` | (docs) | first version of this report |
|
||||
| `dfacedb0` | DetailControllerIT | **Cluster B template**: ExecutionChunk envelope + REST-driven lookup |
|
||||
| `87bada1f` | ExecutionControllerIT, MetricsControllerIT | Chunk payloads + REST flush-visibility probes |
|
||||
| `a6e7458a` | DiagramControllerIT, DiagramRenderControllerIT | Env-scoped render + execution-detail-derived content hash for flat SVG path |
|
||||
| `56844799` | SearchControllerIT | 10 seed payloads → ExecutionChunk; fix AGENT→VIEWER token on search GET |
|
||||
| `d5adaaab` | DiagramLinkingIT, IngestionSchemaIT | REST for diagramContentHash + processor-tree/snapshot assertions |
|
||||
| `8283d531` | ClickHouseChunkPipelineIT, ClickHouseExecutionReadIT | Replace removed `/clickhouse/V2_.sql` with consolidated init.sql; correct `iteration` vs `loopIndex` on seq-based tree path |
|
||||
| `95f90f43` | ForwardCompatIT, ProtocolVersionIT, BackpressureIT | Chunk payload; fix wrong property-key prefix in BackpressureIT (+ MetricsFlushScheduler's separate `ingestion.flush-interval-ms` key) |
|
||||
| `b55221e9` | SensitiveKeysAdminControllerIT | assert pushResult shape, not exact 0 (shared registry across ITs) |
|
||||
|
||||
## The single biggest insight
|
||||
|
||||
**`ExecutionController` (legacy PG path) is dead code.** It's `@ConditionalOnMissingBean(ChunkAccumulator.class)` and `ChunkAccumulator` is registered **unconditionally** in `StorageBeanConfig.java:92`, so `ExecutionController` never binds. Even if it did, `IngestionService.upsert` → `ClickHouseExecutionStore.upsert` throws `UnsupportedOperationException("ClickHouse writes use the chunked pipeline")` — the only `ExecutionStore` impl in `src/main/java` is ClickHouse, the Postgres variant lives in a planning doc only.
|
||||
|
||||
Practical consequences for every IT that was exercising `/api/v1/data/executions`:
|
||||
1. `ChunkIngestionController` owns the URL and expects an `ExecutionChunk` envelope (`exchangeId`, `applicationId`, `instanceId`, `routeId`, `status`, `startTime`, `endTime`, `durationMs`, `chunkSeq`, `final`, `processors: FlatProcessorRecord[]`) — the legacy `RouteExecution` shape was being silently degraded to an empty/degenerate chunk.
|
||||
2. The test payload changes are accompanied by assertion changes that now go through REST endpoints instead of raw SQL against the (ClickHouse-resident) `executions` / `processor_executions` / `route_diagrams` / `agent_metrics` tables.
|
||||
3. **Recommendation for cleanup**: remove `ExecutionController` + the `upsert` path in `IngestionService` + the stubbed `ClickHouseExecutionStore.upsert` throwers. Separate PR. Happy to file.
|
||||
|
||||
## Cluster breakdown
|
||||
|
||||
**Cluster A — missing `environmentId` in register bodies (DONE)**
|
||||
Root cause: `POST /api/v1/agents/register` now 400s without `environmentId`. Test payloads minted before this requirement. Fixed across all agent-registering ITs plus side-cleanups (flaky iat-coupled assertion in JwtRefreshIT, wrong RBAC target in can-access tests, absolute vs relative sseEndpoint).
|
||||
|
||||
**Cluster B — ingestion payload drift (DONE per user direction)**
|
||||
All controller + storage ITs that posted `RouteExecution` JSON now post `ExecutionChunk` envelopes. All CH-side assertions now go through REST endpoints (`/api/v1/environments/{env}/executions` search + `/api/v1/executions/{id}` detail + `/agents/{id}/metrics` + `/apps/{app}/routes/{route}/diagram`). DiagramRenderControllerIT's SVG tests still need a content hash → reads it off the execution-detail REST response rather than querying `route_diagrams`.
|
||||
|
||||
**Cluster C — flat URL drift (DONE)**
|
||||
`/api/v1/agents` → `/api/v1/environments/{envSlug}/agents`. Two test classes touched.
|
||||
|
||||
**Cluster D — heartbeat auto-heal contract (DONE)**
|
||||
`heartbeatUnknownAgent_returns404` renamed and asserts the 200 auto-heal path that `fb54f9cb` made the contract.
|
||||
|
||||
**Cluster E — individual drifts (DONE except three parked)**
|
||||
|
||||
| Test class | Status |
|
||||
|---|---|
|
||||
| FlywayMigrationIT | DONE (decouple from shared PG state) |
|
||||
| ConfigEnvIsolationIT.findByEnvironment_excludesOtherEnvs | DONE (unique slug prefix) |
|
||||
| ForwardCompatIT | DONE (chunk payload) |
|
||||
| ProtocolVersionIT | DONE (chunk payload) |
|
||||
| BackpressureIT | DONE (property-key prefix fix — see note below) |
|
||||
| SensitiveKeysAdminControllerIT | DONE (assert shape not count) |
|
||||
| ClickHouseChunkPipelineIT | DONE (consolidated init.sql) |
|
||||
| ClickHouseExecutionReadIT | DONE (iteration vs loopIndex mapping) |
|
||||
|
||||
## PARKED — what you'll want to look at next
|
||||
|
||||
### 1. ClickHouseStatsStoreIT (8 failures) — timezone bug in production code
|
||||
|
||||
`ClickHouseStatsStore.buildStatsSql` uses `lit(Instant)` which formats as `'yyyy-MM-dd HH:mm:ss'` in UTC but with no timezone marker. ClickHouse parses that literal in the session timezone when comparing against the `DateTime`-typed `bucket` column in `stats_1m_*`. On a non-UTC CH host (e.g. CEST docker on a CEST laptop), the filter endpoint is off by the tz offset in hours and misses every row the MV bucketed.
|
||||
|
||||
I confirmed this by instrumenting the test: `toDateTime(bucket)` returned `12:00:00` for a row inserted with `start_time=10:00:00Z` (i.e. the stored UTC Unix timestamp but displayed in CEST), and the filter literal `'2026-03-31 10:05:00'` was being parsed as CEST → UTC 08:05 → excluded all rows.
|
||||
|
||||
**I didn't fix this** because the repair is in `src/main/java`, not the test. Two reasonable options:
|
||||
- **Test-side**: pin the container TZ via `.withEnv("TZ", "UTC")` + include `use_time_zone=UTC` in the JDBC URL. I tried both; neither was sufficient on their own — the CH server reads `timezone` from its own config, not `$TZ`. Getting all three layers (container env, CH server config, JDBC driver) aligned needs dedicated effort.
|
||||
- **Production-side (preferred)**: change `lit(Instant)` to `toDateTime('...', 'UTC')` or use the 3-arg `DateTime(3, 'UTC')` column type for `bucket`. That's a store change; would be caught by a matching unit test.
|
||||
|
||||
I did add the explicit `'default'` env to the seed `INSERT`s per your directive, but reverted it locally because the timezone bug swallowed the fix. The raw unchanged test is what's committed.
|
||||
|
||||
### 2. AgentSseControllerIT (3 failures) & SseSigningIT (1 failure) — SSE connection timing
|
||||
|
||||
All failing assertions are `awaitConnection(5000)` timeouts or `ConditionTimeoutException` on SSE stream observation. Not related to any spec drift I could identify — the SSE server is up (other tests in the same classes connect fine), and auth/JWT is accepted. Looks like a real race on either the SseConnectionManager registration or on the HTTP client's first-read flush. Needs a dedicated debug session with a minimal reproducer; not something I wanted to hack around with sleeps.
|
||||
|
||||
Specific tests:
|
||||
- `AgentSseControllerIT.sseConnect_unknownAgent_returns404` — 5s `CompletableFuture.get` timeout on an HTTP GET that should return 404 synchronously. Suggests the client is waiting on body data that never arrives (SSE stream opens even on 404?).
|
||||
- `AgentSseControllerIT.lastEventIdHeader_connectionSucceeds` — `stream.awaitConnection(5000)` false.
|
||||
- `AgentSseControllerIT.pingKeepalive_receivedViaSseStream` — waits for an event line in the stream snapshot, never sees it.
|
||||
- `SseSigningIT.deepTraceEvent_containsValidSignature` — same pattern.
|
||||
|
||||
The sibling tests (`SseSigningIT.configUpdateEvent_containsValidEd25519Signature`) pass in isolation, which strongly suggests order-dependent flakiness rather than a protocol break.
|
||||
|
||||
## Final verify command
|
||||
|
||||
```bash
|
||||
mvn -pl cameleer-server-app -am -Dit.test='!SchemaBootstrapIT' -Dtest='!*' -DfailIfNoTests=false -Dsurefire.failIfNoSpecifiedTests=false verify
|
||||
```
|
||||
|
||||
Reports land in `cameleer-server-app/target/failsafe-reports/`. Expect **12 failures** in the three classes above. Everything else is green.
|
||||
|
||||
## Side notes worth flagging
|
||||
|
||||
- **Property-key inconsistency in the main code** — surfaced via BackpressureIT. `IngestionConfig` is bound under `cameleer.server.ingestion.*`, but `MetricsFlushScheduler.@Scheduled` reads `ingestion.flush-interval-ms` (no prefix, hyphenated). In production this means the flush-interval in `application.yml` isn't actually being honoured by the metrics flush — it stays at the 1s fallback. Separate cleanup.
|
||||
- **Shared Testcontainers PG across IT classes** — several of the "cross-test state" fixes (FlywayMigrationIT, ConfigEnvIsolationIT, SensitiveKeysAdminControllerIT) are symptoms of one underlying issue: `AbstractPostgresIT` uses a singleton PG container, and nothing cleans between test classes. Could do with a global `@Sql("/test-reset.sql")` on `@BeforeAll`, but out of scope here.
|
||||
- **Agent registry shared across ITs** — same class of issue. Doesn't bite until a test explicitly inspects registry membership (SensitiveKeys `pushResult.total`).
|
||||
|
||||
## Follow-up (2026-04-22) — 12 parked failures closed
|
||||
|
||||
All three parked clusters now green. 560/560 tests passing.
|
||||
|
||||
- **ClickHouseStatsStoreIT (8 failures)** — fixed in `a9a6b465`. Two-layer TZ fix: JVM default TZ pinned to UTC in `CameleerServerApplication.main()` (the ClickHouse JDBC 0.9.7 driver formats `java.sql.Timestamp` via `Timestamp.toString()`, which uses JVM default TZ — a CEST JVM shipping to a UTC CH server stored off-by-offset Unix timestamps), plus column-level `bucket DateTime('UTC')` on all `stats_1m_*` tables with explicit `toDateTime(..., 'UTC')` casts in MV projections and `ClickHouseStatsStore.lit(Instant)` as defence in depth.
|
||||
- **MetricsFlushScheduler property-key drift** — fixed in `a6944911`. Scheduler now reads `${cameleer.server.ingestion.flush-interval-ms:1000}` (the SpEL-via-`@ingestionConfig` approach doesn't work because `@EnableConfigurationProperties` uses a compound bean name). BackpressureIT workaround property removed.
|
||||
- **SSE flakiness (4 failures, `AgentSseControllerIT` + `SseSigningIT`)** — fixed in `41df042e`. Triage's "order-dependent flakiness" theory was wrong — all four reproduced in isolation. Three root causes: (a) `AgentSseController.events` auto-heal was over-permissive (spoofing vector), fixed with JWT-subject-equals-path-id check; (b) `SseConnectionManager.pingAll` read an unprefixed property key (`agent-registry.ping-interval-ms`), same family of bug as (a6944911); (c) SSE response headers didn't flush until the first `emitter.send()`, so `awaitConnection(5s)` assertions timed out under the 15s ping cadence — fixed by sending an initial `: connected` comment on `connect()`. Full diagnosis in `.planning/sse-flakiness-diagnosis.md`.
|
||||
|
||||
Plus the two prod-code cleanups from the ExecutionController-removal follow-ons:
|
||||
|
||||
- **Dead `SearchIndexer` subsystem** — removed in `98cbf8f3`. `ExecutionUpdatedEvent` had no publisher after `0f635576`, so the whole indexer + stats + `/admin/clickhouse/pipeline` endpoint + UI pipeline card carried zero signal.
|
||||
- **Unused `TaggedExecution` record** — removed in `06c6f53b`.
|
||||
|
||||
Final verify: `mvn -pl cameleer-server-app -am -Dit.test='!SchemaBootstrapIT' ... verify` → **Tests run: 560, Failures: 0, Errors: 0, Skipped: 0**.
|
||||
81
.planning/sse-flakiness-diagnosis.md
Normal file
81
.planning/sse-flakiness-diagnosis.md
Normal file
@@ -0,0 +1,81 @@
|
||||
# SSE Flakiness — Root-Cause Analysis
|
||||
|
||||
**Date:** 2026-04-21
|
||||
**Tests:** `AgentSseControllerIT.sseConnect_unknownAgent_returns404`, `.lastEventIdHeader_connectionSucceeds`, `.pingKeepalive_receivedViaSseStream`, `SseSigningIT.deepTraceEvent_containsValidSignature`
|
||||
|
||||
## Summary
|
||||
|
||||
Not order-dependent flakiness (triage report was wrong). Three distinct root causes, one production bug and one test-infrastructure issue, all reproducible when running the classes in isolation.
|
||||
|
||||
## Reproduction
|
||||
|
||||
```bash
|
||||
mvn -pl cameleer-server-app -am -Dit.test='AgentSseControllerIT' -Dtest='!*' \
|
||||
-DfailIfNoTests=false -Dsurefire.failIfNoSpecifiedTests=false verify
|
||||
```
|
||||
|
||||
Result: 3 failures out of 7 tests with a cold CH container. Not order-dependent.
|
||||
|
||||
## Root causes
|
||||
|
||||
### 1. `AgentSseController.events` auto-heal is over-permissive (security bug)
|
||||
|
||||
**File:** `cameleer-server-app/src/main/java/com/cameleer/server/app/controller/AgentSseController.java:63-76`
|
||||
|
||||
```java
|
||||
AgentInfo agent = registryService.findById(id);
|
||||
if (agent == null) {
|
||||
var jwtResult = ...;
|
||||
if (jwtResult != null) { // ← only checks JWT presence
|
||||
registryService.register(id, id, application, env, ...);
|
||||
} else {
|
||||
throw 404;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Bug:** auto-heal registers *any* path id when any valid JWT is present, regardless of whether the JWT subject matches the path id. A holder of agent X's JWT can open SSE for any path-id Y, silently spoofing Y.
|
||||
|
||||
**Surface symptom:** `sseConnect_unknownAgent_returns404` sends a JWT for `test-agent-sse-it` and requests SSE for `unknown-sse-agent`. Auto-heal kicks in, returns 200 with an infinite empty stream. Test's `statusFuture.get(5s)` — which uses `BodyHandlers.ofString()` and waits for the full body — times out instead of getting a synchronous 404.
|
||||
|
||||
**Fix:** only auto-heal when `jwtResult.subject().equals(id)`.
|
||||
|
||||
### 2. `SseConnectionManager.pingAll` reads an unprefixed property key (production bug)
|
||||
|
||||
**File:** `cameleer-server-app/src/main/java/com/cameleer/server/app/agent/SseConnectionManager.java:172`
|
||||
|
||||
```java
|
||||
@Scheduled(fixedDelayString = "${agent-registry.ping-interval-ms:15000}")
|
||||
```
|
||||
|
||||
**Bug:** `AgentRegistryConfig` is `@ConfigurationProperties(prefix = "cameleer.server.agentregistry")`. The scheduler reads an unprefixed `agent-registry.*` key that the YAML never defines — so the default 15s always applies, regardless of config. Same family of bug as the `MetricsFlushScheduler` fix in commit `a6944911`.
|
||||
|
||||
**Fix:** `${cameleer.server.agentregistry.ping-interval-ms:15000}`.
|
||||
|
||||
### 3. SSE response body doesn't flush until first event (test timing dependency)
|
||||
|
||||
**File:** `cameleer-server-app/src/main/java/com/cameleer/server/app/agent/SseConnectionManager.java:connect()`
|
||||
|
||||
Spring's `SseEmitter` holds the response open but doesn't flush headers to the client until the first `emitter.send()`. Until then, clients using `HttpResponse.BodyHandlers.ofInputStream()` block on the first byte.
|
||||
|
||||
**Surface symptom:**
|
||||
- `lastEventIdHeader_connectionSucceeds` — asserts `awaitConnection(5000)` is `true`. The latch counts down in `.thenAccept(response -> ...)`, which in practice only fires once body bytes start flowing (JDK 21 behaviour with SSE streams). Default ping cadence is 15s → 5s assertion times out.
|
||||
- `pingKeepalive_receivedViaSseStream` — waits 5s for a `:ping` line. The scheduler runs every 15s (both by default, and because of bug #2, unconditionally).
|
||||
- `SseSigningIT.deepTraceEvent_containsValidSignature` — same family: `awaitConnection(5000).isTrue()`.
|
||||
|
||||
**Fix:** send an initial `: connected` comment as part of `connect()`. Spring flushes on the first `.send()`, so an immediate comment forces the response headers + first byte to hit the wire, which triggers the client's `thenAccept` callback. Also solves the ping-test: the initial comment is observed as a keepalive line within the test's polling window.
|
||||
|
||||
## Hypothesis ladder (ruled out)
|
||||
|
||||
- **Order-dependent singleton leak** — ruled out: every failure reproduces when the class is run solo.
|
||||
- **Tomcat async thread pool exhaustion** — ruled out: `SseEmitter(Long.MAX_VALUE)` does hold threads, but the 7-test class doesn't reach Tomcat's defaults.
|
||||
- **SseConnectionManager emitter-map contamination** — ruled out: each test uses a unique agent id (UUID-suffixed), and the `@Component` is the same instance across tests but the emitter map is keyed by agent id, no collisions.
|
||||
|
||||
## Verification
|
||||
|
||||
```
|
||||
mvn -pl cameleer-server-app -am -Dit.test='AgentSseControllerIT,SseSigningIT' ... verify
|
||||
# Tests run: 9, Failures: 0, Errors: 0, Skipped: 0
|
||||
```
|
||||
|
||||
All 9 tests green with the three fixes applied.
|
||||
12
AGENTS.md
12
AGENTS.md
@@ -1,7 +1,7 @@
|
||||
<!-- gitnexus:start -->
|
||||
# GitNexus — Code Intelligence
|
||||
|
||||
This project is indexed by GitNexus as **alerting-02** (7810 symbols, 20082 relationships, 300 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely.
|
||||
This project is indexed by GitNexus as **cameleer-server** (9731 symbols, 24987 relationships, 300 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely.
|
||||
|
||||
> If any GitNexus tool warns the index is stale, run `npx gitnexus analyze` in terminal first.
|
||||
|
||||
@@ -17,7 +17,7 @@ This project is indexed by GitNexus as **alerting-02** (7810 symbols, 20082 rela
|
||||
|
||||
1. `gitnexus_query({query: "<error or symptom>"})` — find execution flows related to the issue
|
||||
2. `gitnexus_context({name: "<suspect function>"})` — see all callers, callees, and process participation
|
||||
3. `READ gitnexus://repo/alerting-02/process/{processName}` — trace the full execution flow step by step
|
||||
3. `READ gitnexus://repo/cameleer-server/process/{processName}` — trace the full execution flow step by step
|
||||
4. For regressions: `gitnexus_detect_changes({scope: "compare", base_ref: "main"})` — see what your branch changed
|
||||
|
||||
## When Refactoring
|
||||
@@ -56,10 +56,10 @@ This project is indexed by GitNexus as **alerting-02** (7810 symbols, 20082 rela
|
||||
|
||||
| Resource | Use for |
|
||||
|----------|---------|
|
||||
| `gitnexus://repo/alerting-02/context` | Codebase overview, check index freshness |
|
||||
| `gitnexus://repo/alerting-02/clusters` | All functional areas |
|
||||
| `gitnexus://repo/alerting-02/processes` | All execution flows |
|
||||
| `gitnexus://repo/alerting-02/process/{name}` | Step-by-step execution trace |
|
||||
| `gitnexus://repo/cameleer-server/context` | Codebase overview, check index freshness |
|
||||
| `gitnexus://repo/cameleer-server/clusters` | All functional areas |
|
||||
| `gitnexus://repo/cameleer-server/processes` | All execution flows |
|
||||
| `gitnexus://repo/cameleer-server/process/{name}` | Step-by-step execution trace |
|
||||
|
||||
## Self-Check Before Finishing
|
||||
|
||||
|
||||
39
CLAUDE.md
39
CLAUDE.md
@@ -22,8 +22,19 @@ Cameleer Server — observability server that receives, stores, and serves Camel
|
||||
```bash
|
||||
mvn clean compile # Compile all modules
|
||||
mvn clean verify # Full build with tests
|
||||
mvn clean verify -DskipITs # Fast: unit tests only (no Testcontainers)
|
||||
```
|
||||
|
||||
### Faster local builds
|
||||
|
||||
- **Surefire reuses forks** (`cameleer-server-app/pom.xml`): unit tests run with `forkCount=1C` + `reuseForks=true` — one JVM per CPU core, reused across classes. Test classes that mutate static state must clean up after themselves.
|
||||
- **Testcontainers reuse** — opt-in per developer. Add to `~/.testcontainers.properties`:
|
||||
```
|
||||
testcontainers.reuse.enable=true
|
||||
```
|
||||
Then `AbstractPostgresIT` containers persist across `mvn verify` runs (saves ~20s per run). Stop them manually when you need a clean DB: `docker rm -f $(docker ps -aq --filter label=org.testcontainers.reuse=true)`.
|
||||
- **UI build** dropped redundant `tsc --noEmit` from `npm run build` (Vite/esbuild type-checks during bundling). Run `npm run typecheck` explicitly when you want a full type-check pass.
|
||||
|
||||
## Run
|
||||
|
||||
```bash
|
||||
@@ -51,25 +62,13 @@ java -jar cameleer-server-app/target/cameleer-server-app-1.0-SNAPSHOT.jar
|
||||
- OIDC: Optional external identity provider support (token exchange pattern). Configured via admin API/UI, stored in database (`server_config` table). Resource server mode: accepts external access tokens (Logto M2M) via JWKS validation when `CAMELEER_SERVER_SECURITY_OIDCISSUERURI` is set. Scope-based role mapping via `SystemRole.normalizeScope()`. System roles synced on every OIDC login via `applyClaimMappings()` in `OidcAuthController` (calls `clearManagedAssignments` + `assignManagedRole` on `RbacService`) — always overwrites managed role assignments; uses managed assignment origin to avoid touching group-inherited or directly-assigned roles. Supports ES384, ES256, RS256.
|
||||
- OIDC role extraction: `OidcTokenExchanger` reads roles from the **access_token** first (JWT with `at+jwt` type), then falls back to id_token. `OidcConfig` includes `audience` (RFC 8707 resource indicator) and `additionalScopes`. All provider-specific configuration is external — no provider-specific code in the server.
|
||||
- Sensitive keys: Global enforced baseline for masking sensitive data in agent payloads. Merge rule: `final = global UNION per-app` (case-insensitive dedup, per-app can only add, never remove global keys).
|
||||
- User persistence: PostgreSQL `users` table, admin CRUD at `/api/v1/admin/users`
|
||||
- User persistence: PostgreSQL `users` table, admin CRUD at `/api/v1/admin/users`. `users.user_id` is the **bare** identifier — local users as `<username>`, OIDC users as `oidc:<sub>`. JWT `sub` carries the `user:` namespace prefix so `JwtAuthenticationFilter` can tell user tokens from agent tokens; write paths (`UiAuthController`, `OidcAuthController`, `UserAdminController`) all upsert unprefixed, and env-scoped read-path controllers strip the `user:` prefix before using the value as an FK to `users.user_id` / `user_roles.user_id`. Alerting / outbound FKs (`alert_rules.created_by`, `outbound_connections.created_by`, …) therefore all reference the bare form.
|
||||
- Usage analytics: ClickHouse `usage_events` table tracks authenticated UI requests, flushed every 5s
|
||||
|
||||
## Database Migrations
|
||||
|
||||
PostgreSQL (Flyway): `cameleer-server-app/src/main/resources/db/migration/`
|
||||
- V1 — RBAC (users, roles, groups, audit_log). `application_config` PK is `(application, environment)`; `app_settings` PK is `(application_id, environment)` — both tables are env-scoped.
|
||||
- V2 — Claim mappings (OIDC)
|
||||
- V3 — Runtime management (apps, environments, deployments, app_versions)
|
||||
- V4 — Environment config (default_container_config JSONB)
|
||||
- V5 — App container config (container_config JSONB on apps)
|
||||
- V6 — JAR retention policy (jar_retention_count on environments)
|
||||
- V7 — Deployment orchestration (target_state, deployment_strategy, replica_states JSONB, deploy_stage)
|
||||
- V8 — Deployment active config (resolved_config JSONB on deployments)
|
||||
- V9 — Password hardening (failed_login_attempts, locked_until, token_revoked_before on users)
|
||||
- V10 — Runtime type detection (detected_runtime_type, detected_main_class on app_versions)
|
||||
- V11 — Outbound connections (outbound_connections table, enums)
|
||||
- V12 — Alerting tables (alert_rules, alert_rule_targets, alert_instances, alert_notifications, alert_reads, alert_silences)
|
||||
- V13 — alert_instances open-rule unique index (alert_instances_open_rule_uq partial index on rule_id WHERE state IN PENDING/FIRING/ACKNOWLEDGED)
|
||||
- V1 — Consolidated baseline schema. All prior V1–V18 evolution was collapsed before first prod deploy. Contains: RBAC (users, roles, groups, user_roles, user_groups, group_roles, claim_mapping_rules), runtime management (environments, apps, app_versions, deployments), env-scoped application config (application_config PK `(application, environment)`, app_settings PK `(application_id, environment)`), audit_log, outbound_connections, server_config, and the full alerting subsystem (alert_rules, alert_rule_targets, alert_instances, alert_silences, alert_notifications). Seeds the 4 system roles (AGENT/VIEWER/OPERATOR/ADMIN), the `Admins` group with ADMIN role, and a default environment. Invariants covered by `SchemaBootstrapIT`.
|
||||
|
||||
ClickHouse: `cameleer-server-app/src/main/resources/clickhouse/init.sql` (run idempotently on startup)
|
||||
|
||||
@@ -97,7 +96,7 @@ When adding, removing, or renaming classes, controllers, endpoints, UI component
|
||||
<!-- gitnexus:start -->
|
||||
# GitNexus — Code Intelligence
|
||||
|
||||
This project is indexed by GitNexus as **alerting-02** (7810 symbols, 20082 relationships, 300 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely.
|
||||
This project is indexed by GitNexus as **cameleer-server** (9731 symbols, 24987 relationships, 300 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely.
|
||||
|
||||
> If any GitNexus tool warns the index is stale, run `npx gitnexus analyze` in terminal first.
|
||||
|
||||
@@ -113,7 +112,7 @@ This project is indexed by GitNexus as **alerting-02** (7810 symbols, 20082 rela
|
||||
|
||||
1. `gitnexus_query({query: "<error or symptom>"})` — find execution flows related to the issue
|
||||
2. `gitnexus_context({name: "<suspect function>"})` — see all callers, callees, and process participation
|
||||
3. `READ gitnexus://repo/alerting-02/process/{processName}` — trace the full execution flow step by step
|
||||
3. `READ gitnexus://repo/cameleer-server/process/{processName}` — trace the full execution flow step by step
|
||||
4. For regressions: `gitnexus_detect_changes({scope: "compare", base_ref: "main"})` — see what your branch changed
|
||||
|
||||
## When Refactoring
|
||||
@@ -152,10 +151,10 @@ This project is indexed by GitNexus as **alerting-02** (7810 symbols, 20082 rela
|
||||
|
||||
| Resource | Use for |
|
||||
|----------|---------|
|
||||
| `gitnexus://repo/alerting-02/context` | Codebase overview, check index freshness |
|
||||
| `gitnexus://repo/alerting-02/clusters` | All functional areas |
|
||||
| `gitnexus://repo/alerting-02/processes` | All execution flows |
|
||||
| `gitnexus://repo/alerting-02/process/{name}` | Step-by-step execution trace |
|
||||
| `gitnexus://repo/cameleer-server/context` | Codebase overview, check index freshness |
|
||||
| `gitnexus://repo/cameleer-server/clusters` | All functional areas |
|
||||
| `gitnexus://repo/cameleer-server/processes` | All execution flows |
|
||||
| `gitnexus://repo/cameleer-server/process/{name}` | Step-by-step execution trace |
|
||||
|
||||
## Self-Check Before Finishing
|
||||
|
||||
|
||||
12
Dockerfile
12
Dockerfile
@@ -1,10 +1,14 @@
|
||||
FROM --platform=$BUILDPLATFORM maven:3.9-eclipse-temurin-17 AS build
|
||||
WORKDIR /build
|
||||
|
||||
# Configure Gitea Maven Registry for cameleer-common dependency
|
||||
ARG REGISTRY_TOKEN
|
||||
RUN mkdir -p ~/.m2 && \
|
||||
echo '<settings><servers><server><id>gitea</id><username>cameleer</username><password>'${REGISTRY_TOKEN}'</password></server></servers></settings>' > ~/.m2/settings.xml
|
||||
# Optional auth for Gitea Maven Registry. The `cameleer/cameleer-common` package
|
||||
# is published publicly, so empty token → anonymous pull (no settings.xml).
|
||||
# Private packages require a non-empty token.
|
||||
ARG REGISTRY_TOKEN=""
|
||||
RUN if [ -n "$REGISTRY_TOKEN" ]; then \
|
||||
mkdir -p ~/.m2 && \
|
||||
printf '<settings><servers><server><id>gitea</id><username>cameleer</username><password>%s</password></server></servers></settings>\n' "$REGISTRY_TOKEN" > ~/.m2/settings.xml; \
|
||||
fi
|
||||
|
||||
COPY pom.xml .
|
||||
COPY cameleer-server-core/pom.xml cameleer-server-core/
|
||||
|
||||
94
HOWTO.md
94
HOWTO.md
@@ -19,38 +19,99 @@ mvn clean compile # compile only
|
||||
mvn clean verify # compile + run all tests (needs Docker for integration tests)
|
||||
```
|
||||
|
||||
## Infrastructure Setup
|
||||
## Start a brand-new local environment (Docker)
|
||||
|
||||
Start PostgreSQL:
|
||||
The repo ships a `docker-compose.yml` with the full stack: PostgreSQL, ClickHouse, the Spring Boot server, and the nginx-served SPA. All dev defaults are baked into the compose file — no `.env` file or extra config needed for a first run.
|
||||
|
||||
```bash
|
||||
# 1. Clean slate (safe if this is already a first run — noop when no volumes exist)
|
||||
docker compose down -v
|
||||
|
||||
# 2. Build + start everything. First run rebuilds both images (~2–4 min).
|
||||
docker compose up -d --build
|
||||
|
||||
# 3. Watch the server come up (health check goes green in ~60–90s after Flyway + ClickHouse init)
|
||||
docker compose logs -f cameleer-server
|
||||
# ready when you see "Started CameleerServerApplication in ...".
|
||||
# Ctrl+C when ready — containers keep running.
|
||||
|
||||
# 4. Smoke test
|
||||
curl -s http://localhost:8081/api/v1/health # → {"status":"UP"}
|
||||
```
|
||||
|
||||
Open the UI at **http://localhost:8080** (nginx) and log in with **admin / admin**.
|
||||
|
||||
| Service | Host port | URL / notes |
|
||||
|------------|-----------|-------------|
|
||||
| Web UI (nginx) | 8080 | http://localhost:8080 — proxies `/api` to the server |
|
||||
| Server API | 8081 | http://localhost:8081/api/v1/health, http://localhost:8081/api/v1/swagger-ui.html |
|
||||
| PostgreSQL | 5432 | user `cameleer`, password `cameleer_dev`, db `cameleer` |
|
||||
| ClickHouse | 8123 (HTTP), 9000 (native) | user `default`, no password, db `cameleer` |
|
||||
|
||||
**Dev credentials baked into compose (do not use in production):**
|
||||
|
||||
| Purpose | Value |
|
||||
|---|---|
|
||||
| UI login | `admin` / `admin` |
|
||||
| Bootstrap token (agent registration) | `dev-bootstrap-token-for-local-agent-registration` |
|
||||
| JWT secret | `dev-jwt-secret-32-bytes-min-0123456789abcdef0123456789abcdef` |
|
||||
| `CAMELEER_SERVER_RUNTIME_ENABLED` | `false` (Docker-in-Docker app orchestration off for the local stack) |
|
||||
|
||||
Override any of these by editing `docker-compose.yml` or passing `-e KEY=value` to `docker compose run`.
|
||||
|
||||
### Common lifecycle commands
|
||||
|
||||
```bash
|
||||
# Stop everything but keep volumes (quick restart later)
|
||||
docker compose stop
|
||||
|
||||
# Start again after a stop
|
||||
docker compose start
|
||||
|
||||
# Apply changes to the server code / UI — rebuild just what changed
|
||||
docker compose up -d --build cameleer-server
|
||||
docker compose up -d --build cameleer-ui
|
||||
|
||||
# Wipe the environment completely (drops PG + ClickHouse volumes — all data gone)
|
||||
docker compose down -v
|
||||
|
||||
# Fresh Flyway run by dropping just the PG volume (keeps ClickHouse data)
|
||||
docker compose down
|
||||
docker volume rm cameleer-server_cameleer-pgdata
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
This starts PostgreSQL 16. The database schema is applied automatically via Flyway migrations on server startup. ClickHouse tables are created by the schema initializer on startup.
|
||||
### Infra-only mode (backend via `mvn` / UI via Vite)
|
||||
|
||||
| Service | Port | Purpose |
|
||||
|------------|------|----------------------|
|
||||
| PostgreSQL | 5432 | JDBC (Spring JDBC) |
|
||||
|
||||
PostgreSQL credentials: `cameleer` / `cameleer_dev`, database `cameleer`.
|
||||
|
||||
## Run the Server
|
||||
If you want to iterate on backend/UI code without rebuilding the server image on every change, start just the databases and run the server + UI locally:
|
||||
|
||||
```bash
|
||||
# 1. Only infra containers
|
||||
docker compose up -d cameleer-postgres cameleer-clickhouse
|
||||
|
||||
# 2. Build and run the server jar against those containers
|
||||
mvn clean package -DskipTests
|
||||
SPRING_DATASOURCE_URL=jdbc:postgresql://localhost:5432/cameleer \
|
||||
SPRING_DATASOURCE_URL="jdbc:postgresql://localhost:5432/cameleer?currentSchema=tenant_default&ApplicationName=tenant_default" \
|
||||
SPRING_DATASOURCE_USERNAME=cameleer \
|
||||
SPRING_DATASOURCE_PASSWORD=cameleer_dev \
|
||||
CAMELEER_SERVER_SECURITY_BOOTSTRAPTOKEN=my-secret-token \
|
||||
SPRING_FLYWAY_USER=cameleer \
|
||||
SPRING_FLYWAY_PASSWORD=cameleer_dev \
|
||||
CAMELEER_SERVER_CLICKHOUSE_URL="jdbc:clickhouse://localhost:8123/cameleer" \
|
||||
CAMELEER_SERVER_CLICKHOUSE_USERNAME=default \
|
||||
CAMELEER_SERVER_CLICKHOUSE_PASSWORD= \
|
||||
CAMELEER_SERVER_SECURITY_BOOTSTRAPTOKEN=dev-bootstrap-token-for-local-agent-registration \
|
||||
CAMELEER_SERVER_SECURITY_JWTSECRET=dev-jwt-secret-32-bytes-min-0123456789abcdef0123456789abcdef \
|
||||
CAMELEER_SERVER_RUNTIME_ENABLED=false \
|
||||
CAMELEER_SERVER_TENANT_ID=default \
|
||||
java -jar cameleer-server-app/target/cameleer-server-app-1.0-SNAPSHOT.jar
|
||||
|
||||
# 3. In another terminal — Vite dev server on :5173 (proxies /api → :8081)
|
||||
cd ui && npm install && npm run dev
|
||||
```
|
||||
|
||||
> **Note:** The Docker image no longer includes default database credentials. When running via `docker run`, pass `-e SPRING_DATASOURCE_URL=...` etc. The docker-compose setup provides these automatically.
|
||||
Database schema is applied automatically: PostgreSQL via Flyway migrations on server startup, ClickHouse tables via `ClickHouseSchemaInitializer`. No manual DDL needed.
|
||||
|
||||
The server starts on **port 8081**. The `CAMELEER_SERVER_SECURITY_BOOTSTRAPTOKEN` environment variable is **required** — the server fails fast on startup if it is not set.
|
||||
|
||||
For token rotation without downtime, set `CAMELEER_SERVER_SECURITY_BOOTSTRAPTOKENPREVIOUS` to the old token while rolling out the new one. The server accepts both during the overlap window.
|
||||
`CAMELEER_SERVER_SECURITY_BOOTSTRAPTOKEN` is **required** for agent registration — the server fails fast on startup if it's not set. For token rotation without downtime, set `CAMELEER_SERVER_SECURITY_BOOTSTRAPTOKENPREVIOUS` to the old token while rolling out the new one — the server accepts both during the overlap window.
|
||||
|
||||
## API Endpoints
|
||||
|
||||
@@ -438,6 +499,7 @@ Key settings in `cameleer-server-app/src/main/resources/application.yml`. All cu
|
||||
| `cameleer.server.runtime.routingmode` | `path` | `CAMELEER_SERVER_RUNTIME_ROUTINGMODE` | `path` or `subdomain` Traefik routing |
|
||||
| `cameleer.server.runtime.routingdomain` | `localhost` | `CAMELEER_SERVER_RUNTIME_ROUTINGDOMAIN` | Domain for Traefik routing labels |
|
||||
| `cameleer.server.runtime.serverurl` | *(empty)* | `CAMELEER_SERVER_RUNTIME_SERVERURL` | Server URL injected into app containers |
|
||||
| `cameleer.server.runtime.certresolver` | *(empty)* | `CAMELEER_SERVER_RUNTIME_CERTRESOLVER` | Traefik TLS cert resolver name (e.g. `letsencrypt`). Blank = omit the `tls.certresolver` label and let Traefik serve the default TLS-store cert |
|
||||
| `cameleer.server.runtime.agenthealthport` | `9464` | `CAMELEER_SERVER_RUNTIME_AGENTHEALTHPORT` | Agent health check port |
|
||||
| `cameleer.server.runtime.healthchecktimeout` | `60` | `CAMELEER_SERVER_RUNTIME_HEALTHCHECKTIMEOUT` | Health check timeout (seconds) |
|
||||
| `cameleer.server.runtime.container.memorylimit` | `512m` | `CAMELEER_SERVER_RUNTIME_CONTAINER_MEMORYLIMIT` | Default memory limit for app containers |
|
||||
|
||||
@@ -189,8 +189,8 @@
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<configuration>
|
||||
<forkCount>1</forkCount>
|
||||
<reuseForks>false</reuseForks>
|
||||
<forkCount>1C</forkCount>
|
||||
<reuseForks>true</reuseForks>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
|
||||
@@ -8,6 +8,8 @@ import org.springframework.boot.context.properties.EnableConfigurationProperties
|
||||
import org.springframework.scheduling.annotation.EnableAsync;
|
||||
import org.springframework.scheduling.annotation.EnableScheduling;
|
||||
|
||||
import java.util.TimeZone;
|
||||
|
||||
/**
|
||||
* Main entry point for the Cameleer Server application.
|
||||
* <p>
|
||||
@@ -23,6 +25,11 @@ import org.springframework.scheduling.annotation.EnableScheduling;
|
||||
public class CameleerServerApplication {
|
||||
|
||||
public static void main(String[] args) {
|
||||
// Pin JVM default TZ to UTC. The ClickHouse JDBC driver formats
|
||||
// java.sql.Timestamp via toString() which uses JVM default TZ; a
|
||||
// non-UTC JVM would then send CH timestamps off by the TZ offset.
|
||||
// Standard practice for observability servers.
|
||||
TimeZone.setDefault(TimeZone.getTimeZone("UTC"));
|
||||
SpringApplication.run(CameleerServerApplication.class, args);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,7 +70,7 @@ public class AgentLifecycleMonitor {
|
||||
private String mapTransitionEvent(AgentState from, AgentState to) {
|
||||
if (from == AgentState.LIVE && to == AgentState.STALE) return "WENT_STALE";
|
||||
if (from == AgentState.STALE && to == AgentState.DEAD) return "WENT_DEAD";
|
||||
if (from == AgentState.STALE && to == AgentState.LIVE) return "RECOVERED";
|
||||
if (to == AgentState.LIVE && (from == AgentState.STALE || from == AgentState.DEAD)) return "RECOVERED";
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -80,6 +80,17 @@ public class SseConnectionManager implements AgentEventListener {
|
||||
log.debug("SSE connection error for agent {}: {}", agentId, ex.getMessage());
|
||||
});
|
||||
|
||||
// Send an initial keepalive comment so Spring flushes the response
|
||||
// headers immediately. Without this, clients blocking on the first
|
||||
// body byte can hang for a full ping interval before observing the
|
||||
// connection — surface symptom in ITs that assert awaitConnection().
|
||||
try {
|
||||
emitter.send(SseEmitter.event().comment("connected"));
|
||||
} catch (IOException e) {
|
||||
log.debug("Initial keepalive failed for agent {}: {}", agentId, e.getMessage());
|
||||
emitters.remove(agentId, emitter);
|
||||
}
|
||||
|
||||
log.info("SSE connection established for agent {}", agentId);
|
||||
|
||||
return emitter;
|
||||
@@ -169,7 +180,7 @@ public class SseConnectionManager implements AgentEventListener {
|
||||
/**
|
||||
* Scheduled ping keepalive to all connected agents.
|
||||
*/
|
||||
@Scheduled(fixedDelayString = "${agent-registry.ping-interval-ms:15000}")
|
||||
@Scheduled(fixedDelayString = "${cameleer.server.agentregistry.ping-interval-ms:15000}")
|
||||
void pingAll() {
|
||||
if (!emitters.isEmpty()) {
|
||||
sendPingToAll();
|
||||
|
||||
@@ -3,7 +3,10 @@ package com.cameleer.server.app.alerting.config;
|
||||
import com.cameleer.server.app.alerting.eval.PerKindCircuitBreaker;
|
||||
import com.cameleer.server.app.alerting.metrics.AlertingMetrics;
|
||||
import com.cameleer.server.app.alerting.storage.*;
|
||||
import com.cameleer.server.core.alerting.*;
|
||||
import com.cameleer.server.core.alerting.AlertInstanceRepository;
|
||||
import com.cameleer.server.core.alerting.AlertNotificationRepository;
|
||||
import com.cameleer.server.core.alerting.AlertRuleRepository;
|
||||
import com.cameleer.server.core.alerting.AlertSilenceRepository;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -41,11 +44,6 @@ public class AlertingBeanConfig {
|
||||
return new PostgresAlertNotificationRepository(jdbc, om);
|
||||
}
|
||||
|
||||
@Bean
|
||||
public AlertReadRepository alertReadRepository(JdbcTemplate jdbc) {
|
||||
return new PostgresAlertReadRepository(jdbc);
|
||||
}
|
||||
|
||||
@Bean
|
||||
public Clock alertingClock() {
|
||||
return Clock.systemDefaultZone();
|
||||
|
||||
@@ -16,7 +16,8 @@ public record AlertingProperties(
|
||||
Integer eventRetentionDays,
|
||||
Integer notificationRetentionDays,
|
||||
Integer webhookTimeoutMs,
|
||||
Integer webhookMaxAttempts) {
|
||||
Integer webhookMaxAttempts,
|
||||
Integer perExchangeDeployBacklogCapSeconds) {
|
||||
|
||||
public int effectiveEvaluatorTickIntervalMs() {
|
||||
int raw = evaluatorTickIntervalMs == null ? 5000 : evaluatorTickIntervalMs;
|
||||
@@ -70,4 +71,9 @@ public record AlertingProperties(
|
||||
public int cbCooldownSeconds() {
|
||||
return circuitBreakerCooldownSeconds == null ? 60 : circuitBreakerCooldownSeconds;
|
||||
}
|
||||
|
||||
public int effectivePerExchangeDeployBacklogCapSeconds() {
|
||||
// Default 24 h. Zero or negative = disabled (no clamp — first-run uses rule.createdAt as today).
|
||||
return perExchangeDeployBacklogCapSeconds == null ? 86_400 : perExchangeDeployBacklogCapSeconds;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,19 +1,22 @@
|
||||
package com.cameleer.server.app.alerting.controller;
|
||||
|
||||
import com.cameleer.server.app.alerting.dto.AlertDto;
|
||||
import com.cameleer.server.app.alerting.dto.BulkReadRequest;
|
||||
import com.cameleer.server.app.alerting.dto.BulkIdsRequest;
|
||||
import com.cameleer.server.app.alerting.dto.UnreadCountResponse;
|
||||
import com.cameleer.server.app.alerting.notify.InAppInboxQuery;
|
||||
import com.cameleer.server.app.web.EnvPath;
|
||||
import com.cameleer.server.core.alerting.AlertInstance;
|
||||
import com.cameleer.server.core.alerting.AlertInstanceRepository;
|
||||
import com.cameleer.server.core.alerting.AlertReadRepository;
|
||||
import com.cameleer.server.core.alerting.AlertSeverity;
|
||||
import com.cameleer.server.core.alerting.AlertState;
|
||||
import com.cameleer.server.core.runtime.Environment;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
import jakarta.validation.Valid;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.security.access.prepost.PreAuthorize;
|
||||
import org.springframework.security.core.context.SecurityContextHolder;
|
||||
import org.springframework.web.bind.annotation.DeleteMapping;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
import org.springframework.web.bind.annotation.PathVariable;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
@@ -29,7 +32,7 @@ import java.util.UUID;
|
||||
|
||||
/**
|
||||
* REST controller for the in-app alert inbox (env-scoped).
|
||||
* VIEWER+ can read their own inbox; OPERATOR+ can ack any alert.
|
||||
* VIEWER+ can read their own inbox; OPERATOR+ can soft-delete and restore alerts.
|
||||
*/
|
||||
@RestController
|
||||
@RequestMapping("/api/v1/environments/{envSlug}/alerts")
|
||||
@@ -37,46 +40,43 @@ import java.util.UUID;
|
||||
@PreAuthorize("hasAnyRole('VIEWER','OPERATOR','ADMIN')")
|
||||
public class AlertController {
|
||||
|
||||
private static final int DEFAULT_LIMIT = 50;
|
||||
|
||||
private final InAppInboxQuery inboxQuery;
|
||||
private final AlertInstanceRepository instanceRepo;
|
||||
private final AlertReadRepository readRepo;
|
||||
|
||||
public AlertController(InAppInboxQuery inboxQuery,
|
||||
AlertInstanceRepository instanceRepo,
|
||||
AlertReadRepository readRepo) {
|
||||
AlertInstanceRepository instanceRepo) {
|
||||
this.inboxQuery = inboxQuery;
|
||||
this.instanceRepo = instanceRepo;
|
||||
this.readRepo = readRepo;
|
||||
}
|
||||
|
||||
@GetMapping
|
||||
public List<AlertDto> list(
|
||||
@EnvPath Environment env,
|
||||
@RequestParam(defaultValue = "50") int limit) {
|
||||
@RequestParam(defaultValue = "50") int limit,
|
||||
@RequestParam(required = false) List<AlertState> state,
|
||||
@RequestParam(required = false) List<AlertSeverity> severity,
|
||||
@RequestParam(required = false) Boolean acked,
|
||||
@RequestParam(required = false) Boolean read) {
|
||||
String userId = currentUserId();
|
||||
int effectiveLimit = Math.min(limit, 200);
|
||||
return inboxQuery.listInbox(env.id(), userId, effectiveLimit)
|
||||
return inboxQuery.listInbox(env.id(), userId, state, severity, acked, read, effectiveLimit)
|
||||
.stream().map(AlertDto::from).toList();
|
||||
}
|
||||
|
||||
@GetMapping("/unread-count")
|
||||
public UnreadCountResponse unreadCount(@EnvPath Environment env) {
|
||||
String userId = currentUserId();
|
||||
long count = inboxQuery.countUnread(env.id(), userId);
|
||||
return new UnreadCountResponse(count);
|
||||
return inboxQuery.countUnread(env.id(), currentUserId());
|
||||
}
|
||||
|
||||
@GetMapping("/{id}")
|
||||
public AlertDto get(@EnvPath Environment env, @PathVariable UUID id) {
|
||||
AlertInstance instance = requireInstance(id, env.id());
|
||||
AlertInstance instance = requireLiveInstance(id, env.id());
|
||||
return AlertDto.from(instance);
|
||||
}
|
||||
|
||||
@PostMapping("/{id}/ack")
|
||||
public AlertDto ack(@EnvPath Environment env, @PathVariable UUID id) {
|
||||
AlertInstance instance = requireInstance(id, env.id());
|
||||
AlertInstance instance = requireLiveInstance(id, env.id());
|
||||
String userId = currentUserId();
|
||||
instanceRepo.ack(id, userId, Instant.now());
|
||||
// Re-fetch to return fresh state
|
||||
@@ -86,39 +86,72 @@ public class AlertController {
|
||||
|
||||
@PostMapping("/{id}/read")
|
||||
public void read(@EnvPath Environment env, @PathVariable UUID id) {
|
||||
requireInstance(id, env.id());
|
||||
String userId = currentUserId();
|
||||
readRepo.markRead(userId, id);
|
||||
requireLiveInstance(id, env.id());
|
||||
instanceRepo.markRead(id, Instant.now());
|
||||
}
|
||||
|
||||
@PostMapping("/bulk-read")
|
||||
public void bulkRead(@EnvPath Environment env,
|
||||
@Valid @RequestBody BulkReadRequest req) {
|
||||
String userId = currentUserId();
|
||||
// filter to only instances in this env
|
||||
List<UUID> filtered = req.instanceIds().stream()
|
||||
.filter(instanceId -> instanceRepo.findById(instanceId)
|
||||
.map(i -> i.environmentId().equals(env.id()))
|
||||
.orElse(false))
|
||||
.toList();
|
||||
@Valid @RequestBody BulkIdsRequest req) {
|
||||
List<UUID> filtered = inEnvLiveIds(req.instanceIds(), env.id());
|
||||
if (!filtered.isEmpty()) {
|
||||
readRepo.bulkMarkRead(userId, filtered);
|
||||
instanceRepo.bulkMarkRead(filtered, Instant.now());
|
||||
}
|
||||
}
|
||||
|
||||
@PostMapping("/bulk-ack")
|
||||
public void bulkAck(@EnvPath Environment env,
|
||||
@Valid @RequestBody BulkIdsRequest req) {
|
||||
List<UUID> filtered = inEnvLiveIds(req.instanceIds(), env.id());
|
||||
if (!filtered.isEmpty()) {
|
||||
instanceRepo.bulkAck(filtered, currentUserId(), Instant.now());
|
||||
}
|
||||
}
|
||||
|
||||
@DeleteMapping("/{id}")
|
||||
@PreAuthorize("hasAnyRole('OPERATOR','ADMIN')")
|
||||
public ResponseEntity<Void> delete(@EnvPath Environment env, @PathVariable UUID id) {
|
||||
requireLiveInstance(id, env.id());
|
||||
instanceRepo.softDelete(id, Instant.now());
|
||||
return ResponseEntity.noContent().build();
|
||||
}
|
||||
|
||||
@PostMapping("/bulk-delete")
|
||||
@PreAuthorize("hasAnyRole('OPERATOR','ADMIN')")
|
||||
public void bulkDelete(@EnvPath Environment env,
|
||||
@Valid @RequestBody BulkIdsRequest req) {
|
||||
List<UUID> filtered = inEnvLiveIds(req.instanceIds(), env.id());
|
||||
if (!filtered.isEmpty()) {
|
||||
instanceRepo.bulkSoftDelete(filtered, Instant.now());
|
||||
}
|
||||
}
|
||||
|
||||
@PostMapping("/{id}/restore")
|
||||
@PreAuthorize("hasAnyRole('OPERATOR','ADMIN')")
|
||||
public ResponseEntity<Void> restore(@EnvPath Environment env, @PathVariable UUID id) {
|
||||
// Unlike requireLiveInstance, restore explicitly targets soft-deleted rows
|
||||
AlertInstance inst = instanceRepo.findById(id)
|
||||
.orElseThrow(() -> new ResponseStatusException(HttpStatus.NOT_FOUND, "Alert not found"));
|
||||
if (!inst.environmentId().equals(env.id()))
|
||||
throw new ResponseStatusException(HttpStatus.NOT_FOUND, "Alert not found in env");
|
||||
instanceRepo.restore(id);
|
||||
return ResponseEntity.noContent().build();
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
private AlertInstance requireInstance(UUID id, UUID envId) {
|
||||
AlertInstance instance = instanceRepo.findById(id)
|
||||
.orElseThrow(() -> new ResponseStatusException(HttpStatus.NOT_FOUND,
|
||||
"Alert not found: " + id));
|
||||
if (!instance.environmentId().equals(envId)) {
|
||||
throw new ResponseStatusException(HttpStatus.NOT_FOUND,
|
||||
"Alert not found in this environment: " + id);
|
||||
}
|
||||
return instance;
|
||||
private AlertInstance requireLiveInstance(UUID id, UUID envId) {
|
||||
AlertInstance i = instanceRepo.findById(id)
|
||||
.orElseThrow(() -> new ResponseStatusException(HttpStatus.NOT_FOUND, "Alert not found"));
|
||||
if (!i.environmentId().equals(envId) || i.deletedAt() != null)
|
||||
throw new ResponseStatusException(HttpStatus.NOT_FOUND, "Alert not found in env");
|
||||
return i;
|
||||
}
|
||||
|
||||
private List<UUID> inEnvLiveIds(List<UUID> ids, UUID envId) {
|
||||
return instanceRepo.filterInEnvLive(ids, envId);
|
||||
}
|
||||
|
||||
private String currentUserId() {
|
||||
|
||||
@@ -22,6 +22,7 @@ import com.cameleer.server.core.alerting.AlertRuleRepository;
|
||||
import com.cameleer.server.core.alerting.AlertRuleTarget;
|
||||
import com.cameleer.server.core.alerting.ConditionKind;
|
||||
import com.cameleer.server.core.alerting.ExchangeMatchCondition;
|
||||
import com.cameleer.server.core.alerting.FireMode;
|
||||
import com.cameleer.server.core.alerting.WebhookBinding;
|
||||
import com.cameleer.server.core.outbound.OutboundConnection;
|
||||
import com.cameleer.server.core.outbound.OutboundConnectionService;
|
||||
@@ -126,6 +127,7 @@ public class AlertRuleController {
|
||||
HttpServletRequest httpRequest) {
|
||||
|
||||
validateAttributeKeys(req.condition());
|
||||
validateBusinessRules(req);
|
||||
validateWebhooks(req.webhooks(), env.id());
|
||||
|
||||
AlertRule draft = buildRule(null, env.id(), req, currentUserId());
|
||||
@@ -147,6 +149,7 @@ public class AlertRuleController {
|
||||
|
||||
AlertRule existing = requireRule(id, env.id());
|
||||
validateAttributeKeys(req.condition());
|
||||
validateBusinessRules(req);
|
||||
validateWebhooks(req.webhooks(), env.id());
|
||||
|
||||
AlertRule updated = buildRule(existing, env.id(), req, currentUserId());
|
||||
@@ -258,6 +261,36 @@ public class AlertRuleController {
|
||||
// Helpers
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Cross-field business-rule validation for {@link AlertRuleRequest}.
|
||||
*
|
||||
* <p>PER_EXCHANGE rules: re-notify and for-duration are nonsensical (each fire is its own
|
||||
* exchange — there's no "still firing" window and nothing to re-notify about). Reject 400
|
||||
* if either is non-zero.
|
||||
*
|
||||
* <p>All rules: reject 400 if both webhooks and targets are empty — such a rule can never
|
||||
* notify anyone and is a pure footgun.
|
||||
*/
|
||||
private void validateBusinessRules(AlertRuleRequest req) {
|
||||
if (req.condition() instanceof ExchangeMatchCondition ex
|
||||
&& ex.fireMode() == FireMode.PER_EXCHANGE) {
|
||||
if (req.reNotifyMinutes() != null && req.reNotifyMinutes() != 0) {
|
||||
throw new ResponseStatusException(HttpStatus.BAD_REQUEST,
|
||||
"reNotifyMinutes must be 0 for PER_EXCHANGE rules (re-notify does not apply)");
|
||||
}
|
||||
if (req.forDurationSeconds() != null && req.forDurationSeconds() != 0) {
|
||||
throw new ResponseStatusException(HttpStatus.BAD_REQUEST,
|
||||
"forDurationSeconds must be 0 for PER_EXCHANGE rules");
|
||||
}
|
||||
}
|
||||
boolean noWebhooks = req.webhooks() == null || req.webhooks().isEmpty();
|
||||
boolean noTargets = req.targets() == null || req.targets().isEmpty();
|
||||
if (noWebhooks && noTargets) {
|
||||
throw new ResponseStatusException(HttpStatus.BAD_REQUEST,
|
||||
"rule must have at least one webhook or target — otherwise it never notifies anyone");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates that all attribute keys in an {@link ExchangeMatchCondition} match
|
||||
* {@code ^[a-zA-Z0-9._-]+$}. Keys are inlined into ClickHouse SQL, making this
|
||||
|
||||
@@ -20,6 +20,7 @@ public record AlertDto(
|
||||
Instant ackedAt,
|
||||
String ackedBy,
|
||||
Instant resolvedAt,
|
||||
Instant readAt, // global "has anyone read this"
|
||||
boolean silenced,
|
||||
Double currentValue,
|
||||
Double threshold,
|
||||
@@ -29,6 +30,7 @@ public record AlertDto(
|
||||
return new AlertDto(
|
||||
i.id(), i.ruleId(), i.environmentId(), i.state(), i.severity(),
|
||||
i.title(), i.message(), i.firedAt(), i.ackedAt(), i.ackedBy(),
|
||||
i.resolvedAt(), i.silenced(), i.currentValue(), i.threshold(), i.context());
|
||||
i.resolvedAt(), i.readAt(), i.silenced(),
|
||||
i.currentValue(), i.threshold(), i.context());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
package com.cameleer.server.app.alerting.dto;
|
||||
|
||||
import jakarta.validation.constraints.NotNull;
|
||||
import jakarta.validation.constraints.Size;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
/** Shared body for bulk-read / bulk-ack / bulk-delete requests. */
|
||||
public record BulkIdsRequest(@NotNull @Size(min = 1, max = 500) List<UUID> instanceIds) {}
|
||||
@@ -1,12 +0,0 @@
|
||||
package com.cameleer.server.app.alerting.dto;
|
||||
|
||||
import jakarta.validation.constraints.NotNull;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
public record BulkReadRequest(@NotNull List<UUID> instanceIds) {
|
||||
public BulkReadRequest {
|
||||
instanceIds = instanceIds == null ? List.of() : List.copyOf(instanceIds);
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,29 @@
|
||||
package com.cameleer.server.app.alerting.dto;
|
||||
|
||||
public record UnreadCountResponse(long count) {}
|
||||
import com.cameleer.server.core.alerting.AlertSeverity;
|
||||
|
||||
import java.util.EnumMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Response shape for {@code GET /alerts/unread-count}.
|
||||
* <p>
|
||||
* {@code total} is the sum of {@code bySeverity} values. The UI branches bell colour on
|
||||
* the highest severity present, so callers can inspect the map directly.
|
||||
*/
|
||||
public record UnreadCountResponse(long total, Map<AlertSeverity, Long> bySeverity) {
|
||||
|
||||
public UnreadCountResponse {
|
||||
// Defensive copy + fill in missing severities as 0 so the UI never sees null/undefined.
|
||||
EnumMap<AlertSeverity, Long> normalized = new EnumMap<>(AlertSeverity.class);
|
||||
for (AlertSeverity s : AlertSeverity.values()) normalized.put(s, 0L);
|
||||
if (bySeverity != null) bySeverity.forEach((k, v) -> normalized.put(k, v == null ? 0L : v));
|
||||
bySeverity = Map.copyOf(normalized);
|
||||
}
|
||||
|
||||
public static UnreadCountResponse from(Map<AlertSeverity, Long> counts) {
|
||||
long total = counts == null ? 0L
|
||||
: counts.values().stream().filter(v -> v != null).mapToLong(Long::longValue).sum();
|
||||
return new UnreadCountResponse(total, counts == null ? Map.of() : counts);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,95 @@
|
||||
package com.cameleer.server.app.alerting.eval;
|
||||
|
||||
import com.cameleer.server.core.agent.AgentEventRecord;
|
||||
import com.cameleer.server.core.agent.AgentEventRepository;
|
||||
import com.cameleer.server.core.alerting.AgentLifecycleCondition;
|
||||
import com.cameleer.server.core.alerting.AgentLifecycleEventType;
|
||||
import com.cameleer.server.core.alerting.AlertRule;
|
||||
import com.cameleer.server.core.alerting.AlertScope;
|
||||
import com.cameleer.server.core.alerting.ConditionKind;
|
||||
import com.cameleer.server.core.runtime.EnvironmentRepository;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Evaluator for {@link AgentLifecycleCondition}.
|
||||
* <p>
|
||||
* Each matching row in {@code agent_events} produces its own {@link EvalResult.Firing}
|
||||
* in an {@link EvalResult.Batch}, so every {@code (agent, eventType, timestamp)}
|
||||
* tuple gets its own {@code AlertInstance} — operationally distinct outages /
|
||||
* restarts / shutdowns are independently ackable. Deduplication across ticks
|
||||
* is enforced by {@code alert_instances_open_rule_uq} via the canonical
|
||||
* {@code _subjectFingerprint} key in the instance context (see V16 migration).
|
||||
*/
|
||||
@Component
|
||||
public class AgentLifecycleEvaluator implements ConditionEvaluator<AgentLifecycleCondition> {
|
||||
|
||||
/** Hard cap on rows returned per tick — prevents a flood of stale events from overwhelming the job. */
|
||||
private static final int MAX_EVENTS_PER_TICK = 500;
|
||||
|
||||
private final AgentEventRepository eventRepo;
|
||||
private final EnvironmentRepository envRepo;
|
||||
|
||||
public AgentLifecycleEvaluator(AgentEventRepository eventRepo, EnvironmentRepository envRepo) {
|
||||
this.eventRepo = eventRepo;
|
||||
this.envRepo = envRepo;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ConditionKind kind() { return ConditionKind.AGENT_LIFECYCLE; }
|
||||
|
||||
@Override
|
||||
public EvalResult evaluate(AgentLifecycleCondition c, AlertRule rule, EvalContext ctx) {
|
||||
String envSlug = envRepo.findById(rule.environmentId())
|
||||
.map(e -> e.slug())
|
||||
.orElse(null);
|
||||
if (envSlug == null) return EvalResult.Clear.INSTANCE;
|
||||
|
||||
AlertScope scope = c.scope();
|
||||
String appSlug = scope != null ? scope.appSlug() : null;
|
||||
String agentId = scope != null ? scope.agentId() : null;
|
||||
|
||||
List<String> typeNames = c.eventTypes().stream()
|
||||
.map(AgentLifecycleEventType::name)
|
||||
.toList();
|
||||
|
||||
Instant from = ctx.now().minusSeconds(c.withinSeconds());
|
||||
Instant to = ctx.now();
|
||||
|
||||
List<AgentEventRecord> matches = eventRepo.findInWindow(
|
||||
envSlug, appSlug, agentId, typeNames, from, to, MAX_EVENTS_PER_TICK);
|
||||
|
||||
if (matches.isEmpty()) return new EvalResult.Batch(List.of(), Map.of());
|
||||
|
||||
List<EvalResult.Firing> firings = new ArrayList<>(matches.size());
|
||||
for (AgentEventRecord ev : matches) {
|
||||
firings.add(toFiring(ev));
|
||||
}
|
||||
return new EvalResult.Batch(firings, Map.of());
|
||||
}
|
||||
|
||||
private static EvalResult.Firing toFiring(AgentEventRecord ev) {
|
||||
String fingerprint = (ev.instanceId() == null ? "" : ev.instanceId())
|
||||
+ ":" + (ev.eventType() == null ? "" : ev.eventType())
|
||||
+ ":" + (ev.timestamp() == null ? "0" : Long.toString(ev.timestamp().toEpochMilli()));
|
||||
|
||||
Map<String, Object> context = new LinkedHashMap<>();
|
||||
context.put("agent", Map.of(
|
||||
"id", ev.instanceId() == null ? "" : ev.instanceId(),
|
||||
"app", ev.applicationId() == null ? "" : ev.applicationId()
|
||||
));
|
||||
context.put("event", Map.of(
|
||||
"type", ev.eventType() == null ? "" : ev.eventType(),
|
||||
"timestamp", ev.timestamp() == null ? "" : ev.timestamp().toString(),
|
||||
"detail", ev.detail() == null ? "" : ev.detail()
|
||||
));
|
||||
context.put("_subjectFingerprint", fingerprint);
|
||||
|
||||
return new EvalResult.Firing(1.0, null, context);
|
||||
}
|
||||
}
|
||||
@@ -47,6 +47,7 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
|
||||
private final NotificationContextBuilder contextBuilder;
|
||||
private final EnvironmentRepository environmentRepo;
|
||||
private final ObjectMapper objectMapper;
|
||||
private final BatchResultApplier batchResultApplier;
|
||||
private final String instanceId;
|
||||
private final String tenantId;
|
||||
private final Clock clock;
|
||||
@@ -64,26 +65,28 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
|
||||
NotificationContextBuilder contextBuilder,
|
||||
EnvironmentRepository environmentRepo,
|
||||
ObjectMapper objectMapper,
|
||||
BatchResultApplier batchResultApplier,
|
||||
@Qualifier("alertingInstanceId") String instanceId,
|
||||
@Value("${cameleer.server.tenant.id:default}") String tenantId,
|
||||
Clock alertingClock,
|
||||
AlertingMetrics metrics) {
|
||||
|
||||
this.props = props;
|
||||
this.ruleRepo = ruleRepo;
|
||||
this.instanceRepo = instanceRepo;
|
||||
this.notificationRepo = notificationRepo;
|
||||
this.evaluators = evaluatorList.stream()
|
||||
this.props = props;
|
||||
this.ruleRepo = ruleRepo;
|
||||
this.instanceRepo = instanceRepo;
|
||||
this.notificationRepo = notificationRepo;
|
||||
this.evaluators = evaluatorList.stream()
|
||||
.collect(Collectors.toMap(ConditionEvaluator::kind, e -> e));
|
||||
this.circuitBreaker = circuitBreaker;
|
||||
this.renderer = renderer;
|
||||
this.contextBuilder = contextBuilder;
|
||||
this.environmentRepo = environmentRepo;
|
||||
this.objectMapper = objectMapper;
|
||||
this.instanceId = instanceId;
|
||||
this.tenantId = tenantId;
|
||||
this.clock = alertingClock;
|
||||
this.metrics = metrics;
|
||||
this.circuitBreaker = circuitBreaker;
|
||||
this.renderer = renderer;
|
||||
this.contextBuilder = contextBuilder;
|
||||
this.environmentRepo = environmentRepo;
|
||||
this.objectMapper = objectMapper;
|
||||
this.batchResultApplier = batchResultApplier;
|
||||
this.instanceId = instanceId;
|
||||
this.tenantId = tenantId;
|
||||
this.clock = alertingClock;
|
||||
this.metrics = metrics;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
@@ -112,21 +115,61 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
|
||||
|
||||
for (AlertRule rule : claimed) {
|
||||
Instant nextRun = Instant.now(clock).plusSeconds(rule.evaluationIntervalSeconds());
|
||||
if (circuitBreaker.isOpen(rule.conditionKind())) {
|
||||
log.debug("Circuit breaker open for {}; skipping rule {}", rule.conditionKind(), rule.id());
|
||||
reschedule(rule, nextRun);
|
||||
continue;
|
||||
}
|
||||
|
||||
EvalResult result;
|
||||
try {
|
||||
if (circuitBreaker.isOpen(rule.conditionKind())) {
|
||||
log.debug("Circuit breaker open for {}; skipping rule {}", rule.conditionKind(), rule.id());
|
||||
continue;
|
||||
}
|
||||
EvalResult result = metrics.evalDuration(rule.conditionKind())
|
||||
result = metrics.evalDuration(rule.conditionKind())
|
||||
.recordCallable(() -> evaluateSafely(rule, ctx));
|
||||
applyResult(rule, result);
|
||||
circuitBreaker.recordSuccess(rule.conditionKind());
|
||||
} catch (Exception e) {
|
||||
metrics.evalError(rule.conditionKind(), rule.id());
|
||||
circuitBreaker.recordFailure(rule.conditionKind());
|
||||
log.warn("Evaluator error for rule {} ({}): {}", rule.id(), rule.conditionKind(), e.toString());
|
||||
} finally {
|
||||
// Evaluation itself failed — release the claim so the rule can be
|
||||
// retried on the next tick. Cursor stays put.
|
||||
reschedule(rule, nextRun);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (result instanceof EvalResult.Batch b) {
|
||||
// Phase 2: the Batch path is atomic. The @Transactional apply() on
|
||||
// BatchResultApplier wraps instance writes, notification enqueues,
|
||||
// AND the cursor advance + releaseClaim into a single tx. A
|
||||
// mid-batch fault rolls everything back — including the cursor —
|
||||
// so the next tick replays the whole batch exactly once.
|
||||
try {
|
||||
batchResultApplier.apply(rule, b, nextRun);
|
||||
circuitBreaker.recordSuccess(rule.conditionKind());
|
||||
} catch (Exception e) {
|
||||
metrics.evalError(rule.conditionKind(), rule.id());
|
||||
circuitBreaker.recordFailure(rule.conditionKind());
|
||||
log.warn("Batch apply failed for rule {} ({}): {} — rolling back; next tick will retry",
|
||||
rule.id(), rule.conditionKind(), e.toString());
|
||||
// The transaction rolled back. Do NOT call reschedule here —
|
||||
// leaving claim + next_evaluation_at as they were means the
|
||||
// claim TTL takes over and the rule becomes due on its own.
|
||||
// Rethrowing is unnecessary for correctness — the cursor
|
||||
// stayed put, so exactly-once-per-exchange is preserved.
|
||||
}
|
||||
} else {
|
||||
// Non-Batch path (FIRING / Clear / Error): classic apply + rule
|
||||
// reschedule. Not wrapped in a single tx — semantics unchanged
|
||||
// from pre-Phase-2.
|
||||
try {
|
||||
applyResult(rule, result);
|
||||
circuitBreaker.recordSuccess(rule.conditionKind());
|
||||
} catch (Exception e) {
|
||||
metrics.evalError(rule.conditionKind(), rule.id());
|
||||
circuitBreaker.recordFailure(rule.conditionKind());
|
||||
log.warn("applyResult failed for rule {} ({}): {}",
|
||||
rule.id(), rule.conditionKind(), e.toString());
|
||||
} finally {
|
||||
reschedule(rule, nextRun);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -171,14 +214,10 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
private void applyResult(AlertRule rule, EvalResult result) {
|
||||
if (result instanceof EvalResult.Batch b) {
|
||||
// PER_EXCHANGE mode: each Firing in the batch creates its own AlertInstance
|
||||
for (EvalResult.Firing f : b.firings()) {
|
||||
applyBatchFiring(rule, f);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Note: the Batch path is handled by BatchResultApplier (transactional) —
|
||||
// tick() routes Batch results there directly and never calls applyResult
|
||||
// for them. This method only handles FIRING / Clear / Error state-machine
|
||||
// transitions for the classic (non-PER_EXCHANGE) path.
|
||||
AlertInstance current = instanceRepo.findOpenForRule(rule.id()).orElse(null);
|
||||
Instant now = Instant.now(clock);
|
||||
|
||||
@@ -199,19 +238,6 @@ public class AlertEvaluatorJob implements SchedulingConfigurer {
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Batch (PER_EXCHANGE) mode: always create a fresh FIRING instance per Firing entry.
|
||||
* No forDuration check — each exchange is its own event.
|
||||
*/
|
||||
private void applyBatchFiring(AlertRule rule, EvalResult.Firing f) {
|
||||
Instant now = Instant.now(clock);
|
||||
AlertInstance instance = AlertStateTransitions.newInstance(rule, f, AlertState.FIRING, now)
|
||||
.withRuleSnapshot(snapshotRule(rule));
|
||||
AlertInstance enriched = enrichTitleMessage(rule, instance);
|
||||
AlertInstance persisted = instanceRepo.save(enriched);
|
||||
enqueueNotifications(rule, persisted, now);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Title / message rendering
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
@@ -28,7 +28,7 @@ public final class AlertStateTransitions {
|
||||
/**
|
||||
* Apply an EvalResult to the current open AlertInstance.
|
||||
*
|
||||
* @param current the open instance for this rule (PENDING / FIRING / ACKNOWLEDGED), or null if none
|
||||
* @param current the open instance for this rule (PENDING / FIRING), or null if none
|
||||
* @param result the evaluator outcome
|
||||
* @param rule the rule being evaluated
|
||||
* @param now wall-clock instant for the current tick
|
||||
@@ -50,7 +50,7 @@ public final class AlertStateTransitions {
|
||||
private static Optional<AlertInstance> onClear(AlertInstance current, Instant now) {
|
||||
if (current == null) return Optional.empty(); // no open instance — no-op
|
||||
if (current.state() == AlertState.RESOLVED) return Optional.empty(); // already resolved
|
||||
// Any open state (PENDING / FIRING / ACKNOWLEDGED) → RESOLVED
|
||||
// Any open state (PENDING / FIRING) → RESOLVED
|
||||
return Optional.of(current
|
||||
.withState(AlertState.RESOLVED)
|
||||
.withResolvedAt(now));
|
||||
@@ -84,8 +84,8 @@ public final class AlertStateTransitions {
|
||||
// Still within forDuration — stay PENDING, nothing to persist
|
||||
yield Optional.empty();
|
||||
}
|
||||
// FIRING / ACKNOWLEDGED — re-notification cadence handled by the dispatcher
|
||||
case FIRING, ACKNOWLEDGED -> Optional.empty();
|
||||
// FIRING — re-notification cadence handled by the dispatcher
|
||||
case FIRING -> Optional.empty();
|
||||
// RESOLVED should never appear as the "current open" instance, but guard anyway
|
||||
case RESOLVED -> Optional.empty();
|
||||
};
|
||||
@@ -126,6 +126,8 @@ public final class AlertStateTransitions {
|
||||
null, // ackedBy
|
||||
null, // resolvedAt
|
||||
null, // lastNotifiedAt
|
||||
null, // readAt
|
||||
null, // deletedAt
|
||||
false, // silenced
|
||||
f.currentValue(),
|
||||
f.threshold(),
|
||||
|
||||
@@ -0,0 +1,144 @@
|
||||
package com.cameleer.server.app.alerting.eval;
|
||||
|
||||
import com.cameleer.server.app.alerting.notify.MustacheRenderer;
|
||||
import com.cameleer.server.app.alerting.notify.NotificationContextBuilder;
|
||||
import com.cameleer.server.core.alerting.*;
|
||||
import com.cameleer.server.core.runtime.Environment;
|
||||
import com.cameleer.server.core.runtime.EnvironmentRepository;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
import java.time.Clock;
|
||||
import java.time.Instant;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
|
||||
/**
|
||||
* Applies a {@link EvalResult.Batch} result to persistent state inside a single
|
||||
* transaction: instance writes, notification enqueues, and the rule's cursor
|
||||
* advance + {@code releaseClaim} either all commit or all roll back together.
|
||||
* <p>
|
||||
* Lives in its own bean so the {@code @Transactional} annotation engages via the
|
||||
* Spring proxy when invoked from {@link AlertEvaluatorJob#tick()}; calling it as
|
||||
* {@code this.apply(...)} from {@code AlertEvaluatorJob} (a bean calling its own
|
||||
* method) would bypass the proxy and silently disable the transaction.
|
||||
* <p>
|
||||
* Phase 2 of the per-exchange exactly-once plan (see
|
||||
* {@code docs/superpowers/plans/2026-04-22-per-exchange-exactly-once.md}).
|
||||
*/
|
||||
@Component
|
||||
public class BatchResultApplier {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(BatchResultApplier.class);
|
||||
|
||||
private final AlertRuleRepository ruleRepo;
|
||||
private final AlertInstanceRepository instanceRepo;
|
||||
private final AlertNotificationRepository notificationRepo;
|
||||
private final MustacheRenderer renderer;
|
||||
private final NotificationContextBuilder contextBuilder;
|
||||
private final EnvironmentRepository environmentRepo;
|
||||
private final ObjectMapper objectMapper;
|
||||
private final Clock clock;
|
||||
|
||||
public BatchResultApplier(
|
||||
AlertRuleRepository ruleRepo,
|
||||
AlertInstanceRepository instanceRepo,
|
||||
AlertNotificationRepository notificationRepo,
|
||||
MustacheRenderer renderer,
|
||||
NotificationContextBuilder contextBuilder,
|
||||
EnvironmentRepository environmentRepo,
|
||||
ObjectMapper objectMapper,
|
||||
Clock alertingClock) {
|
||||
this.ruleRepo = ruleRepo;
|
||||
this.instanceRepo = instanceRepo;
|
||||
this.notificationRepo = notificationRepo;
|
||||
this.renderer = renderer;
|
||||
this.contextBuilder = contextBuilder;
|
||||
this.environmentRepo = environmentRepo;
|
||||
this.objectMapper = objectMapper;
|
||||
this.clock = alertingClock;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically apply a Batch result for a single rule:
|
||||
* <ol>
|
||||
* <li>persist a FIRING instance per firing + enqueue its notifications</li>
|
||||
* <li>advance the rule's cursor ({@code evalState}) iff the batch supplied one</li>
|
||||
* <li>release the claim with the new {@code nextRun} + {@code evalState}</li>
|
||||
* </ol>
|
||||
* Any exception thrown from the repo calls rolls back every write — including
|
||||
* the cursor advance — so the rule is replayable on the next tick.
|
||||
*/
|
||||
@Transactional
|
||||
public void apply(AlertRule rule, EvalResult.Batch batch, Instant nextRun) {
|
||||
for (EvalResult.Firing f : batch.firings()) {
|
||||
applyBatchFiring(rule, f);
|
||||
}
|
||||
Map<String, Object> nextEvalState =
|
||||
batch.nextEvalState().isEmpty() ? rule.evalState() : batch.nextEvalState();
|
||||
ruleRepo.releaseClaim(rule.id(), nextRun, nextEvalState);
|
||||
}
|
||||
|
||||
/**
|
||||
* Batch (PER_EXCHANGE) mode: always create a fresh FIRING instance per Firing entry.
|
||||
* No forDuration check — each exchange is its own event.
|
||||
*/
|
||||
private void applyBatchFiring(AlertRule rule, EvalResult.Firing f) {
|
||||
Instant now = Instant.now(clock);
|
||||
AlertInstance instance = AlertStateTransitions.newInstance(rule, f, AlertState.FIRING, now)
|
||||
.withRuleSnapshot(snapshotRule(rule));
|
||||
AlertInstance enriched = enrichTitleMessage(rule, instance);
|
||||
AlertInstance persisted = instanceRepo.save(enriched);
|
||||
enqueueNotifications(rule, persisted, now);
|
||||
}
|
||||
|
||||
private AlertInstance enrichTitleMessage(AlertRule rule, AlertInstance instance) {
|
||||
Environment env = environmentRepo.findById(rule.environmentId()).orElse(null);
|
||||
Map<String, Object> ctx = contextBuilder.build(rule, instance, env, null);
|
||||
String title = renderer.render(rule.notificationTitleTmpl(), ctx);
|
||||
String message = renderer.render(rule.notificationMessageTmpl(), ctx);
|
||||
return instance.withTitleMessage(title, message);
|
||||
}
|
||||
|
||||
private void enqueueNotifications(AlertRule rule, AlertInstance instance, Instant now) {
|
||||
for (WebhookBinding w : rule.webhooks()) {
|
||||
Map<String, Object> payload = buildPayload(rule, instance);
|
||||
notificationRepo.save(new AlertNotification(
|
||||
UUID.randomUUID(),
|
||||
instance.id(),
|
||||
w.id(),
|
||||
w.outboundConnectionId(),
|
||||
NotificationStatus.PENDING,
|
||||
0,
|
||||
now,
|
||||
null, null, null, null,
|
||||
payload,
|
||||
null,
|
||||
now));
|
||||
}
|
||||
}
|
||||
|
||||
private Map<String, Object> buildPayload(AlertRule rule, AlertInstance instance) {
|
||||
Environment env = environmentRepo.findById(rule.environmentId()).orElse(null);
|
||||
return contextBuilder.build(rule, instance, env, null);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private Map<String, Object> snapshotRule(AlertRule rule) {
|
||||
try {
|
||||
Map<String, Object> raw = objectMapper.convertValue(rule, Map.class);
|
||||
// Map.copyOf (used in AlertInstance compact ctor) rejects null values —
|
||||
// strip them so the snapshot is safe to store.
|
||||
Map<String, Object> safe = new LinkedHashMap<>();
|
||||
raw.forEach((k, v) -> { if (v != null) safe.put(k, v); });
|
||||
return safe;
|
||||
} catch (Exception e) {
|
||||
log.warn("Failed to snapshot rule {}: {}", rule.id(), e.getMessage());
|
||||
return Map.of("id", rule.id().toString(), "name", rule.name());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -17,9 +17,14 @@ public sealed interface EvalResult {
|
||||
|
||||
record Error(Throwable cause) implements EvalResult {}
|
||||
|
||||
record Batch(List<Firing> firings) implements EvalResult {
|
||||
record Batch(List<Firing> firings, Map<String, Object> nextEvalState) implements EvalResult {
|
||||
public Batch {
|
||||
firings = firings == null ? List.of() : List.copyOf(firings);
|
||||
nextEvalState = nextEvalState == null ? Map.of() : Map.copyOf(nextEvalState);
|
||||
}
|
||||
/** Convenience: a Batch with no cursor update (first-run empty, or no matches). */
|
||||
public static Batch empty() {
|
||||
return new Batch(List.of(), Map.of());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
package com.cameleer.server.app.alerting.eval;
|
||||
|
||||
import com.cameleer.server.app.alerting.config.AlertingProperties;
|
||||
import com.cameleer.server.app.search.ClickHouseSearchIndex;
|
||||
import com.cameleer.server.core.alerting.AlertMatchSpec;
|
||||
import com.cameleer.server.core.alerting.AlertRule;
|
||||
@@ -14,6 +15,7 @@ import org.springframework.stereotype.Component;
|
||||
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@@ -23,10 +25,14 @@ public class ExchangeMatchEvaluator implements ConditionEvaluator<ExchangeMatchC
|
||||
|
||||
private final ClickHouseSearchIndex searchIndex;
|
||||
private final EnvironmentRepository envRepo;
|
||||
private final AlertingProperties alertingProperties;
|
||||
|
||||
public ExchangeMatchEvaluator(ClickHouseSearchIndex searchIndex, EnvironmentRepository envRepo) {
|
||||
this.searchIndex = searchIndex;
|
||||
this.envRepo = envRepo;
|
||||
public ExchangeMatchEvaluator(ClickHouseSearchIndex searchIndex,
|
||||
EnvironmentRepository envRepo,
|
||||
AlertingProperties alertingProperties) {
|
||||
this.searchIndex = searchIndex;
|
||||
this.envRepo = envRepo;
|
||||
this.alertingProperties = alertingProperties;
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -85,19 +91,31 @@ public class ExchangeMatchEvaluator implements ConditionEvaluator<ExchangeMatchC
|
||||
String routeId = c.scope() != null ? c.scope().routeId() : null;
|
||||
ExchangeMatchCondition.ExchangeFilter filter = c.filter();
|
||||
|
||||
// Resolve cursor from evalState
|
||||
Instant cursor = null;
|
||||
Object raw = rule.evalState().get("lastExchangeTs");
|
||||
// Resolve composite cursor: (startTime, executionId)
|
||||
Instant cursorTs;
|
||||
String cursorId;
|
||||
Object raw = rule.evalState().get("lastExchangeCursor");
|
||||
if (raw instanceof String s && !s.isBlank()) {
|
||||
try { cursor = Instant.parse(s); } catch (Exception ignored) {}
|
||||
} else if (raw instanceof Instant i) {
|
||||
cursor = i;
|
||||
int pipe = s.indexOf('|');
|
||||
if (pipe < 0) {
|
||||
// Malformed — treat as first-run (with deploy-backlog-cap clamp).
|
||||
cursorTs = firstRunCursorTs(rule, ctx);
|
||||
cursorId = "";
|
||||
} else {
|
||||
cursorTs = Instant.parse(s.substring(0, pipe));
|
||||
cursorId = s.substring(pipe + 1);
|
||||
}
|
||||
} else {
|
||||
// First run — bounded by rule.createdAt, empty executionId so any real id sorts after it.
|
||||
// Clamp to deploy-backlog-cap to avoid backlog flooding for long-lived rules on first
|
||||
// post-deploy tick. Normal-advance path (valid cursor above) is intentionally unaffected.
|
||||
cursorTs = firstRunCursorTs(rule, ctx);
|
||||
cursorId = "";
|
||||
}
|
||||
|
||||
// Build SearchRequest — use cursor as timeFrom so we only see exchanges after last run
|
||||
var req = new SearchRequest(
|
||||
filter != null ? filter.status() : null,
|
||||
cursor, // timeFrom = cursor (or null for first run)
|
||||
cursorTs, // timeFrom
|
||||
ctx.now(), // timeTo
|
||||
null, null, null, // durationMin/Max, correlationId
|
||||
null, null, null, null, // text variants
|
||||
@@ -110,23 +128,26 @@ public class ExchangeMatchEvaluator implements ConditionEvaluator<ExchangeMatchC
|
||||
50,
|
||||
"startTime",
|
||||
"asc", // asc so we process oldest first
|
||||
cursorId.isEmpty() ? null : cursorId, // afterExecutionId — null on first run enables >=
|
||||
envSlug
|
||||
);
|
||||
|
||||
SearchResult<ExecutionSummary> result = searchIndex.search(req);
|
||||
List<ExecutionSummary> matches = result.data();
|
||||
|
||||
if (matches.isEmpty()) return new EvalResult.Batch(List.of());
|
||||
if (matches.isEmpty()) return EvalResult.Batch.empty();
|
||||
|
||||
// Find the latest startTime across all matches — becomes the next cursor
|
||||
Instant latestTs = matches.stream()
|
||||
.map(ExecutionSummary::startTime)
|
||||
.max(Instant::compareTo)
|
||||
.orElse(ctx.now());
|
||||
// Ensure deterministic ordering for cursor advance
|
||||
matches = new ArrayList<>(matches);
|
||||
matches.sort(Comparator
|
||||
.comparing(ExecutionSummary::startTime)
|
||||
.thenComparing(ExecutionSummary::executionId));
|
||||
|
||||
ExecutionSummary last = matches.get(matches.size() - 1);
|
||||
String nextCursorSerialized = last.startTime().toString() + "|" + last.executionId();
|
||||
|
||||
List<EvalResult.Firing> firings = new ArrayList<>();
|
||||
for (int i = 0; i < matches.size(); i++) {
|
||||
ExecutionSummary ex = matches.get(i);
|
||||
for (ExecutionSummary ex : matches) {
|
||||
Map<String, Object> ctx2 = new HashMap<>();
|
||||
ctx2.put("exchange", Map.of(
|
||||
"id", ex.executionId(),
|
||||
@@ -135,15 +156,32 @@ public class ExchangeMatchEvaluator implements ConditionEvaluator<ExchangeMatchC
|
||||
"startTime", ex.startTime() == null ? "" : ex.startTime().toString()
|
||||
));
|
||||
ctx2.put("app", Map.of("slug", ex.applicationId() == null ? "" : ex.applicationId()));
|
||||
|
||||
// Attach the next-cursor to the last firing so the job can extract it
|
||||
if (i == matches.size() - 1) {
|
||||
ctx2.put("_nextCursor", latestTs);
|
||||
}
|
||||
|
||||
firings.add(new EvalResult.Firing(1.0, null, ctx2));
|
||||
}
|
||||
|
||||
return new EvalResult.Batch(firings);
|
||||
Map<String, Object> nextEvalState = new HashMap<>(rule.evalState());
|
||||
nextEvalState.put("lastExchangeCursor", nextCursorSerialized);
|
||||
return new EvalResult.Batch(firings, nextEvalState);
|
||||
}
|
||||
|
||||
/**
|
||||
* First-run cursor timestamp: {@code rule.createdAt()}, clamped to
|
||||
* {@code now - perExchangeDeployBacklogCapSeconds} so a long-lived PER_EXCHANGE rule
|
||||
* doesn't scan from its creation date forward on first post-deploy tick.
|
||||
* <p>
|
||||
* Cap ≤ 0 disables the clamp (first-run falls back to {@code rule.createdAt()} verbatim).
|
||||
* Applied only on first-run / malformed-cursor paths — the normal-advance path is
|
||||
* intentionally unaffected so legitimate missed ticks are not silently skipped.
|
||||
*/
|
||||
private Instant firstRunCursorTs(AlertRule rule, EvalContext ctx) {
|
||||
Instant cursorTs = rule.createdAt();
|
||||
int capSeconds = alertingProperties.effectivePerExchangeDeployBacklogCapSeconds();
|
||||
if (capSeconds > 0) {
|
||||
Instant capFloor = ctx.now().minusSeconds(capSeconds);
|
||||
if (cursorTs == null || cursorTs.isBefore(capFloor)) {
|
||||
cursorTs = capFloor;
|
||||
}
|
||||
}
|
||||
return cursorTs;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -61,7 +61,8 @@ public class LogPatternEvaluator implements ConditionEvaluator<LogPatternConditi
|
||||
to,
|
||||
null, // cursor
|
||||
1, // limit (count query; value irrelevant)
|
||||
"desc" // sort
|
||||
"desc", // sort
|
||||
null // instanceIds
|
||||
);
|
||||
return logStore.countLogs(req);
|
||||
});
|
||||
|
||||
@@ -9,12 +9,20 @@ import io.micrometer.core.instrument.MeterRegistry;
|
||||
import io.micrometer.core.instrument.Timer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.jdbc.core.JdbcTemplate;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.EnumMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
/**
|
||||
* Micrometer-based metrics for the alerting subsystem.
|
||||
@@ -30,10 +38,11 @@ import java.util.concurrent.ConcurrentMap;
|
||||
* <li>{@code alerting_eval_duration_seconds{kind}} — per-kind evaluation latency</li>
|
||||
* <li>{@code alerting_webhook_delivery_duration_seconds} — webhook POST latency</li>
|
||||
* </ul>
|
||||
* Gauges (read from PostgreSQL on each scrape; low scrape frequency = low DB load):
|
||||
* Gauges (read from PostgreSQL, cached for {@link #DEFAULT_GAUGE_TTL} to amortise
|
||||
* Prometheus scrapes that may fire every few seconds):
|
||||
* <ul>
|
||||
* <li>{@code alerting_rules_total{state=enabled|disabled}} — rule counts from {@code alert_rules}</li>
|
||||
* <li>{@code alerting_instances_total{state,severity}} — instance counts grouped from {@code alert_instances}</li>
|
||||
* <li>{@code alerting_instances_total{state}} — instance counts grouped from {@code alert_instances}</li>
|
||||
* </ul>
|
||||
*/
|
||||
@Component
|
||||
@@ -41,11 +50,13 @@ public class AlertingMetrics {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(AlertingMetrics.class);
|
||||
|
||||
/** Default time-to-live for the gauge-supplier caches. */
|
||||
static final Duration DEFAULT_GAUGE_TTL = Duration.ofSeconds(30);
|
||||
|
||||
private final MeterRegistry registry;
|
||||
private final JdbcTemplate jdbc;
|
||||
|
||||
// Cached counters per kind (lazy-initialized)
|
||||
private final ConcurrentMap<String, Counter> evalErrorCounters = new ConcurrentHashMap<>();
|
||||
private final ConcurrentMap<String, Counter> evalErrorCounters = new ConcurrentHashMap<>();
|
||||
private final ConcurrentMap<String, Counter> circuitOpenCounters = new ConcurrentHashMap<>();
|
||||
private final ConcurrentMap<String, Timer> evalDurationTimers = new ConcurrentHashMap<>();
|
||||
|
||||
@@ -55,33 +66,81 @@ public class AlertingMetrics {
|
||||
// Shared delivery timer
|
||||
private final Timer webhookDeliveryTimer;
|
||||
|
||||
// TTL-cached gauge suppliers registered so tests can force a read cycle.
|
||||
private final TtlCache enabledRulesCache;
|
||||
private final TtlCache disabledRulesCache;
|
||||
private final Map<AlertState, TtlCache> instancesByStateCaches;
|
||||
|
||||
/**
|
||||
* Production constructor: wraps the Postgres-backed gauge suppliers in a
|
||||
* 30-second TTL cache so Prometheus scrapes don't cause per-scrape DB queries.
|
||||
*/
|
||||
@Autowired
|
||||
public AlertingMetrics(MeterRegistry registry, JdbcTemplate jdbc) {
|
||||
this(registry,
|
||||
() -> countRules(jdbc, true),
|
||||
() -> countRules(jdbc, false),
|
||||
state -> countInstances(jdbc, state),
|
||||
DEFAULT_GAUGE_TTL,
|
||||
Instant::now);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test-friendly constructor accepting the three gauge suppliers that are
|
||||
* exercised in the {@link AlertingMetricsCachingTest} plan sketch. The
|
||||
* {@code instancesSupplier} is used for every {@link AlertState}.
|
||||
*/
|
||||
AlertingMetrics(MeterRegistry registry,
|
||||
Supplier<Long> enabledRulesSupplier,
|
||||
Supplier<Long> disabledRulesSupplier,
|
||||
Supplier<Long> instancesSupplier,
|
||||
Duration gaugeTtl,
|
||||
Supplier<Instant> clock) {
|
||||
this(registry,
|
||||
enabledRulesSupplier,
|
||||
disabledRulesSupplier,
|
||||
state -> instancesSupplier.get(),
|
||||
gaugeTtl,
|
||||
clock);
|
||||
}
|
||||
|
||||
/**
|
||||
* Core constructor: accepts per-state instance supplier so production can
|
||||
* query PostgreSQL with a different value per {@link AlertState}.
|
||||
*/
|
||||
private AlertingMetrics(MeterRegistry registry,
|
||||
Supplier<Long> enabledRulesSupplier,
|
||||
Supplier<Long> disabledRulesSupplier,
|
||||
java.util.function.Function<AlertState, Long> instancesSupplier,
|
||||
Duration gaugeTtl,
|
||||
Supplier<Instant> clock) {
|
||||
this.registry = registry;
|
||||
this.jdbc = jdbc;
|
||||
|
||||
// ── Static timers ───────────────────────────────────────────────
|
||||
this.webhookDeliveryTimer = Timer.builder("alerting_webhook_delivery_duration_seconds")
|
||||
.description("Latency of outbound webhook POST requests")
|
||||
.register(registry);
|
||||
|
||||
// ── Gauge: rules by enabled/disabled ────────────────────────────
|
||||
Gauge.builder("alerting_rules_total", this, m -> m.countRules(true))
|
||||
// ── Gauge: rules by enabled/disabled (cached) ───────────────────
|
||||
this.enabledRulesCache = new TtlCache(enabledRulesSupplier, gaugeTtl, clock);
|
||||
this.disabledRulesCache = new TtlCache(disabledRulesSupplier, gaugeTtl, clock);
|
||||
|
||||
Gauge.builder("alerting_rules_total", enabledRulesCache, TtlCache::getAsDouble)
|
||||
.tag("state", "enabled")
|
||||
.description("Number of enabled alert rules")
|
||||
.register(registry);
|
||||
Gauge.builder("alerting_rules_total", this, m -> m.countRules(false))
|
||||
Gauge.builder("alerting_rules_total", disabledRulesCache, TtlCache::getAsDouble)
|
||||
.tag("state", "disabled")
|
||||
.description("Number of disabled alert rules")
|
||||
.register(registry);
|
||||
|
||||
// ── Gauges: alert instances by state × severity ─────────────────
|
||||
// ── Gauges: alert instances by state (cached) ───────────────────
|
||||
this.instancesByStateCaches = new EnumMap<>(AlertState.class);
|
||||
for (AlertState state : AlertState.values()) {
|
||||
// Capture state as effectively-final for lambda
|
||||
AlertState capturedState = state;
|
||||
// We register one gauge per state (summed across severities) for simplicity;
|
||||
// per-severity breakdown would require a dynamic MultiGauge.
|
||||
Gauge.builder("alerting_instances_total", this,
|
||||
m -> m.countInstances(capturedState))
|
||||
AlertState captured = state;
|
||||
TtlCache cache = new TtlCache(() -> instancesSupplier.apply(captured), gaugeTtl, clock);
|
||||
this.instancesByStateCaches.put(state, cache);
|
||||
Gauge.builder("alerting_instances_total", cache, TtlCache::getAsDouble)
|
||||
.tag("state", state.name().toLowerCase())
|
||||
.description("Number of alert instances by state")
|
||||
.register(registry);
|
||||
@@ -148,28 +207,73 @@ public class AlertingMetrics {
|
||||
.increment();
|
||||
}
|
||||
|
||||
// ── Gauge suppliers (called on each Prometheus scrape) ──────────────
|
||||
|
||||
private double countRules(boolean enabled) {
|
||||
try {
|
||||
Long count = jdbc.queryForObject(
|
||||
"SELECT COUNT(*) FROM alert_rules WHERE enabled = ?", Long.class, enabled);
|
||||
return count == null ? 0.0 : count.doubleValue();
|
||||
} catch (Exception e) {
|
||||
log.debug("alerting_rules gauge query failed: {}", e.getMessage());
|
||||
return 0.0;
|
||||
/**
|
||||
* Force a read of every TTL-cached gauge supplier. Used by tests to simulate
|
||||
* a Prometheus scrape without needing a real registry scrape pipeline.
|
||||
*/
|
||||
void snapshotAllGauges() {
|
||||
List<TtlCache> all = new ArrayList<>();
|
||||
all.add(enabledRulesCache);
|
||||
all.add(disabledRulesCache);
|
||||
all.addAll(instancesByStateCaches.values());
|
||||
for (TtlCache c : all) {
|
||||
c.getAsDouble();
|
||||
}
|
||||
}
|
||||
|
||||
private double countInstances(AlertState state) {
|
||||
// ── Gauge suppliers (queried at most once per TTL) ──────────────────
|
||||
|
||||
private static long countRules(JdbcTemplate jdbc, boolean enabled) {
|
||||
try {
|
||||
Long count = jdbc.queryForObject(
|
||||
"SELECT COUNT(*) FROM alert_rules WHERE enabled = ?", Long.class, enabled);
|
||||
return count == null ? 0L : count;
|
||||
} catch (Exception e) {
|
||||
log.debug("alerting_rules gauge query failed: {}", e.getMessage());
|
||||
return 0L;
|
||||
}
|
||||
}
|
||||
|
||||
private static long countInstances(JdbcTemplate jdbc, AlertState state) {
|
||||
try {
|
||||
Long count = jdbc.queryForObject(
|
||||
"SELECT COUNT(*) FROM alert_instances WHERE state = ?::alert_state_enum",
|
||||
Long.class, state.name());
|
||||
return count == null ? 0.0 : count.doubleValue();
|
||||
return count == null ? 0L : count;
|
||||
} catch (Exception e) {
|
||||
log.debug("alerting_instances gauge query failed: {}", e.getMessage());
|
||||
return 0.0;
|
||||
return 0L;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Lightweight TTL cache around a {@code Supplier<Long>}. Every call to
|
||||
* {@link #getAsDouble()} either returns the cached value (if {@code clock.get()
|
||||
* - lastRead < ttl}) or invokes the delegate and refreshes the cache.
|
||||
*
|
||||
* <p>Used to amortise Postgres queries behind Prometheus gauges over a
|
||||
* 30-second TTL (see {@link AlertingMetrics#DEFAULT_GAUGE_TTL}).
|
||||
*/
|
||||
static final class TtlCache {
|
||||
private final Supplier<Long> delegate;
|
||||
private final Duration ttl;
|
||||
private final Supplier<Instant> clock;
|
||||
private volatile Instant lastRead = Instant.MIN;
|
||||
private volatile long cached = 0L;
|
||||
|
||||
TtlCache(Supplier<Long> delegate, Duration ttl, Supplier<Instant> clock) {
|
||||
this.delegate = delegate;
|
||||
this.ttl = ttl;
|
||||
this.clock = clock;
|
||||
}
|
||||
|
||||
synchronized double getAsDouble() {
|
||||
Instant now = clock.get();
|
||||
if (lastRead == Instant.MIN || Duration.between(lastRead, now).compareTo(ttl) >= 0) {
|
||||
cached = delegate.get();
|
||||
lastRead = now;
|
||||
}
|
||||
return cached;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
package com.cameleer.server.app.alerting.notify;
|
||||
|
||||
import com.cameleer.server.app.alerting.dto.UnreadCountResponse;
|
||||
import com.cameleer.server.core.alerting.AlertInstance;
|
||||
import com.cameleer.server.core.alerting.AlertInstanceRepository;
|
||||
import com.cameleer.server.core.alerting.AlertSeverity;
|
||||
import com.cameleer.server.core.alerting.AlertState;
|
||||
import com.cameleer.server.core.rbac.RbacService;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@@ -17,7 +20,8 @@ import java.util.concurrent.ConcurrentHashMap;
|
||||
* <p>
|
||||
* {@link #listInbox} returns alerts the user is allowed to see (targeted directly or via group/role).
|
||||
* {@link #countUnread} is memoized per {@code (envId, userId)} for 5 seconds to avoid hammering
|
||||
* the database on every page render.
|
||||
* the database on every page render. The memo caches the full per-severity breakdown so
|
||||
* the UI can branch bell colour on the highest unread severity without a second call.
|
||||
*/
|
||||
@Component
|
||||
public class InAppInboxQuery {
|
||||
@@ -31,8 +35,8 @@ public class InAppInboxQuery {
|
||||
/** Cache key for the unread count memo. */
|
||||
private record Key(UUID envId, String userId) {}
|
||||
|
||||
/** Cache entry: cached count + expiry timestamp. */
|
||||
private record Entry(long count, Instant expiresAt) {}
|
||||
/** Cache entry: cached response + expiry timestamp. */
|
||||
private record Entry(UnreadCountResponse response, Instant expiresAt) {}
|
||||
|
||||
private final ConcurrentHashMap<Key, Entry> memo = new ConcurrentHashMap<>();
|
||||
|
||||
@@ -45,32 +49,42 @@ public class InAppInboxQuery {
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the most recent {@code limit} alert instances visible to the given user.
|
||||
* <p>
|
||||
* Visibility: the instance must target this user directly, or target a group the user belongs to,
|
||||
* or target a role the user holds. Empty target lists mean "broadcast to all".
|
||||
* Full filtered variant: optional {@code states}, {@code severities}, {@code acked},
|
||||
* and {@code read} narrow the result set. {@code null} or empty lists mean
|
||||
* "no filter on that dimension". {@code acked}/{@code read} are tri-state:
|
||||
* {@code null} = no filter, {@code TRUE} = only acked/read, {@code FALSE} = only unacked/unread.
|
||||
*/
|
||||
public List<AlertInstance> listInbox(UUID envId, String userId, int limit) {
|
||||
public List<AlertInstance> listInbox(UUID envId,
|
||||
String userId,
|
||||
List<AlertState> states,
|
||||
List<AlertSeverity> severities,
|
||||
Boolean acked,
|
||||
Boolean read,
|
||||
int limit) {
|
||||
List<String> groupIds = resolveGroupIds(userId);
|
||||
List<String> roleNames = resolveRoleNames(userId);
|
||||
return instanceRepo.listForInbox(envId, groupIds, userId, roleNames, limit);
|
||||
return instanceRepo.listForInbox(envId, groupIds, userId, roleNames,
|
||||
states, severities, acked, read, limit);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the count of unread (un-acked) alert instances visible to the user.
|
||||
* Returns the unread (un-acked) alert count for the user, broken down by severity.
|
||||
* <p>
|
||||
* The result is memoized for 5 seconds per {@code (envId, userId)}.
|
||||
* Memoized for 5 seconds per {@code (envId, userId)}.
|
||||
*/
|
||||
public long countUnread(UUID envId, String userId) {
|
||||
public UnreadCountResponse countUnread(UUID envId, String userId) {
|
||||
Key key = new Key(envId, userId);
|
||||
Instant now = Instant.now(clock);
|
||||
Entry cached = memo.get(key);
|
||||
if (cached != null && now.isBefore(cached.expiresAt())) {
|
||||
return cached.count();
|
||||
return cached.response();
|
||||
}
|
||||
long count = instanceRepo.countUnreadForUser(envId, userId);
|
||||
memo.put(key, new Entry(count, now.plusMillis(MEMO_TTL_MS)));
|
||||
return count;
|
||||
List<String> groupIds = resolveGroupIds(userId);
|
||||
List<String> roleNames = resolveRoleNames(userId);
|
||||
Map<AlertSeverity, Long> bySeverity = instanceRepo.countUnreadBySeverity(envId, userId, groupIds, roleNames);
|
||||
UnreadCountResponse response = UnreadCountResponse.from(bySeverity);
|
||||
memo.put(key, new Entry(response, now.plusMillis(MEMO_TTL_MS)));
|
||||
return response;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
@@ -64,6 +64,10 @@ public class NotificationContextBuilder {
|
||||
ctx.put("agent", subtree(instance, "agent.id", "agent.name", "agent.state"));
|
||||
ctx.put("app", subtree(instance, "app.slug", "app.id"));
|
||||
}
|
||||
case AGENT_LIFECYCLE -> {
|
||||
ctx.put("agent", subtree(instance, "agent.id", "agent.app"));
|
||||
ctx.put("event", subtree(instance, "event.type", "event.timestamp", "event.detail"));
|
||||
}
|
||||
case DEPLOYMENT_STATE -> {
|
||||
ctx.put("deployment", subtree(instance, "deployment.id", "deployment.status"));
|
||||
ctx.put("app", subtree(instance, "app.slug", "app.id"));
|
||||
|
||||
@@ -34,10 +34,12 @@ public class PostgresAlertInstanceRepository implements AlertInstanceRepository
|
||||
INSERT INTO alert_instances (
|
||||
id, rule_id, rule_snapshot, environment_id, state, severity,
|
||||
fired_at, acked_at, acked_by, resolved_at, last_notified_at,
|
||||
read_at, deleted_at,
|
||||
silenced, current_value, threshold, context, title, message,
|
||||
target_user_ids, target_group_ids, target_role_names)
|
||||
VALUES (?, ?, ?::jsonb, ?, ?::alert_state_enum, ?::severity_enum,
|
||||
?, ?, ?, ?, ?,
|
||||
?, ?,
|
||||
?, ?, ?, ?::jsonb, ?, ?,
|
||||
?, ?, ?)
|
||||
ON CONFLICT (id) DO UPDATE SET
|
||||
@@ -46,6 +48,8 @@ public class PostgresAlertInstanceRepository implements AlertInstanceRepository
|
||||
acked_by = EXCLUDED.acked_by,
|
||||
resolved_at = EXCLUDED.resolved_at,
|
||||
last_notified_at = EXCLUDED.last_notified_at,
|
||||
read_at = EXCLUDED.read_at,
|
||||
deleted_at = EXCLUDED.deleted_at,
|
||||
silenced = EXCLUDED.silenced,
|
||||
current_value = EXCLUDED.current_value,
|
||||
threshold = EXCLUDED.threshold,
|
||||
@@ -66,6 +70,7 @@ public class PostgresAlertInstanceRepository implements AlertInstanceRepository
|
||||
i.environmentId(), i.state().name(), i.severity().name(),
|
||||
ts(i.firedAt()), ts(i.ackedAt()), i.ackedBy(),
|
||||
ts(i.resolvedAt()), ts(i.lastNotifiedAt()),
|
||||
ts(i.readAt()), ts(i.deletedAt()),
|
||||
i.silenced(), i.currentValue(), i.threshold(),
|
||||
writeJson(i.context()), i.title(), i.message(),
|
||||
userIds, groupIds, roleNames);
|
||||
@@ -87,7 +92,8 @@ public class PostgresAlertInstanceRepository implements AlertInstanceRepository
|
||||
var list = jdbc.query("""
|
||||
SELECT * FROM alert_instances
|
||||
WHERE rule_id = ?
|
||||
AND state IN ('PENDING','FIRING','ACKNOWLEDGED')
|
||||
AND state IN ('PENDING','FIRING')
|
||||
AND deleted_at IS NULL
|
||||
LIMIT 1
|
||||
""", rowMapper(), ruleId);
|
||||
return list.isEmpty() ? Optional.empty() : Optional.of(list.get(0));
|
||||
@@ -98,12 +104,15 @@ public class PostgresAlertInstanceRepository implements AlertInstanceRepository
|
||||
List<String> userGroupIdFilter,
|
||||
String userId,
|
||||
List<String> userRoleNames,
|
||||
List<AlertState> states,
|
||||
List<AlertSeverity> severities,
|
||||
Boolean acked,
|
||||
Boolean read,
|
||||
int limit) {
|
||||
// Build arrays for group UUIDs and role names
|
||||
Array groupArray = toUuidArrayFromStrings(userGroupIdFilter);
|
||||
Array roleArray = toTextArray(userRoleNames);
|
||||
|
||||
String sql = """
|
||||
StringBuilder sql = new StringBuilder("""
|
||||
SELECT * FROM alert_instances
|
||||
WHERE environment_id = ?
|
||||
AND (
|
||||
@@ -111,37 +120,119 @@ public class PostgresAlertInstanceRepository implements AlertInstanceRepository
|
||||
OR target_group_ids && ?
|
||||
OR target_role_names && ?
|
||||
)
|
||||
ORDER BY fired_at DESC
|
||||
LIMIT ?
|
||||
""";
|
||||
return jdbc.query(sql, rowMapper(), environmentId, userId, groupArray, roleArray, limit);
|
||||
""");
|
||||
List<Object> args = new ArrayList<>(List.of(environmentId, userId, groupArray, roleArray));
|
||||
|
||||
if (states != null && !states.isEmpty()) {
|
||||
Array stateArray = toTextArray(states.stream().map(Enum::name).toList());
|
||||
sql.append(" AND state::text = ANY(?)");
|
||||
args.add(stateArray);
|
||||
}
|
||||
if (severities != null && !severities.isEmpty()) {
|
||||
Array severityArray = toTextArray(severities.stream().map(Enum::name).toList());
|
||||
sql.append(" AND severity::text = ANY(?)");
|
||||
args.add(severityArray);
|
||||
}
|
||||
if (acked != null) {
|
||||
sql.append(acked ? " AND acked_at IS NOT NULL" : " AND acked_at IS NULL");
|
||||
}
|
||||
if (read != null) {
|
||||
sql.append(read ? " AND read_at IS NOT NULL" : " AND read_at IS NULL");
|
||||
}
|
||||
sql.append(" AND deleted_at IS NULL");
|
||||
sql.append(" ORDER BY fired_at DESC LIMIT ?");
|
||||
args.add(limit);
|
||||
|
||||
return jdbc.query(sql.toString(), rowMapper(), args.toArray());
|
||||
}
|
||||
|
||||
@Override
|
||||
public long countUnreadForUser(UUID environmentId, String userId) {
|
||||
public Map<AlertSeverity, Long> countUnreadBySeverity(UUID environmentId,
|
||||
String userId,
|
||||
List<String> groupIds,
|
||||
List<String> roleNames) {
|
||||
Array groupArray = toUuidArrayFromStrings(groupIds);
|
||||
Array roleArray = toTextArray(roleNames);
|
||||
String sql = """
|
||||
SELECT COUNT(*) FROM alert_instances ai
|
||||
WHERE ai.environment_id = ?
|
||||
AND ? = ANY(ai.target_user_ids)
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM alert_reads ar
|
||||
WHERE ar.user_id = ? AND ar.alert_instance_id = ai.id
|
||||
SELECT severity::text AS severity, COUNT(*) AS cnt
|
||||
FROM alert_instances
|
||||
WHERE environment_id = ?
|
||||
AND read_at IS NULL
|
||||
AND deleted_at IS NULL
|
||||
AND (
|
||||
? = ANY(target_user_ids)
|
||||
OR target_group_ids && ?
|
||||
OR target_role_names && ?
|
||||
)
|
||||
GROUP BY severity
|
||||
""";
|
||||
Long count = jdbc.queryForObject(sql, Long.class, environmentId, userId, userId);
|
||||
return count == null ? 0L : count;
|
||||
EnumMap<AlertSeverity, Long> counts = new EnumMap<>(AlertSeverity.class);
|
||||
for (AlertSeverity s : AlertSeverity.values()) counts.put(s, 0L);
|
||||
jdbc.query(sql, (org.springframework.jdbc.core.RowCallbackHandler) rs -> counts.put(
|
||||
AlertSeverity.valueOf(rs.getString("severity")), rs.getLong("cnt")
|
||||
), environmentId, userId, groupArray, roleArray);
|
||||
return counts;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void ack(UUID id, String userId, Instant when) {
|
||||
jdbc.update("""
|
||||
UPDATE alert_instances
|
||||
SET state = 'ACKNOWLEDGED'::alert_state_enum,
|
||||
acked_at = ?, acked_by = ?
|
||||
WHERE id = ?
|
||||
SET acked_at = ?, acked_by = ?
|
||||
WHERE id = ? AND acked_at IS NULL AND deleted_at IS NULL
|
||||
""", Timestamp.from(when), userId, id);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void markRead(UUID id, Instant when) {
|
||||
jdbc.update("UPDATE alert_instances SET read_at = ? WHERE id = ? AND read_at IS NULL",
|
||||
Timestamp.from(when), id);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void bulkMarkRead(List<UUID> ids, Instant when) {
|
||||
if (ids == null || ids.isEmpty()) return;
|
||||
Array idArray = jdbc.execute((ConnectionCallback<Array>) c ->
|
||||
c.createArrayOf("uuid", ids.toArray()));
|
||||
jdbc.update("""
|
||||
UPDATE alert_instances SET read_at = ?
|
||||
WHERE id = ANY(?) AND read_at IS NULL AND deleted_at IS NULL
|
||||
""", Timestamp.from(when), idArray);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void softDelete(UUID id, Instant when) {
|
||||
jdbc.update("UPDATE alert_instances SET deleted_at = ? WHERE id = ? AND deleted_at IS NULL",
|
||||
Timestamp.from(when), id);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void bulkSoftDelete(List<UUID> ids, Instant when) {
|
||||
if (ids == null || ids.isEmpty()) return;
|
||||
Array idArray = jdbc.execute((ConnectionCallback<Array>) c ->
|
||||
c.createArrayOf("uuid", ids.toArray()));
|
||||
jdbc.update("""
|
||||
UPDATE alert_instances SET deleted_at = ?
|
||||
WHERE id = ANY(?) AND deleted_at IS NULL
|
||||
""", Timestamp.from(when), idArray);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void restore(UUID id) {
|
||||
jdbc.update("UPDATE alert_instances SET deleted_at = NULL WHERE id = ?", id);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void bulkAck(List<UUID> ids, String userId, Instant when) {
|
||||
if (ids == null || ids.isEmpty()) return;
|
||||
Array idArray = jdbc.execute((ConnectionCallback<Array>) c ->
|
||||
c.createArrayOf("uuid", ids.toArray()));
|
||||
jdbc.update("""
|
||||
UPDATE alert_instances SET acked_at = ?, acked_by = ?
|
||||
WHERE id = ANY(?) AND acked_at IS NULL AND deleted_at IS NULL
|
||||
""", Timestamp.from(when), userId, idArray);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void resolve(UUID id, Instant when) {
|
||||
jdbc.update("""
|
||||
@@ -171,6 +262,17 @@ public class PostgresAlertInstanceRepository implements AlertInstanceRepository
|
||||
""", rowMapper(), Timestamp.from(now));
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<UUID> filterInEnvLive(List<UUID> ids, UUID environmentId) {
|
||||
if (ids == null || ids.isEmpty()) return List.of();
|
||||
Array idArray = jdbc.execute((ConnectionCallback<Array>) c ->
|
||||
c.createArrayOf("uuid", ids.toArray()));
|
||||
return jdbc.query("""
|
||||
SELECT id FROM alert_instances
|
||||
WHERE id = ANY(?) AND environment_id = ? AND deleted_at IS NULL
|
||||
""", (rs, i) -> (UUID) rs.getObject("id"), idArray, environmentId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void deleteResolvedBefore(Instant cutoff) {
|
||||
jdbc.update("""
|
||||
@@ -193,6 +295,8 @@ public class PostgresAlertInstanceRepository implements AlertInstanceRepository
|
||||
Timestamp ackedAt = rs.getTimestamp("acked_at");
|
||||
Timestamp resolvedAt = rs.getTimestamp("resolved_at");
|
||||
Timestamp lastNotifiedAt = rs.getTimestamp("last_notified_at");
|
||||
Timestamp readAt = rs.getTimestamp("read_at");
|
||||
Timestamp deletedAt = rs.getTimestamp("deleted_at");
|
||||
|
||||
Object cvObj = rs.getObject("current_value");
|
||||
Double currentValue = cvObj == null ? null : ((Number) cvObj).doubleValue();
|
||||
@@ -213,6 +317,8 @@ public class PostgresAlertInstanceRepository implements AlertInstanceRepository
|
||||
rs.getString("acked_by"),
|
||||
resolvedAt == null ? null : resolvedAt.toInstant(),
|
||||
lastNotifiedAt == null ? null : lastNotifiedAt.toInstant(),
|
||||
readAt == null ? null : readAt.toInstant(),
|
||||
deletedAt == null ? null : deletedAt.toInstant(),
|
||||
rs.getBoolean("silenced"),
|
||||
currentValue,
|
||||
threshold,
|
||||
|
||||
@@ -1,35 +0,0 @@
|
||||
package com.cameleer.server.app.alerting.storage;
|
||||
|
||||
import com.cameleer.server.core.alerting.AlertReadRepository;
|
||||
import org.springframework.jdbc.core.JdbcTemplate;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
public class PostgresAlertReadRepository implements AlertReadRepository {
|
||||
|
||||
private final JdbcTemplate jdbc;
|
||||
|
||||
public PostgresAlertReadRepository(JdbcTemplate jdbc) {
|
||||
this.jdbc = jdbc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void markRead(String userId, UUID alertInstanceId) {
|
||||
jdbc.update("""
|
||||
INSERT INTO alert_reads (user_id, alert_instance_id)
|
||||
VALUES (?, ?)
|
||||
ON CONFLICT (user_id, alert_instance_id) DO NOTHING
|
||||
""", userId, alertInstanceId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void bulkMarkRead(String userId, List<UUID> alertInstanceIds) {
|
||||
if (alertInstanceIds == null || alertInstanceIds.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
for (UUID id : alertInstanceIds) {
|
||||
markRead(userId, id);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -9,6 +9,7 @@ import com.cameleer.server.core.runtime.AppService;
|
||||
import com.cameleer.server.core.runtime.AppVersionRepository;
|
||||
import com.cameleer.server.core.runtime.DeploymentRepository;
|
||||
import com.cameleer.server.core.runtime.DeploymentService;
|
||||
import com.cameleer.server.core.runtime.DirtyStateCalculator;
|
||||
import com.cameleer.server.core.runtime.EnvironmentRepository;
|
||||
import com.cameleer.server.core.runtime.EnvironmentService;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
@@ -64,6 +65,11 @@ public class RuntimeBeanConfig {
|
||||
return new DeploymentService(deployRepo, appService, envService);
|
||||
}
|
||||
|
||||
@Bean
|
||||
public DirtyStateCalculator dirtyStateCalculator(ObjectMapper objectMapper) {
|
||||
return new DirtyStateCalculator(objectMapper);
|
||||
}
|
||||
|
||||
@Bean(name = "deploymentTaskExecutor")
|
||||
public Executor deploymentTaskExecutor() {
|
||||
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
|
||||
|
||||
@@ -9,6 +9,8 @@ import com.cameleer.server.app.storage.ClickHouseRouteCatalogStore;
|
||||
import com.cameleer.server.core.storage.RouteCatalogStore;
|
||||
import com.cameleer.server.app.storage.ClickHouseMetricsQueryStore;
|
||||
import com.cameleer.server.app.storage.ClickHouseMetricsStore;
|
||||
import com.cameleer.server.app.storage.ClickHouseServerMetricsQueryStore;
|
||||
import com.cameleer.server.app.storage.ClickHouseServerMetricsStore;
|
||||
import com.cameleer.server.app.storage.ClickHouseStatsStore;
|
||||
import com.cameleer.server.core.admin.AuditRepository;
|
||||
import com.cameleer.server.core.admin.AuditService;
|
||||
@@ -16,7 +18,6 @@ import com.cameleer.server.core.agent.AgentEventRepository;
|
||||
import com.cameleer.server.core.agent.AgentInfo;
|
||||
import com.cameleer.server.core.agent.AgentRegistryService;
|
||||
import com.cameleer.server.core.detail.DetailService;
|
||||
import com.cameleer.server.core.indexing.SearchIndexer;
|
||||
import com.cameleer.server.app.ingestion.ExecutionFlushScheduler;
|
||||
import com.cameleer.server.app.search.ClickHouseSearchIndex;
|
||||
import com.cameleer.server.app.storage.ClickHouseExecutionStore;
|
||||
@@ -43,26 +44,15 @@ public class StorageBeanConfig {
|
||||
return new DetailService(executionStore);
|
||||
}
|
||||
|
||||
@Bean(destroyMethod = "shutdown")
|
||||
public SearchIndexer searchIndexer(ExecutionStore executionStore, SearchIndex searchIndex,
|
||||
@Value("${cameleer.server.indexer.debouncems:2000}") long debounceMs,
|
||||
@Value("${cameleer.server.indexer.queuesize:10000}") int queueSize) {
|
||||
return new SearchIndexer(executionStore, searchIndex, debounceMs, queueSize);
|
||||
}
|
||||
|
||||
@Bean
|
||||
public AuditService auditService(AuditRepository auditRepository) {
|
||||
return new AuditService(auditRepository);
|
||||
}
|
||||
|
||||
@Bean
|
||||
public IngestionService ingestionService(ExecutionStore executionStore,
|
||||
DiagramStore diagramStore,
|
||||
WriteBuffer<MetricsSnapshot> metricsBuffer,
|
||||
SearchIndexer searchIndexer,
|
||||
@Value("${cameleer.server.ingestion.bodysizelimit:16384}") int bodySizeLimit) {
|
||||
return new IngestionService(executionStore, diagramStore, metricsBuffer,
|
||||
searchIndexer::onExecutionUpdated, bodySizeLimit);
|
||||
public IngestionService ingestionService(DiagramStore diagramStore,
|
||||
WriteBuffer<MetricsSnapshot> metricsBuffer) {
|
||||
return new IngestionService(diagramStore, metricsBuffer);
|
||||
}
|
||||
|
||||
@Bean
|
||||
@@ -79,6 +69,19 @@ public class StorageBeanConfig {
|
||||
return new ClickHouseMetricsQueryStore(tenantProperties.getId(), clickHouseJdbc);
|
||||
}
|
||||
|
||||
@Bean
|
||||
public ServerMetricsStore clickHouseServerMetricsStore(
|
||||
@Qualifier("clickHouseJdbcTemplate") JdbcTemplate clickHouseJdbc) {
|
||||
return new ClickHouseServerMetricsStore(clickHouseJdbc);
|
||||
}
|
||||
|
||||
@Bean
|
||||
public ServerMetricsQueryStore clickHouseServerMetricsQueryStore(
|
||||
TenantProperties tenantProperties,
|
||||
@Qualifier("clickHouseJdbcTemplate") JdbcTemplate clickHouseJdbc) {
|
||||
return new ClickHouseServerMetricsQueryStore(tenantProperties.getId(), clickHouseJdbc);
|
||||
}
|
||||
|
||||
// ── Execution Store ──────────────────────────────────────────────────
|
||||
|
||||
@Bean
|
||||
|
||||
@@ -62,10 +62,13 @@ public class AgentSseController {
|
||||
|
||||
AgentInfo agent = registryService.findById(id);
|
||||
if (agent == null) {
|
||||
// Auto-heal: re-register agent from JWT claims after server restart
|
||||
// Auto-heal re-registers an agent from JWT claims after a server
|
||||
// restart, but only when the JWT subject matches the path id.
|
||||
// Otherwise a holder of any valid agent JWT could spoof an
|
||||
// arbitrary agentId in the URL.
|
||||
var jwtResult = (JwtService.JwtValidationResult) httpRequest.getAttribute(
|
||||
JwtAuthenticationFilter.JWT_RESULT_ATTR);
|
||||
if (jwtResult != null) {
|
||||
if (jwtResult != null && id.equals(jwtResult.subject())) {
|
||||
String application = jwtResult.application() != null ? jwtResult.application() : "default";
|
||||
String env = jwtResult.environment() != null ? jwtResult.environment() : "default";
|
||||
registryService.register(id, id, application, env, "unknown", List.of(), Map.of());
|
||||
|
||||
@@ -1,14 +1,24 @@
|
||||
package com.cameleer.server.app.controller;
|
||||
|
||||
import com.cameleer.common.model.ApplicationConfig;
|
||||
import com.cameleer.server.app.dto.DirtyStateResponse;
|
||||
import com.cameleer.server.app.storage.PostgresApplicationConfigRepository;
|
||||
import com.cameleer.server.app.storage.PostgresDeploymentRepository;
|
||||
import com.cameleer.server.app.web.EnvPath;
|
||||
import com.cameleer.server.core.runtime.App;
|
||||
import com.cameleer.server.core.runtime.AppService;
|
||||
import com.cameleer.server.core.runtime.AppVersion;
|
||||
import com.cameleer.server.core.runtime.AppVersionRepository;
|
||||
import com.cameleer.server.core.runtime.Deployment;
|
||||
import com.cameleer.server.core.runtime.DeploymentConfigSnapshot;
|
||||
import com.cameleer.server.core.runtime.DirtyStateCalculator;
|
||||
import com.cameleer.server.core.runtime.DirtyStateResult;
|
||||
import com.cameleer.server.core.runtime.Environment;
|
||||
import com.cameleer.server.core.runtime.RuntimeType;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.responses.ApiResponse;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.security.access.prepost.PreAuthorize;
|
||||
@@ -22,8 +32,10 @@ import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
import org.springframework.web.server.ResponseStatusException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
@@ -40,9 +52,21 @@ import java.util.UUID;
|
||||
public class AppController {
|
||||
|
||||
private final AppService appService;
|
||||
private final AppVersionRepository appVersionRepository;
|
||||
private final PostgresApplicationConfigRepository configRepository;
|
||||
private final PostgresDeploymentRepository deploymentRepository;
|
||||
private final DirtyStateCalculator dirtyCalc;
|
||||
|
||||
public AppController(AppService appService) {
|
||||
public AppController(AppService appService,
|
||||
AppVersionRepository appVersionRepository,
|
||||
PostgresApplicationConfigRepository configRepository,
|
||||
PostgresDeploymentRepository deploymentRepository,
|
||||
DirtyStateCalculator dirtyCalc) {
|
||||
this.appService = appService;
|
||||
this.appVersionRepository = appVersionRepository;
|
||||
this.configRepository = configRepository;
|
||||
this.deploymentRepository = deploymentRepository;
|
||||
this.dirtyCalc = dirtyCalc;
|
||||
}
|
||||
|
||||
@GetMapping
|
||||
@@ -120,6 +144,47 @@ public class AppController {
|
||||
}
|
||||
}
|
||||
|
||||
@GetMapping("/{appSlug}/dirty-state")
|
||||
@Operation(summary = "Check whether the app's current config differs from the last successful deploy",
|
||||
description = "Returns dirty=true when the desired state (current JAR + agent config + container config) "
|
||||
+ "would produce a changed deployment. When no successful deploy exists yet, dirty=true.")
|
||||
@ApiResponse(responseCode = "200", description = "Dirty-state computed")
|
||||
@ApiResponse(responseCode = "404", description = "App not found in this environment")
|
||||
public ResponseEntity<DirtyStateResponse> getDirtyState(@EnvPath Environment env,
|
||||
@PathVariable String appSlug) {
|
||||
App app;
|
||||
try {
|
||||
app = appService.getByEnvironmentAndSlug(env.id(), appSlug);
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new ResponseStatusException(HttpStatus.NOT_FOUND, "App not found");
|
||||
}
|
||||
|
||||
// Latest JAR version (newest first — findByAppId orders by version DESC)
|
||||
List<AppVersion> versions = appVersionRepository.findByAppId(app.id());
|
||||
UUID latestVersionId = versions.isEmpty() ? null
|
||||
: versions.stream().max(Comparator.comparingInt(AppVersion::version))
|
||||
.map(AppVersion::id).orElse(null);
|
||||
|
||||
// Desired agent config
|
||||
ApplicationConfig agentConfig = configRepository
|
||||
.findByApplicationAndEnvironment(appSlug, env.slug())
|
||||
.orElse(null);
|
||||
|
||||
// Container config
|
||||
Map<String, Object> containerConfig = app.containerConfig();
|
||||
|
||||
// Last successful deployment snapshot
|
||||
Deployment lastSuccessful = deploymentRepository
|
||||
.findLatestSuccessfulByAppAndEnv(app.id(), env.id())
|
||||
.orElse(null);
|
||||
DeploymentConfigSnapshot snapshot = lastSuccessful != null ? lastSuccessful.deployedConfigSnapshot() : null;
|
||||
|
||||
DirtyStateResult result = dirtyCalc.compute(latestVersionId, agentConfig, containerConfig, snapshot);
|
||||
|
||||
String lastId = lastSuccessful != null ? lastSuccessful.id().toString() : null;
|
||||
return ResponseEntity.ok(new DirtyStateResponse(result.dirty(), lastId, result.differences()));
|
||||
}
|
||||
|
||||
private static final java.util.regex.Pattern CUSTOM_ARGS_PATTERN =
|
||||
java.util.regex.Pattern.compile("^[-a-zA-Z0-9_.=:/\\s+\"']*$");
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@ import com.cameleer.server.core.storage.DiagramStore;
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.Parameter;
|
||||
import io.swagger.v3.oas.annotations.responses.ApiResponse;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
import jakarta.servlet.http.HttpServletRequest;
|
||||
@@ -33,6 +34,7 @@ import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.security.core.Authentication;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
import org.springframework.web.server.ResponseStatusException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@@ -108,13 +110,23 @@ public class ApplicationConfigController {
|
||||
|
||||
@PutMapping("/apps/{appSlug}/config")
|
||||
@Operation(summary = "Update application config for this environment",
|
||||
description = "Saves config and pushes CONFIG_UPDATE to LIVE agents of this application in the given environment")
|
||||
@ApiResponse(responseCode = "200", description = "Config saved and pushed")
|
||||
description = "Saves config. When apply=live (default), also pushes CONFIG_UPDATE to LIVE agents. "
|
||||
+ "When apply=staged, persists without a live push — the next successful deploy applies it.")
|
||||
@ApiResponse(responseCode = "200", description = "Config saved (and pushed if apply=live)")
|
||||
@ApiResponse(responseCode = "400", description = "Unknown apply value (must be 'staged' or 'live')")
|
||||
public ResponseEntity<ConfigUpdateResponse> updateConfig(@EnvPath Environment env,
|
||||
@PathVariable String appSlug,
|
||||
@Parameter(name = "apply",
|
||||
description = "When to apply: 'live' (default) saves and pushes CONFIG_UPDATE to live agents immediately; 'staged' saves without pushing — the next successful deploy applies it.")
|
||||
@RequestParam(name = "apply", defaultValue = "live") String apply,
|
||||
@RequestBody ApplicationConfig config,
|
||||
Authentication auth,
|
||||
HttpServletRequest httpRequest) {
|
||||
if (!"staged".equalsIgnoreCase(apply) && !"live".equalsIgnoreCase(apply)) {
|
||||
throw new ResponseStatusException(HttpStatus.BAD_REQUEST,
|
||||
"Unknown apply value '" + apply + "' — must be 'staged' or 'live'");
|
||||
}
|
||||
|
||||
String updatedBy = auth != null ? auth.getName() : "system";
|
||||
|
||||
config.setApplication(appSlug);
|
||||
@@ -126,14 +138,24 @@ public class ApplicationConfigController {
|
||||
List<String> perAppKeys = extractSensitiveKeys(saved);
|
||||
List<String> mergedKeys = SensitiveKeysMerger.merge(globalKeys, perAppKeys);
|
||||
|
||||
CommandGroupResponse pushResult = pushConfigToAgentsWithMergedKeys(appSlug, env.slug(), saved, mergedKeys);
|
||||
log.info("Config v{} saved for '{}', pushed to {} agent(s), {} responded",
|
||||
saved.getVersion(), appSlug, pushResult.total(), pushResult.responded());
|
||||
CommandGroupResponse pushResult;
|
||||
if ("staged".equalsIgnoreCase(apply)) {
|
||||
pushResult = new CommandGroupResponse(true, 0, 0, List.of(), List.of());
|
||||
log.info("Config v{} staged for '{}' (no live push)", saved.getVersion(), appSlug);
|
||||
} else {
|
||||
pushResult = pushConfigToAgentsWithMergedKeys(appSlug, env.slug(), saved, mergedKeys);
|
||||
log.info("Config v{} saved for '{}', pushed to {} agent(s), {} responded",
|
||||
saved.getVersion(), appSlug, pushResult.total(), pushResult.responded());
|
||||
}
|
||||
|
||||
auditService.log("update_app_config", AuditCategory.CONFIG, appSlug,
|
||||
auditService.log(
|
||||
"staged".equalsIgnoreCase(apply) ? "stage_app_config" : "update_app_config",
|
||||
AuditCategory.CONFIG, appSlug,
|
||||
Map.of("environment", env.slug(), "version", saved.getVersion(),
|
||||
"apply", apply.toLowerCase(),
|
||||
"agentsPushed", pushResult.total(),
|
||||
"responded", pushResult.responded(), "timedOut", pushResult.timedOut().size()),
|
||||
"responded", pushResult.responded(),
|
||||
"timedOut", pushResult.timedOut().size()),
|
||||
AuditResult.SUCCESS, httpRequest);
|
||||
|
||||
return ResponseEntity.ok(new ConfigUpdateResponse(saved, pushResult));
|
||||
|
||||
@@ -196,7 +196,16 @@ public class CatalogController {
|
||||
}
|
||||
|
||||
Set<String> routeIds = routesByApp.getOrDefault(slug, Set.of());
|
||||
List<String> agentIds = agents.stream().map(AgentInfo::instanceId).toList();
|
||||
|
||||
// Resolve the env slug for this row early so fromUri can survive
|
||||
// cross-env queries (env==null) against managed apps.
|
||||
String rowEnvSlug = envSlug;
|
||||
if (app != null && rowEnvSlug.isEmpty()) {
|
||||
try {
|
||||
rowEnvSlug = envService.getById(app.environmentId()).slug();
|
||||
} catch (Exception ignored) {}
|
||||
}
|
||||
final String resolvedEnvSlug = rowEnvSlug;
|
||||
|
||||
// Routes
|
||||
List<RouteSummary> routeSummaries = routeIds.stream()
|
||||
@@ -204,7 +213,7 @@ public class CatalogController {
|
||||
String key = slug + "/" + routeId;
|
||||
long count = routeExchangeCounts.getOrDefault(key, 0L);
|
||||
Instant lastSeen = routeLastSeen.get(key);
|
||||
String fromUri = resolveFromEndpointUri(routeId, agentIds);
|
||||
String fromUri = resolveFromEndpointUri(slug, routeId, resolvedEnvSlug);
|
||||
String state = routeStateRegistry.getState(slug, routeId).name().toLowerCase();
|
||||
String routeState = "started".equals(state) ? null : state;
|
||||
return new RouteSummary(routeId, count, lastSeen, fromUri, routeState);
|
||||
@@ -258,15 +267,9 @@ public class CatalogController {
|
||||
String healthTooltip = buildHealthTooltip(app != null, deployStatus, agentHealth, agents.size());
|
||||
|
||||
String displayName = app != null ? app.displayName() : slug;
|
||||
String appEnvSlug = envSlug;
|
||||
if (app != null && appEnvSlug.isEmpty()) {
|
||||
try {
|
||||
appEnvSlug = envService.getById(app.environmentId()).slug();
|
||||
} catch (Exception ignored) {}
|
||||
}
|
||||
|
||||
catalog.add(new CatalogApp(
|
||||
slug, displayName, app != null, appEnvSlug,
|
||||
slug, displayName, app != null, resolvedEnvSlug,
|
||||
health, healthTooltip, agents.size(), routeSummaries, agentSummaries,
|
||||
totalExchanges, deploymentSummary
|
||||
));
|
||||
@@ -275,8 +278,11 @@ public class CatalogController {
|
||||
return ResponseEntity.ok(catalog);
|
||||
}
|
||||
|
||||
private String resolveFromEndpointUri(String routeId, List<String> agentIds) {
|
||||
return diagramStore.findContentHashForRouteByAgents(routeId, agentIds)
|
||||
private String resolveFromEndpointUri(String applicationId, String routeId, String environment) {
|
||||
if (environment == null || environment.isBlank()) {
|
||||
return null;
|
||||
}
|
||||
return diagramStore.findLatestContentHashForAppRoute(applicationId, routeId, environment)
|
||||
.flatMap(diagramStore::findByContentHash)
|
||||
.map(RouteGraph::getRoot)
|
||||
.map(root -> root.getEndpointUri())
|
||||
|
||||
@@ -4,8 +4,6 @@ import com.cameleer.server.app.dto.ClickHousePerformanceResponse;
|
||||
import com.cameleer.server.app.dto.ClickHouseQueryInfo;
|
||||
import com.cameleer.server.app.dto.ClickHouseStatusResponse;
|
||||
import com.cameleer.server.app.dto.ClickHouseTableInfo;
|
||||
import com.cameleer.server.app.dto.IndexerPipelineResponse;
|
||||
import com.cameleer.server.core.indexing.SearchIndexerStats;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
import org.springframework.beans.factory.annotation.Qualifier;
|
||||
@@ -31,15 +29,12 @@ import java.util.List;
|
||||
public class ClickHouseAdminController {
|
||||
|
||||
private final JdbcTemplate clickHouseJdbc;
|
||||
private final SearchIndexerStats indexerStats;
|
||||
private final String clickHouseUrl;
|
||||
|
||||
public ClickHouseAdminController(
|
||||
@Qualifier("clickHouseJdbcTemplate") JdbcTemplate clickHouseJdbc,
|
||||
SearchIndexerStats indexerStats,
|
||||
@Value("${cameleer.server.clickhouse.url:}") String clickHouseUrl) {
|
||||
this.clickHouseJdbc = clickHouseJdbc;
|
||||
this.indexerStats = indexerStats;
|
||||
this.clickHouseUrl = clickHouseUrl;
|
||||
}
|
||||
|
||||
@@ -157,16 +152,4 @@ public class ClickHouseAdminController {
|
||||
}
|
||||
}
|
||||
|
||||
@GetMapping("/pipeline")
|
||||
@Operation(summary = "Search indexer pipeline statistics")
|
||||
public IndexerPipelineResponse getPipeline() {
|
||||
return new IndexerPipelineResponse(
|
||||
indexerStats.getQueueDepth(),
|
||||
indexerStats.getMaxQueueSize(),
|
||||
indexerStats.getFailedCount(),
|
||||
indexerStats.getIndexedCount(),
|
||||
indexerStats.getDebounceMs(),
|
||||
indexerStats.getIndexingRate(),
|
||||
indexerStats.getLastIndexedAt());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,8 +2,13 @@ package com.cameleer.server.app.controller;
|
||||
|
||||
import com.cameleer.server.app.runtime.DeploymentExecutor;
|
||||
import com.cameleer.server.app.web.EnvPath;
|
||||
import com.cameleer.server.core.admin.AuditCategory;
|
||||
import com.cameleer.server.core.admin.AuditResult;
|
||||
import com.cameleer.server.core.admin.AuditService;
|
||||
import com.cameleer.server.core.runtime.App;
|
||||
import com.cameleer.server.core.runtime.AppService;
|
||||
import com.cameleer.server.core.runtime.AppVersion;
|
||||
import com.cameleer.server.core.runtime.AppVersionRepository;
|
||||
import com.cameleer.server.core.runtime.Deployment;
|
||||
import com.cameleer.server.core.runtime.DeploymentService;
|
||||
import com.cameleer.server.core.runtime.Environment;
|
||||
@@ -12,14 +17,18 @@ import com.cameleer.server.core.runtime.RuntimeOrchestrator;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.responses.ApiResponse;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
import jakarta.servlet.http.HttpServletRequest;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.security.access.prepost.PreAuthorize;
|
||||
import org.springframework.security.core.context.SecurityContextHolder;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
import org.springframework.web.bind.annotation.PathVariable;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.server.ResponseStatusException;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@@ -42,17 +51,23 @@ public class DeploymentController {
|
||||
private final RuntimeOrchestrator orchestrator;
|
||||
private final AppService appService;
|
||||
private final EnvironmentService environmentService;
|
||||
private final AuditService auditService;
|
||||
private final AppVersionRepository appVersionRepository;
|
||||
|
||||
public DeploymentController(DeploymentService deploymentService,
|
||||
DeploymentExecutor deploymentExecutor,
|
||||
RuntimeOrchestrator orchestrator,
|
||||
AppService appService,
|
||||
EnvironmentService environmentService) {
|
||||
EnvironmentService environmentService,
|
||||
AuditService auditService,
|
||||
AppVersionRepository appVersionRepository) {
|
||||
this.deploymentService = deploymentService;
|
||||
this.deploymentExecutor = deploymentExecutor;
|
||||
this.orchestrator = orchestrator;
|
||||
this.appService = appService;
|
||||
this.environmentService = environmentService;
|
||||
this.auditService = auditService;
|
||||
this.appVersionRepository = appVersionRepository;
|
||||
}
|
||||
|
||||
@GetMapping
|
||||
@@ -86,13 +101,25 @@ public class DeploymentController {
|
||||
@ApiResponse(responseCode = "202", description = "Deployment accepted and starting")
|
||||
public ResponseEntity<Deployment> deploy(@EnvPath Environment env,
|
||||
@PathVariable String appSlug,
|
||||
@RequestBody DeployRequest request) {
|
||||
@RequestBody DeployRequest request,
|
||||
HttpServletRequest httpRequest) {
|
||||
try {
|
||||
App app = appService.getByEnvironmentAndSlug(env.id(), appSlug);
|
||||
Deployment deployment = deploymentService.createDeployment(app.id(), request.appVersionId(), env.id());
|
||||
AppVersion appVersion = appVersionRepository.findById(request.appVersionId())
|
||||
.orElseThrow(() -> new IllegalArgumentException("AppVersion not found: " + request.appVersionId()));
|
||||
Deployment deployment = deploymentService.createDeployment(app.id(), request.appVersionId(), env.id(), currentUserId());
|
||||
deploymentExecutor.executeAsync(deployment);
|
||||
auditService.log("deploy_app", AuditCategory.DEPLOYMENT, deployment.id().toString(),
|
||||
Map.of("appSlug", appSlug, "envSlug", env.slug(),
|
||||
"appVersionId", request.appVersionId().toString(),
|
||||
"jarFilename", appVersion.jarFilename() != null ? appVersion.jarFilename() : "",
|
||||
"version", appVersion.version()),
|
||||
AuditResult.SUCCESS, httpRequest);
|
||||
return ResponseEntity.accepted().body(deployment);
|
||||
} catch (IllegalArgumentException e) {
|
||||
auditService.log("deploy_app", AuditCategory.DEPLOYMENT, null,
|
||||
Map.of("appSlug", appSlug, "envSlug", env.slug(), "error", e.getMessage()),
|
||||
AuditResult.FAILURE, httpRequest);
|
||||
return ResponseEntity.notFound().build();
|
||||
}
|
||||
}
|
||||
@@ -103,12 +130,19 @@ public class DeploymentController {
|
||||
@ApiResponse(responseCode = "404", description = "Deployment not found")
|
||||
public ResponseEntity<Deployment> stop(@EnvPath Environment env,
|
||||
@PathVariable String appSlug,
|
||||
@PathVariable UUID deploymentId) {
|
||||
@PathVariable UUID deploymentId,
|
||||
HttpServletRequest httpRequest) {
|
||||
try {
|
||||
Deployment deployment = deploymentService.getById(deploymentId);
|
||||
deploymentExecutor.stopDeployment(deployment);
|
||||
auditService.log("stop_deployment", AuditCategory.DEPLOYMENT, deploymentId.toString(),
|
||||
Map.of("appSlug", appSlug, "envSlug", env.slug()),
|
||||
AuditResult.SUCCESS, httpRequest);
|
||||
return ResponseEntity.ok(deploymentService.getById(deploymentId));
|
||||
} catch (IllegalArgumentException e) {
|
||||
auditService.log("stop_deployment", AuditCategory.DEPLOYMENT, deploymentId.toString(),
|
||||
Map.of("appSlug", appSlug, "envSlug", env.slug(), "error", e.getMessage()),
|
||||
AuditResult.FAILURE, httpRequest);
|
||||
return ResponseEntity.notFound().build();
|
||||
}
|
||||
}
|
||||
@@ -122,18 +156,26 @@ public class DeploymentController {
|
||||
public ResponseEntity<?> promote(@EnvPath Environment env,
|
||||
@PathVariable String appSlug,
|
||||
@PathVariable UUID deploymentId,
|
||||
@RequestBody PromoteRequest request) {
|
||||
@RequestBody PromoteRequest request,
|
||||
HttpServletRequest httpRequest) {
|
||||
try {
|
||||
App sourceApp = appService.getByEnvironmentAndSlug(env.id(), appSlug);
|
||||
Deployment source = deploymentService.getById(deploymentId);
|
||||
Environment targetEnv = environmentService.getBySlug(request.targetEnvironment());
|
||||
// Target must also have the app with the same slug
|
||||
App targetApp = appService.getByEnvironmentAndSlug(targetEnv.id(), appSlug);
|
||||
Deployment promoted = deploymentService.promote(targetApp.id(), source.appVersionId(), targetEnv.id());
|
||||
Deployment promoted = deploymentService.promote(targetApp.id(), source.appVersionId(), targetEnv.id(), currentUserId());
|
||||
deploymentExecutor.executeAsync(promoted);
|
||||
auditService.log("promote_deployment", AuditCategory.DEPLOYMENT, promoted.id().toString(),
|
||||
Map.of("sourceEnv", env.slug(), "targetEnv", request.targetEnvironment(),
|
||||
"appSlug", appSlug, "appVersionId", source.appVersionId().toString()),
|
||||
AuditResult.SUCCESS, httpRequest);
|
||||
return ResponseEntity.accepted().body(promoted);
|
||||
} catch (IllegalArgumentException e) {
|
||||
return ResponseEntity.status(org.springframework.http.HttpStatus.NOT_FOUND)
|
||||
auditService.log("promote_deployment", AuditCategory.DEPLOYMENT, deploymentId.toString(),
|
||||
Map.of("sourceEnv", env.slug(), "targetEnv", request.targetEnvironment(),
|
||||
"appSlug", appSlug, "error", e.getMessage()),
|
||||
AuditResult.FAILURE, httpRequest);
|
||||
return ResponseEntity.status(HttpStatus.NOT_FOUND)
|
||||
.body(Map.of("error", e.getMessage()));
|
||||
}
|
||||
}
|
||||
@@ -157,6 +199,15 @@ public class DeploymentController {
|
||||
}
|
||||
}
|
||||
|
||||
private String currentUserId() {
|
||||
var auth = SecurityContextHolder.getContext().getAuthentication();
|
||||
if (auth == null || auth.getName() == null) {
|
||||
throw new ResponseStatusException(HttpStatus.UNAUTHORIZED, "No authentication");
|
||||
}
|
||||
String name = auth.getName();
|
||||
return name.startsWith("user:") ? name.substring(5) : name;
|
||||
}
|
||||
|
||||
public record DeployRequest(UUID appVersionId) {}
|
||||
public record PromoteRequest(String targetEnvironment) {}
|
||||
}
|
||||
|
||||
@@ -2,8 +2,6 @@ package com.cameleer.server.app.controller;
|
||||
|
||||
import com.cameleer.common.graph.RouteGraph;
|
||||
import com.cameleer.server.app.web.EnvPath;
|
||||
import com.cameleer.server.core.agent.AgentInfo;
|
||||
import com.cameleer.server.core.agent.AgentRegistryService;
|
||||
import com.cameleer.server.core.diagram.DiagramLayout;
|
||||
import com.cameleer.server.core.diagram.DiagramRenderer;
|
||||
import com.cameleer.server.core.runtime.Environment;
|
||||
@@ -21,7 +19,6 @@ import org.springframework.web.bind.annotation.PathVariable;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
/**
|
||||
@@ -42,14 +39,11 @@ public class DiagramRenderController {
|
||||
|
||||
private final DiagramStore diagramStore;
|
||||
private final DiagramRenderer diagramRenderer;
|
||||
private final AgentRegistryService registryService;
|
||||
|
||||
public DiagramRenderController(DiagramStore diagramStore,
|
||||
DiagramRenderer diagramRenderer,
|
||||
AgentRegistryService registryService) {
|
||||
DiagramRenderer diagramRenderer) {
|
||||
this.diagramStore = diagramStore;
|
||||
this.diagramRenderer = diagramRenderer;
|
||||
this.registryService = registryService;
|
||||
}
|
||||
|
||||
@GetMapping("/api/v1/diagrams/{contentHash}/render")
|
||||
@@ -90,8 +84,8 @@ public class DiagramRenderController {
|
||||
|
||||
@GetMapping("/api/v1/environments/{envSlug}/apps/{appSlug}/routes/{routeId}/diagram")
|
||||
@Operation(summary = "Find the latest diagram for this app's route in this environment",
|
||||
description = "Resolves agents in this env for this app, then looks up the latest diagram for the route "
|
||||
+ "they reported. Env scope prevents a dev route from returning a prod diagram.")
|
||||
description = "Returns the most recently stored diagram for (app, env, route). Independent of the "
|
||||
+ "agent registry, so routes removed from the current app version still resolve.")
|
||||
@ApiResponse(responseCode = "200", description = "Diagram layout returned")
|
||||
@ApiResponse(responseCode = "404", description = "No diagram found")
|
||||
public ResponseEntity<DiagramLayout> findByAppAndRoute(
|
||||
@@ -99,15 +93,7 @@ public class DiagramRenderController {
|
||||
@PathVariable String appSlug,
|
||||
@PathVariable String routeId,
|
||||
@RequestParam(defaultValue = "LR") String direction) {
|
||||
List<String> agentIds = registryService.findByApplicationAndEnvironment(appSlug, env.slug()).stream()
|
||||
.map(AgentInfo::instanceId)
|
||||
.toList();
|
||||
|
||||
if (agentIds.isEmpty()) {
|
||||
return ResponseEntity.notFound().build();
|
||||
}
|
||||
|
||||
Optional<String> contentHash = diagramStore.findContentHashForRouteByAgents(routeId, agentIds);
|
||||
Optional<String> contentHash = diagramStore.findLatestContentHashForAppRoute(appSlug, routeId, env.slug());
|
||||
if (contentHash.isEmpty()) {
|
||||
return ResponseEntity.notFound().build();
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package com.cameleer.server.app.controller;
|
||||
|
||||
import com.cameleer.server.core.runtime.Environment;
|
||||
import com.cameleer.server.core.runtime.EnvironmentColor;
|
||||
import com.cameleer.server.core.runtime.EnvironmentService;
|
||||
import com.cameleer.server.core.runtime.RuntimeType;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
@@ -58,16 +59,22 @@ public class EnvironmentAdminController {
|
||||
}
|
||||
|
||||
@PutMapping("/{envSlug}")
|
||||
@Operation(summary = "Update an environment's mutable fields (displayName, production, enabled)",
|
||||
@Operation(summary = "Update an environment's mutable fields (displayName, production, enabled, color)",
|
||||
description = "Slug is immutable after creation and cannot be changed. "
|
||||
+ "Any slug field in the request body is ignored.")
|
||||
+ "Any slug field in the request body is ignored. "
|
||||
+ "If color is null or absent, the existing color is preserved.")
|
||||
@ApiResponse(responseCode = "200", description = "Environment updated")
|
||||
@ApiResponse(responseCode = "400", description = "Unknown color value")
|
||||
@ApiResponse(responseCode = "404", description = "Environment not found")
|
||||
public ResponseEntity<?> updateEnvironment(@PathVariable String envSlug,
|
||||
@RequestBody UpdateEnvironmentRequest request) {
|
||||
try {
|
||||
Environment current = environmentService.getBySlug(envSlug);
|
||||
environmentService.update(current.id(), request.displayName(), request.production(), request.enabled());
|
||||
String nextColor = request.color() == null ? current.color() : request.color();
|
||||
if (!EnvironmentColor.isValid(nextColor)) {
|
||||
return ResponseEntity.badRequest().body(Map.of("error", "unknown environment color: " + request.color()));
|
||||
}
|
||||
environmentService.update(current.id(), request.displayName(), request.production(), request.enabled(), nextColor);
|
||||
return ResponseEntity.ok(environmentService.getBySlug(envSlug));
|
||||
} catch (IllegalArgumentException e) {
|
||||
if (e.getMessage().contains("not found")) {
|
||||
@@ -149,6 +156,6 @@ public class EnvironmentAdminController {
|
||||
}
|
||||
|
||||
public record CreateEnvironmentRequest(String slug, String displayName, boolean production) {}
|
||||
public record UpdateEnvironmentRequest(String displayName, boolean production, boolean enabled) {}
|
||||
public record UpdateEnvironmentRequest(String displayName, boolean production, boolean enabled, String color) {}
|
||||
public record JarRetentionRequest(Integer jarRetentionCount) {}
|
||||
}
|
||||
|
||||
@@ -1,87 +0,0 @@
|
||||
package com.cameleer.server.app.controller;
|
||||
|
||||
import com.cameleer.common.model.RouteExecution;
|
||||
import com.cameleer.server.core.agent.AgentInfo;
|
||||
import com.cameleer.server.core.agent.AgentRegistryService;
|
||||
import com.cameleer.server.core.ingestion.ChunkAccumulator;
|
||||
import com.cameleer.server.core.ingestion.IngestionService;
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.core.type.TypeReference;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.responses.ApiResponse;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.security.core.Authentication;
|
||||
import org.springframework.security.core.context.SecurityContextHolder;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Legacy ingestion endpoint for route execution data (PostgreSQL path).
|
||||
* <p>
|
||||
* Accepts both single {@link RouteExecution} and arrays. Data is written
|
||||
* synchronously to PostgreSQL via {@link IngestionService}.
|
||||
* <p>
|
||||
* Only active when ClickHouse is disabled — when ClickHouse is enabled,
|
||||
* {@link ChunkIngestionController} takes over the {@code /executions} mapping.
|
||||
*/
|
||||
@RestController
|
||||
@RequestMapping("/api/v1/data")
|
||||
@ConditionalOnMissingBean(ChunkAccumulator.class)
|
||||
@Tag(name = "Ingestion", description = "Data ingestion endpoints")
|
||||
public class ExecutionController {
|
||||
|
||||
private final IngestionService ingestionService;
|
||||
private final AgentRegistryService registryService;
|
||||
private final ObjectMapper objectMapper;
|
||||
|
||||
public ExecutionController(IngestionService ingestionService,
|
||||
AgentRegistryService registryService,
|
||||
ObjectMapper objectMapper) {
|
||||
this.ingestionService = ingestionService;
|
||||
this.registryService = registryService;
|
||||
this.objectMapper = objectMapper;
|
||||
}
|
||||
|
||||
@PostMapping("/executions")
|
||||
@Operation(summary = "Ingest route execution data",
|
||||
description = "Accepts a single RouteExecution or an array of RouteExecutions")
|
||||
@ApiResponse(responseCode = "202", description = "Data accepted for processing")
|
||||
public ResponseEntity<Void> ingestExecutions(@RequestBody String body) throws JsonProcessingException {
|
||||
String instanceId = extractAgentId();
|
||||
String applicationId = resolveApplicationId(instanceId);
|
||||
List<RouteExecution> executions = parsePayload(body);
|
||||
|
||||
for (RouteExecution execution : executions) {
|
||||
ingestionService.ingestExecution(instanceId, applicationId, execution);
|
||||
}
|
||||
|
||||
return ResponseEntity.accepted().build();
|
||||
}
|
||||
|
||||
private String extractAgentId() {
|
||||
Authentication auth = SecurityContextHolder.getContext().getAuthentication();
|
||||
return auth != null ? auth.getName() : "";
|
||||
}
|
||||
|
||||
private String resolveApplicationId(String instanceId) {
|
||||
AgentInfo agent = registryService.findById(instanceId);
|
||||
return agent != null ? agent.applicationId() : "";
|
||||
}
|
||||
|
||||
private List<RouteExecution> parsePayload(String body) throws JsonProcessingException {
|
||||
String trimmed = body.strip();
|
||||
if (trimmed.startsWith("[")) {
|
||||
return objectMapper.readValue(trimmed, new TypeReference<>() {});
|
||||
} else {
|
||||
RouteExecution single = objectMapper.readValue(trimmed, RouteExecution.class);
|
||||
return List.of(single);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -44,6 +44,7 @@ public class LogQueryController {
|
||||
@RequestParam(required = false) String exchangeId,
|
||||
@RequestParam(required = false) String logger,
|
||||
@RequestParam(required = false) String source,
|
||||
@RequestParam(required = false) String instanceIds,
|
||||
@RequestParam(required = false) String from,
|
||||
@RequestParam(required = false) String to,
|
||||
@RequestParam(required = false) String cursor,
|
||||
@@ -69,12 +70,21 @@ public class LogQueryController {
|
||||
.toList();
|
||||
}
|
||||
|
||||
List<String> instanceIdList = List.of();
|
||||
if (instanceIds != null && !instanceIds.isEmpty()) {
|
||||
instanceIdList = Arrays.stream(instanceIds.split(","))
|
||||
.map(String::trim)
|
||||
.filter(s -> !s.isEmpty())
|
||||
.toList();
|
||||
}
|
||||
|
||||
Instant fromInstant = from != null ? Instant.parse(from) : null;
|
||||
Instant toInstant = to != null ? Instant.parse(to) : null;
|
||||
|
||||
LogSearchRequest request = new LogSearchRequest(
|
||||
searchText, levels, application, instanceId, exchangeId,
|
||||
logger, env.slug(), sources, fromInstant, toInstant, cursor, limit, sort);
|
||||
logger, env.slug(), sources, fromInstant, toInstant, cursor, limit, sort,
|
||||
instanceIdList);
|
||||
|
||||
LogSearchResponse result = logIndex.search(request);
|
||||
|
||||
|
||||
@@ -132,13 +132,12 @@ public class RouteCatalogController {
|
||||
List<AgentInfo> agents = agentsByApp.getOrDefault(appId, List.of());
|
||||
|
||||
Set<String> routeIds = routesByApp.getOrDefault(appId, Set.of());
|
||||
List<String> agentIds = agents.stream().map(AgentInfo::instanceId).toList();
|
||||
List<RouteSummary> routeSummaries = routeIds.stream()
|
||||
.map(routeId -> {
|
||||
String key = appId + "/" + routeId;
|
||||
long count = routeExchangeCounts.getOrDefault(key, 0L);
|
||||
Instant lastSeen = routeLastSeen.get(key);
|
||||
String fromUri = resolveFromEndpointUri(routeId, agentIds);
|
||||
String fromUri = resolveFromEndpointUri(appId, routeId, envSlug);
|
||||
String state = routeStateRegistry.getState(appId, routeId).name().toLowerCase();
|
||||
String routeState = "started".equals(state) ? null : state;
|
||||
return new RouteSummary(routeId, count, lastSeen, fromUri, routeState);
|
||||
@@ -160,8 +159,8 @@ public class RouteCatalogController {
|
||||
return ResponseEntity.ok(catalog);
|
||||
}
|
||||
|
||||
private String resolveFromEndpointUri(String routeId, List<String> agentIds) {
|
||||
return diagramStore.findContentHashForRouteByAgents(routeId, agentIds)
|
||||
private String resolveFromEndpointUri(String applicationId, String routeId, String environment) {
|
||||
return diagramStore.findLatestContentHashForAppRoute(applicationId, routeId, environment)
|
||||
.flatMap(diagramStore::findByContentHash)
|
||||
.map(RouteGraph::getRoot)
|
||||
.map(root -> root.getEndpointUri())
|
||||
|
||||
@@ -4,6 +4,7 @@ import com.cameleer.server.app.web.EnvPath;
|
||||
import com.cameleer.server.core.admin.AppSettings;
|
||||
import com.cameleer.server.core.admin.AppSettingsRepository;
|
||||
import com.cameleer.server.core.runtime.Environment;
|
||||
import com.cameleer.server.core.search.AttributeFilter;
|
||||
import com.cameleer.server.core.search.ExecutionStats;
|
||||
import com.cameleer.server.core.search.ExecutionSummary;
|
||||
import com.cameleer.server.core.search.SearchRequest;
|
||||
@@ -14,6 +15,7 @@ import com.cameleer.server.core.search.TopError;
|
||||
import com.cameleer.server.core.storage.StatsStore;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
@@ -21,8 +23,10 @@ import org.springframework.web.bind.annotation.RequestBody;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.server.ResponseStatusException;
|
||||
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@@ -57,11 +61,19 @@ public class SearchController {
|
||||
@RequestParam(name = "agentId", required = false) String instanceId,
|
||||
@RequestParam(required = false) String processorType,
|
||||
@RequestParam(required = false) String application,
|
||||
@RequestParam(name = "attr", required = false) List<String> attr,
|
||||
@RequestParam(defaultValue = "0") int offset,
|
||||
@RequestParam(defaultValue = "50") int limit,
|
||||
@RequestParam(required = false) String sortField,
|
||||
@RequestParam(required = false) String sortDir) {
|
||||
|
||||
List<AttributeFilter> attributeFilters;
|
||||
try {
|
||||
attributeFilters = parseAttrParams(attr);
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new ResponseStatusException(HttpStatus.BAD_REQUEST, e.getMessage(), e);
|
||||
}
|
||||
|
||||
SearchRequest request = new SearchRequest(
|
||||
status, timeFrom, timeTo,
|
||||
null, null,
|
||||
@@ -71,12 +83,37 @@ public class SearchController {
|
||||
application, null,
|
||||
offset, limit,
|
||||
sortField, sortDir,
|
||||
env.slug()
|
||||
null,
|
||||
env.slug(),
|
||||
attributeFilters
|
||||
);
|
||||
|
||||
return ResponseEntity.ok(searchService.search(request));
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses {@code attr} query params of the form {@code key} (key-only) or {@code key:value}
|
||||
* (exact or wildcard via {@code *}). Splits on the first {@code :}; later colons are part of
|
||||
* the value. Blank / null list → empty result. Key validation is delegated to
|
||||
* {@link AttributeFilter}'s compact constructor, which throws {@link IllegalArgumentException}
|
||||
* on invalid keys (mapped to 400 by the caller).
|
||||
*/
|
||||
static List<AttributeFilter> parseAttrParams(List<String> raw) {
|
||||
if (raw == null || raw.isEmpty()) return List.of();
|
||||
List<AttributeFilter> out = new ArrayList<>(raw.size());
|
||||
for (String entry : raw) {
|
||||
if (entry == null || entry.isBlank()) continue;
|
||||
int colon = entry.indexOf(':');
|
||||
if (colon < 0) {
|
||||
out.add(new AttributeFilter(entry.trim(), null));
|
||||
} else {
|
||||
out.add(new AttributeFilter(entry.substring(0, colon).trim(),
|
||||
entry.substring(colon + 1)));
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
@PostMapping("/executions/search")
|
||||
@Operation(summary = "Advanced search with all filters",
|
||||
description = "Env from the path overrides any environment field in the body.")
|
||||
|
||||
@@ -0,0 +1,148 @@
|
||||
package com.cameleer.server.app.controller;
|
||||
|
||||
import com.cameleer.server.core.storage.ServerMetricsQueryStore;
|
||||
import com.cameleer.server.core.storage.model.ServerInstanceInfo;
|
||||
import com.cameleer.server.core.storage.model.ServerMetricCatalogEntry;
|
||||
import com.cameleer.server.core.storage.model.ServerMetricQueryRequest;
|
||||
import com.cameleer.server.core.storage.model.ServerMetricQueryResponse;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.security.access.prepost.PreAuthorize;
|
||||
import org.springframework.web.bind.annotation.ExceptionHandler;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
import java.time.Instant;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Generic read API over the ClickHouse {@code server_metrics} table. Lets
|
||||
* SaaS control planes build server-health dashboards without requiring direct
|
||||
* ClickHouse access.
|
||||
*
|
||||
* <p>Three endpoints cover all 17 panels in {@code docs/server-self-metrics.md}:
|
||||
* <ul>
|
||||
* <li>{@code GET /catalog} — discover available metric names, types, statistics, and tags</li>
|
||||
* <li>{@code POST /query} — generic time-series query with aggregation, grouping, filtering, and counter-delta mode</li>
|
||||
* <li>{@code GET /instances} — list server instances (useful for partitioning counter math)</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>Visibility matches {@code ClickHouseAdminController} / {@code DatabaseAdminController}:
|
||||
* <ul>
|
||||
* <li>Conditional on {@code cameleer.server.security.infrastructureendpoints=true} (default).</li>
|
||||
* <li>Class-level {@code @PreAuthorize("hasRole('ADMIN')")} on top of the
|
||||
* {@code /api/v1/admin/**} catch-all in {@code SecurityConfig}.</li>
|
||||
* </ul>
|
||||
*/
|
||||
@ConditionalOnProperty(
|
||||
name = "cameleer.server.security.infrastructureendpoints",
|
||||
havingValue = "true",
|
||||
matchIfMissing = true
|
||||
)
|
||||
@RestController
|
||||
@RequestMapping("/api/v1/admin/server-metrics")
|
||||
@PreAuthorize("hasRole('ADMIN')")
|
||||
@Tag(name = "Server Self-Metrics",
|
||||
description = "Read API over the server's own Micrometer registry snapshots (ADMIN only)")
|
||||
public class ServerMetricsAdminController {
|
||||
|
||||
/** Default lookback window for catalog/instances when from/to are omitted. */
|
||||
private static final long DEFAULT_LOOKBACK_SECONDS = 3_600L;
|
||||
|
||||
private final ServerMetricsQueryStore store;
|
||||
|
||||
public ServerMetricsAdminController(ServerMetricsQueryStore store) {
|
||||
this.store = store;
|
||||
}
|
||||
|
||||
@GetMapping("/catalog")
|
||||
@Operation(summary = "List metric names observed in the window",
|
||||
description = "For each metric_name, returns metric_type, the set of statistics emitted, and the union of tag keys.")
|
||||
public ResponseEntity<List<ServerMetricCatalogEntry>> catalog(
|
||||
@RequestParam(required = false) String from,
|
||||
@RequestParam(required = false) String to) {
|
||||
Instant[] window = resolveWindow(from, to);
|
||||
return ResponseEntity.ok(store.catalog(window[0], window[1]));
|
||||
}
|
||||
|
||||
@GetMapping("/instances")
|
||||
@Operation(summary = "List server_instance_id values observed in the window",
|
||||
description = "Returns first/last seen timestamps — use to partition counter-delta computations.")
|
||||
public ResponseEntity<List<ServerInstanceInfo>> instances(
|
||||
@RequestParam(required = false) String from,
|
||||
@RequestParam(required = false) String to) {
|
||||
Instant[] window = resolveWindow(from, to);
|
||||
return ResponseEntity.ok(store.listInstances(window[0], window[1]));
|
||||
}
|
||||
|
||||
@PostMapping("/query")
|
||||
@Operation(summary = "Generic time-series query",
|
||||
description = "Returns bucketed series for a single metric_name. Supports aggregation (avg/sum/max/min/latest), group-by-tag, filter-by-tag, counter delta mode, and a derived 'mean' statistic for timers.")
|
||||
public ResponseEntity<ServerMetricQueryResponse> query(@RequestBody QueryBody body) {
|
||||
ServerMetricQueryRequest request = new ServerMetricQueryRequest(
|
||||
body.metric(),
|
||||
body.statistic(),
|
||||
parseInstant(body.from(), "from"),
|
||||
parseInstant(body.to(), "to"),
|
||||
body.stepSeconds(),
|
||||
body.groupByTags(),
|
||||
body.filterTags(),
|
||||
body.aggregation(),
|
||||
body.mode(),
|
||||
body.serverInstanceIds());
|
||||
return ResponseEntity.ok(store.query(request));
|
||||
}
|
||||
|
||||
@ExceptionHandler(IllegalArgumentException.class)
|
||||
public ResponseEntity<Map<String, String>> handleBadRequest(IllegalArgumentException e) {
|
||||
return ResponseEntity.badRequest().body(Map.of("error", e.getMessage()));
|
||||
}
|
||||
|
||||
private static Instant[] resolveWindow(String from, String to) {
|
||||
Instant toI = to != null ? parseInstant(to, "to") : Instant.now();
|
||||
Instant fromI = from != null
|
||||
? parseInstant(from, "from")
|
||||
: toI.minusSeconds(DEFAULT_LOOKBACK_SECONDS);
|
||||
if (!fromI.isBefore(toI)) {
|
||||
throw new IllegalArgumentException("from must be strictly before to");
|
||||
}
|
||||
return new Instant[]{fromI, toI};
|
||||
}
|
||||
|
||||
private static Instant parseInstant(String raw, String field) {
|
||||
if (raw == null || raw.isBlank()) {
|
||||
throw new IllegalArgumentException(field + " is required");
|
||||
}
|
||||
try {
|
||||
return Instant.parse(raw);
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException(
|
||||
field + " must be an ISO-8601 instant (e.g. 2026-04-23T10:00:00Z)");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Request body for {@link #query(QueryBody)}. Uses ISO-8601 strings on
|
||||
* the wire so the OpenAPI schema stays language-neutral.
|
||||
*/
|
||||
public record QueryBody(
|
||||
String metric,
|
||||
String statistic,
|
||||
String from,
|
||||
String to,
|
||||
Integer stepSeconds,
|
||||
List<String> groupByTags,
|
||||
Map<String, String> filterTags,
|
||||
String aggregation,
|
||||
String mode,
|
||||
List<String> serverInstanceIds
|
||||
) {
|
||||
}
|
||||
}
|
||||
@@ -93,7 +93,9 @@ public class UserAdminController {
|
||||
return ResponseEntity.badRequest()
|
||||
.body(Map.of("error", "Local user creation is disabled when OIDC is enabled. Users are provisioned automatically via SSO."));
|
||||
}
|
||||
String userId = "user:" + request.username();
|
||||
// DB key is the bare username (matches alert_rules.created_by FK shape used by
|
||||
// the env-scoped read-path controllers, which strip "user:" from JWT subjects).
|
||||
String userId = request.username();
|
||||
UserInfo user = new UserInfo(userId, "local",
|
||||
request.email() != null ? request.email() : "",
|
||||
request.displayName() != null ? request.displayName() : request.username(),
|
||||
@@ -215,9 +217,7 @@ public class UserAdminController {
|
||||
return ResponseEntity.badRequest().build();
|
||||
}
|
||||
}
|
||||
// Extract bare username from "user:username" format for policy check
|
||||
String username = userId.startsWith("user:") ? userId.substring(5) : userId;
|
||||
List<String> violations = PasswordPolicyValidator.validate(request.password(), username);
|
||||
List<String> violations = PasswordPolicyValidator.validate(request.password(), userId);
|
||||
if (!violations.isEmpty()) {
|
||||
throw new ResponseStatusException(HttpStatus.BAD_REQUEST,
|
||||
"Password policy violation: " + String.join("; ", violations));
|
||||
|
||||
@@ -0,0 +1,12 @@
|
||||
package com.cameleer.server.app.dto;
|
||||
|
||||
import com.cameleer.server.core.runtime.DirtyStateResult;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public record DirtyStateResponse(
|
||||
boolean dirty,
|
||||
String lastSuccessfulDeploymentId,
|
||||
List<DirtyStateResult.Difference> differences
|
||||
) {
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
package com.cameleer.server.app.dto;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
|
||||
import java.time.Instant;
|
||||
|
||||
@Schema(description = "Search indexer pipeline statistics")
|
||||
public record IndexerPipelineResponse(
|
||||
int queueDepth,
|
||||
int maxQueueSize,
|
||||
long failedCount,
|
||||
long indexedCount,
|
||||
long debounceMs,
|
||||
double indexingRate,
|
||||
Instant lastIndexedAt
|
||||
) {}
|
||||
@@ -30,7 +30,7 @@ public class MetricsFlushScheduler implements SmartLifecycle {
|
||||
this.batchSize = config.getBatchSize();
|
||||
}
|
||||
|
||||
@Scheduled(fixedDelayString = "${ingestion.flush-interval-ms:1000}")
|
||||
@Scheduled(fixedDelayString = "${cameleer.server.ingestion.flush-interval-ms:1000}")
|
||||
public void flush() {
|
||||
try {
|
||||
List<MetricsSnapshot> batch = metricsBuffer.drain(batchSize);
|
||||
|
||||
@@ -6,8 +6,10 @@ import com.cameleer.server.core.admin.AuditService;
|
||||
import jakarta.servlet.http.HttpServletRequest;
|
||||
import jakarta.servlet.http.HttpServletResponse;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.util.AntPathMatcher;
|
||||
import org.springframework.web.servlet.HandlerInterceptor;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
@@ -22,7 +24,9 @@ import java.util.Set;
|
||||
public class AuditInterceptor implements HandlerInterceptor {
|
||||
|
||||
private static final Set<String> AUDITABLE_METHODS = Set.of("POST", "PUT", "DELETE");
|
||||
private static final Set<String> EXCLUDED_PATHS = Set.of("/api/v1/search/executions");
|
||||
private static final List<String> EXCLUDED_PATH_PATTERNS = List.of(
|
||||
"/api/v1/environments/*/executions/search");
|
||||
private static final AntPathMatcher PATH_MATCHER = new AntPathMatcher();
|
||||
|
||||
private final AuditService auditService;
|
||||
|
||||
@@ -41,8 +45,10 @@ public class AuditInterceptor implements HandlerInterceptor {
|
||||
}
|
||||
|
||||
String path = request.getRequestURI();
|
||||
if (EXCLUDED_PATHS.contains(path)) {
|
||||
return;
|
||||
for (String pattern : EXCLUDED_PATH_PATTERNS) {
|
||||
if (PATH_MATCHER.match(pattern, path)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
AuditResult result = response.getStatus() < 400 ? AuditResult.SUCCESS : AuditResult.FAILURE;
|
||||
|
||||
|
||||
@@ -0,0 +1,63 @@
|
||||
package com.cameleer.server.app.metrics;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import java.net.InetAddress;
|
||||
import java.net.UnknownHostException;
|
||||
import java.util.UUID;
|
||||
|
||||
/**
|
||||
* Resolves a stable identifier for this server process, used as the
|
||||
* {@code server_instance_id} on every server_metrics sample. The value is
|
||||
* fixed at boot, so counters restart cleanly whenever the id rotates.
|
||||
*
|
||||
* <p>Precedence:
|
||||
* <ol>
|
||||
* <li>{@code cameleer.server.instance-id} property / {@code CAMELEER_SERVER_INSTANCE_ID} env
|
||||
* <li>{@code HOSTNAME} env (populated by Docker/Kubernetes)
|
||||
* <li>{@link InetAddress#getLocalHost()} hostname
|
||||
* <li>Random UUID (fallback — only hit when DNS and env are both silent)
|
||||
* </ol>
|
||||
*/
|
||||
@Configuration
|
||||
public class ServerInstanceIdConfig {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(ServerInstanceIdConfig.class);
|
||||
|
||||
@Bean("serverInstanceId")
|
||||
public String serverInstanceId(
|
||||
@Value("${cameleer.server.instance-id:}") String configuredId) {
|
||||
if (!isBlank(configuredId)) {
|
||||
log.info("Server instance id resolved from configuration: {}", configuredId);
|
||||
return configuredId;
|
||||
}
|
||||
|
||||
String hostnameEnv = System.getenv("HOSTNAME");
|
||||
if (!isBlank(hostnameEnv)) {
|
||||
log.info("Server instance id resolved from HOSTNAME env: {}", hostnameEnv);
|
||||
return hostnameEnv;
|
||||
}
|
||||
|
||||
try {
|
||||
String localHost = InetAddress.getLocalHost().getHostName();
|
||||
if (!isBlank(localHost)) {
|
||||
log.info("Server instance id resolved from localhost lookup: {}", localHost);
|
||||
return localHost;
|
||||
}
|
||||
} catch (UnknownHostException e) {
|
||||
log.debug("InetAddress.getLocalHost() failed, falling back to UUID: {}", e.getMessage());
|
||||
}
|
||||
|
||||
String fallback = UUID.randomUUID().toString();
|
||||
log.warn("Server instance id could not be resolved; using random UUID {}", fallback);
|
||||
return fallback;
|
||||
}
|
||||
|
||||
private static boolean isBlank(String s) {
|
||||
return s == null || s.isBlank();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,106 @@
|
||||
package com.cameleer.server.app.metrics;
|
||||
|
||||
import com.cameleer.server.core.storage.ServerMetricsStore;
|
||||
import com.cameleer.server.core.storage.model.ServerMetricSample;
|
||||
import io.micrometer.core.instrument.Measurement;
|
||||
import io.micrometer.core.instrument.Meter;
|
||||
import io.micrometer.core.instrument.MeterRegistry;
|
||||
import io.micrometer.core.instrument.Tag;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Qualifier;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
import org.springframework.scheduling.annotation.Scheduled;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Periodically snapshots every meter in the server's {@link MeterRegistry}
|
||||
* and writes the result to ClickHouse via {@link ServerMetricsStore}. This
|
||||
* gives us historical server-health data (buffer depths, agent transitions,
|
||||
* flush latency, JVM memory, HTTP response counts, etc.) without requiring
|
||||
* an external Prometheus.
|
||||
*
|
||||
* <p>Each Micrometer {@link Meter#measure() measurement} becomes one row, so
|
||||
* a single Timer produces rows for {@code count}, {@code total_time}, and
|
||||
* {@code max} each tick. Counter values are cumulative since meter
|
||||
* registration (Prometheus convention) — callers compute rate() themselves.
|
||||
*
|
||||
* <p>Disabled via {@code cameleer.server.self-metrics.enabled=false}.
|
||||
*/
|
||||
@Component
|
||||
@ConditionalOnProperty(
|
||||
prefix = "cameleer.server.self-metrics",
|
||||
name = "enabled",
|
||||
havingValue = "true",
|
||||
matchIfMissing = true)
|
||||
public class ServerMetricsSnapshotScheduler {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(ServerMetricsSnapshotScheduler.class);
|
||||
|
||||
private final MeterRegistry registry;
|
||||
private final ServerMetricsStore store;
|
||||
private final String tenantId;
|
||||
private final String serverInstanceId;
|
||||
|
||||
public ServerMetricsSnapshotScheduler(
|
||||
MeterRegistry registry,
|
||||
ServerMetricsStore store,
|
||||
@Value("${cameleer.server.tenant.id:default}") String tenantId,
|
||||
@Qualifier("serverInstanceId") String serverInstanceId) {
|
||||
this.registry = registry;
|
||||
this.store = store;
|
||||
this.tenantId = tenantId;
|
||||
this.serverInstanceId = serverInstanceId;
|
||||
}
|
||||
|
||||
@Scheduled(fixedDelayString = "${cameleer.server.self-metrics.interval-ms:60000}",
|
||||
initialDelayString = "${cameleer.server.self-metrics.interval-ms:60000}")
|
||||
public void snapshot() {
|
||||
try {
|
||||
Instant now = Instant.now();
|
||||
List<ServerMetricSample> batch = new ArrayList<>();
|
||||
|
||||
for (Meter meter : registry.getMeters()) {
|
||||
Meter.Id id = meter.getId();
|
||||
Map<String, String> tags = flattenTags(id.getTagsAsIterable());
|
||||
String type = id.getType().name().toLowerCase();
|
||||
|
||||
for (Measurement m : meter.measure()) {
|
||||
double v = m.getValue();
|
||||
if (!Double.isFinite(v)) continue;
|
||||
batch.add(new ServerMetricSample(
|
||||
tenantId,
|
||||
now,
|
||||
serverInstanceId,
|
||||
id.getName(),
|
||||
type,
|
||||
m.getStatistic().getTagValueRepresentation(),
|
||||
v,
|
||||
tags));
|
||||
}
|
||||
}
|
||||
|
||||
if (!batch.isEmpty()) {
|
||||
store.insertBatch(batch);
|
||||
log.debug("Persisted {} server self-metric samples", batch.size());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.warn("Server self-metrics snapshot failed: {}", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private static Map<String, String> flattenTags(Iterable<Tag> tags) {
|
||||
Map<String, String> out = new LinkedHashMap<>();
|
||||
for (Tag t : tags) {
|
||||
out.put(t.getKey(), t.getValue());
|
||||
}
|
||||
return out;
|
||||
}
|
||||
}
|
||||
@@ -7,6 +7,8 @@ import com.cameleer.server.core.outbound.OutboundConnectionService;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.web.server.ResponseStatusException;
|
||||
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.time.Instant;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
@@ -15,20 +17,24 @@ public class OutboundConnectionServiceImpl implements OutboundConnectionService
|
||||
|
||||
private final OutboundConnectionRepository repo;
|
||||
private final AlertRuleRepository ruleRepo;
|
||||
private final SsrfGuard ssrfGuard;
|
||||
private final String tenantId;
|
||||
|
||||
public OutboundConnectionServiceImpl(
|
||||
OutboundConnectionRepository repo,
|
||||
AlertRuleRepository ruleRepo,
|
||||
SsrfGuard ssrfGuard,
|
||||
String tenantId) {
|
||||
this.repo = repo;
|
||||
this.ruleRepo = ruleRepo;
|
||||
this.ssrfGuard = ssrfGuard;
|
||||
this.tenantId = tenantId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public OutboundConnection create(OutboundConnection draft, String actingUserId) {
|
||||
assertNameUnique(draft.name(), null);
|
||||
validateUrl(draft.url());
|
||||
OutboundConnection c = new OutboundConnection(
|
||||
UUID.randomUUID(), tenantId, draft.name(), draft.description(),
|
||||
draft.url(), draft.method(), draft.defaultHeaders(), draft.defaultBodyTmpl(),
|
||||
@@ -46,6 +52,7 @@ public class OutboundConnectionServiceImpl implements OutboundConnectionService
|
||||
if (!existing.name().equals(draft.name())) {
|
||||
assertNameUnique(draft.name(), id);
|
||||
}
|
||||
validateUrl(draft.url());
|
||||
|
||||
// Narrowing allowed-envs guard: if the new draft restricts to a non-empty set of envs,
|
||||
// find any envs that existed before but are absent in the draft.
|
||||
@@ -107,4 +114,23 @@ public class OutboundConnectionServiceImpl implements OutboundConnectionService
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate the webhook URL against SSRF pitfalls. Translates the guard's
|
||||
* {@link IllegalArgumentException} into a 400 Bad Request with the guard's
|
||||
* message preserved, so the client sees e.g. "private or loopback".
|
||||
*/
|
||||
private void validateUrl(String url) {
|
||||
URI uri;
|
||||
try {
|
||||
uri = new URI(url);
|
||||
} catch (URISyntaxException e) {
|
||||
throw new ResponseStatusException(HttpStatus.BAD_REQUEST, "Invalid URL: " + url);
|
||||
}
|
||||
try {
|
||||
ssrfGuard.validate(uri);
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new ResponseStatusException(HttpStatus.BAD_REQUEST, e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
package com.cameleer.server.app.outbound;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.net.Inet4Address;
|
||||
import java.net.Inet6Address;
|
||||
import java.net.InetAddress;
|
||||
import java.net.URI;
|
||||
import java.net.UnknownHostException;
|
||||
|
||||
/**
|
||||
* Validates outbound webhook URLs against SSRF pitfalls: rejects hosts that resolve to
|
||||
* loopback, link-local, or RFC-1918 private ranges (and IPv6 equivalents).
|
||||
*
|
||||
* Per spec §17. The `cameleer.server.outbound-http.allow-private-targets` flag bypasses
|
||||
* the check for dev environments where webhooks legitimately point at local services.
|
||||
*/
|
||||
@Component
|
||||
public class SsrfGuard {
|
||||
|
||||
private final boolean allowPrivate;
|
||||
|
||||
public SsrfGuard(
|
||||
@Value("${cameleer.server.outbound-http.allow-private-targets:false}") boolean allowPrivate
|
||||
) {
|
||||
this.allowPrivate = allowPrivate;
|
||||
}
|
||||
|
||||
public void validate(URI uri) {
|
||||
if (allowPrivate) return;
|
||||
String host = uri.getHost();
|
||||
if (host == null || host.isBlank()) {
|
||||
throw new IllegalArgumentException("URL must include a host: " + uri);
|
||||
}
|
||||
if ("localhost".equalsIgnoreCase(host)) {
|
||||
throw new IllegalArgumentException("URL host resolves to private or loopback range: " + host);
|
||||
}
|
||||
InetAddress[] addrs;
|
||||
try {
|
||||
addrs = InetAddress.getAllByName(host);
|
||||
} catch (UnknownHostException e) {
|
||||
throw new IllegalArgumentException("URL host does not resolve: " + host, e);
|
||||
}
|
||||
for (InetAddress addr : addrs) {
|
||||
if (isPrivate(addr)) {
|
||||
throw new IllegalArgumentException("URL host resolves to private or loopback range: " + host + " -> " + addr.getHostAddress());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean isPrivate(InetAddress addr) {
|
||||
if (addr.isLoopbackAddress()) return true;
|
||||
if (addr.isLinkLocalAddress()) return true;
|
||||
if (addr.isSiteLocalAddress()) return true; // 10/8, 172.16/12, 192.168/16
|
||||
if (addr.isAnyLocalAddress()) return true; // 0.0.0.0, ::
|
||||
if (addr instanceof Inet6Address ip6) {
|
||||
byte[] raw = ip6.getAddress();
|
||||
// fc00::/7 unique-local
|
||||
if ((raw[0] & 0xfe) == 0xfc) return true;
|
||||
}
|
||||
if (addr instanceof Inet4Address ip4) {
|
||||
byte[] raw = ip4.getAddress();
|
||||
// 169.254.0.0/16 link-local (also matches isLinkLocalAddress but doubled-up for safety)
|
||||
if ((raw[0] & 0xff) == 169 && (raw[1] & 0xff) == 254) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,7 @@
|
||||
package com.cameleer.server.app.outbound.config;
|
||||
|
||||
import com.cameleer.server.app.outbound.OutboundConnectionServiceImpl;
|
||||
import com.cameleer.server.app.outbound.SsrfGuard;
|
||||
import com.cameleer.server.app.outbound.crypto.SecretCipher;
|
||||
import com.cameleer.server.app.outbound.storage.PostgresOutboundConnectionRepository;
|
||||
import com.cameleer.server.core.alerting.AlertRuleRepository;
|
||||
@@ -31,7 +32,8 @@ public class OutboundBeanConfig {
|
||||
public OutboundConnectionService outboundConnectionService(
|
||||
OutboundConnectionRepository repo,
|
||||
AlertRuleRepository ruleRepo,
|
||||
SsrfGuard ssrfGuard,
|
||||
@Value("${cameleer.server.tenant.id:default}") String tenantId) {
|
||||
return new OutboundConnectionServiceImpl(repo, ruleRepo, tenantId);
|
||||
return new OutboundConnectionServiceImpl(repo, ruleRepo, ssrfGuard, tenantId);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
package com.cameleer.server.app.runtime;
|
||||
|
||||
import com.cameleer.common.model.ApplicationConfig;
|
||||
import com.cameleer.server.app.metrics.ServerMetrics;
|
||||
import com.cameleer.server.app.storage.PostgresApplicationConfigRepository;
|
||||
import com.cameleer.server.app.storage.PostgresDeploymentRepository;
|
||||
import com.cameleer.server.core.runtime.*;
|
||||
import org.slf4j.Logger;
|
||||
@@ -25,6 +27,7 @@ public class DeploymentExecutor {
|
||||
private final EnvironmentService envService;
|
||||
private final DeploymentRepository deploymentRepository;
|
||||
private final PostgresDeploymentRepository pgDeployRepo;
|
||||
private final PostgresApplicationConfigRepository applicationConfigRepository;
|
||||
|
||||
@Autowired(required = false)
|
||||
private DockerNetworkManager networkManager;
|
||||
@@ -59,6 +62,9 @@ public class DeploymentExecutor {
|
||||
@Value("${cameleer.server.runtime.serverurl:}")
|
||||
private String globalServerUrl;
|
||||
|
||||
@Value("${cameleer.server.runtime.certresolver:}")
|
||||
private String globalCertResolver;
|
||||
|
||||
@Value("${cameleer.server.runtime.jardockervolume:}")
|
||||
private String jarDockerVolume;
|
||||
|
||||
@@ -75,15 +81,45 @@ public class DeploymentExecutor {
|
||||
DeploymentService deploymentService,
|
||||
AppService appService,
|
||||
EnvironmentService envService,
|
||||
DeploymentRepository deploymentRepository) {
|
||||
DeploymentRepository deploymentRepository,
|
||||
PostgresApplicationConfigRepository applicationConfigRepository) {
|
||||
this.orchestrator = orchestrator;
|
||||
this.deploymentService = deploymentService;
|
||||
this.appService = appService;
|
||||
this.envService = envService;
|
||||
this.deploymentRepository = deploymentRepository;
|
||||
this.pgDeployRepo = (PostgresDeploymentRepository) deploymentRepository;
|
||||
this.applicationConfigRepository = applicationConfigRepository;
|
||||
}
|
||||
|
||||
/** Deployment-scoped id suffix — distinguishes container names and
|
||||
* CAMELEER_AGENT_INSTANCEID across redeploys so old + new replicas can
|
||||
* coexist during a blue/green swap. First 8 chars of the deployment UUID. */
|
||||
static String generationOf(Deployment deployment) {
|
||||
return deployment.id().toString().substring(0, 8);
|
||||
}
|
||||
|
||||
/**
|
||||
* Per-deployment context assembled once at the top of executeAsync and passed
|
||||
* into strategy handlers. Keeps the strategy methods readable instead of
|
||||
* threading 12 positional args.
|
||||
*/
|
||||
private record DeployCtx(
|
||||
Deployment deployment,
|
||||
App app,
|
||||
Environment env,
|
||||
ResolvedContainerConfig config,
|
||||
String jarPath,
|
||||
String resolvedRuntimeType,
|
||||
String mainClass,
|
||||
String generation,
|
||||
String primaryNetwork,
|
||||
List<String> additionalNets,
|
||||
Map<String, String> baseEnvVars,
|
||||
Map<String, String> prometheusLabels,
|
||||
long deployStart
|
||||
) {}
|
||||
|
||||
@Async("deploymentTaskExecutor")
|
||||
public void executeAsync(Deployment deployment) {
|
||||
long deployStart = System.currentTimeMillis();
|
||||
@@ -91,13 +127,15 @@ public class DeploymentExecutor {
|
||||
App app = appService.getById(deployment.appId());
|
||||
Environment env = envService.getById(deployment.environmentId());
|
||||
String jarPath = appService.resolveJarPath(deployment.appVersionId());
|
||||
String generation = generationOf(deployment);
|
||||
|
||||
var globalDefaults = new ConfigMerger.GlobalRuntimeDefaults(
|
||||
parseMemoryLimitMb(globalMemoryLimit),
|
||||
globalCpuShares,
|
||||
globalRoutingMode,
|
||||
globalRoutingDomain,
|
||||
globalServerUrl.isBlank() ? "http://cameleer-server:8081" : globalServerUrl
|
||||
globalServerUrl.isBlank() ? "http://cameleer-server:8081" : globalServerUrl,
|
||||
globalCertResolver.isBlank() ? null : globalCertResolver
|
||||
);
|
||||
ResolvedContainerConfig config = ConfigMerger.resolve(
|
||||
globalDefaults, env.defaultContainerConfig(), app.containerConfig());
|
||||
@@ -139,7 +177,6 @@ public class DeploymentExecutor {
|
||||
updateStage(deployment.id(), DeployStage.CREATE_NETWORK);
|
||||
// Primary network: use configured CAMELEER_DOCKER_NETWORK (tenant-isolated in SaaS mode)
|
||||
String primaryNetwork = dockerNetwork;
|
||||
String envNet = null;
|
||||
List<String> additionalNets = new ArrayList<>();
|
||||
if (networkManager != null) {
|
||||
networkManager.ensureNetwork(primaryNetwork);
|
||||
@@ -147,7 +184,7 @@ public class DeploymentExecutor {
|
||||
networkManager.ensureNetwork(DockerNetworkManager.TRAEFIK_NETWORK);
|
||||
additionalNets.add(DockerNetworkManager.TRAEFIK_NETWORK);
|
||||
// Per-environment network scoped to tenant to prevent cross-tenant collisions
|
||||
envNet = DockerNetworkManager.envNetworkName(tenantId, env.slug());
|
||||
String envNet = DockerNetworkManager.envNetworkName(tenantId, env.slug());
|
||||
networkManager.ensureNetwork(envNet);
|
||||
additionalNets.add(envNet);
|
||||
}
|
||||
@@ -162,111 +199,21 @@ public class DeploymentExecutor {
|
||||
}
|
||||
}
|
||||
|
||||
// === START REPLICAS ===
|
||||
updateStage(deployment.id(), DeployStage.START_REPLICAS);
|
||||
DeployCtx ctx = new DeployCtx(
|
||||
deployment, app, env, config, jarPath,
|
||||
resolvedRuntimeType, mainClass, generation,
|
||||
primaryNetwork, additionalNets,
|
||||
buildEnvVars(app, env, config),
|
||||
PrometheusLabelBuilder.build(resolvedRuntimeType),
|
||||
deployStart);
|
||||
|
||||
Map<String, String> baseEnvVars = buildEnvVars(app, env, config);
|
||||
Map<String, String> prometheusLabels = PrometheusLabelBuilder.build(resolvedRuntimeType);
|
||||
|
||||
List<Map<String, Object>> replicaStates = new ArrayList<>();
|
||||
List<String> newContainerIds = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < config.replicas(); i++) {
|
||||
String instanceId = env.slug() + "-" + app.slug() + "-" + i;
|
||||
String containerName = tenantId + "-" + instanceId;
|
||||
|
||||
// Per-replica labels (include replica index and instance-id)
|
||||
Map<String, String> labels = TraefikLabelBuilder.build(app.slug(), env.slug(), tenantId, config, i);
|
||||
labels.putAll(prometheusLabels);
|
||||
|
||||
// Per-replica env vars (set agent instance ID to match container log identity)
|
||||
Map<String, String> replicaEnvVars = new LinkedHashMap<>(baseEnvVars);
|
||||
replicaEnvVars.put("CAMELEER_AGENT_INSTANCEID", instanceId);
|
||||
|
||||
String volumeName = jarDockerVolume != null && !jarDockerVolume.isBlank() ? jarDockerVolume : null;
|
||||
ContainerRequest request = new ContainerRequest(
|
||||
containerName, baseImage, jarPath,
|
||||
volumeName, jarStoragePath,
|
||||
primaryNetwork,
|
||||
additionalNets,
|
||||
replicaEnvVars, labels,
|
||||
config.memoryLimitBytes(), config.memoryReserveBytes(),
|
||||
config.dockerCpuShares(), config.dockerCpuQuota(),
|
||||
config.exposedPorts(), agentHealthPort,
|
||||
"on-failure", 3,
|
||||
resolvedRuntimeType, config.customArgs(), mainClass
|
||||
);
|
||||
|
||||
String containerId = orchestrator.startContainer(request);
|
||||
newContainerIds.add(containerId);
|
||||
|
||||
// Connect to additional networks after container is started
|
||||
for (String net : additionalNets) {
|
||||
if (networkManager != null) {
|
||||
networkManager.connectContainer(containerId, net);
|
||||
}
|
||||
}
|
||||
|
||||
orchestrator.startLogCapture(containerId, instanceId, app.slug(), env.slug(), tenantId);
|
||||
|
||||
replicaStates.add(Map.of(
|
||||
"index", i,
|
||||
"containerId", containerId,
|
||||
"containerName", containerName,
|
||||
"status", "STARTING"
|
||||
));
|
||||
// Dispatch on strategy. Unknown values fall back to BLUE_GREEN via fromWire.
|
||||
DeploymentStrategy strategy = DeploymentStrategy.fromWire(config.deploymentStrategy());
|
||||
switch (strategy) {
|
||||
case BLUE_GREEN -> deployBlueGreen(ctx);
|
||||
case ROLLING -> deployRolling(ctx);
|
||||
}
|
||||
|
||||
pgDeployRepo.updateReplicaStates(deployment.id(), replicaStates);
|
||||
|
||||
// === HEALTH CHECK ===
|
||||
updateStage(deployment.id(), DeployStage.HEALTH_CHECK);
|
||||
int healthyCount = waitForAnyHealthy(newContainerIds, healthCheckTimeout);
|
||||
|
||||
if (healthyCount == 0) {
|
||||
for (String cid : newContainerIds) {
|
||||
try { orchestrator.stopContainer(cid); orchestrator.removeContainer(cid); }
|
||||
catch (Exception e) { log.warn("Cleanup failed for {}: {}", cid, e.getMessage()); }
|
||||
}
|
||||
pgDeployRepo.updateDeployStage(deployment.id(), null);
|
||||
deploymentService.markFailed(deployment.id(), "No replicas passed health check within " + healthCheckTimeout + "s");
|
||||
serverMetrics.recordDeploymentOutcome("FAILED");
|
||||
serverMetrics.recordDeploymentDuration(deployStart);
|
||||
return;
|
||||
}
|
||||
|
||||
replicaStates = updateReplicaHealth(replicaStates, newContainerIds);
|
||||
pgDeployRepo.updateReplicaStates(deployment.id(), replicaStates);
|
||||
|
||||
// === SWAP TRAFFIC ===
|
||||
updateStage(deployment.id(), DeployStage.SWAP_TRAFFIC);
|
||||
|
||||
Optional<Deployment> existing = deploymentRepository.findActiveByAppIdAndEnvironmentId(
|
||||
deployment.appId(), deployment.environmentId());
|
||||
if (existing.isPresent() && !existing.get().id().equals(deployment.id())) {
|
||||
stopDeploymentContainers(existing.get());
|
||||
deploymentService.markStopped(existing.get().id());
|
||||
log.info("Stopped previous deployment {} for replacement", existing.get().id());
|
||||
}
|
||||
|
||||
// === COMPLETE ===
|
||||
updateStage(deployment.id(), DeployStage.COMPLETE);
|
||||
|
||||
String primaryContainerId = newContainerIds.get(0);
|
||||
DeploymentStatus finalStatus = healthyCount == config.replicas()
|
||||
? DeploymentStatus.RUNNING : DeploymentStatus.DEGRADED;
|
||||
deploymentService.markRunning(deployment.id(), primaryContainerId);
|
||||
if (finalStatus == DeploymentStatus.DEGRADED) {
|
||||
deploymentRepository.updateStatus(deployment.id(), DeploymentStatus.DEGRADED,
|
||||
primaryContainerId, null);
|
||||
}
|
||||
|
||||
pgDeployRepo.updateDeployStage(deployment.id(), null);
|
||||
serverMetrics.recordDeploymentOutcome(finalStatus.name());
|
||||
serverMetrics.recordDeploymentDuration(deployStart);
|
||||
log.info("Deployment {} is {} ({}/{} replicas healthy)",
|
||||
deployment.id(), finalStatus, healthyCount, config.replicas());
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error("Deployment {} FAILED: {}", deployment.id(), e.getMessage(), e);
|
||||
pgDeployRepo.updateDeployStage(deployment.id(), null);
|
||||
@@ -276,6 +223,262 @@ public class DeploymentExecutor {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Blue/green strategy: start all N new replicas (coexisting with the old
|
||||
* ones thanks to the gen-suffixed container names), wait for ALL healthy,
|
||||
* then stop the previous deployment. Strict all-healthy — partial failure
|
||||
* preserves the previous deployment untouched.
|
||||
*/
|
||||
private void deployBlueGreen(DeployCtx ctx) {
|
||||
ResolvedContainerConfig config = ctx.config();
|
||||
Deployment deployment = ctx.deployment();
|
||||
|
||||
// === START REPLICAS ===
|
||||
updateStage(deployment.id(), DeployStage.START_REPLICAS);
|
||||
List<Map<String, Object>> replicaStates = new ArrayList<>();
|
||||
List<String> newContainerIds = new ArrayList<>();
|
||||
for (int i = 0; i < config.replicas(); i++) {
|
||||
Map<String, Object> state = new LinkedHashMap<>();
|
||||
String containerId = startReplica(ctx, i, state);
|
||||
newContainerIds.add(containerId);
|
||||
replicaStates.add(state);
|
||||
}
|
||||
pgDeployRepo.updateReplicaStates(deployment.id(), replicaStates);
|
||||
|
||||
// === HEALTH CHECK ===
|
||||
updateStage(deployment.id(), DeployStage.HEALTH_CHECK);
|
||||
int healthyCount = waitForAllHealthy(newContainerIds, healthCheckTimeout);
|
||||
|
||||
if (healthyCount < config.replicas()) {
|
||||
// Strict abort: tear down new replicas, leave the previous deployment untouched.
|
||||
for (String cid : newContainerIds) {
|
||||
try { orchestrator.stopContainer(cid); orchestrator.removeContainer(cid); }
|
||||
catch (Exception e) { log.warn("Cleanup failed for {}: {}", cid, e.getMessage()); }
|
||||
}
|
||||
pgDeployRepo.updateDeployStage(deployment.id(), null);
|
||||
String reason = String.format(
|
||||
"blue-green: %d/%d replicas healthy within %ds; preserving previous deployment",
|
||||
healthyCount, config.replicas(), healthCheckTimeout);
|
||||
deploymentService.markFailed(deployment.id(), reason);
|
||||
serverMetrics.recordDeploymentOutcome("FAILED");
|
||||
serverMetrics.recordDeploymentDuration(ctx.deployStart());
|
||||
return;
|
||||
}
|
||||
|
||||
replicaStates = updateReplicaHealth(replicaStates, newContainerIds);
|
||||
pgDeployRepo.updateReplicaStates(deployment.id(), replicaStates);
|
||||
|
||||
// === SWAP TRAFFIC ===
|
||||
// All new replicas are healthy; Traefik labels are already attracting
|
||||
// traffic to them. Stop the previous deployment now — the swap is
|
||||
// implicit in the label-driven load balancer.
|
||||
updateStage(deployment.id(), DeployStage.SWAP_TRAFFIC);
|
||||
Optional<Deployment> previous = deploymentRepository.findActiveByAppIdAndEnvironmentIdExcluding(
|
||||
deployment.appId(), deployment.environmentId(), deployment.id());
|
||||
if (previous.isPresent()) {
|
||||
log.info("blue-green: stopping previous deployment {} now that new replicas are healthy",
|
||||
previous.get().id());
|
||||
stopDeploymentContainers(previous.get());
|
||||
deploymentService.markStopped(previous.get().id());
|
||||
}
|
||||
|
||||
// === COMPLETE ===
|
||||
updateStage(deployment.id(), DeployStage.COMPLETE);
|
||||
persistSnapshotAndMarkRunning(ctx, newContainerIds.get(0));
|
||||
log.info("Deployment {} is RUNNING (blue-green, {}/{} replicas healthy)",
|
||||
deployment.id(), healthyCount, config.replicas());
|
||||
}
|
||||
|
||||
/**
|
||||
* Rolling strategy: replace replicas one at a time — start new[i], wait
|
||||
* healthy, stop old[i]. On any replica's health failure, stop the
|
||||
* in-flight new container, leave remaining old replicas serving, mark
|
||||
* FAILED. Already-replaced old containers are not restored (can't unring
|
||||
* that bell) — user redeploys to recover.
|
||||
*
|
||||
* Resource peak: replicas + 1 (briefly while a new replica warms up
|
||||
* before its counterpart is stopped).
|
||||
*/
|
||||
private void deployRolling(DeployCtx ctx) {
|
||||
ResolvedContainerConfig config = ctx.config();
|
||||
Deployment deployment = ctx.deployment();
|
||||
|
||||
// Capture previous deployment's per-index container ids up front.
|
||||
Optional<Deployment> previousOpt = deploymentRepository.findActiveByAppIdAndEnvironmentIdExcluding(
|
||||
deployment.appId(), deployment.environmentId(), deployment.id());
|
||||
Map<Integer, String> oldContainerByIndex = new LinkedHashMap<>();
|
||||
if (previousOpt.isPresent() && previousOpt.get().replicaStates() != null) {
|
||||
for (Map<String, Object> r : previousOpt.get().replicaStates()) {
|
||||
Object idx = r.get("index");
|
||||
Object cid = r.get("containerId");
|
||||
if (idx instanceof Number n && cid instanceof String s) {
|
||||
oldContainerByIndex.put(n.intValue(), s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// === START REPLICAS ===
|
||||
updateStage(deployment.id(), DeployStage.START_REPLICAS);
|
||||
List<Map<String, Object>> replicaStates = new ArrayList<>();
|
||||
List<String> newContainerIds = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < config.replicas(); i++) {
|
||||
// Start new replica i (gen-suffixed name; coexists with old[i]).
|
||||
Map<String, Object> state = new LinkedHashMap<>();
|
||||
String newCid = startReplica(ctx, i, state);
|
||||
newContainerIds.add(newCid);
|
||||
replicaStates.add(state);
|
||||
pgDeployRepo.updateReplicaStates(deployment.id(), replicaStates);
|
||||
|
||||
// === HEALTH CHECK (per-replica) ===
|
||||
updateStage(deployment.id(), DeployStage.HEALTH_CHECK);
|
||||
boolean healthy = waitForOneHealthy(newCid, healthCheckTimeout);
|
||||
if (!healthy) {
|
||||
// Abort: stop this in-flight new replica AND any new replicas
|
||||
// started so far. Already-stopped old replicas stay stopped
|
||||
// (rolling is not reversible). Remaining un-replaced old
|
||||
// replicas keep serving traffic.
|
||||
for (String cid : newContainerIds) {
|
||||
try { orchestrator.stopContainer(cid); orchestrator.removeContainer(cid); }
|
||||
catch (Exception e) { log.warn("Cleanup failed for {}: {}", cid, e.getMessage()); }
|
||||
}
|
||||
pgDeployRepo.updateDeployStage(deployment.id(), null);
|
||||
String reason = String.format(
|
||||
"rolling: replica %d failed to reach healthy within %ds; %d previous replicas still running",
|
||||
i, healthCheckTimeout, oldContainerByIndex.size());
|
||||
deploymentService.markFailed(deployment.id(), reason);
|
||||
serverMetrics.recordDeploymentOutcome("FAILED");
|
||||
serverMetrics.recordDeploymentDuration(ctx.deployStart());
|
||||
return;
|
||||
}
|
||||
|
||||
// Health check passed: update replica status to RUNNING, stop the
|
||||
// corresponding old[i] if present, and continue with replica i+1.
|
||||
replicaStates = updateReplicaHealth(replicaStates, newContainerIds);
|
||||
pgDeployRepo.updateReplicaStates(deployment.id(), replicaStates);
|
||||
|
||||
String oldCid = oldContainerByIndex.remove(i);
|
||||
if (oldCid != null) {
|
||||
try {
|
||||
orchestrator.stopContainer(oldCid);
|
||||
orchestrator.removeContainer(oldCid);
|
||||
log.info("rolling: replaced replica {} (old={}, new={})", i, oldCid, newCid);
|
||||
} catch (Exception e) {
|
||||
log.warn("rolling: failed to stop old replica {} ({}): {}", i, oldCid, e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// === SWAP TRAFFIC ===
|
||||
// Any old replicas with indices >= new.replicas (e.g., when replica
|
||||
// count shrank) are still running; sweep them now so the old
|
||||
// deployment can be marked STOPPED.
|
||||
updateStage(deployment.id(), DeployStage.SWAP_TRAFFIC);
|
||||
for (Map.Entry<Integer, String> e : oldContainerByIndex.entrySet()) {
|
||||
try {
|
||||
orchestrator.stopContainer(e.getValue());
|
||||
orchestrator.removeContainer(e.getValue());
|
||||
log.info("rolling: stopped leftover old replica {} ({})", e.getKey(), e.getValue());
|
||||
} catch (Exception ex) {
|
||||
log.warn("rolling: failed to stop leftover old replica {}: {}", e.getKey(), ex.getMessage());
|
||||
}
|
||||
}
|
||||
if (previousOpt.isPresent()) {
|
||||
deploymentService.markStopped(previousOpt.get().id());
|
||||
}
|
||||
|
||||
// === COMPLETE ===
|
||||
updateStage(deployment.id(), DeployStage.COMPLETE);
|
||||
persistSnapshotAndMarkRunning(ctx, newContainerIds.get(0));
|
||||
log.info("Deployment {} is RUNNING (rolling, {}/{} replicas replaced)",
|
||||
deployment.id(), config.replicas(), config.replicas());
|
||||
}
|
||||
|
||||
/** Poll a single container until healthy or the timeout expires. Returns
|
||||
* true on healthy, false on timeout or thread interrupt. */
|
||||
private boolean waitForOneHealthy(String containerId, int timeoutSeconds) {
|
||||
long deadline = System.currentTimeMillis() + (timeoutSeconds * 1000L);
|
||||
while (System.currentTimeMillis() < deadline) {
|
||||
ContainerStatus status = orchestrator.getContainerStatus(containerId);
|
||||
if ("healthy".equals(status.state())) return true;
|
||||
try { Thread.sleep(2000); } catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Start one replica container with the gen-suffixed name and return its
|
||||
* container id. Fills `stateOut` with the replicaStates JSONB row. */
|
||||
private String startReplica(DeployCtx ctx, int i, Map<String, Object> stateOut) {
|
||||
Environment env = ctx.env();
|
||||
App app = ctx.app();
|
||||
ResolvedContainerConfig config = ctx.config();
|
||||
|
||||
String instanceId = env.slug() + "-" + app.slug() + "-" + i + "-" + ctx.generation();
|
||||
String containerName = tenantId + "-" + instanceId;
|
||||
|
||||
Map<String, String> labels = TraefikLabelBuilder.build(
|
||||
app.slug(), env.slug(), tenantId, config, i, ctx.generation());
|
||||
labels.putAll(ctx.prometheusLabels());
|
||||
|
||||
Map<String, String> replicaEnvVars = new LinkedHashMap<>(ctx.baseEnvVars());
|
||||
replicaEnvVars.put("CAMELEER_AGENT_INSTANCEID", instanceId);
|
||||
|
||||
String volumeName = jarDockerVolume != null && !jarDockerVolume.isBlank() ? jarDockerVolume : null;
|
||||
ContainerRequest request = new ContainerRequest(
|
||||
containerName, baseImage, ctx.jarPath(),
|
||||
volumeName, jarStoragePath,
|
||||
ctx.primaryNetwork(),
|
||||
ctx.additionalNets(),
|
||||
replicaEnvVars, labels,
|
||||
config.memoryLimitBytes(), config.memoryReserveBytes(),
|
||||
config.dockerCpuShares(), config.dockerCpuQuota(),
|
||||
config.exposedPorts(), agentHealthPort,
|
||||
"on-failure", 3,
|
||||
ctx.resolvedRuntimeType(), config.customArgs(), ctx.mainClass()
|
||||
);
|
||||
|
||||
String containerId = orchestrator.startContainer(request);
|
||||
|
||||
// Connect to additional networks after container is started
|
||||
for (String net : ctx.additionalNets()) {
|
||||
if (networkManager != null) {
|
||||
networkManager.connectContainer(containerId, net);
|
||||
}
|
||||
}
|
||||
|
||||
orchestrator.startLogCapture(containerId, instanceId, app.slug(), env.slug(), tenantId);
|
||||
|
||||
stateOut.put("index", i);
|
||||
stateOut.put("containerId", containerId);
|
||||
stateOut.put("containerName", containerName);
|
||||
stateOut.put("status", "STARTING");
|
||||
return containerId;
|
||||
}
|
||||
|
||||
/** Persist the deployment snapshot and mark the deployment RUNNING.
|
||||
* Finalizes the deploy in a single place shared by all strategy paths. */
|
||||
private void persistSnapshotAndMarkRunning(DeployCtx ctx, String primaryContainerId) {
|
||||
Deployment deployment = ctx.deployment();
|
||||
ApplicationConfig agentConfig = applicationConfigRepository
|
||||
.findByApplicationAndEnvironment(ctx.app().slug(), ctx.env().slug())
|
||||
.orElse(null);
|
||||
List<String> snapshotSensitiveKeys = agentConfig != null ? agentConfig.getSensitiveKeys() : null;
|
||||
DeploymentConfigSnapshot snapshot = new DeploymentConfigSnapshot(
|
||||
deployment.appVersionId(),
|
||||
agentConfig,
|
||||
ctx.app().containerConfig(),
|
||||
snapshotSensitiveKeys);
|
||||
pgDeployRepo.saveDeployedConfigSnapshot(deployment.id(), snapshot);
|
||||
|
||||
deploymentService.markRunning(deployment.id(), primaryContainerId);
|
||||
pgDeployRepo.updateDeployStage(deployment.id(), null);
|
||||
serverMetrics.recordDeploymentOutcome("RUNNING");
|
||||
serverMetrics.recordDeploymentDuration(ctx.deployStart());
|
||||
}
|
||||
|
||||
public void stopDeployment(Deployment deployment) {
|
||||
pgDeployRepo.updateTargetState(deployment.id(), "STOPPED");
|
||||
deploymentRepository.updateStatus(deployment.id(), DeploymentStatus.STOPPING,
|
||||
@@ -341,7 +544,10 @@ public class DeploymentExecutor {
|
||||
return envVars;
|
||||
}
|
||||
|
||||
private int waitForAnyHealthy(List<String> containerIds, int timeoutSeconds) {
|
||||
/** Poll until all containers are healthy or the timeout expires. Returns
|
||||
* the healthy count at return time — == ids.size() on full success, less
|
||||
* if the timeout won. */
|
||||
private int waitForAllHealthy(List<String> containerIds, int timeoutSeconds) {
|
||||
long deadline = System.currentTimeMillis() + (timeoutSeconds * 1000L);
|
||||
int lastHealthy = 0;
|
||||
while (System.currentTimeMillis() < deadline) {
|
||||
@@ -403,6 +609,10 @@ public class DeploymentExecutor {
|
||||
map.put("runtimeType", config.runtimeType());
|
||||
map.put("customArgs", config.customArgs());
|
||||
map.put("extraNetworks", config.extraNetworks());
|
||||
map.put("externalRouting", config.externalRouting());
|
||||
if (config.certResolver() != null) {
|
||||
map.put("certResolver", config.certResolver());
|
||||
}
|
||||
return map;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,19 +10,28 @@ public final class TraefikLabelBuilder {
|
||||
private TraefikLabelBuilder() {}
|
||||
|
||||
public static Map<String, String> build(String appSlug, String envSlug, String tenantId,
|
||||
ResolvedContainerConfig config, int replicaIndex) {
|
||||
ResolvedContainerConfig config, int replicaIndex,
|
||||
String generation) {
|
||||
// Traefik router/service keys stay generation-agnostic so load balancing
|
||||
// spans old + new replicas during a blue/green overlap. instance-id and
|
||||
// the new generation label carry the per-deploy identity.
|
||||
String svc = envSlug + "-" + appSlug;
|
||||
String instanceId = envSlug + "-" + appSlug + "-" + replicaIndex;
|
||||
String instanceId = envSlug + "-" + appSlug + "-" + replicaIndex + "-" + generation;
|
||||
Map<String, String> labels = new LinkedHashMap<>();
|
||||
|
||||
labels.put("traefik.enable", "true");
|
||||
labels.put("managed-by", "cameleer-server");
|
||||
labels.put("cameleer.tenant", tenantId);
|
||||
labels.put("cameleer.app", appSlug);
|
||||
labels.put("cameleer.environment", envSlug);
|
||||
labels.put("cameleer.replica", String.valueOf(replicaIndex));
|
||||
labels.put("cameleer.generation", generation);
|
||||
labels.put("cameleer.instance-id", instanceId);
|
||||
|
||||
if (!config.externalRouting()) {
|
||||
return labels;
|
||||
}
|
||||
|
||||
labels.put("traefik.enable", "true");
|
||||
labels.put("traefik.http.services." + svc + ".loadbalancer.server.port",
|
||||
String.valueOf(config.appPort()));
|
||||
|
||||
@@ -46,7 +55,10 @@ public final class TraefikLabelBuilder {
|
||||
|
||||
if (config.sslOffloading()) {
|
||||
labels.put("traefik.http.routers." + svc + ".tls", "true");
|
||||
labels.put("traefik.http.routers." + svc + ".tls.certresolver", "default");
|
||||
if (config.certResolver() != null && !config.certResolver().isBlank()) {
|
||||
labels.put("traefik.http.routers." + svc + ".tls.certresolver",
|
||||
config.certResolver());
|
||||
}
|
||||
}
|
||||
|
||||
return labels;
|
||||
|
||||
@@ -122,6 +122,14 @@ public class ClickHouseLogStore implements LogIndex {
|
||||
baseParams.add(request.instanceId());
|
||||
}
|
||||
|
||||
if (!request.instanceIds().isEmpty()) {
|
||||
String placeholders = String.join(", ", Collections.nCopies(request.instanceIds().size(), "?"));
|
||||
baseConditions.add("instance_id IN (" + placeholders + ")");
|
||||
for (String id : request.instanceIds()) {
|
||||
baseParams.add(id);
|
||||
}
|
||||
}
|
||||
|
||||
if (request.exchangeId() != null && !request.exchangeId().isEmpty()) {
|
||||
baseConditions.add("(exchange_id = ?" +
|
||||
" OR (mapContains(mdc, 'cameleer.exchangeId') AND mdc['cameleer.exchangeId'] = ?)" +
|
||||
@@ -281,6 +289,14 @@ public class ClickHouseLogStore implements LogIndex {
|
||||
params.add(request.instanceId());
|
||||
}
|
||||
|
||||
if (!request.instanceIds().isEmpty()) {
|
||||
String placeholders = String.join(", ", Collections.nCopies(request.instanceIds().size(), "?"));
|
||||
conditions.add("instance_id IN (" + placeholders + ")");
|
||||
for (String id : request.instanceIds()) {
|
||||
params.add(id);
|
||||
}
|
||||
}
|
||||
|
||||
if (request.exchangeId() != null && !request.exchangeId().isEmpty()) {
|
||||
conditions.add("(exchange_id = ?" +
|
||||
" OR (mapContains(mdc, 'cameleer.exchangeId') AND mdc['cameleer.exchangeId'] = ?)" +
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package com.cameleer.server.app.search;
|
||||
|
||||
import com.cameleer.server.core.alerting.AlertMatchSpec;
|
||||
import com.cameleer.server.core.search.AttributeFilter;
|
||||
import com.cameleer.server.core.search.ExecutionSummary;
|
||||
import com.cameleer.server.core.search.SearchRequest;
|
||||
import com.cameleer.server.core.search.SearchResult;
|
||||
@@ -81,13 +82,24 @@ public class ClickHouseSearchIndex implements SearchIndex {
|
||||
String sortColumn = SORT_FIELD_MAP.getOrDefault(request.sortField(), "start_time");
|
||||
String sortDir = "asc".equalsIgnoreCase(request.sortDir()) ? "ASC" : "DESC";
|
||||
|
||||
// Composite-cursor callers (afterExecutionId set) need a deterministic tiebreak inside
|
||||
// same-millisecond groups so the client-side last-row pick matches ClickHouse's row order.
|
||||
// Without this, a same-start_time tail >LIMIT can silently drop rows: the page ends mid-ms,
|
||||
// the cursor advances past the returned lastRowId, and the skipped rows with smaller
|
||||
// execution_id values never reappear. Other callers (UI/stats) keep the unchanged
|
||||
// single-column ORDER BY — they don't use the composite cursor.
|
||||
String orderBy = sortColumn + " " + sortDir;
|
||||
if (request.afterExecutionId() != null) {
|
||||
orderBy += ", execution_id " + sortDir;
|
||||
}
|
||||
|
||||
String dataSql = "SELECT execution_id, route_id, instance_id, application_id, "
|
||||
+ "status, start_time, end_time, duration_ms, correlation_id, "
|
||||
+ "error_message, error_stacktrace, diagram_content_hash, attributes, "
|
||||
+ "has_trace_data, is_replay, "
|
||||
+ "input_body, output_body, input_headers, output_headers, root_cause_message "
|
||||
+ "FROM executions FINAL WHERE " + whereClause
|
||||
+ " ORDER BY " + sortColumn + " " + sortDir
|
||||
+ " ORDER BY " + orderBy
|
||||
+ " LIMIT ? OFFSET ?";
|
||||
|
||||
List<Object> dataParams = new ArrayList<>(params);
|
||||
@@ -124,7 +136,13 @@ public class ClickHouseSearchIndex implements SearchIndex {
|
||||
conditions.add("tenant_id = ?");
|
||||
params.add(tenantId);
|
||||
|
||||
if (request.timeFrom() != null) {
|
||||
if (request.timeFrom() != null && request.afterExecutionId() != null) {
|
||||
// composite predicate: strictly-after in (start_time, execution_id) tuple order
|
||||
conditions.add("(start_time > ? OR (start_time = ? AND execution_id > ?))");
|
||||
params.add(Timestamp.from(request.timeFrom()));
|
||||
params.add(Timestamp.from(request.timeFrom()));
|
||||
params.add(request.afterExecutionId());
|
||||
} else if (request.timeFrom() != null) {
|
||||
conditions.add("start_time >= ?");
|
||||
params.add(Timestamp.from(request.timeFrom()));
|
||||
}
|
||||
@@ -239,6 +257,23 @@ public class ClickHouseSearchIndex implements SearchIndex {
|
||||
params.add(likeTerm);
|
||||
}
|
||||
|
||||
// Structured attribute filters. Keys were validated at AttributeFilter construction
|
||||
// time against ^[a-zA-Z0-9._-]+$ so they are safe to single-quote-inline; the JSON path
|
||||
// argument of JSONExtractString does not accept a ? placeholder in ClickHouse JDBC
|
||||
// (same constraint as countExecutionsForAlerting below). Values are parameter-bound.
|
||||
for (AttributeFilter filter : request.attributeFilters()) {
|
||||
String escapedKey = filter.key().replace("'", "\\'");
|
||||
if (filter.isKeyOnly()) {
|
||||
conditions.add("JSONHas(attributes, '" + escapedKey + "')");
|
||||
} else if (filter.isWildcard()) {
|
||||
conditions.add("JSONExtractString(attributes, '" + escapedKey + "') LIKE ?");
|
||||
params.add(filter.toLikePattern());
|
||||
} else {
|
||||
conditions.add("JSONExtractString(attributes, '" + escapedKey + "') = ?");
|
||||
params.add(filter.value());
|
||||
}
|
||||
}
|
||||
|
||||
return String.join(" AND ", conditions);
|
||||
}
|
||||
|
||||
|
||||
@@ -140,28 +140,29 @@ public class OidcAuthController {
|
||||
OidcTokenExchanger.OidcUserInfo oidcUser =
|
||||
tokenExchanger.exchange(request.code(), request.redirectUri());
|
||||
|
||||
String userId = "user:oidc:" + oidcUser.subject();
|
||||
// DB key is unprefixed (matches alert_rules.created_by FK shape used by the
|
||||
// env-scoped read-path controllers). JWT subject keeps the "user:" namespace
|
||||
// so JwtAuthenticationFilter can still distinguish user vs agent tokens.
|
||||
String userId = "oidc:" + oidcUser.subject();
|
||||
String subject = "user:" + userId;
|
||||
String issuerHost = URI.create(config.get().issuerUri()).getHost();
|
||||
String provider = "oidc:" + issuerHost;
|
||||
|
||||
// Check auto-signup gate: if disabled, user must already exist
|
||||
Optional<UserInfo> existingUser = userRepository.findById(userId);
|
||||
if (!config.get().autoSignup() && existingUser.isEmpty()) {
|
||||
throw new ResponseStatusException(HttpStatus.FORBIDDEN,
|
||||
"Account not provisioned. Contact your administrator.");
|
||||
}
|
||||
|
||||
// Upsert user (without roles -- roles are in user_roles table)
|
||||
userRepository.upsert(new UserInfo(
|
||||
userId, provider, oidcUser.email(), oidcUser.name(), Instant.now()));
|
||||
|
||||
// Apply claim mapping rules to assign managed roles/groups from JWT claims
|
||||
applyClaimMappings(userId, oidcUser.allClaims(), oidcUser.roles(), config.get());
|
||||
|
||||
List<String> roles = rbacService.getSystemRoleNames(userId);
|
||||
|
||||
String accessToken = jwtService.createAccessToken(userId, "user", roles);
|
||||
String refreshToken = jwtService.createRefreshToken(userId, "user", roles);
|
||||
String accessToken = jwtService.createAccessToken(subject, "user", roles);
|
||||
String refreshToken = jwtService.createRefreshToken(subject, "user", roles);
|
||||
|
||||
String displayName = oidcUser.name() != null && !oidcUser.name().isBlank()
|
||||
? oidcUser.name() : oidcUser.email();
|
||||
|
||||
@@ -171,10 +171,15 @@ public class SecurityConfig {
|
||||
.requestMatchers(HttpMethod.POST, "/api/v1/environments/*/alerts/silences/**").hasAnyRole("OPERATOR", "ADMIN")
|
||||
.requestMatchers(HttpMethod.PUT, "/api/v1/environments/*/alerts/silences/**").hasAnyRole("OPERATOR", "ADMIN")
|
||||
.requestMatchers(HttpMethod.DELETE, "/api/v1/environments/*/alerts/silences/**").hasAnyRole("OPERATOR", "ADMIN")
|
||||
// Alerting — ack/read (VIEWER+ self-service)
|
||||
// Alerting — ack/read/bulk-ack (VIEWER+ self-service)
|
||||
.requestMatchers(HttpMethod.POST, "/api/v1/environments/*/alerts/*/ack").hasAnyRole("VIEWER", "OPERATOR", "ADMIN")
|
||||
.requestMatchers(HttpMethod.POST, "/api/v1/environments/*/alerts/*/read").hasAnyRole("VIEWER", "OPERATOR", "ADMIN")
|
||||
.requestMatchers(HttpMethod.POST, "/api/v1/environments/*/alerts/bulk-read").hasAnyRole("VIEWER", "OPERATOR", "ADMIN")
|
||||
.requestMatchers(HttpMethod.POST, "/api/v1/environments/*/alerts/bulk-ack").hasAnyRole("VIEWER", "OPERATOR", "ADMIN")
|
||||
// Alerting — soft-delete / restore (OPERATOR+)
|
||||
.requestMatchers(HttpMethod.DELETE, "/api/v1/environments/*/alerts/*").hasAnyRole("OPERATOR", "ADMIN")
|
||||
.requestMatchers(HttpMethod.POST, "/api/v1/environments/*/alerts/bulk-delete").hasAnyRole("OPERATOR", "ADMIN")
|
||||
.requestMatchers(HttpMethod.POST, "/api/v1/environments/*/alerts/*/restore").hasAnyRole("OPERATOR", "ADMIN")
|
||||
// Alerting — notification retry (flat path; notification IDs globally unique)
|
||||
.requestMatchers(HttpMethod.POST, "/api/v1/alerts/notifications/*/retry").hasAnyRole("OPERATOR", "ADMIN")
|
||||
|
||||
|
||||
@@ -77,27 +77,30 @@ public class UiAuthController {
|
||||
HttpServletRequest httpRequest) {
|
||||
String configuredUser = properties.getUiUser();
|
||||
String configuredPassword = properties.getUiPassword();
|
||||
String subject = "user:" + request.username();
|
||||
// The JWT subject carries a "user:" namespace prefix so the auth filter
|
||||
// can distinguish user vs agent tokens. The DB row keys (users.user_id,
|
||||
// user_roles.user_id, alert_rules.created_by FK, …) are the bare username:
|
||||
// every env-scoped controller strips the prefix on the read path via
|
||||
// stripSubjectPrefix(...), so the write path here must match.
|
||||
String userId = request.username();
|
||||
String subject = "user:" + userId;
|
||||
|
||||
// Check account lockout before attempting authentication
|
||||
if (userRepository.isLocked(subject)) {
|
||||
if (userRepository.isLocked(userId)) {
|
||||
auditService.log(request.username(), "login_locked", AuditCategory.AUTH, null,
|
||||
Map.of("reason", "Account locked"), AuditResult.FAILURE, httpRequest);
|
||||
throw new ResponseStatusException(HttpStatus.TOO_MANY_REQUESTS,
|
||||
"Account locked due to too many failed attempts. Try again later.");
|
||||
}
|
||||
|
||||
// Try env-var admin first
|
||||
boolean envMatch = configuredUser != null && !configuredUser.isBlank()
|
||||
&& configuredPassword != null && !configuredPassword.isBlank()
|
||||
&& configuredUser.equals(request.username())
|
||||
&& configuredPassword.equals(request.password());
|
||||
|
||||
if (!envMatch) {
|
||||
// Try per-user password
|
||||
Optional<String> hash = userRepository.getPasswordHash(subject);
|
||||
Optional<String> hash = userRepository.getPasswordHash(userId);
|
||||
if (hash.isEmpty() || !passwordEncoder.matches(request.password(), hash.get())) {
|
||||
userRepository.recordFailedLogin(subject);
|
||||
userRepository.recordFailedLogin(userId);
|
||||
log.debug("UI login failed for user: {}", request.username());
|
||||
auditService.log(request.username(), "login_failed", AuditCategory.AUTH, null,
|
||||
Map.of("reason", "Invalid credentials"), AuditResult.FAILURE, httpRequest);
|
||||
@@ -105,23 +108,22 @@ public class UiAuthController {
|
||||
}
|
||||
}
|
||||
|
||||
// Successful login — clear any failed attempt counter
|
||||
userRepository.clearFailedLogins(subject);
|
||||
userRepository.clearFailedLogins(userId);
|
||||
|
||||
if (envMatch) {
|
||||
// Env-var admin: upsert and ensure ADMIN role + Admins group
|
||||
// Env-var admin: upsert unprefixed and ensure ADMIN role + Admins group
|
||||
try {
|
||||
userRepository.upsert(new UserInfo(
|
||||
subject, "local", "", request.username(), Instant.now()));
|
||||
rbacService.assignRoleToUser(subject, SystemRole.ADMIN_ID);
|
||||
rbacService.addUserToGroup(subject, SystemRole.ADMINS_GROUP_ID);
|
||||
userId, "local", "", request.username(), Instant.now()));
|
||||
rbacService.assignRoleToUser(userId, SystemRole.ADMIN_ID);
|
||||
rbacService.addUserToGroup(userId, SystemRole.ADMINS_GROUP_ID);
|
||||
} catch (Exception e) {
|
||||
log.warn("Failed to upsert local admin to store (login continues): {}", e.getMessage());
|
||||
}
|
||||
}
|
||||
// Per-user logins: user already exists in DB (created by admin)
|
||||
|
||||
List<String> roles = rbacService.getSystemRoleNames(subject);
|
||||
List<String> roles = rbacService.getSystemRoleNames(userId);
|
||||
if (roles.isEmpty()) {
|
||||
roles = List.of("VIEWER");
|
||||
}
|
||||
@@ -152,9 +154,10 @@ public class UiAuthController {
|
||||
String accessToken = jwtService.createAccessToken(result.subject(), "user", roles);
|
||||
String refreshToken = jwtService.createRefreshToken(result.subject(), "user", roles);
|
||||
|
||||
String displayName = userRepository.findById(result.subject())
|
||||
String userId = stripSubjectPrefix(result.subject());
|
||||
String displayName = userRepository.findById(userId)
|
||||
.map(UserInfo::displayName)
|
||||
.orElse(result.subject());
|
||||
.orElse(userId);
|
||||
auditService.log(result.subject(), "token_refresh", AuditCategory.AUTH, null, null, AuditResult.SUCCESS, httpRequest);
|
||||
return ResponseEntity.ok(new AuthTokenResponse(accessToken, refreshToken, displayName, null));
|
||||
} catch (ResponseStatusException e) {
|
||||
@@ -173,13 +176,22 @@ public class UiAuthController {
|
||||
if (authentication == null || authentication.getName() == null) {
|
||||
throw new ResponseStatusException(HttpStatus.UNAUTHORIZED, "Not authenticated");
|
||||
}
|
||||
UserDetail detail = rbacService.getUser(authentication.getName());
|
||||
UserDetail detail = rbacService.getUser(stripSubjectPrefix(authentication.getName()));
|
||||
if (detail == null) {
|
||||
throw new ResponseStatusException(HttpStatus.UNAUTHORIZED, "User not found");
|
||||
}
|
||||
return ResponseEntity.ok(detail);
|
||||
}
|
||||
|
||||
/**
|
||||
* Map a JWT subject ({@code "user:<name>"} or {@code "user:oidc:<sub>"}) to the DB key:
|
||||
* just the bare username. FKs on {@code alert_rules.created_by},
|
||||
* {@code outbound_connections.created_by}, etc. reference the unprefixed row.
|
||||
*/
|
||||
private static String stripSubjectPrefix(String subject) {
|
||||
return subject != null && subject.startsWith("user:") ? subject.substring(5) : subject;
|
||||
}
|
||||
|
||||
public record LoginRequest(String username, String password) {}
|
||||
public record RefreshRequest(String refreshToken) {}
|
||||
}
|
||||
|
||||
@@ -106,4 +106,57 @@ public class ClickHouseAgentEventRepository implements AgentEventRepository {
|
||||
|
||||
return new AgentEventPage(results, nextCursor, hasMore);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<AgentEventRecord> findInWindow(String environment,
|
||||
String applicationId,
|
||||
String instanceId,
|
||||
List<String> eventTypes,
|
||||
Instant fromInclusive,
|
||||
Instant toExclusive,
|
||||
int limit) {
|
||||
if (eventTypes == null || eventTypes.isEmpty()) {
|
||||
throw new IllegalArgumentException("eventTypes must not be empty");
|
||||
}
|
||||
if (fromInclusive == null || toExclusive == null) {
|
||||
throw new IllegalArgumentException("from/to must not be null");
|
||||
}
|
||||
|
||||
// `event_type IN (?, ?, …)` — one placeholder per type.
|
||||
String placeholders = String.join(",", java.util.Collections.nCopies(eventTypes.size(), "?"));
|
||||
var sql = new StringBuilder(SELECT_BASE);
|
||||
var params = new ArrayList<Object>();
|
||||
params.add(tenantId);
|
||||
|
||||
if (environment != null) {
|
||||
sql.append(" AND environment = ?");
|
||||
params.add(environment);
|
||||
}
|
||||
if (applicationId != null) {
|
||||
sql.append(" AND application_id = ?");
|
||||
params.add(applicationId);
|
||||
}
|
||||
if (instanceId != null) {
|
||||
sql.append(" AND instance_id = ?");
|
||||
params.add(instanceId);
|
||||
}
|
||||
sql.append(" AND event_type IN (").append(placeholders).append(")");
|
||||
params.addAll(eventTypes);
|
||||
sql.append(" AND timestamp >= ? AND timestamp < ?");
|
||||
params.add(Timestamp.from(fromInclusive));
|
||||
params.add(Timestamp.from(toExclusive));
|
||||
sql.append(" ORDER BY timestamp ASC, insert_id ASC LIMIT ?");
|
||||
params.add(limit);
|
||||
|
||||
return jdbc.query(sql.toString(),
|
||||
(rs, rowNum) -> new AgentEventRecord(
|
||||
rs.getLong("id"),
|
||||
rs.getString("instance_id"),
|
||||
rs.getString("application_id"),
|
||||
rs.getString("event_type"),
|
||||
rs.getString("detail"),
|
||||
rs.getTimestamp("timestamp").toInstant()
|
||||
),
|
||||
params.toArray());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,8 +16,6 @@ import java.security.MessageDigest;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.sql.Timestamp;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HexFormat;
|
||||
import java.util.List;
|
||||
@@ -57,6 +55,12 @@ public class ClickHouseDiagramStore implements DiagramStore {
|
||||
ORDER BY created_at DESC LIMIT 1
|
||||
""";
|
||||
|
||||
private static final String SELECT_HASH_FOR_APP_ROUTE = """
|
||||
SELECT content_hash FROM route_diagrams
|
||||
WHERE tenant_id = ? AND application_id = ? AND environment = ? AND route_id = ?
|
||||
ORDER BY created_at DESC LIMIT 1
|
||||
""";
|
||||
|
||||
private static final String SELECT_DEFINITIONS_FOR_APP = """
|
||||
SELECT DISTINCT route_id, definition FROM route_diagrams
|
||||
WHERE tenant_id = ? AND application_id = ? AND environment = ?
|
||||
@@ -68,6 +72,8 @@ public class ClickHouseDiagramStore implements DiagramStore {
|
||||
|
||||
// (routeId + "\0" + instanceId) → contentHash
|
||||
private final ConcurrentHashMap<String, String> hashCache = new ConcurrentHashMap<>();
|
||||
// (applicationId + "\0" + environment + "\0" + routeId) → most recent contentHash
|
||||
private final ConcurrentHashMap<String, String> appRouteHashCache = new ConcurrentHashMap<>();
|
||||
// contentHash → deserialized RouteGraph
|
||||
private final ConcurrentHashMap<String, RouteGraph> graphCache = new ConcurrentHashMap<>();
|
||||
|
||||
@@ -92,12 +98,37 @@ public class ClickHouseDiagramStore implements DiagramStore {
|
||||
} catch (Exception e) {
|
||||
log.warn("Failed to warm diagram hash cache — lookups will fall back to ClickHouse: {}", e.getMessage());
|
||||
}
|
||||
|
||||
try {
|
||||
jdbc.query(
|
||||
"SELECT application_id, environment, route_id, " +
|
||||
"argMax(content_hash, created_at) AS content_hash " +
|
||||
"FROM route_diagrams WHERE tenant_id = ? " +
|
||||
"GROUP BY application_id, environment, route_id",
|
||||
rs -> {
|
||||
String key = appRouteCacheKey(
|
||||
rs.getString("application_id"),
|
||||
rs.getString("environment"),
|
||||
rs.getString("route_id"));
|
||||
appRouteHashCache.put(key, rs.getString("content_hash"));
|
||||
},
|
||||
tenantId);
|
||||
log.info("Diagram app-route cache warmed: {} entries", appRouteHashCache.size());
|
||||
} catch (Exception e) {
|
||||
log.warn("Failed to warm diagram app-route cache — lookups will fall back to ClickHouse: {}", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private static String cacheKey(String routeId, String instanceId) {
|
||||
return routeId + "\0" + instanceId;
|
||||
}
|
||||
|
||||
private static String appRouteCacheKey(String applicationId, String environment, String routeId) {
|
||||
return (applicationId != null ? applicationId : "") + "\0"
|
||||
+ (environment != null ? environment : "") + "\0"
|
||||
+ (routeId != null ? routeId : "");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void store(TaggedDiagram diagram) {
|
||||
try {
|
||||
@@ -122,6 +153,7 @@ public class ClickHouseDiagramStore implements DiagramStore {
|
||||
|
||||
// Update caches
|
||||
hashCache.put(cacheKey(routeId, agentId), contentHash);
|
||||
appRouteHashCache.put(appRouteCacheKey(applicationId, environment, routeId), contentHash);
|
||||
graphCache.put(contentHash, graph);
|
||||
|
||||
log.debug("Stored diagram for route={} agent={} with hash={}", routeId, agentId, contentHash);
|
||||
@@ -170,33 +202,29 @@ public class ClickHouseDiagramStore implements DiagramStore {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<String> findContentHashForRouteByAgents(String routeId, List<String> agentIds) {
|
||||
if (agentIds == null || agentIds.isEmpty()) {
|
||||
public Optional<String> findLatestContentHashForAppRoute(String applicationId,
|
||||
String routeId,
|
||||
String environment) {
|
||||
if (applicationId == null || applicationId.isBlank()
|
||||
|| routeId == null || routeId.isBlank()
|
||||
|| environment == null || environment.isBlank()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
// Try cache first — return first hit
|
||||
for (String agentId : agentIds) {
|
||||
String cached = hashCache.get(cacheKey(routeId, agentId));
|
||||
if (cached != null) {
|
||||
return Optional.of(cached);
|
||||
}
|
||||
String key = appRouteCacheKey(applicationId, environment, routeId);
|
||||
String cached = appRouteHashCache.get(key);
|
||||
if (cached != null) {
|
||||
return Optional.of(cached);
|
||||
}
|
||||
|
||||
// Fall back to ClickHouse
|
||||
String placeholders = String.join(", ", Collections.nCopies(agentIds.size(), "?"));
|
||||
String sql = "SELECT content_hash FROM route_diagrams " +
|
||||
"WHERE tenant_id = ? AND route_id = ? AND instance_id IN (" + placeholders + ") " +
|
||||
"ORDER BY created_at DESC LIMIT 1";
|
||||
var params = new ArrayList<Object>();
|
||||
params.add(tenantId);
|
||||
params.add(routeId);
|
||||
params.addAll(agentIds);
|
||||
List<Map<String, Object>> rows = jdbc.queryForList(sql, params.toArray());
|
||||
List<Map<String, Object>> rows = jdbc.queryForList(
|
||||
SELECT_HASH_FOR_APP_ROUTE, tenantId, applicationId, environment, routeId);
|
||||
if (rows.isEmpty()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
return Optional.of((String) rows.get(0).get("content_hash"));
|
||||
String hash = (String) rows.get(0).get("content_hash");
|
||||
appRouteHashCache.put(key, hash);
|
||||
return Optional.of(hash);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
||||
@@ -282,20 +282,6 @@ public class ClickHouseExecutionStore implements ExecutionStore {
|
||||
return results.isEmpty() ? Optional.empty() : Optional.of(results.get(0));
|
||||
}
|
||||
|
||||
// --- ExecutionStore interface: write methods (unsupported, use chunked pipeline) ---
|
||||
|
||||
@Override
|
||||
public void upsert(ExecutionRecord execution) {
|
||||
throw new UnsupportedOperationException("ClickHouse writes use the chunked pipeline");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void upsertProcessors(String executionId, Instant startTime,
|
||||
String applicationId, String routeId,
|
||||
List<ProcessorRecord> processors) {
|
||||
throw new UnsupportedOperationException("ClickHouse writes use the chunked pipeline");
|
||||
}
|
||||
|
||||
// --- Row mappers ---
|
||||
|
||||
private static ExecutionRecord mapExecutionRecord(ResultSet rs) throws SQLException {
|
||||
|
||||
@@ -0,0 +1,408 @@
|
||||
package com.cameleer.server.app.storage;
|
||||
|
||||
import com.cameleer.server.core.storage.ServerMetricsQueryStore;
|
||||
import com.cameleer.server.core.storage.model.ServerInstanceInfo;
|
||||
import com.cameleer.server.core.storage.model.ServerMetricCatalogEntry;
|
||||
import com.cameleer.server.core.storage.model.ServerMetricPoint;
|
||||
import com.cameleer.server.core.storage.model.ServerMetricQueryRequest;
|
||||
import com.cameleer.server.core.storage.model.ServerMetricQueryResponse;
|
||||
import com.cameleer.server.core.storage.model.ServerMetricSeries;
|
||||
import org.springframework.jdbc.core.JdbcTemplate;
|
||||
|
||||
import java.sql.Array;
|
||||
import java.sql.Timestamp;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* ClickHouse-backed {@link ServerMetricsQueryStore}.
|
||||
*
|
||||
* <p>Safety rules for every query:
|
||||
* <ul>
|
||||
* <li>tenant_id always bound as a parameter — no cross-tenant reads.</li>
|
||||
* <li>Identifier-like inputs (metric name, statistic, tag keys,
|
||||
* aggregation, mode) are regex-validated. Tag keys flow through the
|
||||
* query as JDBC parameter-bound values of {@code tags[?]} map lookups,
|
||||
* so even with a "safe" regex they cannot inject SQL.</li>
|
||||
* <li>Literal values ({@code from}, {@code to}, tag filter values,
|
||||
* server_instance_id allow-list) always go through {@code ?}.</li>
|
||||
* <li>The time range is capped at {@link #MAX_RANGE}.</li>
|
||||
* <li>Result cardinality is capped at {@link #MAX_SERIES} series.</li>
|
||||
* </ul>
|
||||
*/
|
||||
public class ClickHouseServerMetricsQueryStore implements ServerMetricsQueryStore {
|
||||
|
||||
private static final Pattern SAFE_IDENTIFIER = Pattern.compile("^[a-zA-Z0-9._]+$");
|
||||
private static final Pattern SAFE_STATISTIC = Pattern.compile("^[a-z_]+$");
|
||||
|
||||
private static final Set<String> AGGREGATIONS = Set.of("avg", "sum", "max", "min", "latest");
|
||||
private static final Set<String> MODES = Set.of("raw", "delta");
|
||||
|
||||
/** Maximum {@code to - from} window accepted by the API. */
|
||||
static final Duration MAX_RANGE = Duration.ofDays(31);
|
||||
|
||||
/** Clamp bounds and default for {@code stepSeconds}. */
|
||||
static final int MIN_STEP = 10;
|
||||
static final int MAX_STEP = 3600;
|
||||
static final int DEFAULT_STEP = 60;
|
||||
|
||||
/** Defence against group-by explosion — limit the series count per response. */
|
||||
static final int MAX_SERIES = 500;
|
||||
|
||||
private final String tenantId;
|
||||
private final JdbcTemplate jdbc;
|
||||
|
||||
public ClickHouseServerMetricsQueryStore(String tenantId, JdbcTemplate jdbc) {
|
||||
this.tenantId = tenantId;
|
||||
this.jdbc = jdbc;
|
||||
}
|
||||
|
||||
// ── catalog ─────────────────────────────────────────────────────────
|
||||
|
||||
@Override
|
||||
public List<ServerMetricCatalogEntry> catalog(Instant from, Instant to) {
|
||||
requireRange(from, to);
|
||||
String sql = """
|
||||
SELECT
|
||||
metric_name,
|
||||
any(metric_type) AS metric_type,
|
||||
arraySort(groupUniqArray(statistic)) AS statistics,
|
||||
arraySort(arrayDistinct(arrayFlatten(groupArray(mapKeys(tags))))) AS tag_keys
|
||||
FROM server_metrics
|
||||
WHERE tenant_id = ?
|
||||
AND collected_at >= ?
|
||||
AND collected_at < ?
|
||||
GROUP BY metric_name
|
||||
ORDER BY metric_name
|
||||
""";
|
||||
return jdbc.query(sql, (rs, n) -> new ServerMetricCatalogEntry(
|
||||
rs.getString("metric_name"),
|
||||
rs.getString("metric_type"),
|
||||
arrayToStringList(rs.getArray("statistics")),
|
||||
arrayToStringList(rs.getArray("tag_keys"))
|
||||
), tenantId, Timestamp.from(from), Timestamp.from(to));
|
||||
}
|
||||
|
||||
// ── instances ───────────────────────────────────────────────────────
|
||||
|
||||
@Override
|
||||
public List<ServerInstanceInfo> listInstances(Instant from, Instant to) {
|
||||
requireRange(from, to);
|
||||
String sql = """
|
||||
SELECT
|
||||
server_instance_id,
|
||||
min(collected_at) AS first_seen,
|
||||
max(collected_at) AS last_seen
|
||||
FROM server_metrics
|
||||
WHERE tenant_id = ?
|
||||
AND collected_at >= ?
|
||||
AND collected_at < ?
|
||||
GROUP BY server_instance_id
|
||||
ORDER BY last_seen DESC
|
||||
""";
|
||||
return jdbc.query(sql, (rs, n) -> new ServerInstanceInfo(
|
||||
rs.getString("server_instance_id"),
|
||||
rs.getTimestamp("first_seen").toInstant(),
|
||||
rs.getTimestamp("last_seen").toInstant()
|
||||
), tenantId, Timestamp.from(from), Timestamp.from(to));
|
||||
}
|
||||
|
||||
// ── query ───────────────────────────────────────────────────────────
|
||||
|
||||
@Override
|
||||
public ServerMetricQueryResponse query(ServerMetricQueryRequest request) {
|
||||
if (request == null) throw new IllegalArgumentException("request is required");
|
||||
String metric = requireSafeIdentifier(request.metric(), "metric");
|
||||
requireRange(request.from(), request.to());
|
||||
|
||||
String aggregation = request.aggregation() != null ? request.aggregation().toLowerCase() : "avg";
|
||||
if (!AGGREGATIONS.contains(aggregation)) {
|
||||
throw new IllegalArgumentException("aggregation must be one of " + AGGREGATIONS);
|
||||
}
|
||||
|
||||
String mode = request.mode() != null ? request.mode().toLowerCase() : "raw";
|
||||
if (!MODES.contains(mode)) {
|
||||
throw new IllegalArgumentException("mode must be one of " + MODES);
|
||||
}
|
||||
|
||||
int step = request.stepSeconds() != null ? request.stepSeconds() : DEFAULT_STEP;
|
||||
if (step < MIN_STEP || step > MAX_STEP) {
|
||||
throw new IllegalArgumentException(
|
||||
"stepSeconds must be in [" + MIN_STEP + "," + MAX_STEP + "]");
|
||||
}
|
||||
|
||||
String statistic = request.statistic();
|
||||
if (statistic != null && !SAFE_STATISTIC.matcher(statistic).matches()) {
|
||||
throw new IllegalArgumentException("statistic contains unsafe characters");
|
||||
}
|
||||
|
||||
List<String> groupByTags = request.groupByTags() != null
|
||||
? request.groupByTags() : List.of();
|
||||
for (String t : groupByTags) requireSafeIdentifier(t, "groupByTag");
|
||||
|
||||
Map<String, String> filterTags = request.filterTags() != null
|
||||
? request.filterTags() : Map.of();
|
||||
for (String t : filterTags.keySet()) requireSafeIdentifier(t, "filterTag key");
|
||||
|
||||
List<String> instanceAllowList = request.serverInstanceIds() != null
|
||||
? request.serverInstanceIds() : List.of();
|
||||
|
||||
boolean isDelta = "delta".equals(mode);
|
||||
boolean isMean = "mean".equals(statistic);
|
||||
|
||||
String sql = isDelta
|
||||
? buildDeltaSql(step, groupByTags, filterTags, instanceAllowList, statistic, isMean)
|
||||
: buildRawSql(step, groupByTags, filterTags, instanceAllowList,
|
||||
statistic, aggregation, isMean);
|
||||
|
||||
List<Object> params = buildParams(groupByTags, metric, statistic, isMean,
|
||||
request.from(), request.to(),
|
||||
filterTags, instanceAllowList);
|
||||
|
||||
List<Row> rows = jdbc.query(sql, (rs, n) -> {
|
||||
int idx = 1;
|
||||
Instant bucket = rs.getTimestamp(idx++).toInstant();
|
||||
List<String> tagValues = new ArrayList<>(groupByTags.size());
|
||||
for (int g = 0; g < groupByTags.size(); g++) {
|
||||
tagValues.add(rs.getString(idx++));
|
||||
}
|
||||
double value = rs.getDouble(idx);
|
||||
return new Row(bucket, tagValues, value);
|
||||
}, params.toArray());
|
||||
|
||||
return assembleSeries(rows, metric, statistic, aggregation, mode, step, groupByTags);
|
||||
}
|
||||
|
||||
// ── SQL builders ────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Builds a single-pass SQL for raw mode:
|
||||
* <pre>{@code
|
||||
* SELECT bucket, tag0, ..., <agg>(metric_value) AS value
|
||||
* FROM server_metrics WHERE ...
|
||||
* GROUP BY bucket, tag0, ...
|
||||
* ORDER BY bucket, tag0, ...
|
||||
* }</pre>
|
||||
* For {@code statistic=mean}, replaces the aggregate with
|
||||
* {@code sumIf(value, statistic IN ('total','total_time')) / nullIf(sumIf(value, statistic='count'), 0)}.
|
||||
*/
|
||||
private String buildRawSql(int step, List<String> groupByTags,
|
||||
Map<String, String> filterTags,
|
||||
List<String> instanceAllowList,
|
||||
String statistic, String aggregation, boolean isMean) {
|
||||
StringBuilder s = new StringBuilder(512);
|
||||
s.append("SELECT\n toDateTime64(toStartOfInterval(collected_at, INTERVAL ")
|
||||
.append(step).append(" SECOND), 3) AS bucket");
|
||||
for (int i = 0; i < groupByTags.size(); i++) {
|
||||
s.append(",\n tags[?] AS tag").append(i);
|
||||
}
|
||||
s.append(",\n ").append(isMean ? meanExpr() : scalarAggExpr(aggregation))
|
||||
.append(" AS value\nFROM server_metrics\n");
|
||||
appendWhereClause(s, filterTags, instanceAllowList, statistic, isMean);
|
||||
s.append("GROUP BY bucket");
|
||||
for (int i = 0; i < groupByTags.size(); i++) s.append(", tag").append(i);
|
||||
s.append("\nORDER BY bucket");
|
||||
for (int i = 0; i < groupByTags.size(); i++) s.append(", tag").append(i);
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a three-level SQL for delta mode. Inner fills one
|
||||
* (bucket, instance, tag-group) row via {@code max(metric_value)};
|
||||
* middle computes positive-clipped per-instance differences via a
|
||||
* window function; outer sums across instances.
|
||||
*/
|
||||
private String buildDeltaSql(int step, List<String> groupByTags,
|
||||
Map<String, String> filterTags,
|
||||
List<String> instanceAllowList,
|
||||
String statistic, boolean isMean) {
|
||||
StringBuilder s = new StringBuilder(1024);
|
||||
s.append("SELECT bucket");
|
||||
for (int i = 0; i < groupByTags.size(); i++) s.append(", tag").append(i);
|
||||
s.append(", sum(delta) AS value FROM (\n");
|
||||
|
||||
// Middle: per-instance positive-clipped delta using window.
|
||||
s.append(" SELECT bucket");
|
||||
for (int i = 0; i < groupByTags.size(); i++) s.append(", tag").append(i);
|
||||
s.append(", server_instance_id, greatest(0, value - coalesce(any(value) OVER (")
|
||||
.append("PARTITION BY server_instance_id");
|
||||
for (int i = 0; i < groupByTags.size(); i++) s.append(", tag").append(i);
|
||||
s.append(" ORDER BY bucket ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING), value)) AS delta FROM (\n");
|
||||
|
||||
// Inner: one representative value per (bucket, instance, tag-group).
|
||||
s.append(" SELECT\n toDateTime64(toStartOfInterval(collected_at, INTERVAL ")
|
||||
.append(step).append(" SECOND), 3) AS bucket,\n server_instance_id");
|
||||
for (int i = 0; i < groupByTags.size(); i++) {
|
||||
s.append(",\n tags[?] AS tag").append(i);
|
||||
}
|
||||
s.append(",\n ").append(isMean ? meanExpr() : "max(metric_value)")
|
||||
.append(" AS value\n FROM server_metrics\n");
|
||||
appendWhereClause(s, filterTags, instanceAllowList, statistic, isMean);
|
||||
s.append(" GROUP BY bucket, server_instance_id");
|
||||
for (int i = 0; i < groupByTags.size(); i++) s.append(", tag").append(i);
|
||||
s.append("\n ) AS bucketed\n) AS deltas\n");
|
||||
|
||||
s.append("GROUP BY bucket");
|
||||
for (int i = 0; i < groupByTags.size(); i++) s.append(", tag").append(i);
|
||||
s.append("\nORDER BY bucket");
|
||||
for (int i = 0; i < groupByTags.size(); i++) s.append(", tag").append(i);
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* WHERE clause shared by both raw and delta SQL shapes. Appended at the
|
||||
* correct indent under either the single {@code FROM server_metrics}
|
||||
* (raw) or the innermost one (delta).
|
||||
*/
|
||||
private void appendWhereClause(StringBuilder s, Map<String, String> filterTags,
|
||||
List<String> instanceAllowList,
|
||||
String statistic, boolean isMean) {
|
||||
s.append(" WHERE tenant_id = ?\n")
|
||||
.append(" AND metric_name = ?\n");
|
||||
if (isMean) {
|
||||
s.append(" AND statistic IN ('count', 'total', 'total_time')\n");
|
||||
} else if (statistic != null) {
|
||||
s.append(" AND statistic = ?\n");
|
||||
}
|
||||
s.append(" AND collected_at >= ?\n")
|
||||
.append(" AND collected_at < ?\n");
|
||||
for (int i = 0; i < filterTags.size(); i++) {
|
||||
s.append(" AND tags[?] = ?\n");
|
||||
}
|
||||
if (!instanceAllowList.isEmpty()) {
|
||||
s.append(" AND server_instance_id IN (")
|
||||
.append("?,".repeat(instanceAllowList.size() - 1)).append("?)\n");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* SQL-positional params for both raw and delta queries (same relative
|
||||
* order because the WHERE clause is emitted by {@link #appendWhereClause}
|
||||
* only once, with the {@code tags[?]} select-list placeholders appearing
|
||||
* earlier in the SQL text).
|
||||
*/
|
||||
private List<Object> buildParams(List<String> groupByTags, String metric,
|
||||
String statistic, boolean isMean,
|
||||
Instant from, Instant to,
|
||||
Map<String, String> filterTags,
|
||||
List<String> instanceAllowList) {
|
||||
List<Object> params = new ArrayList<>();
|
||||
// SELECT-list tags[?] placeholders
|
||||
params.addAll(groupByTags);
|
||||
// WHERE
|
||||
params.add(tenantId);
|
||||
params.add(metric);
|
||||
if (!isMean && statistic != null) params.add(statistic);
|
||||
params.add(Timestamp.from(from));
|
||||
params.add(Timestamp.from(to));
|
||||
for (Map.Entry<String, String> e : filterTags.entrySet()) {
|
||||
params.add(e.getKey());
|
||||
params.add(e.getValue());
|
||||
}
|
||||
params.addAll(instanceAllowList);
|
||||
return params;
|
||||
}
|
||||
|
||||
private static String scalarAggExpr(String aggregation) {
|
||||
return switch (aggregation) {
|
||||
case "avg" -> "avg(metric_value)";
|
||||
case "sum" -> "sum(metric_value)";
|
||||
case "max" -> "max(metric_value)";
|
||||
case "min" -> "min(metric_value)";
|
||||
case "latest" -> "argMax(metric_value, collected_at)";
|
||||
default -> throw new IllegalStateException("unreachable: " + aggregation);
|
||||
};
|
||||
}
|
||||
|
||||
private static String meanExpr() {
|
||||
return "sumIf(metric_value, statistic IN ('total', 'total_time'))"
|
||||
+ " / nullIf(sumIf(metric_value, statistic = 'count'), 0)";
|
||||
}
|
||||
|
||||
// ── response assembly ───────────────────────────────────────────────
|
||||
|
||||
private ServerMetricQueryResponse assembleSeries(
|
||||
List<Row> rows, String metric, String statistic,
|
||||
String aggregation, String mode, int step, List<String> groupByTags) {
|
||||
|
||||
Map<List<String>, List<ServerMetricPoint>> bySignature = new LinkedHashMap<>();
|
||||
for (Row r : rows) {
|
||||
if (Double.isNaN(r.value) || Double.isInfinite(r.value)) continue;
|
||||
bySignature.computeIfAbsent(r.tagValues, k -> new ArrayList<>())
|
||||
.add(new ServerMetricPoint(r.bucket, r.value));
|
||||
}
|
||||
|
||||
if (bySignature.size() > MAX_SERIES) {
|
||||
throw new IllegalArgumentException(
|
||||
"query produced " + bySignature.size()
|
||||
+ " series; reduce groupByTags or tighten filterTags (max "
|
||||
+ MAX_SERIES + ")");
|
||||
}
|
||||
|
||||
List<ServerMetricSeries> series = new ArrayList<>(bySignature.size());
|
||||
for (Map.Entry<List<String>, List<ServerMetricPoint>> e : bySignature.entrySet()) {
|
||||
Map<String, String> tags = new LinkedHashMap<>();
|
||||
for (int i = 0; i < groupByTags.size(); i++) {
|
||||
tags.put(groupByTags.get(i), e.getKey().get(i));
|
||||
}
|
||||
series.add(new ServerMetricSeries(Collections.unmodifiableMap(tags), e.getValue()));
|
||||
}
|
||||
|
||||
return new ServerMetricQueryResponse(metric,
|
||||
statistic != null ? statistic : "value",
|
||||
aggregation, mode, step, series);
|
||||
}
|
||||
|
||||
// ── helpers ─────────────────────────────────────────────────────────
|
||||
|
||||
private static void requireRange(Instant from, Instant to) {
|
||||
if (from == null || to == null) {
|
||||
throw new IllegalArgumentException("from and to are required");
|
||||
}
|
||||
if (!from.isBefore(to)) {
|
||||
throw new IllegalArgumentException("from must be strictly before to");
|
||||
}
|
||||
if (Duration.between(from, to).compareTo(MAX_RANGE) > 0) {
|
||||
throw new IllegalArgumentException(
|
||||
"time range exceeds maximum of " + MAX_RANGE.toDays() + " days");
|
||||
}
|
||||
}
|
||||
|
||||
private static String requireSafeIdentifier(String value, String field) {
|
||||
if (value == null || value.isBlank()) {
|
||||
throw new IllegalArgumentException(field + " is required");
|
||||
}
|
||||
if (!SAFE_IDENTIFIER.matcher(value).matches()) {
|
||||
throw new IllegalArgumentException(
|
||||
field + " contains unsafe characters (allowed: [a-zA-Z0-9._])");
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
private static List<String> arrayToStringList(Array array) {
|
||||
if (array == null) return List.of();
|
||||
try {
|
||||
Object[] values = (Object[]) array.getArray();
|
||||
Set<String> sorted = new TreeSet<>();
|
||||
for (Object v : values) {
|
||||
if (v != null) sorted.add(v.toString());
|
||||
}
|
||||
return List.copyOf(sorted);
|
||||
} catch (Exception e) {
|
||||
return List.of();
|
||||
} finally {
|
||||
try { array.free(); } catch (Exception ignore) { }
|
||||
}
|
||||
}
|
||||
|
||||
private record Row(Instant bucket, List<String> tagValues, double value) {
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
package com.cameleer.server.app.storage;
|
||||
|
||||
import com.cameleer.server.core.storage.ServerMetricsStore;
|
||||
import com.cameleer.server.core.storage.model.ServerMetricSample;
|
||||
import org.springframework.jdbc.core.JdbcTemplate;
|
||||
|
||||
import java.sql.Timestamp;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class ClickHouseServerMetricsStore implements ServerMetricsStore {
|
||||
|
||||
private final JdbcTemplate jdbc;
|
||||
|
||||
public ClickHouseServerMetricsStore(JdbcTemplate jdbc) {
|
||||
this.jdbc = jdbc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void insertBatch(List<ServerMetricSample> samples) {
|
||||
if (samples.isEmpty()) return;
|
||||
|
||||
jdbc.batchUpdate("""
|
||||
INSERT INTO server_metrics
|
||||
(tenant_id, collected_at, server_instance_id, metric_name,
|
||||
metric_type, statistic, metric_value, tags)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
samples.stream().map(s -> new Object[]{
|
||||
s.tenantId(),
|
||||
Timestamp.from(s.collectedAt()),
|
||||
s.serverInstanceId(),
|
||||
s.metricName(),
|
||||
s.metricType(),
|
||||
s.statistic(),
|
||||
s.value(),
|
||||
tagsToClickHouseMap(s.tags())
|
||||
}).toList());
|
||||
}
|
||||
|
||||
private Map<String, String> tagsToClickHouseMap(Map<String, String> tags) {
|
||||
if (tags == null || tags.isEmpty()) return new HashMap<>();
|
||||
return new HashMap<>(tags);
|
||||
}
|
||||
}
|
||||
@@ -338,15 +338,15 @@ public class ClickHouseStatsStore implements StatsStore {
|
||||
private record Filter(String column, String value) {}
|
||||
|
||||
/**
|
||||
* Format an Instant as a ClickHouse DateTime literal.
|
||||
* Uses java.sql.Timestamp to match the JVM-ClickHouse timezone convention
|
||||
* used by the JDBC driver, then truncates to second precision for DateTime
|
||||
* column compatibility.
|
||||
* Format an Instant as a ClickHouse DateTime literal explicitly typed in UTC.
|
||||
* The explicit `toDateTime(..., 'UTC')` cast avoids depending on the session
|
||||
* timezone matching the `bucket DateTime('UTC')` column type.
|
||||
*/
|
||||
private static String lit(Instant instant) {
|
||||
return "'" + java.time.format.DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
|
||||
String raw = java.time.format.DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
|
||||
.withZone(java.time.ZoneOffset.UTC)
|
||||
.format(instant.truncatedTo(ChronoUnit.SECONDS)) + "'";
|
||||
.format(instant.truncatedTo(ChronoUnit.SECONDS));
|
||||
return "toDateTime('" + raw + "', 'UTC')";
|
||||
}
|
||||
|
||||
/** Format a string as a ClickHouse SQL literal with backslash + quote escaping. */
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package com.cameleer.server.app.storage;
|
||||
|
||||
import com.cameleer.server.core.runtime.Deployment;
|
||||
import com.cameleer.server.core.runtime.DeploymentConfigSnapshot;
|
||||
import com.cameleer.server.core.runtime.DeploymentRepository;
|
||||
import com.cameleer.server.core.runtime.DeploymentStatus;
|
||||
import com.fasterxml.jackson.core.type.TypeReference;
|
||||
@@ -21,7 +22,7 @@ public class PostgresDeploymentRepository implements DeploymentRepository {
|
||||
private static final String SELECT_COLS =
|
||||
"id, app_id, app_version_id, environment_id, status, target_state, deployment_strategy, " +
|
||||
"replica_states, deploy_stage, container_id, container_name, error_message, " +
|
||||
"resolved_config, deployed_at, stopped_at, created_at";
|
||||
"resolved_config, deployed_config_snapshot, deployed_at, stopped_at, created_at, created_by";
|
||||
|
||||
private final JdbcTemplate jdbc;
|
||||
private final ObjectMapper objectMapper;
|
||||
@@ -62,6 +63,16 @@ public class PostgresDeploymentRepository implements DeploymentRepository {
|
||||
return results.isEmpty() ? Optional.empty() : Optional.of(results.get(0));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<Deployment> findActiveByAppIdAndEnvironmentIdExcluding(UUID appId, UUID environmentId, UUID excludeDeploymentId) {
|
||||
var results = jdbc.query(
|
||||
"SELECT " + SELECT_COLS + " FROM deployments WHERE app_id = ? AND environment_id = ? " +
|
||||
"AND status IN ('STARTING', 'RUNNING', 'DEGRADED') AND id <> ? " +
|
||||
"ORDER BY created_at DESC LIMIT 1",
|
||||
(rs, rowNum) -> mapRow(rs), appId, environmentId, excludeDeploymentId);
|
||||
return results.isEmpty() ? Optional.empty() : Optional.of(results.get(0));
|
||||
}
|
||||
|
||||
public List<Deployment> findByStatus(List<DeploymentStatus> statuses) {
|
||||
String placeholders = String.join(",", statuses.stream().map(s -> "'" + s.name() + "'").toList());
|
||||
return jdbc.query(
|
||||
@@ -70,10 +81,10 @@ public class PostgresDeploymentRepository implements DeploymentRepository {
|
||||
}
|
||||
|
||||
@Override
|
||||
public UUID create(UUID appId, UUID appVersionId, UUID environmentId, String containerName) {
|
||||
public UUID create(UUID appId, UUID appVersionId, UUID environmentId, String containerName, String createdBy) {
|
||||
UUID id = UUID.randomUUID();
|
||||
jdbc.update("INSERT INTO deployments (id, app_id, app_version_id, environment_id, container_name) VALUES (?, ?, ?, ?, ?)",
|
||||
id, appId, appVersionId, environmentId, containerName);
|
||||
jdbc.update("INSERT INTO deployments (id, app_id, app_version_id, environment_id, container_name, created_by) VALUES (?, ?, ?, ?, ?, ?)",
|
||||
id, appId, appVersionId, environmentId, containerName, createdBy);
|
||||
return id;
|
||||
}
|
||||
|
||||
@@ -115,8 +126,8 @@ public class PostgresDeploymentRepository implements DeploymentRepository {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void deleteTerminalByAppAndEnvironment(UUID appId, UUID environmentId) {
|
||||
jdbc.update("DELETE FROM deployments WHERE app_id = ? AND environment_id = ? AND status IN ('STOPPED', 'FAILED')",
|
||||
public void deleteFailedByAppAndEnvironment(UUID appId, UUID environmentId) {
|
||||
jdbc.update("DELETE FROM deployments WHERE app_id = ? AND environment_id = ? AND status = 'FAILED'",
|
||||
appId, environmentId);
|
||||
}
|
||||
|
||||
@@ -129,6 +140,27 @@ public class PostgresDeploymentRepository implements DeploymentRepository {
|
||||
}
|
||||
}
|
||||
|
||||
public void saveDeployedConfigSnapshot(UUID id, DeploymentConfigSnapshot snapshot) {
|
||||
try {
|
||||
String json = snapshot != null ? objectMapper.writeValueAsString(snapshot) : null;
|
||||
jdbc.update("UPDATE deployments SET deployed_config_snapshot = ?::jsonb WHERE id = ?", json, id);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to serialize deployed_config_snapshot", e);
|
||||
}
|
||||
}
|
||||
|
||||
public Optional<Deployment> findLatestSuccessfulByAppAndEnv(UUID appId, UUID envId) {
|
||||
// DEGRADED deploys also carry a snapshot (executor writes before the RUNNING/DEGRADED
|
||||
// split), and represent a config that reached COMPLETE stage — restorable for the user.
|
||||
var results = jdbc.query(
|
||||
"SELECT " + SELECT_COLS + " FROM deployments "
|
||||
+ "WHERE app_id = ? AND environment_id = ? "
|
||||
+ "AND status IN ('RUNNING', 'DEGRADED') AND deployed_config_snapshot IS NOT NULL "
|
||||
+ "ORDER BY deployed_at DESC NULLS LAST LIMIT 1",
|
||||
(rs, rowNum) -> mapRow(rs), appId, envId);
|
||||
return results.isEmpty() ? Optional.empty() : Optional.of(results.get(0));
|
||||
}
|
||||
|
||||
public Optional<Deployment> findByContainerId(String containerId) {
|
||||
var results = jdbc.query(
|
||||
"SELECT " + SELECT_COLS + " FROM deployments WHERE replica_states::text LIKE ? " +
|
||||
@@ -158,6 +190,15 @@ public class PostgresDeploymentRepository implements DeploymentRepository {
|
||||
throw new SQLException("Failed to deserialize resolved_config", e);
|
||||
}
|
||||
}
|
||||
DeploymentConfigSnapshot deployedConfigSnapshot = null;
|
||||
String snapshotJson = rs.getString("deployed_config_snapshot");
|
||||
if (snapshotJson != null) {
|
||||
try {
|
||||
deployedConfigSnapshot = objectMapper.readValue(snapshotJson, DeploymentConfigSnapshot.class);
|
||||
} catch (Exception e) {
|
||||
throw new SQLException("Failed to deserialize deployed_config_snapshot", e);
|
||||
}
|
||||
}
|
||||
return new Deployment(
|
||||
UUID.fromString(rs.getString("id")),
|
||||
UUID.fromString(rs.getString("app_id")),
|
||||
@@ -172,9 +213,11 @@ public class PostgresDeploymentRepository implements DeploymentRepository {
|
||||
rs.getString("container_name"),
|
||||
rs.getString("error_message"),
|
||||
resolvedConfig,
|
||||
deployedConfigSnapshot,
|
||||
deployedAt != null ? deployedAt.toInstant() : null,
|
||||
stoppedAt != null ? stoppedAt.toInstant() : null,
|
||||
rs.getTimestamp("created_at").toInstant()
|
||||
rs.getTimestamp("created_at").toInstant(),
|
||||
rs.getString("created_by")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package com.cameleer.server.app.storage;
|
||||
|
||||
import com.cameleer.server.core.runtime.Environment;
|
||||
import com.cameleer.server.core.runtime.EnvironmentColor;
|
||||
import com.cameleer.server.core.runtime.EnvironmentRepository;
|
||||
import com.fasterxml.jackson.core.type.TypeReference;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
@@ -24,7 +25,8 @@ public class PostgresEnvironmentRepository implements EnvironmentRepository {
|
||||
this.objectMapper = objectMapper;
|
||||
}
|
||||
|
||||
private static final String SELECT_COLS = "id, slug, display_name, production, enabled, default_container_config, jar_retention_count, created_at";
|
||||
private static final String SELECT_COLS =
|
||||
"id, slug, display_name, production, enabled, default_container_config, jar_retention_count, color, created_at";
|
||||
|
||||
@Override
|
||||
public List<Environment> findAll() {
|
||||
@@ -58,9 +60,9 @@ public class PostgresEnvironmentRepository implements EnvironmentRepository {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void update(UUID id, String displayName, boolean production, boolean enabled) {
|
||||
jdbc.update("UPDATE environments SET display_name = ?, production = ?, enabled = ?, updated_at = now() WHERE id = ?",
|
||||
displayName, production, enabled, id);
|
||||
public void update(UUID id, String displayName, boolean production, boolean enabled, String color) {
|
||||
jdbc.update("UPDATE environments SET display_name = ?, production = ?, enabled = ?, color = ?, updated_at = now() WHERE id = ?",
|
||||
displayName, production, enabled, color, id);
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -93,6 +95,10 @@ public class PostgresEnvironmentRepository implements EnvironmentRepository {
|
||||
} catch (Exception e) { /* use empty default */ }
|
||||
int retentionRaw = rs.getInt("jar_retention_count");
|
||||
Integer jarRetentionCount = rs.wasNull() ? null : retentionRaw;
|
||||
String color = rs.getString("color");
|
||||
if (color == null || color.isBlank()) {
|
||||
color = EnvironmentColor.DEFAULT;
|
||||
}
|
||||
return new Environment(
|
||||
UUID.fromString(rs.getString("id")),
|
||||
rs.getString("slug"),
|
||||
@@ -101,6 +107,7 @@ public class PostgresEnvironmentRepository implements EnvironmentRepository {
|
||||
rs.getBoolean("enabled"),
|
||||
config,
|
||||
jarRetentionCount,
|
||||
color,
|
||||
rs.getTimestamp("created_at").toInstant()
|
||||
);
|
||||
}
|
||||
|
||||
@@ -18,6 +18,8 @@ spring:
|
||||
mvc:
|
||||
async:
|
||||
request-timeout: -1
|
||||
mustache:
|
||||
check-template-location: false
|
||||
jackson:
|
||||
serialization:
|
||||
write-dates-as-timestamps: false
|
||||
@@ -53,6 +55,7 @@ cameleer:
|
||||
routingmode: ${CAMELEER_SERVER_RUNTIME_ROUTINGMODE:path}
|
||||
routingdomain: ${CAMELEER_SERVER_RUNTIME_ROUTINGDOMAIN:localhost}
|
||||
serverurl: ${CAMELEER_SERVER_RUNTIME_SERVERURL:}
|
||||
certresolver: ${CAMELEER_SERVER_RUNTIME_CERTRESOLVER:}
|
||||
jardockervolume: ${CAMELEER_SERVER_RUNTIME_JARDOCKERVOLUME:}
|
||||
indexer:
|
||||
debouncems: ${CAMELEER_SERVER_INDEXER_DEBOUNCEMS:2000}
|
||||
@@ -93,6 +96,10 @@ cameleer:
|
||||
notification-retention-days: ${CAMELEER_SERVER_ALERTING_NOTIFICATIONRETENTIONDAYS:30}
|
||||
webhook-timeout-ms: ${CAMELEER_SERVER_ALERTING_WEBHOOKTIMEOUTMS:5000}
|
||||
webhook-max-attempts: ${CAMELEER_SERVER_ALERTING_WEBHOOKMAXATTEMPTS:3}
|
||||
# PER_EXCHANGE first-run cursor clamp: on first tick with no persisted cursor, evaluator
|
||||
# scans no further back than (now - this cap). Prevents one-time backlog flood for rules
|
||||
# whose createdAt predates a migration. Set to 0 to disable and replay from createdAt.
|
||||
per-exchange-deploy-backlog-cap-seconds: ${CAMELEER_SERVER_ALERTING_PEREXCHANGEDEPLOYBACKLOGCAPSECONDS:86400}
|
||||
outbound-http:
|
||||
trust-all: false
|
||||
trusted-ca-pem-paths: []
|
||||
@@ -105,6 +112,10 @@ cameleer:
|
||||
url: ${CAMELEER_SERVER_CLICKHOUSE_URL:jdbc:clickhouse://localhost:8123/cameleer}
|
||||
username: ${CAMELEER_SERVER_CLICKHOUSE_USERNAME:default}
|
||||
password: ${CAMELEER_SERVER_CLICKHOUSE_PASSWORD:}
|
||||
self-metrics:
|
||||
enabled: ${CAMELEER_SERVER_SELFMETRICS_ENABLED:true}
|
||||
interval-ms: ${CAMELEER_SERVER_SELFMETRICS_INTERVALMS:60000}
|
||||
instance-id: ${CAMELEER_SERVER_INSTANCE_ID:}
|
||||
|
||||
springdoc:
|
||||
api-docs:
|
||||
|
||||
@@ -132,7 +132,7 @@ SETTINGS index_granularity = 8192;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS stats_1m_all (
|
||||
tenant_id LowCardinality(String),
|
||||
bucket DateTime,
|
||||
bucket DateTime('UTC'),
|
||||
environment LowCardinality(String) DEFAULT 'default',
|
||||
total_count AggregateFunction(uniq, String),
|
||||
failed_count AggregateFunction(uniqIf, String, UInt8),
|
||||
@@ -149,7 +149,7 @@ TTL bucket + INTERVAL 365 DAY DELETE;
|
||||
CREATE MATERIALIZED VIEW IF NOT EXISTS stats_1m_all_mv TO stats_1m_all AS
|
||||
SELECT
|
||||
tenant_id,
|
||||
toStartOfMinute(start_time) AS bucket,
|
||||
toDateTime(toStartOfMinute(start_time), 'UTC') AS bucket,
|
||||
environment,
|
||||
uniqState(execution_id) AS total_count,
|
||||
uniqIfState(execution_id, status = 'FAILED') AS failed_count,
|
||||
@@ -165,7 +165,7 @@ GROUP BY tenant_id, bucket, environment;
|
||||
CREATE TABLE IF NOT EXISTS stats_1m_app (
|
||||
tenant_id LowCardinality(String),
|
||||
application_id LowCardinality(String),
|
||||
bucket DateTime,
|
||||
bucket DateTime('UTC'),
|
||||
environment LowCardinality(String) DEFAULT 'default',
|
||||
total_count AggregateFunction(uniq, String),
|
||||
failed_count AggregateFunction(uniqIf, String, UInt8),
|
||||
@@ -183,7 +183,7 @@ CREATE MATERIALIZED VIEW IF NOT EXISTS stats_1m_app_mv TO stats_1m_app AS
|
||||
SELECT
|
||||
tenant_id,
|
||||
application_id,
|
||||
toStartOfMinute(start_time) AS bucket,
|
||||
toDateTime(toStartOfMinute(start_time), 'UTC') AS bucket,
|
||||
environment,
|
||||
uniqState(execution_id) AS total_count,
|
||||
uniqIfState(execution_id, status = 'FAILED') AS failed_count,
|
||||
@@ -200,7 +200,7 @@ CREATE TABLE IF NOT EXISTS stats_1m_route (
|
||||
tenant_id LowCardinality(String),
|
||||
application_id LowCardinality(String),
|
||||
route_id LowCardinality(String),
|
||||
bucket DateTime,
|
||||
bucket DateTime('UTC'),
|
||||
environment LowCardinality(String) DEFAULT 'default',
|
||||
total_count AggregateFunction(uniq, String),
|
||||
failed_count AggregateFunction(uniqIf, String, UInt8),
|
||||
@@ -219,7 +219,7 @@ SELECT
|
||||
tenant_id,
|
||||
application_id,
|
||||
route_id,
|
||||
toStartOfMinute(start_time) AS bucket,
|
||||
toDateTime(toStartOfMinute(start_time), 'UTC') AS bucket,
|
||||
environment,
|
||||
uniqState(execution_id) AS total_count,
|
||||
uniqIfState(execution_id, status = 'FAILED') AS failed_count,
|
||||
@@ -236,7 +236,7 @@ CREATE TABLE IF NOT EXISTS stats_1m_processor (
|
||||
tenant_id LowCardinality(String),
|
||||
application_id LowCardinality(String),
|
||||
processor_type LowCardinality(String),
|
||||
bucket DateTime,
|
||||
bucket DateTime('UTC'),
|
||||
environment LowCardinality(String) DEFAULT 'default',
|
||||
total_count AggregateFunction(uniq, String),
|
||||
failed_count AggregateFunction(uniqIf, String, UInt8),
|
||||
@@ -254,7 +254,7 @@ SELECT
|
||||
tenant_id,
|
||||
application_id,
|
||||
processor_type,
|
||||
toStartOfMinute(start_time) AS bucket,
|
||||
toDateTime(toStartOfMinute(start_time), 'UTC') AS bucket,
|
||||
environment,
|
||||
uniqState(concat(execution_id, toString(seq))) AS total_count,
|
||||
uniqIfState(concat(execution_id, toString(seq)), status = 'FAILED') AS failed_count,
|
||||
@@ -272,7 +272,7 @@ CREATE TABLE IF NOT EXISTS stats_1m_processor_detail (
|
||||
route_id LowCardinality(String),
|
||||
processor_id String,
|
||||
processor_type LowCardinality(String),
|
||||
bucket DateTime,
|
||||
bucket DateTime('UTC'),
|
||||
environment LowCardinality(String) DEFAULT 'default',
|
||||
total_count AggregateFunction(uniq, String),
|
||||
failed_count AggregateFunction(uniqIf, String, UInt8),
|
||||
@@ -292,7 +292,7 @@ SELECT
|
||||
route_id,
|
||||
processor_id,
|
||||
processor_type,
|
||||
toStartOfMinute(start_time) AS bucket,
|
||||
toDateTime(toStartOfMinute(start_time), 'UTC') AS bucket,
|
||||
environment,
|
||||
uniqState(concat(execution_id, toString(seq))) AS total_count,
|
||||
uniqIfState(concat(execution_id, toString(seq)), status = 'FAILED') AS failed_count,
|
||||
@@ -401,6 +401,29 @@ CREATE TABLE IF NOT EXISTS route_catalog (
|
||||
ENGINE = ReplacingMergeTree(last_seen)
|
||||
ORDER BY (tenant_id, environment, application_id, route_id);
|
||||
|
||||
-- ── Server Self-Metrics ────────────────────────────────────────────────
|
||||
-- Periodic snapshot of the server's own Micrometer registry (written by
|
||||
-- ServerMetricsSnapshotScheduler). No `environment` column — the server
|
||||
-- straddles environments. `statistic` distinguishes Timer/DistributionSummary
|
||||
-- sub-measurements (count, total_time, max, mean) from plain counter/gauge values.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS server_metrics (
|
||||
tenant_id LowCardinality(String) DEFAULT 'default',
|
||||
collected_at DateTime64(3),
|
||||
server_instance_id LowCardinality(String),
|
||||
metric_name LowCardinality(String),
|
||||
metric_type LowCardinality(String),
|
||||
statistic LowCardinality(String) DEFAULT 'value',
|
||||
metric_value Float64,
|
||||
tags Map(String, String) DEFAULT map(),
|
||||
server_received_at DateTime64(3) DEFAULT now64(3)
|
||||
)
|
||||
ENGINE = MergeTree()
|
||||
PARTITION BY (tenant_id, toYYYYMM(collected_at))
|
||||
ORDER BY (tenant_id, collected_at, server_instance_id, metric_name, statistic)
|
||||
TTL toDateTime(collected_at) + INTERVAL 90 DAY DELETE
|
||||
SETTINGS index_granularity = 8192;
|
||||
|
||||
-- insert_id tiebreak for keyset pagination (fixes same-millisecond cursor collision).
|
||||
-- IF NOT EXISTS on ADD COLUMN is idempotent. MATERIALIZE COLUMN is a background mutation,
|
||||
-- effectively a no-op once all parts are already materialized.
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
ALTER TABLE app_versions ADD COLUMN detected_runtime_type VARCHAR;
|
||||
ALTER TABLE app_versions ADD COLUMN detected_main_class VARCHAR;
|
||||
@@ -1,30 +0,0 @@
|
||||
-- V11 — Outbound connections (admin-managed HTTPS destinations)
|
||||
-- See: docs/superpowers/specs/2026-04-19-alerting-design.md §6
|
||||
|
||||
CREATE TYPE trust_mode_enum AS ENUM ('SYSTEM_DEFAULT','TRUST_ALL','TRUST_PATHS');
|
||||
CREATE TYPE outbound_method_enum AS ENUM ('POST','PUT','PATCH');
|
||||
CREATE TYPE outbound_auth_kind_enum AS ENUM ('NONE','BEARER','BASIC');
|
||||
|
||||
CREATE TABLE outbound_connections (
|
||||
id uuid PRIMARY KEY,
|
||||
tenant_id varchar(64) NOT NULL,
|
||||
name varchar(100) NOT NULL,
|
||||
description text,
|
||||
url text NOT NULL CHECK (url ~ '^https://'),
|
||||
method outbound_method_enum NOT NULL,
|
||||
default_headers jsonb NOT NULL DEFAULT '{}',
|
||||
default_body_tmpl text,
|
||||
tls_trust_mode trust_mode_enum NOT NULL DEFAULT 'SYSTEM_DEFAULT',
|
||||
tls_ca_pem_paths jsonb NOT NULL DEFAULT '[]',
|
||||
hmac_secret_ciphertext text,
|
||||
auth_kind outbound_auth_kind_enum NOT NULL DEFAULT 'NONE',
|
||||
auth_config jsonb NOT NULL DEFAULT '{}',
|
||||
allowed_environment_ids uuid[] NOT NULL DEFAULT '{}',
|
||||
created_at timestamptz NOT NULL DEFAULT now(),
|
||||
created_by text NOT NULL REFERENCES users(user_id),
|
||||
updated_at timestamptz NOT NULL DEFAULT now(),
|
||||
updated_by text NOT NULL REFERENCES users(user_id),
|
||||
CONSTRAINT outbound_connections_name_unique_per_tenant UNIQUE (tenant_id, name)
|
||||
);
|
||||
|
||||
CREATE INDEX outbound_connections_tenant_idx ON outbound_connections (tenant_id);
|
||||
@@ -1,110 +0,0 @@
|
||||
-- V12 — Alerting tables
|
||||
-- Enums (outbound_method_enum / outbound_auth_kind_enum / trust_mode_enum already exist from V11)
|
||||
CREATE TYPE severity_enum AS ENUM ('CRITICAL','WARNING','INFO');
|
||||
CREATE TYPE condition_kind_enum AS ENUM ('ROUTE_METRIC','EXCHANGE_MATCH','AGENT_STATE','DEPLOYMENT_STATE','LOG_PATTERN','JVM_METRIC');
|
||||
CREATE TYPE alert_state_enum AS ENUM ('PENDING','FIRING','ACKNOWLEDGED','RESOLVED');
|
||||
CREATE TYPE target_kind_enum AS ENUM ('USER','GROUP','ROLE');
|
||||
CREATE TYPE notification_status_enum AS ENUM ('PENDING','DELIVERED','FAILED');
|
||||
|
||||
CREATE TABLE alert_rules (
|
||||
id uuid PRIMARY KEY,
|
||||
environment_id uuid NOT NULL REFERENCES environments(id) ON DELETE CASCADE,
|
||||
name varchar(200) NOT NULL,
|
||||
description text,
|
||||
severity severity_enum NOT NULL,
|
||||
enabled boolean NOT NULL DEFAULT true,
|
||||
condition_kind condition_kind_enum NOT NULL,
|
||||
condition jsonb NOT NULL,
|
||||
evaluation_interval_seconds int NOT NULL DEFAULT 60 CHECK (evaluation_interval_seconds >= 5),
|
||||
for_duration_seconds int NOT NULL DEFAULT 0 CHECK (for_duration_seconds >= 0),
|
||||
re_notify_minutes int NOT NULL DEFAULT 60 CHECK (re_notify_minutes >= 0),
|
||||
notification_title_tmpl text NOT NULL,
|
||||
notification_message_tmpl text NOT NULL,
|
||||
webhooks jsonb NOT NULL DEFAULT '[]',
|
||||
next_evaluation_at timestamptz NOT NULL DEFAULT now(),
|
||||
claimed_by varchar(64),
|
||||
claimed_until timestamptz,
|
||||
eval_state jsonb NOT NULL DEFAULT '{}',
|
||||
created_at timestamptz NOT NULL DEFAULT now(),
|
||||
created_by text NOT NULL REFERENCES users(user_id),
|
||||
updated_at timestamptz NOT NULL DEFAULT now(),
|
||||
updated_by text NOT NULL REFERENCES users(user_id)
|
||||
);
|
||||
CREATE INDEX alert_rules_env_idx ON alert_rules (environment_id);
|
||||
CREATE INDEX alert_rules_claim_due_idx ON alert_rules (next_evaluation_at) WHERE enabled = true;
|
||||
|
||||
CREATE TABLE alert_rule_targets (
|
||||
id uuid PRIMARY KEY,
|
||||
rule_id uuid NOT NULL REFERENCES alert_rules(id) ON DELETE CASCADE,
|
||||
target_kind target_kind_enum NOT NULL,
|
||||
target_id varchar(128) NOT NULL,
|
||||
UNIQUE (rule_id, target_kind, target_id)
|
||||
);
|
||||
CREATE INDEX alert_rule_targets_lookup_idx ON alert_rule_targets (target_kind, target_id);
|
||||
|
||||
CREATE TABLE alert_instances (
|
||||
id uuid PRIMARY KEY,
|
||||
rule_id uuid REFERENCES alert_rules(id) ON DELETE SET NULL,
|
||||
rule_snapshot jsonb NOT NULL,
|
||||
environment_id uuid NOT NULL REFERENCES environments(id) ON DELETE CASCADE,
|
||||
state alert_state_enum NOT NULL,
|
||||
severity severity_enum NOT NULL,
|
||||
fired_at timestamptz NOT NULL,
|
||||
acked_at timestamptz,
|
||||
acked_by text REFERENCES users(user_id),
|
||||
resolved_at timestamptz,
|
||||
last_notified_at timestamptz,
|
||||
silenced boolean NOT NULL DEFAULT false,
|
||||
current_value numeric,
|
||||
threshold numeric,
|
||||
context jsonb NOT NULL,
|
||||
title text NOT NULL,
|
||||
message text NOT NULL,
|
||||
target_user_ids text[] NOT NULL DEFAULT '{}',
|
||||
target_group_ids uuid[] NOT NULL DEFAULT '{}',
|
||||
target_role_names text[] NOT NULL DEFAULT '{}'
|
||||
);
|
||||
CREATE INDEX alert_instances_inbox_idx ON alert_instances (environment_id, state, fired_at DESC);
|
||||
CREATE INDEX alert_instances_open_rule_idx ON alert_instances (rule_id, state) WHERE rule_id IS NOT NULL;
|
||||
CREATE INDEX alert_instances_resolved_idx ON alert_instances (resolved_at) WHERE state = 'RESOLVED';
|
||||
CREATE INDEX alert_instances_target_u_idx ON alert_instances USING GIN (target_user_ids);
|
||||
CREATE INDEX alert_instances_target_g_idx ON alert_instances USING GIN (target_group_ids);
|
||||
CREATE INDEX alert_instances_target_r_idx ON alert_instances USING GIN (target_role_names);
|
||||
|
||||
CREATE TABLE alert_silences (
|
||||
id uuid PRIMARY KEY,
|
||||
environment_id uuid NOT NULL REFERENCES environments(id) ON DELETE CASCADE,
|
||||
matcher jsonb NOT NULL,
|
||||
reason text,
|
||||
starts_at timestamptz NOT NULL,
|
||||
ends_at timestamptz NOT NULL CHECK (ends_at > starts_at),
|
||||
created_by text NOT NULL REFERENCES users(user_id),
|
||||
created_at timestamptz NOT NULL DEFAULT now()
|
||||
);
|
||||
CREATE INDEX alert_silences_active_idx ON alert_silences (environment_id, ends_at);
|
||||
|
||||
CREATE TABLE alert_notifications (
|
||||
id uuid PRIMARY KEY,
|
||||
alert_instance_id uuid NOT NULL REFERENCES alert_instances(id) ON DELETE CASCADE,
|
||||
webhook_id uuid,
|
||||
outbound_connection_id uuid REFERENCES outbound_connections(id) ON DELETE SET NULL,
|
||||
status notification_status_enum NOT NULL DEFAULT 'PENDING',
|
||||
attempts int NOT NULL DEFAULT 0,
|
||||
next_attempt_at timestamptz NOT NULL DEFAULT now(),
|
||||
claimed_by varchar(64),
|
||||
claimed_until timestamptz,
|
||||
last_response_status int,
|
||||
last_response_snippet text,
|
||||
payload jsonb NOT NULL,
|
||||
delivered_at timestamptz,
|
||||
created_at timestamptz NOT NULL DEFAULT now()
|
||||
);
|
||||
CREATE INDEX alert_notifications_pending_idx ON alert_notifications (next_attempt_at) WHERE status = 'PENDING';
|
||||
CREATE INDEX alert_notifications_instance_idx ON alert_notifications (alert_instance_id);
|
||||
|
||||
CREATE TABLE alert_reads (
|
||||
user_id text NOT NULL REFERENCES users(user_id) ON DELETE CASCADE,
|
||||
alert_instance_id uuid NOT NULL REFERENCES alert_instances(id) ON DELETE CASCADE,
|
||||
read_at timestamptz NOT NULL DEFAULT now(),
|
||||
PRIMARY KEY (user_id, alert_instance_id)
|
||||
);
|
||||
@@ -1,7 +0,0 @@
|
||||
-- V13 — Unique partial index: at most one open alert_instance per rule
|
||||
-- Prevents duplicate FIRING rows in multi-replica deployments.
|
||||
-- The Java save() path catches DuplicateKeyException and log-and-skips the losing insert.
|
||||
CREATE UNIQUE INDEX alert_instances_open_rule_uq
|
||||
ON alert_instances (rule_id)
|
||||
WHERE rule_id IS NOT NULL
|
||||
AND state IN ('PENDING','FIRING','ACKNOWLEDGED');
|
||||
@@ -1,34 +1,61 @@
|
||||
-- V1__init.sql — PostgreSQL schema for Cameleer Server
|
||||
-- PostgreSQL stores RBAC, configuration, and audit data only.
|
||||
-- All observability data (executions, metrics, diagrams, logs, stats) is in ClickHouse.
|
||||
--
|
||||
-- PostgreSQL stores RBAC, configuration, audit, runtime management,
|
||||
-- outbound connections, and alerting. All observability data
|
||||
-- (executions, metrics, diagrams, logs, stats) lives in ClickHouse.
|
||||
--
|
||||
-- This file is the consolidated baseline — the project was kept greenfield
|
||||
-- and the V1..V18 evolution was collapsed before first prod deployment.
|
||||
-- See commit history for the ordered migration archaeology.
|
||||
|
||||
-- =============================================================
|
||||
-- RBAC
|
||||
-- Enums
|
||||
-- =============================================================
|
||||
|
||||
CREATE TYPE alert_state_enum AS ENUM ('PENDING', 'FIRING', 'RESOLVED');
|
||||
CREATE TYPE severity_enum AS ENUM ('CRITICAL', 'WARNING', 'INFO');
|
||||
CREATE TYPE target_kind_enum AS ENUM ('USER', 'GROUP', 'ROLE');
|
||||
CREATE TYPE notification_status_enum AS ENUM ('PENDING', 'DELIVERED', 'FAILED');
|
||||
|
||||
CREATE TYPE condition_kind_enum AS ENUM (
|
||||
'ROUTE_METRIC', 'EXCHANGE_MATCH', 'AGENT_STATE', 'AGENT_LIFECYCLE',
|
||||
'DEPLOYMENT_STATE', 'LOG_PATTERN', 'JVM_METRIC'
|
||||
);
|
||||
|
||||
CREATE TYPE outbound_method_enum AS ENUM ('POST', 'PUT', 'PATCH');
|
||||
CREATE TYPE outbound_auth_kind_enum AS ENUM ('NONE', 'BEARER', 'BASIC');
|
||||
CREATE TYPE trust_mode_enum AS ENUM ('SYSTEM_DEFAULT', 'TRUST_ALL', 'TRUST_PATHS');
|
||||
|
||||
-- =============================================================
|
||||
-- RBAC — users, roles, groups, assignments
|
||||
-- =============================================================
|
||||
|
||||
CREATE TABLE users (
|
||||
user_id TEXT PRIMARY KEY,
|
||||
provider TEXT NOT NULL,
|
||||
email TEXT,
|
||||
display_name TEXT,
|
||||
password_hash TEXT,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
user_id TEXT PRIMARY KEY,
|
||||
provider TEXT NOT NULL,
|
||||
email TEXT,
|
||||
display_name TEXT,
|
||||
password_hash TEXT,
|
||||
failed_login_attempts INTEGER NOT NULL DEFAULT 0,
|
||||
locked_until TIMESTAMPTZ,
|
||||
token_revoked_before TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
CREATE TABLE roles (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
name TEXT NOT NULL UNIQUE,
|
||||
description TEXT NOT NULL DEFAULT '',
|
||||
scope TEXT NOT NULL DEFAULT 'custom',
|
||||
system BOOLEAN NOT NULL DEFAULT false,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
name TEXT NOT NULL UNIQUE,
|
||||
description TEXT NOT NULL DEFAULT '',
|
||||
scope TEXT NOT NULL DEFAULT 'custom',
|
||||
system BOOLEAN NOT NULL DEFAULT false,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
INSERT INTO roles (id, name, description, scope, system) VALUES
|
||||
('00000000-0000-0000-0000-000000000001', 'AGENT', 'Agent registration and data ingestion', 'system-wide', true),
|
||||
('00000000-0000-0000-0000-000000000002', 'VIEWER', 'Read-only access to dashboards and data', 'system-wide', true),
|
||||
('00000000-0000-0000-0000-000000000003', 'OPERATOR', 'Operational commands (start/stop/configure agents)', 'system-wide', true),
|
||||
('00000000-0000-0000-0000-000000000003', 'OPERATOR', 'Operational commands (start/stop/configure agents)', 'system-wide', true),
|
||||
('00000000-0000-0000-0000-000000000004', 'ADMIN', 'Full administrative access', 'system-wide', true);
|
||||
|
||||
CREATE TABLE groups (
|
||||
@@ -37,58 +64,145 @@ CREATE TABLE groups (
|
||||
parent_group_id UUID REFERENCES groups(id) ON DELETE SET NULL,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
CREATE INDEX idx_groups_parent ON groups (parent_group_id);
|
||||
|
||||
INSERT INTO groups (id, name) VALUES
|
||||
('00000000-0000-0000-0000-000000000010', 'Admins');
|
||||
|
||||
CREATE TABLE group_roles (
|
||||
group_id UUID NOT NULL REFERENCES groups(id) ON DELETE CASCADE,
|
||||
role_id UUID NOT NULL REFERENCES roles(id) ON DELETE CASCADE,
|
||||
role_id UUID NOT NULL REFERENCES roles(id) ON DELETE CASCADE,
|
||||
PRIMARY KEY (group_id, role_id)
|
||||
);
|
||||
CREATE INDEX idx_group_roles_group_id ON group_roles (group_id);
|
||||
|
||||
INSERT INTO group_roles (group_id, role_id) VALUES
|
||||
('00000000-0000-0000-0000-000000000010', '00000000-0000-0000-0000-000000000004');
|
||||
|
||||
CREATE TABLE user_groups (
|
||||
user_id TEXT NOT NULL REFERENCES users(user_id) ON DELETE CASCADE,
|
||||
group_id UUID NOT NULL REFERENCES groups(id) ON DELETE CASCADE,
|
||||
PRIMARY KEY (user_id, group_id)
|
||||
-- Claim-mapping rules (OIDC). Declared before user_roles/user_groups
|
||||
-- because those tables carry an FK to claim_mapping_rules.id.
|
||||
CREATE TABLE claim_mapping_rules (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
claim TEXT NOT NULL,
|
||||
match_type TEXT NOT NULL,
|
||||
match_value TEXT NOT NULL,
|
||||
action TEXT NOT NULL,
|
||||
target TEXT NOT NULL,
|
||||
priority INTEGER NOT NULL DEFAULT 0,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
CONSTRAINT chk_match_type CHECK (match_type IN ('equals', 'contains', 'regex')),
|
||||
CONSTRAINT chk_action CHECK (action IN ('assignRole', 'addToGroup'))
|
||||
);
|
||||
|
||||
CREATE TABLE user_roles (
|
||||
user_id TEXT NOT NULL REFERENCES users(user_id) ON DELETE CASCADE,
|
||||
role_id UUID NOT NULL REFERENCES roles(id) ON DELETE CASCADE,
|
||||
PRIMARY KEY (user_id, role_id)
|
||||
user_id TEXT NOT NULL REFERENCES users(user_id) ON DELETE CASCADE,
|
||||
role_id UUID NOT NULL REFERENCES roles(id) ON DELETE CASCADE,
|
||||
origin TEXT NOT NULL DEFAULT 'direct',
|
||||
mapping_id UUID REFERENCES claim_mapping_rules(id) ON DELETE CASCADE,
|
||||
PRIMARY KEY (user_id, role_id, origin)
|
||||
);
|
||||
CREATE INDEX idx_user_roles_user_id ON user_roles (user_id);
|
||||
CREATE INDEX idx_user_roles_origin ON user_roles (user_id, origin);
|
||||
|
||||
CREATE INDEX idx_user_roles_user_id ON user_roles(user_id);
|
||||
CREATE INDEX idx_user_groups_user_id ON user_groups(user_id);
|
||||
CREATE INDEX idx_group_roles_group_id ON group_roles(group_id);
|
||||
CREATE INDEX idx_groups_parent ON groups(parent_group_id);
|
||||
CREATE TABLE user_groups (
|
||||
user_id TEXT NOT NULL REFERENCES users(user_id) ON DELETE CASCADE,
|
||||
group_id UUID NOT NULL REFERENCES groups(id) ON DELETE CASCADE,
|
||||
origin TEXT NOT NULL DEFAULT 'direct',
|
||||
mapping_id UUID REFERENCES claim_mapping_rules(id) ON DELETE CASCADE,
|
||||
PRIMARY KEY (user_id, group_id, origin)
|
||||
);
|
||||
CREATE INDEX idx_user_groups_user_id ON user_groups (user_id);
|
||||
CREATE INDEX idx_user_groups_origin ON user_groups (user_id, origin);
|
||||
|
||||
-- =============================================================
|
||||
-- Server configuration
|
||||
-- =============================================================
|
||||
|
||||
CREATE TABLE server_config (
|
||||
config_key TEXT PRIMARY KEY,
|
||||
config_val JSONB NOT NULL,
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_by TEXT
|
||||
config_key TEXT PRIMARY KEY,
|
||||
config_val JSONB NOT NULL,
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_by TEXT
|
||||
);
|
||||
|
||||
-- =============================================================
|
||||
-- Application configuration
|
||||
-- Runtime management — environments, apps, versions, deployments
|
||||
-- =============================================================
|
||||
|
||||
CREATE TABLE environments (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
slug VARCHAR(100) NOT NULL UNIQUE,
|
||||
display_name VARCHAR(255) NOT NULL,
|
||||
production BOOLEAN NOT NULL DEFAULT false,
|
||||
enabled BOOLEAN NOT NULL DEFAULT true,
|
||||
default_container_config JSONB NOT NULL DEFAULT '{}'::jsonb,
|
||||
jar_retention_count INTEGER DEFAULT 5,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
-- Default environment — standalone mode always has at least one
|
||||
INSERT INTO environments (slug, display_name) VALUES ('default', 'Default');
|
||||
|
||||
CREATE TABLE apps (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
environment_id UUID NOT NULL REFERENCES environments(id) ON DELETE CASCADE,
|
||||
slug VARCHAR(100) NOT NULL,
|
||||
display_name VARCHAR(255) NOT NULL,
|
||||
container_config JSONB NOT NULL DEFAULT '{}'::jsonb,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
UNIQUE (environment_id, slug)
|
||||
);
|
||||
CREATE INDEX idx_apps_environment_id ON apps (environment_id);
|
||||
|
||||
CREATE TABLE app_versions (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
app_id UUID NOT NULL REFERENCES apps(id) ON DELETE CASCADE,
|
||||
version INTEGER NOT NULL,
|
||||
jar_path VARCHAR(500) NOT NULL,
|
||||
jar_checksum VARCHAR(64) NOT NULL,
|
||||
jar_filename VARCHAR(255),
|
||||
jar_size_bytes BIGINT,
|
||||
detected_runtime_type VARCHAR,
|
||||
detected_main_class VARCHAR,
|
||||
uploaded_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
UNIQUE (app_id, version)
|
||||
);
|
||||
CREATE INDEX idx_app_versions_app_id ON app_versions (app_id);
|
||||
|
||||
CREATE TABLE deployments (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
app_id UUID NOT NULL REFERENCES apps(id) ON DELETE CASCADE,
|
||||
app_version_id UUID NOT NULL REFERENCES app_versions(id),
|
||||
environment_id UUID NOT NULL REFERENCES environments(id),
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'STARTING',
|
||||
target_state VARCHAR(20) NOT NULL DEFAULT 'RUNNING',
|
||||
deployment_strategy VARCHAR(20) NOT NULL DEFAULT 'BLUE_GREEN',
|
||||
deploy_stage VARCHAR(30),
|
||||
replica_states JSONB NOT NULL DEFAULT '[]'::jsonb,
|
||||
resolved_config JSONB,
|
||||
container_id VARCHAR(100),
|
||||
container_name VARCHAR(255),
|
||||
error_message TEXT,
|
||||
deployed_at TIMESTAMPTZ,
|
||||
stopped_at TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
CREATE INDEX idx_deployments_app_id ON deployments (app_id);
|
||||
CREATE INDEX idx_deployments_env_id ON deployments (environment_id);
|
||||
|
||||
-- =============================================================
|
||||
-- Application configuration (env-scoped)
|
||||
-- =============================================================
|
||||
|
||||
CREATE TABLE application_config (
|
||||
application TEXT NOT NULL,
|
||||
environment TEXT NOT NULL,
|
||||
config_val JSONB NOT NULL,
|
||||
version INTEGER NOT NULL DEFAULT 1,
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_by TEXT,
|
||||
application TEXT NOT NULL,
|
||||
environment TEXT NOT NULL,
|
||||
config_val JSONB NOT NULL,
|
||||
version INTEGER NOT NULL DEFAULT 1,
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_by TEXT,
|
||||
PRIMARY KEY (application, environment)
|
||||
);
|
||||
|
||||
@@ -110,20 +224,169 @@ CREATE TABLE app_settings (
|
||||
-- =============================================================
|
||||
|
||||
CREATE TABLE audit_log (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
timestamp TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
username TEXT NOT NULL,
|
||||
action TEXT NOT NULL,
|
||||
category TEXT NOT NULL,
|
||||
target TEXT,
|
||||
detail JSONB,
|
||||
result TEXT NOT NULL,
|
||||
ip_address TEXT,
|
||||
user_agent TEXT
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
timestamp TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
username TEXT NOT NULL,
|
||||
action TEXT NOT NULL,
|
||||
category TEXT NOT NULL,
|
||||
target TEXT,
|
||||
detail JSONB,
|
||||
result TEXT NOT NULL,
|
||||
ip_address TEXT,
|
||||
user_agent TEXT
|
||||
);
|
||||
|
||||
CREATE INDEX idx_audit_log_timestamp ON audit_log (timestamp DESC);
|
||||
CREATE INDEX idx_audit_log_username ON audit_log (username);
|
||||
CREATE INDEX idx_audit_log_category ON audit_log (category);
|
||||
CREATE INDEX idx_audit_log_action ON audit_log (action);
|
||||
CREATE INDEX idx_audit_log_target ON audit_log (target);
|
||||
CREATE INDEX idx_audit_log_username ON audit_log (username);
|
||||
CREATE INDEX idx_audit_log_category ON audit_log (category);
|
||||
CREATE INDEX idx_audit_log_action ON audit_log (action);
|
||||
CREATE INDEX idx_audit_log_target ON audit_log (target);
|
||||
|
||||
-- =============================================================
|
||||
-- Outbound connections (admin-managed HTTPS targets for webhooks)
|
||||
-- =============================================================
|
||||
|
||||
CREATE TABLE outbound_connections (
|
||||
id UUID PRIMARY KEY,
|
||||
tenant_id VARCHAR(64) NOT NULL,
|
||||
name VARCHAR(100) NOT NULL,
|
||||
description TEXT,
|
||||
url TEXT NOT NULL,
|
||||
method outbound_method_enum NOT NULL,
|
||||
default_headers JSONB NOT NULL DEFAULT '{}'::jsonb,
|
||||
default_body_tmpl TEXT,
|
||||
tls_trust_mode trust_mode_enum NOT NULL DEFAULT 'SYSTEM_DEFAULT',
|
||||
tls_ca_pem_paths JSONB NOT NULL DEFAULT '[]'::jsonb,
|
||||
hmac_secret_ciphertext TEXT,
|
||||
auth_kind outbound_auth_kind_enum NOT NULL DEFAULT 'NONE',
|
||||
auth_config JSONB NOT NULL DEFAULT '{}'::jsonb,
|
||||
allowed_environment_ids UUID[] NOT NULL DEFAULT '{}'::uuid[],
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
created_by TEXT NOT NULL REFERENCES users(user_id),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_by TEXT NOT NULL REFERENCES users(user_id),
|
||||
CONSTRAINT outbound_connections_name_unique_per_tenant UNIQUE (tenant_id, name),
|
||||
CONSTRAINT outbound_connections_url_check CHECK (url ~ '^https://')
|
||||
);
|
||||
CREATE INDEX outbound_connections_tenant_idx ON outbound_connections (tenant_id);
|
||||
|
||||
-- =============================================================
|
||||
-- Alerting
|
||||
-- =============================================================
|
||||
|
||||
CREATE TABLE alert_rules (
|
||||
id UUID NOT NULL PRIMARY KEY,
|
||||
environment_id UUID NOT NULL REFERENCES environments(id) ON DELETE CASCADE,
|
||||
name VARCHAR(200) NOT NULL,
|
||||
description TEXT,
|
||||
severity severity_enum NOT NULL,
|
||||
enabled BOOLEAN NOT NULL DEFAULT true,
|
||||
condition_kind condition_kind_enum NOT NULL,
|
||||
condition JSONB NOT NULL,
|
||||
evaluation_interval_seconds INTEGER NOT NULL DEFAULT 60 CHECK (evaluation_interval_seconds >= 5),
|
||||
for_duration_seconds INTEGER NOT NULL DEFAULT 0 CHECK (for_duration_seconds >= 0),
|
||||
re_notify_minutes INTEGER NOT NULL DEFAULT 60 CHECK (re_notify_minutes >= 0),
|
||||
notification_title_tmpl TEXT NOT NULL,
|
||||
notification_message_tmpl TEXT NOT NULL,
|
||||
webhooks JSONB NOT NULL DEFAULT '[]'::jsonb,
|
||||
next_evaluation_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
claimed_by VARCHAR(64),
|
||||
claimed_until TIMESTAMPTZ,
|
||||
eval_state JSONB NOT NULL DEFAULT '{}'::jsonb,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
created_by TEXT NOT NULL REFERENCES users(user_id),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_by TEXT NOT NULL REFERENCES users(user_id)
|
||||
);
|
||||
CREATE INDEX alert_rules_env_idx ON alert_rules (environment_id);
|
||||
CREATE INDEX alert_rules_claim_due_idx ON alert_rules (next_evaluation_at) WHERE enabled = true;
|
||||
|
||||
CREATE TABLE alert_rule_targets (
|
||||
id UUID NOT NULL PRIMARY KEY,
|
||||
rule_id UUID NOT NULL REFERENCES alert_rules(id) ON DELETE CASCADE,
|
||||
target_kind target_kind_enum NOT NULL,
|
||||
target_id VARCHAR(128) NOT NULL,
|
||||
UNIQUE (rule_id, target_kind, target_id)
|
||||
);
|
||||
CREATE INDEX alert_rule_targets_lookup_idx ON alert_rule_targets (target_kind, target_id);
|
||||
|
||||
CREATE TABLE alert_instances (
|
||||
id UUID NOT NULL PRIMARY KEY,
|
||||
rule_id UUID REFERENCES alert_rules(id) ON DELETE SET NULL,
|
||||
rule_snapshot JSONB NOT NULL,
|
||||
environment_id UUID NOT NULL REFERENCES environments(id) ON DELETE CASCADE,
|
||||
state alert_state_enum NOT NULL,
|
||||
severity severity_enum NOT NULL,
|
||||
fired_at TIMESTAMPTZ NOT NULL,
|
||||
acked_at TIMESTAMPTZ,
|
||||
acked_by TEXT REFERENCES users(user_id),
|
||||
resolved_at TIMESTAMPTZ,
|
||||
last_notified_at TIMESTAMPTZ,
|
||||
silenced BOOLEAN NOT NULL DEFAULT false,
|
||||
current_value NUMERIC,
|
||||
threshold NUMERIC,
|
||||
context JSONB NOT NULL,
|
||||
title TEXT NOT NULL,
|
||||
message TEXT NOT NULL,
|
||||
target_user_ids TEXT[] NOT NULL DEFAULT '{}'::text[],
|
||||
target_group_ids UUID[] NOT NULL DEFAULT '{}'::uuid[],
|
||||
target_role_names TEXT[] NOT NULL DEFAULT '{}'::text[],
|
||||
read_at TIMESTAMPTZ,
|
||||
deleted_at TIMESTAMPTZ
|
||||
);
|
||||
CREATE INDEX alert_instances_inbox_idx ON alert_instances (environment_id, state, fired_at DESC);
|
||||
CREATE INDEX alert_instances_open_rule_idx ON alert_instances (rule_id, state) WHERE rule_id IS NOT NULL;
|
||||
CREATE INDEX alert_instances_resolved_idx ON alert_instances (resolved_at) WHERE state = 'RESOLVED';
|
||||
CREATE INDEX alert_instances_unread_idx ON alert_instances (environment_id, read_at)
|
||||
WHERE read_at IS NULL AND deleted_at IS NULL;
|
||||
CREATE INDEX alert_instances_deleted_idx ON alert_instances (deleted_at) WHERE deleted_at IS NOT NULL;
|
||||
CREATE INDEX alert_instances_target_u_idx ON alert_instances USING GIN (target_user_ids);
|
||||
CREATE INDEX alert_instances_target_g_idx ON alert_instances USING GIN (target_group_ids);
|
||||
CREATE INDEX alert_instances_target_r_idx ON alert_instances USING GIN (target_role_names);
|
||||
|
||||
-- Per-rule open-instance uniqueness. The discriminator prefers
|
||||
-- context->>'_subjectFingerprint' (populated by the evaluator for
|
||||
-- PER_EXCHANGE / PER_AGENT condition kinds), with a fall-through to
|
||||
-- the legacy exchange-id path for rules predating the fingerprint.
|
||||
-- Scalar kinds resolve to '' and retain strict one-open-per-rule.
|
||||
-- Soft-deleted rows (deleted_at IS NOT NULL) free the slot so a
|
||||
-- deleted alert can be re-raised.
|
||||
CREATE UNIQUE INDEX alert_instances_open_rule_uq
|
||||
ON alert_instances (rule_id, COALESCE(
|
||||
context->>'_subjectFingerprint',
|
||||
context->'exchange'->>'id',
|
||||
''))
|
||||
WHERE rule_id IS NOT NULL
|
||||
AND state IN ('PENDING', 'FIRING')
|
||||
AND deleted_at IS NULL;
|
||||
|
||||
CREATE TABLE alert_silences (
|
||||
id UUID NOT NULL PRIMARY KEY,
|
||||
environment_id UUID NOT NULL REFERENCES environments(id) ON DELETE CASCADE,
|
||||
matcher JSONB NOT NULL,
|
||||
reason TEXT,
|
||||
starts_at TIMESTAMPTZ NOT NULL,
|
||||
ends_at TIMESTAMPTZ NOT NULL,
|
||||
created_by TEXT NOT NULL REFERENCES users(user_id),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
CHECK (ends_at > starts_at)
|
||||
);
|
||||
CREATE INDEX alert_silences_active_idx ON alert_silences (environment_id, ends_at);
|
||||
|
||||
CREATE TABLE alert_notifications (
|
||||
id UUID NOT NULL PRIMARY KEY,
|
||||
alert_instance_id UUID NOT NULL REFERENCES alert_instances(id) ON DELETE CASCADE,
|
||||
webhook_id UUID,
|
||||
outbound_connection_id UUID REFERENCES outbound_connections(id) ON DELETE SET NULL,
|
||||
status notification_status_enum NOT NULL DEFAULT 'PENDING',
|
||||
attempts INTEGER NOT NULL DEFAULT 0,
|
||||
next_attempt_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
claimed_by VARCHAR(64),
|
||||
claimed_until TIMESTAMPTZ,
|
||||
last_response_status INTEGER,
|
||||
last_response_snippet TEXT,
|
||||
payload JSONB NOT NULL,
|
||||
delivered_at TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
CREATE INDEX alert_notifications_instance_idx ON alert_notifications (alert_instance_id);
|
||||
CREATE INDEX alert_notifications_pending_idx ON alert_notifications (next_attempt_at) WHERE status = 'PENDING';
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
-- V2: per-environment color for UI indicator
|
||||
-- Added after V1 baseline (2026-04-22). 8-swatch preset palette; default 'slate'.
|
||||
|
||||
ALTER TABLE environments
|
||||
ADD COLUMN color VARCHAR(16) NOT NULL DEFAULT 'slate'
|
||||
CHECK (color IN ('slate','red','amber','green','teal','blue','purple','pink'));
|
||||
@@ -1,39 +0,0 @@
|
||||
-- V2__claim_mapping.sql
|
||||
-- Add origin tracking to assignment tables
|
||||
|
||||
ALTER TABLE user_roles ADD COLUMN origin TEXT NOT NULL DEFAULT 'direct';
|
||||
ALTER TABLE user_roles ADD COLUMN mapping_id UUID;
|
||||
|
||||
ALTER TABLE user_groups ADD COLUMN origin TEXT NOT NULL DEFAULT 'direct';
|
||||
ALTER TABLE user_groups ADD COLUMN mapping_id UUID;
|
||||
|
||||
-- Drop old primary keys (they don't include origin)
|
||||
ALTER TABLE user_roles DROP CONSTRAINT user_roles_pkey;
|
||||
ALTER TABLE user_roles ADD PRIMARY KEY (user_id, role_id, origin);
|
||||
|
||||
ALTER TABLE user_groups DROP CONSTRAINT user_groups_pkey;
|
||||
ALTER TABLE user_groups ADD PRIMARY KEY (user_id, group_id, origin);
|
||||
|
||||
-- Claim mapping rules table
|
||||
CREATE TABLE claim_mapping_rules (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
claim TEXT NOT NULL,
|
||||
match_type TEXT NOT NULL,
|
||||
match_value TEXT NOT NULL,
|
||||
action TEXT NOT NULL,
|
||||
target TEXT NOT NULL,
|
||||
priority INT NOT NULL DEFAULT 0,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
CONSTRAINT chk_match_type CHECK (match_type IN ('equals', 'contains', 'regex')),
|
||||
CONSTRAINT chk_action CHECK (action IN ('assignRole', 'addToGroup'))
|
||||
);
|
||||
|
||||
-- Foreign key from assignments to mapping rules
|
||||
ALTER TABLE user_roles ADD CONSTRAINT fk_user_roles_mapping
|
||||
FOREIGN KEY (mapping_id) REFERENCES claim_mapping_rules(id) ON DELETE CASCADE;
|
||||
ALTER TABLE user_groups ADD CONSTRAINT fk_user_groups_mapping
|
||||
FOREIGN KEY (mapping_id) REFERENCES claim_mapping_rules(id) ON DELETE CASCADE;
|
||||
|
||||
-- Index for fast managed assignment cleanup
|
||||
CREATE INDEX idx_user_roles_origin ON user_roles(user_id, origin);
|
||||
CREATE INDEX idx_user_groups_origin ON user_groups(user_id, origin);
|
||||
@@ -0,0 +1,7 @@
|
||||
-- V3: per-deployment config snapshot for "last known good" + dirty detection
|
||||
-- Captures {jarVersionId, agentConfig, containerConfig} at the moment a
|
||||
-- deployment transitions to RUNNING. Historical rows are NULL; dirty detection
|
||||
-- treats NULL as "everything dirty" and the next successful Redeploy populates it.
|
||||
|
||||
ALTER TABLE deployments
|
||||
ADD COLUMN deployed_config_snapshot JSONB;
|
||||
@@ -1,54 +0,0 @@
|
||||
-- V3__runtime_management.sql
|
||||
-- Runtime management: environments, apps, app versions, deployments
|
||||
|
||||
CREATE TABLE environments (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
slug VARCHAR(100) NOT NULL UNIQUE,
|
||||
display_name VARCHAR(255) NOT NULL,
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'ACTIVE',
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
CREATE TABLE apps (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
environment_id UUID NOT NULL REFERENCES environments(id) ON DELETE CASCADE,
|
||||
slug VARCHAR(100) NOT NULL,
|
||||
display_name VARCHAR(255) NOT NULL,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
UNIQUE(environment_id, slug)
|
||||
);
|
||||
CREATE INDEX idx_apps_environment_id ON apps(environment_id);
|
||||
|
||||
CREATE TABLE app_versions (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
app_id UUID NOT NULL REFERENCES apps(id) ON DELETE CASCADE,
|
||||
version INTEGER NOT NULL,
|
||||
jar_path VARCHAR(500) NOT NULL,
|
||||
jar_checksum VARCHAR(64) NOT NULL,
|
||||
jar_filename VARCHAR(255),
|
||||
jar_size_bytes BIGINT,
|
||||
uploaded_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
UNIQUE(app_id, version)
|
||||
);
|
||||
CREATE INDEX idx_app_versions_app_id ON app_versions(app_id);
|
||||
|
||||
CREATE TABLE deployments (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
app_id UUID NOT NULL REFERENCES apps(id) ON DELETE CASCADE,
|
||||
app_version_id UUID NOT NULL REFERENCES app_versions(id),
|
||||
environment_id UUID NOT NULL REFERENCES environments(id),
|
||||
status VARCHAR(20) NOT NULL DEFAULT 'STARTING',
|
||||
container_id VARCHAR(100),
|
||||
container_name VARCHAR(255),
|
||||
error_message TEXT,
|
||||
deployed_at TIMESTAMPTZ,
|
||||
stopped_at TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
CREATE INDEX idx_deployments_app_id ON deployments(app_id);
|
||||
CREATE INDEX idx_deployments_env_id ON deployments(environment_id);
|
||||
|
||||
-- Default environment (standalone mode always has at least one)
|
||||
INSERT INTO environments (slug, display_name) VALUES ('default', 'Default');
|
||||
@@ -0,0 +1,8 @@
|
||||
-- V4: add created_by column to deployments for audit trail
|
||||
-- Captures which user initiated a deployment. Nullable for backwards compatibility;
|
||||
-- pre-V4 historical deployments will have NULL.
|
||||
|
||||
ALTER TABLE deployments
|
||||
ADD COLUMN created_by TEXT REFERENCES users(user_id);
|
||||
|
||||
CREATE INDEX idx_deployments_created_by ON deployments (created_by);
|
||||
@@ -1,6 +0,0 @@
|
||||
-- V4__environment_config.sql
|
||||
-- Add production flag and enabled flag to environments, drop unused status column
|
||||
|
||||
ALTER TABLE environments ADD COLUMN production BOOLEAN NOT NULL DEFAULT false;
|
||||
ALTER TABLE environments ADD COLUMN enabled BOOLEAN NOT NULL DEFAULT true;
|
||||
ALTER TABLE environments DROP COLUMN status;
|
||||
@@ -1,4 +0,0 @@
|
||||
-- Add container config to apps and environment defaults
|
||||
ALTER TABLE apps ADD COLUMN container_config JSONB NOT NULL DEFAULT '{}';
|
||||
|
||||
ALTER TABLE environments ADD COLUMN default_container_config JSONB NOT NULL DEFAULT '{}';
|
||||
@@ -1 +0,0 @@
|
||||
ALTER TABLE environments ADD COLUMN jar_retention_count INTEGER DEFAULT 5;
|
||||
@@ -1,12 +0,0 @@
|
||||
-- Deployment orchestration: status model, replicas, strategies, progress tracking
|
||||
|
||||
ALTER TABLE deployments ADD COLUMN target_state VARCHAR(20) NOT NULL DEFAULT 'RUNNING';
|
||||
ALTER TABLE deployments ADD COLUMN deployment_strategy VARCHAR(20) NOT NULL DEFAULT 'BLUE_GREEN';
|
||||
ALTER TABLE deployments ADD COLUMN replica_states JSONB NOT NULL DEFAULT '[]';
|
||||
ALTER TABLE deployments ADD COLUMN deploy_stage VARCHAR(30);
|
||||
|
||||
-- Backfill existing deployments
|
||||
UPDATE deployments SET target_state = CASE
|
||||
WHEN status = 'STOPPED' THEN 'STOPPED'
|
||||
ELSE 'RUNNING'
|
||||
END;
|
||||
@@ -1 +0,0 @@
|
||||
ALTER TABLE deployments ADD COLUMN resolved_config JSONB;
|
||||
@@ -1,3 +0,0 @@
|
||||
ALTER TABLE users ADD COLUMN IF NOT EXISTS failed_login_attempts INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE users ADD COLUMN IF NOT EXISTS locked_until TIMESTAMPTZ;
|
||||
ALTER TABLE users ADD COLUMN IF NOT EXISTS token_revoked_before TIMESTAMPTZ;
|
||||
@@ -1,10 +1,7 @@
|
||||
package com.cameleer.server.app;
|
||||
|
||||
import com.cameleer.server.app.search.ClickHouseSearchIndex;
|
||||
import com.cameleer.server.core.agent.AgentRegistryService;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.jdbc.core.JdbcTemplate;
|
||||
import org.springframework.test.context.ActiveProfiles;
|
||||
import org.springframework.test.context.DynamicPropertyRegistry;
|
||||
@@ -17,12 +14,6 @@ import org.testcontainers.containers.PostgreSQLContainer;
|
||||
@ActiveProfiles("test")
|
||||
public abstract class AbstractPostgresIT {
|
||||
|
||||
// Mocked infrastructure beans required by the full application context.
|
||||
// ClickHouseSearchIndex is not available in test without explicit ClickHouse wiring,
|
||||
// and AgentRegistryService requires in-memory state that tests manage directly.
|
||||
@MockBean(name = "clickHouseSearchIndex") protected ClickHouseSearchIndex clickHouseSearchIndex;
|
||||
@MockBean protected AgentRegistryService agentRegistryService;
|
||||
|
||||
static final PostgreSQLContainer<?> postgres;
|
||||
static final ClickHouseContainer clickhouse;
|
||||
|
||||
@@ -30,10 +21,12 @@ public abstract class AbstractPostgresIT {
|
||||
postgres = new PostgreSQLContainer<>("postgres:16")
|
||||
.withDatabaseName("cameleer")
|
||||
.withUsername("cameleer")
|
||||
.withPassword("test");
|
||||
.withPassword("test")
|
||||
.withReuse(true);
|
||||
postgres.start();
|
||||
|
||||
clickhouse = new ClickHouseContainer("clickhouse/clickhouse-server:24.12");
|
||||
clickhouse = new ClickHouseContainer("clickhouse/clickhouse-server:24.12")
|
||||
.withReuse(true);
|
||||
clickhouse.start();
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
package com.cameleer.server.app;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
/**
|
||||
* Asserts the full production Spring context loads without any @MockBean
|
||||
* replacements. This is the regression test for the #141 crashloop — declared
|
||||
* bean types must match autowire expectations end-to-end.
|
||||
*/
|
||||
class SpringContextSmokeIT extends AbstractPostgresIT {
|
||||
|
||||
@Test
|
||||
void contextLoads() {
|
||||
// no-op: @SpringBootTest refresh is the assertion
|
||||
}
|
||||
}
|
||||
@@ -3,6 +3,7 @@ package com.cameleer.server.app.alerting;
|
||||
import com.cameleer.server.app.AbstractPostgresIT;
|
||||
import com.cameleer.server.app.TestSecurityHelper;
|
||||
import com.cameleer.server.app.search.ClickHouseLogStore;
|
||||
import com.cameleer.server.core.agent.AgentRegistryService;
|
||||
import com.cameleer.server.core.alerting.*;
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
@@ -26,7 +27,7 @@ import static org.mockito.Mockito.when;
|
||||
*/
|
||||
class AlertingEnvIsolationIT extends AbstractPostgresIT {
|
||||
|
||||
// AbstractPostgresIT already declares clickHouseSearchIndex + agentRegistryService mocks.
|
||||
@MockBean AgentRegistryService agentRegistryService;
|
||||
@MockBean(name = "clickHouseLogStore") ClickHouseLogStore clickHouseLogStore;
|
||||
|
||||
@Autowired private TestRestTemplate restTemplate;
|
||||
|
||||
@@ -7,8 +7,10 @@ import com.cameleer.server.app.alerting.eval.AlertEvaluatorJob;
|
||||
import com.cameleer.server.app.alerting.notify.NotificationDispatchJob;
|
||||
import com.cameleer.server.app.outbound.crypto.SecretCipher;
|
||||
import com.cameleer.server.app.search.ClickHouseLogStore;
|
||||
import com.cameleer.server.app.storage.ClickHouseExecutionStore;
|
||||
import com.cameleer.server.core.alerting.*;
|
||||
import com.cameleer.server.core.ingestion.BufferedLogEntry;
|
||||
import com.cameleer.server.core.ingestion.MergedExecution;
|
||||
import com.cameleer.server.core.outbound.OutboundConnectionRepository;
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
@@ -49,8 +51,6 @@ import static org.assertj.core.api.Assertions.assertThat;
|
||||
@TestInstance(Lifecycle.PER_CLASS)
|
||||
class AlertingFullLifecycleIT extends AbstractPostgresIT {
|
||||
|
||||
// AbstractPostgresIT already declares clickHouseSearchIndex + agentRegistryService mocks.
|
||||
|
||||
// Replace the alertingClock bean so we can control time in re-notify test
|
||||
@MockBean(name = "alertingClock") Clock alertingClock;
|
||||
|
||||
@@ -64,6 +64,7 @@ class AlertingFullLifecycleIT extends AbstractPostgresIT {
|
||||
@Autowired private AlertSilenceRepository silenceRepo;
|
||||
@Autowired private OutboundConnectionRepository outboundRepo;
|
||||
@Autowired private ClickHouseLogStore logStore;
|
||||
@Autowired private ClickHouseExecutionStore executionStore;
|
||||
@Autowired private SecretCipher secretCipher;
|
||||
@Autowired private TestRestTemplate restTemplate;
|
||||
@Autowired private TestSecurityHelper securityHelper;
|
||||
@@ -245,11 +246,11 @@ class AlertingFullLifecycleIT extends AbstractPostgresIT {
|
||||
|
||||
assertThat(resp.getStatusCode()).isEqualTo(HttpStatus.OK);
|
||||
JsonNode body = objectMapper.readTree(resp.getBody());
|
||||
assertThat(body.path("state").asText()).isEqualTo("ACKNOWLEDGED");
|
||||
assertThat(body.path("state").asText()).isEqualTo("FIRING");
|
||||
|
||||
// DB state
|
||||
AlertInstance updated = instanceRepo.findById(instanceId).orElseThrow();
|
||||
assertThat(updated.state()).isEqualTo(AlertState.ACKNOWLEDGED);
|
||||
assertThat(updated.state()).isEqualTo(AlertState.FIRING);
|
||||
}
|
||||
|
||||
@Test
|
||||
@@ -401,6 +402,102 @@ class AlertingFullLifecycleIT extends AbstractPostgresIT {
|
||||
jdbcTemplate.update("DELETE FROM alert_rules WHERE id = ?", reNotifyRuleId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Exactly-once-per-exchange end-to-end lifecycle.
|
||||
* <p>
|
||||
* 5 FAILED exchanges across 2 evaluator ticks must produce exactly
|
||||
* 5 FIRING instances + 5 PENDING notifications (one per exchange, one webhook).
|
||||
* A third tick with no new exchanges must be a no-op. Acking one instance
|
||||
* must leave the other four untouched.
|
||||
* <p>
|
||||
* Exercises the full Phase-1+2+3 stack: evaluator cursor persistence across
|
||||
* ticks, per-tick rollback isolation, and the ack-doesn't-cascade invariant.
|
||||
* See: docs/superpowers/plans/2026-04-22-per-exchange-exactly-once.md
|
||||
*/
|
||||
@Test
|
||||
@Order(7)
|
||||
void perExchange_5FailuresAcross2Ticks_exactlyOncePerExchange() {
|
||||
// Relative-to-now timestamps so they fall inside the evaluator's
|
||||
// [rule.createdAt .. ctx.now()] window. Using Instant.parse(...) would
|
||||
// require reconciling with the mocked alertingClock AND rule.createdAt,
|
||||
// which is wall-clock in createPerExchangeRuleWithWebhook.
|
||||
Instant base = Instant.now().minusSeconds(30);
|
||||
|
||||
// Pin the mocked alertingClock to current wall time so ctx.now() is >
|
||||
// every seeded execution timestamp (base + 0..4s) AND > rule.createdAt
|
||||
// (now - 60s). Prior tests may have set simulatedNow far in the past
|
||||
// (step1 used wall time but step6 advanced by 61s — test ordering means
|
||||
// the last value lingers). Re-pinning here makes the window deterministic.
|
||||
setSimulatedNow(Instant.now());
|
||||
|
||||
UUID perExRuleId = createPerExchangeRuleWithWebhook();
|
||||
|
||||
// ── Tick 1 — seed 3, tick ────────────────────────────────────────────
|
||||
seedFailedExecution("ex1-exec-1", base);
|
||||
seedFailedExecution("ex1-exec-2", base.plusSeconds(1));
|
||||
seedFailedExecution("ex1-exec-3", base.plusSeconds(2));
|
||||
evaluatorJob.tick();
|
||||
|
||||
// ── Tick 2 — seed 2 more, tick ───────────────────────────────────────
|
||||
seedFailedExecution("ex1-exec-4", base.plusSeconds(3));
|
||||
seedFailedExecution("ex1-exec-5", base.plusSeconds(4));
|
||||
// Re-open the rule claim so it's due for tick 2.
|
||||
jdbcTemplate.update(
|
||||
"UPDATE alert_rules SET next_evaluation_at = now() - interval '1 second', " +
|
||||
"claimed_by = NULL, claimed_until = NULL WHERE id = ?", perExRuleId);
|
||||
evaluatorJob.tick();
|
||||
|
||||
// Assert: 5 instances, 5 PENDING notifications.
|
||||
List<UUID> instanceIds = instanceIdsForRule(perExRuleId);
|
||||
assertThat(instanceIds)
|
||||
.as("5 FAILED exchanges across 2 ticks must produce exactly 5 FIRING instances")
|
||||
.hasSize(5);
|
||||
List<AlertNotification> allNotifs = notificationsForRule(perExRuleId);
|
||||
assertThat(allNotifs)
|
||||
.as("5 instances × 1 webhook must produce exactly 5 notifications")
|
||||
.hasSize(5);
|
||||
assertThat(allNotifs.stream().allMatch(n -> n.status() == NotificationStatus.PENDING))
|
||||
.as("all notifications must be PENDING before dispatch")
|
||||
.isTrue();
|
||||
|
||||
// ── Dispatch all pending, then tick 3 — expect no change ────────────
|
||||
dispatchAllPending();
|
||||
// Re-open the rule claim so it's due for tick 3.
|
||||
jdbcTemplate.update(
|
||||
"UPDATE alert_rules SET next_evaluation_at = now() - interval '1 second', " +
|
||||
"claimed_by = NULL, claimed_until = NULL WHERE id = ?", perExRuleId);
|
||||
evaluatorJob.tick();
|
||||
|
||||
assertThat(instanceIdsForRule(perExRuleId))
|
||||
.as("tick 3 with no new exchanges must not create new instances")
|
||||
.hasSize(5);
|
||||
long pending = notificationsForRule(perExRuleId).stream()
|
||||
.filter(n -> n.status() == NotificationStatus.PENDING)
|
||||
.count();
|
||||
assertThat(pending)
|
||||
.as("tick 3 must not re-enqueue notifications — all prior were dispatched")
|
||||
.isZero();
|
||||
|
||||
// ── Ack one — others unchanged ──────────────────────────────────────
|
||||
UUID firstInstanceId = instanceIds.get(0);
|
||||
instanceRepo.ack(firstInstanceId, "test-operator", Instant.now());
|
||||
|
||||
List<AlertInstance> all = instanceIds.stream()
|
||||
.map(id -> instanceRepo.findById(id).orElseThrow())
|
||||
.toList();
|
||||
long ackedCount = all.stream().filter(i -> i.ackedBy() != null).count();
|
||||
assertThat(ackedCount)
|
||||
.as("ack on one instance must not cascade to peers")
|
||||
.isEqualTo(1);
|
||||
|
||||
// Cleanup — the @AfterAll cleans by envId which covers us, but be explicit.
|
||||
jdbcTemplate.update("DELETE FROM alert_notifications WHERE alert_instance_id IN " +
|
||||
"(SELECT id FROM alert_instances WHERE rule_id = ?)", perExRuleId);
|
||||
jdbcTemplate.update("DELETE FROM alert_instances WHERE rule_id = ?", perExRuleId);
|
||||
jdbcTemplate.update("DELETE FROM alert_rule_targets WHERE rule_id = ?", perExRuleId);
|
||||
jdbcTemplate.update("DELETE FROM alert_rules WHERE id = ?", perExRuleId);
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
/** POST the main lifecycle rule via REST API. Returns the created rule ID. */
|
||||
@@ -515,4 +612,96 @@ class AlertingFullLifecycleIT extends AbstractPostgresIT {
|
||||
logStore.insertBufferedBatch(List.of(
|
||||
new BufferedLogEntry(tenantId, envSlug, "lc-agent-01", "lc-app", entry)));
|
||||
}
|
||||
|
||||
// ── Helpers for perExchange exactly-once test ────────────────────────────
|
||||
|
||||
private static final String PER_EX_APP_SLUG = "per-ex-app";
|
||||
|
||||
/**
|
||||
* Create a PER_EXCHANGE rule bound to {@link #PER_EX_APP_SLUG} that fires on
|
||||
* {@code status=FAILED} and enqueues one notification per match via the
|
||||
* pre-seeded webhook connection ({@link #connId}). Returns the new rule id.
|
||||
* <p>
|
||||
* Replicates the pattern from {@code AlertEvaluatorJobIT#createPerExchangeRuleWithWebhook}
|
||||
* but reuses this test's env + outbound connection.
|
||||
*/
|
||||
private UUID createPerExchangeRuleWithWebhook() {
|
||||
UUID rid = UUID.randomUUID();
|
||||
Instant now = Instant.now();
|
||||
var condition = new ExchangeMatchCondition(
|
||||
new AlertScope(PER_EX_APP_SLUG, null, null),
|
||||
new ExchangeMatchCondition.ExchangeFilter("FAILED", Map.of()),
|
||||
FireMode.PER_EXCHANGE, null, null);
|
||||
var webhook = new WebhookBinding(connId, null, null, Map.of());
|
||||
var rule = new AlertRule(
|
||||
rid, envId, "per-ex-lc-rule-" + rid, null,
|
||||
AlertSeverity.WARNING, true, ConditionKind.EXCHANGE_MATCH, condition,
|
||||
60, 0, 60,
|
||||
"Exchange FAILED: {{exchange.id}}", "route={{exchange.routeId}}",
|
||||
List.of(webhook), List.of(),
|
||||
now.minusSeconds(5), // due now
|
||||
null, null, Map.of(),
|
||||
now.minusSeconds(60), "test-operator", // createdAt bounds first-run cursor
|
||||
now.minusSeconds(60), "test-operator");
|
||||
ruleRepo.save(rule);
|
||||
return rid;
|
||||
}
|
||||
|
||||
/**
|
||||
* Seed one FAILED execution into ClickHouse, scoped to this test's tenant/env/app
|
||||
* so it's picked up by a PER_EXCHANGE rule targeting {@link #PER_EX_APP_SLUG}.
|
||||
*/
|
||||
private void seedFailedExecution(String executionId, Instant startTime) {
|
||||
executionStore.insertExecutionBatch(List.of(new MergedExecution(
|
||||
tenantId, 1L, executionId, "route-a", "inst-1", PER_EX_APP_SLUG, envSlug,
|
||||
"FAILED", "", "exchange-" + executionId,
|
||||
startTime, startTime.plusMillis(100), 100L,
|
||||
"", "", "", "", "", "", // error fields
|
||||
"", "FULL", // diagramContentHash, engineLevel
|
||||
"", "", "", "", "", "", // bodies / headers / properties
|
||||
"{}", // attributes (JSON)
|
||||
"", "", // traceId, spanId
|
||||
false, false,
|
||||
null, null
|
||||
)));
|
||||
}
|
||||
|
||||
/** All instance ids for a rule, ordered by fired_at ascending (deterministic). */
|
||||
private List<UUID> instanceIdsForRule(UUID rid) {
|
||||
return jdbcTemplate.queryForList(
|
||||
"SELECT id FROM alert_instances WHERE rule_id = ? ORDER BY fired_at ASC",
|
||||
UUID.class, rid);
|
||||
}
|
||||
|
||||
/** All notifications across every instance of a rule. */
|
||||
private List<AlertNotification> notificationsForRule(UUID rid) {
|
||||
List<UUID> ids = instanceIdsForRule(rid);
|
||||
List<AlertNotification> out = new java.util.ArrayList<>();
|
||||
for (UUID iid : ids) {
|
||||
out.addAll(notificationRepo.listForInstance(iid));
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Simulate a dispatch pass without hitting the real webhook — marks every
|
||||
* PENDING notification for this rule as DELIVERED. Using
|
||||
* {@code dispatchJob.tick()} would round-trip through WireMock and require
|
||||
* extra plumbing; the exactly-once contract under test is about the
|
||||
* evaluator re-enqueueing behaviour, not webhook delivery.
|
||||
*/
|
||||
private void dispatchAllPending() {
|
||||
Instant now = Instant.now();
|
||||
// Drain PENDING notifications across the whole env (safe because the
|
||||
// ackedBy-scoped assertions further down look at this rule only).
|
||||
List<UUID> pendingIds = jdbcTemplate.queryForList(
|
||||
"SELECT n.id FROM alert_notifications n " +
|
||||
"JOIN alert_instances i ON n.alert_instance_id = i.id " +
|
||||
"WHERE i.environment_id = ? " +
|
||||
"AND n.status = 'PENDING'::notification_status_enum",
|
||||
UUID.class, envId);
|
||||
for (UUID nid : pendingIds) {
|
||||
notificationRepo.markDelivered(nid, 200, "OK", now);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ package com.cameleer.server.app.alerting;
|
||||
import com.cameleer.server.app.AbstractPostgresIT;
|
||||
import com.cameleer.server.app.TestSecurityHelper;
|
||||
import com.cameleer.server.app.search.ClickHouseLogStore;
|
||||
import com.cameleer.server.core.agent.AgentRegistryService;
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
@@ -31,7 +32,7 @@ import static org.mockito.Mockito.when;
|
||||
*/
|
||||
class OutboundConnectionAllowedEnvIT extends AbstractPostgresIT {
|
||||
|
||||
// AbstractPostgresIT already declares clickHouseSearchIndex + agentRegistryService mocks.
|
||||
@MockBean AgentRegistryService agentRegistryService;
|
||||
@MockBean(name = "clickHouseLogStore") ClickHouseLogStore clickHouseLogStore;
|
||||
|
||||
@Autowired private TestRestTemplate restTemplate;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user