From eda74b7339bae9ff37b034bdf42e1389eb26589a Mon Sep 17 00:00:00 2001
From: hsiegeln <37154749+hsiegeln@users.noreply.github.com>
Date: Wed, 22 Apr 2026 18:39:49 +0200
Subject: [PATCH] =?UTF-8?q?docs(alerting):=20PER=5FEXCHANGE=20exactly-once?=
 =?UTF-8?q?=20=E2=80=94=20fireMode=20reference=20+=20deploy-backlog-cap?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix stale `AGGREGATE` label (actual enum: `COUNT_IN_WINDOW`). Expand
EXCHANGE_MATCH section with both fire modes, PER_EXCHANGE config-surface
restrictions (0 for reNotifyMinutes/forDurationSeconds, at-least-one-sink
rule), exactly-once guarantee scope, and the first-run backlog-cap knob.

Surface the new config in application.yml with the 24h default and the
opt-out-to-0 semantics.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../src/main/resources/application.yml        |  4 ++
 docs/alerting.md                              | 43 +++++++++++++++++--
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/cameleer-server-app/src/main/resources/application.yml b/cameleer-server-app/src/main/resources/application.yml
index 360bb581..2d3d8866 100644
--- a/cameleer-server-app/src/main/resources/application.yml
+++ b/cameleer-server-app/src/main/resources/application.yml
@@ -93,6 +93,10 @@ cameleer:
       notification-retention-days: ${CAMELEER_SERVER_ALERTING_NOTIFICATIONRETENTIONDAYS:30}
       webhook-timeout-ms: ${CAMELEER_SERVER_ALERTING_WEBHOOKTIMEOUTMS:5000}
       webhook-max-attempts: ${CAMELEER_SERVER_ALERTING_WEBHOOKMAXATTEMPTS:3}
+      # PER_EXCHANGE first-run cursor clamp: on first tick with no persisted cursor, evaluator
+      # scans no further back than (now - this cap). Prevents one-time backlog flood for rules
+      # whose createdAt predates a migration. Set to 0 to disable and replay from createdAt.
+      per-exchange-deploy-backlog-cap-seconds: ${CAMELEER_SERVER_ALERTING_PEREXCHANGEDEPLOYBACKLOGCAPSECONDS:86400}
     outbound-http:
       trust-all: false
       trusted-ca-pem-paths: []
diff --git a/docs/alerting.md b/docs/alerting.md
index bcc7002e..76e3718d 100644
--- a/docs/alerting.md
+++ b/docs/alerting.md
@@ -36,25 +36,60 @@ Comparators: `GT`, `GTE`, `LT`, `LTE`, `EQ`.
 
 ### EXCHANGE_MATCH
 
-Fires when the number of exchanges matching a filter exceeds a threshold.
+Fires on exchanges matching a filter. Two firing modes — pick the one that matches your operational intent.
+
+#### `fireMode: COUNT_IN_WINDOW`
+
+One alert when the count of matching exchanges in a rolling window crosses a threshold. Aggregation-style: good for "more than 3 payment failures in 10 minutes."
 
 ```json
 {
-  "name": "Failed payment exchanges",
+  "name": "Payment failures spike",
   "severity": "WARNING",
   "conditionKind": "EXCHANGE_MATCH",
   "condition": {
     "kind": "EXCHANGE_MATCH",
     "scope": { "appSlug": "payment-service", "routeId": "processPayment" },
     "filter": { "status": "FAILED", "attributes": { "payment.type": "card" } },
-    "fireMode": "AGGREGATE",
+    "fireMode": "COUNT_IN_WINDOW",
     "threshold": 3,
     "windowSeconds": 600
   }
 }
 ```
 
-`fireMode`: `AGGREGATE` (one alert for the count) or `PER_EXCHANGE` (one alert per matching exchange).
+#### `fireMode: PER_EXCHANGE`
+
+One alert per distinct failed exchange — **exactly once**. Each failure produces its own `AlertInstance` and its own notification. The Inbox contains one row per failed exchange, never a duplicate, across ticks or process restarts. Good for "page me for every failed order regardless of rate."
+
+```json
+{
+  "name": "Any order failure",
+  "severity": "CRITICAL",
+  "conditionKind": "EXCHANGE_MATCH",
+  "condition": {
+    "kind": "EXCHANGE_MATCH",
+    "scope": { "appSlug": "orders-service" },
+    "filter": { "status": "FAILED" },
+    "fireMode": "PER_EXCHANGE"
+  }
+}
+```
+
+PER_EXCHANGE rules have a tighter configurable surface — the server rejects non-coherent combinations at save time with 400:
+
+| Field | PER_EXCHANGE | COUNT_IN_WINDOW |
+|---|---|---|
+| `threshold`, `windowSeconds` | must be absent / zero | required, positive |
+| `reNotifyMinutes` | must be 0 (fires once; re-notify does not apply) | optional |
+| `forDurationSeconds` | must be 0 | optional |
+| `scope`, `filter`, `severity`, notification template, `webhooks` / `targets` | standard | standard |
+
+Additionally, any rule (any `conditionKind`) with **both** empty `webhooks` and empty `targets` is rejected — a rule that notifies no one is always a misconfiguration.
+
+**Exactly-once guarantee — scope.** One `AlertInstance` and one PENDING `AlertNotification` per exchange, survived across evaluator ticks and process restarts. HTTP webhook delivery is still at-least-once under transient failure; for Slack and similar, include `{{alert.id}}` in the message template so the consumer can dedup.
+
+**First post-deploy tick — backlog cap.** A PER_EXCHANGE rule's first run (no persisted cursor yet) would otherwise scan from `rule.createdAt` forward, which can trigger a one-time notification flood for long-lived rules after a DB migration or schema reset. The server clamps the first-run scan to `max(rule.createdAt, now - deployBacklogCap)`. Default cap: 24 h. Tune via `cameleer.server.alerting.per-exchange-deploy-backlog-cap-seconds` (set to 0 to disable the clamp and replay from `createdAt`).
 
 ### AGENT_STATE