From 1ab21bc0199d36df1720442b9adff9eb6e3f927e Mon Sep 17 00:00:00 2001 From: hsiegeln <37154749+hsiegeln@users.noreply.github.com> Date: Sun, 19 Apr 2026 22:16:21 +0200 Subject: [PATCH] feat(alerting): AlertingRetentionJob daily cleanup Nightly @Scheduled(03:00) job deletes RESOLVED alert_instances older than eventRetentionDays and DELIVERED/FAILED alert_notifications older than notificationRetentionDays. Uses injected Clock for testability. IT covers: old-resolved deleted, fresh-resolved kept, FIRING kept regardless of age, PENDING notification never deleted. Co-Authored-By: Claude Sonnet 4.6 --- .../retention/AlertingRetentionJob.java | 63 +++++ .../retention/AlertingRetentionJobIT.java | 247 ++++++++++++++++++ 2 files changed, 310 insertions(+) create mode 100644 cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/retention/AlertingRetentionJob.java create mode 100644 cameleer-server-app/src/test/java/com/cameleer/server/app/alerting/retention/AlertingRetentionJobIT.java diff --git a/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/retention/AlertingRetentionJob.java b/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/retention/AlertingRetentionJob.java new file mode 100644 index 00000000..7fcb0154 --- /dev/null +++ b/cameleer-server-app/src/main/java/com/cameleer/server/app/alerting/retention/AlertingRetentionJob.java @@ -0,0 +1,63 @@ +package com.cameleer.server.app.alerting.retention; + +import com.cameleer.server.app.alerting.config.AlertingProperties; +import com.cameleer.server.core.alerting.AlertInstanceRepository; +import com.cameleer.server.core.alerting.AlertNotificationRepository; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.scheduling.annotation.Scheduled; +import org.springframework.stereotype.Component; + +import java.time.Clock; +import java.time.Instant; +import java.time.temporal.ChronoUnit; + +/** + * Nightly retention job for alerting data. + *

+ * Deletes RESOLVED {@link com.cameleer.server.core.alerting.AlertInstance} rows older than + * {@code cameleer.server.alerting.eventRetentionDays} and DELIVERED/FAILED + * {@link com.cameleer.server.core.alerting.AlertNotification} rows older than + * {@code cameleer.server.alerting.notificationRetentionDays}. + *

+ * Duplicate runs across replicas are tolerable — the DELETEs are idempotent. + */ +@Component +public class AlertingRetentionJob { + + private static final Logger log = LoggerFactory.getLogger(AlertingRetentionJob.class); + + private final AlertingProperties props; + private final AlertInstanceRepository alertInstanceRepo; + private final AlertNotificationRepository alertNotificationRepo; + private final Clock clock; + + public AlertingRetentionJob(AlertingProperties props, + AlertInstanceRepository alertInstanceRepo, + AlertNotificationRepository alertNotificationRepo, + Clock alertingClock) { + this.props = props; + this.alertInstanceRepo = alertInstanceRepo; + this.alertNotificationRepo = alertNotificationRepo; + this.clock = alertingClock; + } + + @Scheduled(cron = "0 0 3 * * *") // 03:00 every day + public void cleanup() { + log.info("Alerting retention job started"); + + Instant now = Instant.now(clock); + + Instant instanceCutoff = now.minus(props.effectiveEventRetentionDays(), ChronoUnit.DAYS); + alertInstanceRepo.deleteResolvedBefore(instanceCutoff); + log.info("Alerting retention: deleted RESOLVED instances older than {} ({} days)", + instanceCutoff, props.effectiveEventRetentionDays()); + + Instant notificationCutoff = now.minus(props.effectiveNotificationRetentionDays(), ChronoUnit.DAYS); + alertNotificationRepo.deleteSettledBefore(notificationCutoff); + log.info("Alerting retention: deleted settled notifications older than {} ({} days)", + notificationCutoff, props.effectiveNotificationRetentionDays()); + + log.info("Alerting retention job completed"); + } +} diff --git a/cameleer-server-app/src/test/java/com/cameleer/server/app/alerting/retention/AlertingRetentionJobIT.java b/cameleer-server-app/src/test/java/com/cameleer/server/app/alerting/retention/AlertingRetentionJobIT.java new file mode 100644 index 00000000..6639a5b9 --- /dev/null +++ b/cameleer-server-app/src/test/java/com/cameleer/server/app/alerting/retention/AlertingRetentionJobIT.java @@ -0,0 +1,247 @@ +package com.cameleer.server.app.alerting.retention; + +import com.cameleer.server.app.AbstractPostgresIT; +import com.cameleer.server.app.search.ClickHouseLogStore; +import com.cameleer.server.app.search.ClickHouseSearchIndex; +import com.cameleer.server.core.agent.AgentRegistryService; +import com.cameleer.server.core.alerting.*; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.mock.mockito.MockBean; +import org.springframework.test.context.bean.override.mockito.MockitoBean; + +import java.time.Clock; +import java.time.Instant; +import java.time.ZoneOffset; +import java.util.List; +import java.util.Map; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Integration tests for {@link AlertingRetentionJob}. + *

+ * Verifies that the job deletes only the correct rows: + * - RESOLVED instances older than retention → deleted. + * - RESOLVED instances fresher than retention → kept. + * - FIRING instances even if very old → kept (state != RESOLVED). + * - DELIVERED/FAILED notifications older than retention → deleted. + * - PENDING notifications → always kept regardless of age. + * - FAILED notifications fresher than retention → kept. + */ +class AlertingRetentionJobIT extends AbstractPostgresIT { + + @MockBean(name = "clickHouseSearchIndex") ClickHouseSearchIndex clickHouseSearchIndex; + @MockBean(name = "clickHouseLogStore") ClickHouseLogStore clickHouseLogStore; + @MockBean AgentRegistryService agentRegistryService; + + @Autowired private AlertingRetentionJob job; + @Autowired private AlertInstanceRepository instanceRepo; + @Autowired private AlertNotificationRepository notificationRepo; + + private UUID envId; + private UUID ruleId; + + /** A fixed "now" = 2025-01-15T12:00:00Z. Retention is 90 days for instances, 30 days for notifications. */ + private static final Instant NOW = Instant.parse("2025-01-15T12:00:00Z"); + + @BeforeEach + void setUp() { + envId = UUID.randomUUID(); + ruleId = UUID.randomUUID(); + + jdbcTemplate.update( + "INSERT INTO environments (id, slug, display_name) VALUES (?, ?, ?)", + envId, "retention-it-env-" + envId, "Retention IT Env"); + jdbcTemplate.update( + "INSERT INTO users (user_id, provider, email) VALUES ('sys-retention', 'local', 'sys-retention@test.example.com') ON CONFLICT (user_id) DO NOTHING"); + jdbcTemplate.update( + "INSERT INTO alert_rules (id, environment_id, name, severity, condition_kind, condition, " + + "notification_title_tmpl, notification_message_tmpl, created_by, updated_by) " + + "VALUES (?, ?, 'ret-rule', 'WARNING', 'AGENT_STATE', '{}'::jsonb, 't', 'm', 'sys-retention', 'sys-retention')", + ruleId, envId); + } + + @AfterEach + void cleanUp() { + jdbcTemplate.update("DELETE FROM alert_notifications WHERE alert_instance_id IN " + + "(SELECT id FROM alert_instances WHERE environment_id = ?)", envId); + jdbcTemplate.update("DELETE FROM alert_instances WHERE environment_id = ?", envId); + jdbcTemplate.update("DELETE FROM alert_rules WHERE id = ?", ruleId); + jdbcTemplate.update("DELETE FROM environments WHERE id = ?", envId); + } + + // ------------------------------------------------------------------------- + // Instance retention tests + // ------------------------------------------------------------------------- + + @Test + void resolvedInstance_olderThanRetention_isDeleted() { + // Seed: RESOLVED, resolved_at = NOW - 100 days (> 90-day retention) + Instant oldResolved = NOW.minusSeconds(100 * 86400L); + UUID instanceId = seedResolvedInstance(oldResolved); + + runJobAt(NOW); + + assertInstanceGone(instanceId); + } + + @Test + void resolvedInstance_fresherThanRetention_isKept() { + // Seed: RESOLVED, resolved_at = NOW - 10 days (< 90-day retention) + Instant recentResolved = NOW.minusSeconds(10 * 86400L); + UUID instanceId = seedResolvedInstance(recentResolved); + + runJobAt(NOW); + + assertInstancePresent(instanceId); + } + + @Test + void firingInstance_veryOld_isKept() { + // Seed: FIRING (not RESOLVED), fired_at = NOW - 200 days + Instant veryOldFired = NOW.minusSeconds(200 * 86400L); + UUID instanceId = seedFiringInstance(veryOldFired); + + runJobAt(NOW); + + assertInstancePresent(instanceId); + } + + // ------------------------------------------------------------------------- + // Notification retention tests + // ------------------------------------------------------------------------- + + @Test + void deliveredNotification_olderThanRetention_isDeleted() { + // Seed an instance first + UUID instanceId = seedResolvedInstance(NOW.minusSeconds(5 * 86400L)); + // Notification created 40 days ago (> 30-day retention), DELIVERED + Instant old = NOW.minusSeconds(40 * 86400L); + UUID notifId = seedNotification(instanceId, NotificationStatus.DELIVERED, old); + + runJobAt(NOW); + + assertNotificationGone(notifId); + } + + @Test + void pendingNotification_isNeverDeleted() { + // Seed an instance first + UUID instanceId = seedResolvedInstance(NOW.minusSeconds(5 * 86400L)); + // PENDING notification created 100 days ago — must NOT be deleted + Instant veryOld = NOW.minusSeconds(100 * 86400L); + UUID notifId = seedNotification(instanceId, NotificationStatus.PENDING, veryOld); + + runJobAt(NOW); + + assertNotificationPresent(notifId); + } + + @Test + void failedNotification_fresherThanRetention_isKept() { + UUID instanceId = seedResolvedInstance(NOW.minusSeconds(5 * 86400L)); + // FAILED notification created 5 days ago (< 30-day retention) + Instant recent = NOW.minusSeconds(5 * 86400L); + UUID notifId = seedNotification(instanceId, NotificationStatus.FAILED, recent); + + runJobAt(NOW); + + assertNotificationPresent(notifId); + } + + // ------------------------------------------------------------------------- + // Helpers + // ------------------------------------------------------------------------- + + private void runJobAt(Instant fixedNow) { + // Replace the job's clock by using a subclass trick — we can't inject the clock + // into the scheduled job in Spring context without replacement, so we invoke a + // freshly constructed job with a fixed clock directly. + var fixedClock = Clock.fixed(fixedNow, ZoneOffset.UTC); + + // The job bean is already wired in Spring context, but we want deterministic "now". + // Since AlertingRetentionJob stores a Clock field, we can inject via the + // @Autowired job using spring's test support. However, the simplest KISS approach + // is to construct a local instance pointing at the real repos + fixed clock. + var localJob = new AlertingRetentionJob( + // pull retention days from context via job.props — but since we can't access + // private field, we use direct construction from known values: + // effectiveEventRetentionDays = 90, effectiveNotificationRetentionDays = 30 + new com.cameleer.server.app.alerting.config.AlertingProperties( + null, null, null, null, null, null, null, null, null, + 90, 30, null, null), + instanceRepo, + notificationRepo, + fixedClock); + localJob.cleanup(); + } + + private UUID seedResolvedInstance(Instant resolvedAt) { + UUID id = UUID.randomUUID(); + jdbcTemplate.update(""" + INSERT INTO alert_instances + (id, rule_id, rule_snapshot, environment_id, state, severity, + fired_at, resolved_at, silenced, context, title, message, + target_user_ids, target_group_ids, target_role_names) + VALUES (?, ?, '{}'::jsonb, ?, 'RESOLVED'::alert_state_enum, 'WARNING'::severity_enum, + ?, ?, false, '{}'::jsonb, 'T', 'M', + '{}', '{}', '{}') + """, + id, ruleId, envId, resolvedAt, resolvedAt); + return id; + } + + private UUID seedFiringInstance(Instant firedAt) { + UUID id = UUID.randomUUID(); + jdbcTemplate.update(""" + INSERT INTO alert_instances + (id, rule_id, rule_snapshot, environment_id, state, severity, + fired_at, silenced, context, title, message, + target_user_ids, target_group_ids, target_role_names) + VALUES (?, ?, '{}'::jsonb, ?, 'FIRING'::alert_state_enum, 'WARNING'::severity_enum, + ?, false, '{}'::jsonb, 'T', 'M', + '{}', '{}', '{}') + """, + id, ruleId, envId, firedAt); + return id; + } + + private UUID seedNotification(UUID alertInstanceId, NotificationStatus status, Instant createdAt) { + UUID id = UUID.randomUUID(); + jdbcTemplate.update(""" + INSERT INTO alert_notifications + (id, alert_instance_id, status, attempts, next_attempt_at, payload, created_at) + VALUES (?, ?, ?::notification_status_enum, 0, ?, '{}'::jsonb, ?) + """, + id, alertInstanceId, status.name(), createdAt, createdAt); + return id; + } + + private void assertInstanceGone(UUID id) { + Integer count = jdbcTemplate.queryForObject( + "SELECT COUNT(*) FROM alert_instances WHERE id = ?", Integer.class, id); + assertThat(count).as("instance %s should be deleted", id).isZero(); + } + + private void assertInstancePresent(UUID id) { + Integer count = jdbcTemplate.queryForObject( + "SELECT COUNT(*) FROM alert_instances WHERE id = ?", Integer.class, id); + assertThat(count).as("instance %s should be present", id).isEqualTo(1); + } + + private void assertNotificationGone(UUID id) { + Integer count = jdbcTemplate.queryForObject( + "SELECT COUNT(*) FROM alert_notifications WHERE id = ?", Integer.class, id); + assertThat(count).as("notification %s should be deleted", id).isZero(); + } + + private void assertNotificationPresent(UUID id) { + Integer count = jdbcTemplate.queryForObject( + "SELECT COUNT(*) FROM alert_notifications WHERE id = ?", Integer.class, id); + assertThat(count).as("notification %s should be present", id).isEqualTo(1); + } +}