feat(alerting): AlertingRetentionJob daily cleanup

Nightly @Scheduled(03:00) job deletes RESOLVED alert_instances older
than eventRetentionDays and DELIVERED/FAILED alert_notifications older
than notificationRetentionDays.  Uses injected Clock for testability.
IT covers: old-resolved deleted, fresh-resolved kept, FIRING kept
regardless of age, PENDING notification never deleted.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
hsiegeln
2026-04-19 22:16:21 +02:00
parent 118ace7cc3
commit 1ab21bc019
2 changed files with 310 additions and 0 deletions

View File

@@ -0,0 +1,63 @@
package com.cameleer.server.app.alerting.retention;
import com.cameleer.server.app.alerting.config.AlertingProperties;
import com.cameleer.server.core.alerting.AlertInstanceRepository;
import com.cameleer.server.core.alerting.AlertNotificationRepository;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.time.Clock;
import java.time.Instant;
import java.time.temporal.ChronoUnit;
/**
* Nightly retention job for alerting data.
* <p>
* Deletes RESOLVED {@link com.cameleer.server.core.alerting.AlertInstance} rows older than
* {@code cameleer.server.alerting.eventRetentionDays} and DELIVERED/FAILED
* {@link com.cameleer.server.core.alerting.AlertNotification} rows older than
* {@code cameleer.server.alerting.notificationRetentionDays}.
* <p>
* Duplicate runs across replicas are tolerable — the DELETEs are idempotent.
*/
@Component
public class AlertingRetentionJob {
private static final Logger log = LoggerFactory.getLogger(AlertingRetentionJob.class);
private final AlertingProperties props;
private final AlertInstanceRepository alertInstanceRepo;
private final AlertNotificationRepository alertNotificationRepo;
private final Clock clock;
public AlertingRetentionJob(AlertingProperties props,
AlertInstanceRepository alertInstanceRepo,
AlertNotificationRepository alertNotificationRepo,
Clock alertingClock) {
this.props = props;
this.alertInstanceRepo = alertInstanceRepo;
this.alertNotificationRepo = alertNotificationRepo;
this.clock = alertingClock;
}
@Scheduled(cron = "0 0 3 * * *") // 03:00 every day
public void cleanup() {
log.info("Alerting retention job started");
Instant now = Instant.now(clock);
Instant instanceCutoff = now.minus(props.effectiveEventRetentionDays(), ChronoUnit.DAYS);
alertInstanceRepo.deleteResolvedBefore(instanceCutoff);
log.info("Alerting retention: deleted RESOLVED instances older than {} ({} days)",
instanceCutoff, props.effectiveEventRetentionDays());
Instant notificationCutoff = now.minus(props.effectiveNotificationRetentionDays(), ChronoUnit.DAYS);
alertNotificationRepo.deleteSettledBefore(notificationCutoff);
log.info("Alerting retention: deleted settled notifications older than {} ({} days)",
notificationCutoff, props.effectiveNotificationRetentionDays());
log.info("Alerting retention job completed");
}
}

View File

@@ -0,0 +1,247 @@
package com.cameleer.server.app.alerting.retention;
import com.cameleer.server.app.AbstractPostgresIT;
import com.cameleer.server.app.search.ClickHouseLogStore;
import com.cameleer.server.app.search.ClickHouseSearchIndex;
import com.cameleer.server.core.agent.AgentRegistryService;
import com.cameleer.server.core.alerting.*;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.test.context.bean.override.mockito.MockitoBean;
import java.time.Clock;
import java.time.Instant;
import java.time.ZoneOffset;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import static org.assertj.core.api.Assertions.assertThat;
/**
* Integration tests for {@link AlertingRetentionJob}.
* <p>
* Verifies that the job deletes only the correct rows:
* - RESOLVED instances older than retention → deleted.
* - RESOLVED instances fresher than retention → kept.
* - FIRING instances even if very old → kept (state != RESOLVED).
* - DELIVERED/FAILED notifications older than retention → deleted.
* - PENDING notifications → always kept regardless of age.
* - FAILED notifications fresher than retention → kept.
*/
class AlertingRetentionJobIT extends AbstractPostgresIT {
@MockBean(name = "clickHouseSearchIndex") ClickHouseSearchIndex clickHouseSearchIndex;
@MockBean(name = "clickHouseLogStore") ClickHouseLogStore clickHouseLogStore;
@MockBean AgentRegistryService agentRegistryService;
@Autowired private AlertingRetentionJob job;
@Autowired private AlertInstanceRepository instanceRepo;
@Autowired private AlertNotificationRepository notificationRepo;
private UUID envId;
private UUID ruleId;
/** A fixed "now" = 2025-01-15T12:00:00Z. Retention is 90 days for instances, 30 days for notifications. */
private static final Instant NOW = Instant.parse("2025-01-15T12:00:00Z");
@BeforeEach
void setUp() {
envId = UUID.randomUUID();
ruleId = UUID.randomUUID();
jdbcTemplate.update(
"INSERT INTO environments (id, slug, display_name) VALUES (?, ?, ?)",
envId, "retention-it-env-" + envId, "Retention IT Env");
jdbcTemplate.update(
"INSERT INTO users (user_id, provider, email) VALUES ('sys-retention', 'local', 'sys-retention@test.example.com') ON CONFLICT (user_id) DO NOTHING");
jdbcTemplate.update(
"INSERT INTO alert_rules (id, environment_id, name, severity, condition_kind, condition, " +
"notification_title_tmpl, notification_message_tmpl, created_by, updated_by) " +
"VALUES (?, ?, 'ret-rule', 'WARNING', 'AGENT_STATE', '{}'::jsonb, 't', 'm', 'sys-retention', 'sys-retention')",
ruleId, envId);
}
@AfterEach
void cleanUp() {
jdbcTemplate.update("DELETE FROM alert_notifications WHERE alert_instance_id IN " +
"(SELECT id FROM alert_instances WHERE environment_id = ?)", envId);
jdbcTemplate.update("DELETE FROM alert_instances WHERE environment_id = ?", envId);
jdbcTemplate.update("DELETE FROM alert_rules WHERE id = ?", ruleId);
jdbcTemplate.update("DELETE FROM environments WHERE id = ?", envId);
}
// -------------------------------------------------------------------------
// Instance retention tests
// -------------------------------------------------------------------------
@Test
void resolvedInstance_olderThanRetention_isDeleted() {
// Seed: RESOLVED, resolved_at = NOW - 100 days (> 90-day retention)
Instant oldResolved = NOW.minusSeconds(100 * 86400L);
UUID instanceId = seedResolvedInstance(oldResolved);
runJobAt(NOW);
assertInstanceGone(instanceId);
}
@Test
void resolvedInstance_fresherThanRetention_isKept() {
// Seed: RESOLVED, resolved_at = NOW - 10 days (< 90-day retention)
Instant recentResolved = NOW.minusSeconds(10 * 86400L);
UUID instanceId = seedResolvedInstance(recentResolved);
runJobAt(NOW);
assertInstancePresent(instanceId);
}
@Test
void firingInstance_veryOld_isKept() {
// Seed: FIRING (not RESOLVED), fired_at = NOW - 200 days
Instant veryOldFired = NOW.minusSeconds(200 * 86400L);
UUID instanceId = seedFiringInstance(veryOldFired);
runJobAt(NOW);
assertInstancePresent(instanceId);
}
// -------------------------------------------------------------------------
// Notification retention tests
// -------------------------------------------------------------------------
@Test
void deliveredNotification_olderThanRetention_isDeleted() {
// Seed an instance first
UUID instanceId = seedResolvedInstance(NOW.minusSeconds(5 * 86400L));
// Notification created 40 days ago (> 30-day retention), DELIVERED
Instant old = NOW.minusSeconds(40 * 86400L);
UUID notifId = seedNotification(instanceId, NotificationStatus.DELIVERED, old);
runJobAt(NOW);
assertNotificationGone(notifId);
}
@Test
void pendingNotification_isNeverDeleted() {
// Seed an instance first
UUID instanceId = seedResolvedInstance(NOW.minusSeconds(5 * 86400L));
// PENDING notification created 100 days ago — must NOT be deleted
Instant veryOld = NOW.minusSeconds(100 * 86400L);
UUID notifId = seedNotification(instanceId, NotificationStatus.PENDING, veryOld);
runJobAt(NOW);
assertNotificationPresent(notifId);
}
@Test
void failedNotification_fresherThanRetention_isKept() {
UUID instanceId = seedResolvedInstance(NOW.minusSeconds(5 * 86400L));
// FAILED notification created 5 days ago (< 30-day retention)
Instant recent = NOW.minusSeconds(5 * 86400L);
UUID notifId = seedNotification(instanceId, NotificationStatus.FAILED, recent);
runJobAt(NOW);
assertNotificationPresent(notifId);
}
// -------------------------------------------------------------------------
// Helpers
// -------------------------------------------------------------------------
private void runJobAt(Instant fixedNow) {
// Replace the job's clock by using a subclass trick — we can't inject the clock
// into the scheduled job in Spring context without replacement, so we invoke a
// freshly constructed job with a fixed clock directly.
var fixedClock = Clock.fixed(fixedNow, ZoneOffset.UTC);
// The job bean is already wired in Spring context, but we want deterministic "now".
// Since AlertingRetentionJob stores a Clock field, we can inject via the
// @Autowired job using spring's test support. However, the simplest KISS approach
// is to construct a local instance pointing at the real repos + fixed clock.
var localJob = new AlertingRetentionJob(
// pull retention days from context via job.props — but since we can't access
// private field, we use direct construction from known values:
// effectiveEventRetentionDays = 90, effectiveNotificationRetentionDays = 30
new com.cameleer.server.app.alerting.config.AlertingProperties(
null, null, null, null, null, null, null, null, null,
90, 30, null, null),
instanceRepo,
notificationRepo,
fixedClock);
localJob.cleanup();
}
private UUID seedResolvedInstance(Instant resolvedAt) {
UUID id = UUID.randomUUID();
jdbcTemplate.update("""
INSERT INTO alert_instances
(id, rule_id, rule_snapshot, environment_id, state, severity,
fired_at, resolved_at, silenced, context, title, message,
target_user_ids, target_group_ids, target_role_names)
VALUES (?, ?, '{}'::jsonb, ?, 'RESOLVED'::alert_state_enum, 'WARNING'::severity_enum,
?, ?, false, '{}'::jsonb, 'T', 'M',
'{}', '{}', '{}')
""",
id, ruleId, envId, resolvedAt, resolvedAt);
return id;
}
private UUID seedFiringInstance(Instant firedAt) {
UUID id = UUID.randomUUID();
jdbcTemplate.update("""
INSERT INTO alert_instances
(id, rule_id, rule_snapshot, environment_id, state, severity,
fired_at, silenced, context, title, message,
target_user_ids, target_group_ids, target_role_names)
VALUES (?, ?, '{}'::jsonb, ?, 'FIRING'::alert_state_enum, 'WARNING'::severity_enum,
?, false, '{}'::jsonb, 'T', 'M',
'{}', '{}', '{}')
""",
id, ruleId, envId, firedAt);
return id;
}
private UUID seedNotification(UUID alertInstanceId, NotificationStatus status, Instant createdAt) {
UUID id = UUID.randomUUID();
jdbcTemplate.update("""
INSERT INTO alert_notifications
(id, alert_instance_id, status, attempts, next_attempt_at, payload, created_at)
VALUES (?, ?, ?::notification_status_enum, 0, ?, '{}'::jsonb, ?)
""",
id, alertInstanceId, status.name(), createdAt, createdAt);
return id;
}
private void assertInstanceGone(UUID id) {
Integer count = jdbcTemplate.queryForObject(
"SELECT COUNT(*) FROM alert_instances WHERE id = ?", Integer.class, id);
assertThat(count).as("instance %s should be deleted", id).isZero();
}
private void assertInstancePresent(UUID id) {
Integer count = jdbcTemplate.queryForObject(
"SELECT COUNT(*) FROM alert_instances WHERE id = ?", Integer.class, id);
assertThat(count).as("instance %s should be present", id).isEqualTo(1);
}
private void assertNotificationGone(UUID id) {
Integer count = jdbcTemplate.queryForObject(
"SELECT COUNT(*) FROM alert_notifications WHERE id = ?", Integer.class, id);
assertThat(count).as("notification %s should be deleted", id).isZero();
}
private void assertNotificationPresent(UUID id) {
Integer count = jdbcTemplate.queryForObject(
"SELECT COUNT(*) FROM alert_notifications WHERE id = ?", Integer.class, id);
assertThat(count).as("notification %s should be present", id).isEqualTo(1);
}
}