feat(alerting): AlertingRetentionJob daily cleanup
Nightly @Scheduled(03:00) job deletes RESOLVED alert_instances older than eventRetentionDays and DELIVERED/FAILED alert_notifications older than notificationRetentionDays. Uses injected Clock for testability. IT covers: old-resolved deleted, fresh-resolved kept, FIRING kept regardless of age, PENDING notification never deleted. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,63 @@
|
||||
package com.cameleer.server.app.alerting.retention;
|
||||
|
||||
import com.cameleer.server.app.alerting.config.AlertingProperties;
|
||||
import com.cameleer.server.core.alerting.AlertInstanceRepository;
|
||||
import com.cameleer.server.core.alerting.AlertNotificationRepository;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.scheduling.annotation.Scheduled;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.time.Clock;
|
||||
import java.time.Instant;
|
||||
import java.time.temporal.ChronoUnit;
|
||||
|
||||
/**
|
||||
* Nightly retention job for alerting data.
|
||||
* <p>
|
||||
* Deletes RESOLVED {@link com.cameleer.server.core.alerting.AlertInstance} rows older than
|
||||
* {@code cameleer.server.alerting.eventRetentionDays} and DELIVERED/FAILED
|
||||
* {@link com.cameleer.server.core.alerting.AlertNotification} rows older than
|
||||
* {@code cameleer.server.alerting.notificationRetentionDays}.
|
||||
* <p>
|
||||
* Duplicate runs across replicas are tolerable — the DELETEs are idempotent.
|
||||
*/
|
||||
@Component
|
||||
public class AlertingRetentionJob {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(AlertingRetentionJob.class);
|
||||
|
||||
private final AlertingProperties props;
|
||||
private final AlertInstanceRepository alertInstanceRepo;
|
||||
private final AlertNotificationRepository alertNotificationRepo;
|
||||
private final Clock clock;
|
||||
|
||||
public AlertingRetentionJob(AlertingProperties props,
|
||||
AlertInstanceRepository alertInstanceRepo,
|
||||
AlertNotificationRepository alertNotificationRepo,
|
||||
Clock alertingClock) {
|
||||
this.props = props;
|
||||
this.alertInstanceRepo = alertInstanceRepo;
|
||||
this.alertNotificationRepo = alertNotificationRepo;
|
||||
this.clock = alertingClock;
|
||||
}
|
||||
|
||||
@Scheduled(cron = "0 0 3 * * *") // 03:00 every day
|
||||
public void cleanup() {
|
||||
log.info("Alerting retention job started");
|
||||
|
||||
Instant now = Instant.now(clock);
|
||||
|
||||
Instant instanceCutoff = now.minus(props.effectiveEventRetentionDays(), ChronoUnit.DAYS);
|
||||
alertInstanceRepo.deleteResolvedBefore(instanceCutoff);
|
||||
log.info("Alerting retention: deleted RESOLVED instances older than {} ({} days)",
|
||||
instanceCutoff, props.effectiveEventRetentionDays());
|
||||
|
||||
Instant notificationCutoff = now.minus(props.effectiveNotificationRetentionDays(), ChronoUnit.DAYS);
|
||||
alertNotificationRepo.deleteSettledBefore(notificationCutoff);
|
||||
log.info("Alerting retention: deleted settled notifications older than {} ({} days)",
|
||||
notificationCutoff, props.effectiveNotificationRetentionDays());
|
||||
|
||||
log.info("Alerting retention job completed");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,247 @@
|
||||
package com.cameleer.server.app.alerting.retention;
|
||||
|
||||
import com.cameleer.server.app.AbstractPostgresIT;
|
||||
import com.cameleer.server.app.search.ClickHouseLogStore;
|
||||
import com.cameleer.server.app.search.ClickHouseSearchIndex;
|
||||
import com.cameleer.server.core.agent.AgentRegistryService;
|
||||
import com.cameleer.server.core.alerting.*;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.test.context.bean.override.mockito.MockitoBean;
|
||||
|
||||
import java.time.Clock;
|
||||
import java.time.Instant;
|
||||
import java.time.ZoneOffset;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
/**
|
||||
* Integration tests for {@link AlertingRetentionJob}.
|
||||
* <p>
|
||||
* Verifies that the job deletes only the correct rows:
|
||||
* - RESOLVED instances older than retention → deleted.
|
||||
* - RESOLVED instances fresher than retention → kept.
|
||||
* - FIRING instances even if very old → kept (state != RESOLVED).
|
||||
* - DELIVERED/FAILED notifications older than retention → deleted.
|
||||
* - PENDING notifications → always kept regardless of age.
|
||||
* - FAILED notifications fresher than retention → kept.
|
||||
*/
|
||||
class AlertingRetentionJobIT extends AbstractPostgresIT {
|
||||
|
||||
@MockBean(name = "clickHouseSearchIndex") ClickHouseSearchIndex clickHouseSearchIndex;
|
||||
@MockBean(name = "clickHouseLogStore") ClickHouseLogStore clickHouseLogStore;
|
||||
@MockBean AgentRegistryService agentRegistryService;
|
||||
|
||||
@Autowired private AlertingRetentionJob job;
|
||||
@Autowired private AlertInstanceRepository instanceRepo;
|
||||
@Autowired private AlertNotificationRepository notificationRepo;
|
||||
|
||||
private UUID envId;
|
||||
private UUID ruleId;
|
||||
|
||||
/** A fixed "now" = 2025-01-15T12:00:00Z. Retention is 90 days for instances, 30 days for notifications. */
|
||||
private static final Instant NOW = Instant.parse("2025-01-15T12:00:00Z");
|
||||
|
||||
@BeforeEach
|
||||
void setUp() {
|
||||
envId = UUID.randomUUID();
|
||||
ruleId = UUID.randomUUID();
|
||||
|
||||
jdbcTemplate.update(
|
||||
"INSERT INTO environments (id, slug, display_name) VALUES (?, ?, ?)",
|
||||
envId, "retention-it-env-" + envId, "Retention IT Env");
|
||||
jdbcTemplate.update(
|
||||
"INSERT INTO users (user_id, provider, email) VALUES ('sys-retention', 'local', 'sys-retention@test.example.com') ON CONFLICT (user_id) DO NOTHING");
|
||||
jdbcTemplate.update(
|
||||
"INSERT INTO alert_rules (id, environment_id, name, severity, condition_kind, condition, " +
|
||||
"notification_title_tmpl, notification_message_tmpl, created_by, updated_by) " +
|
||||
"VALUES (?, ?, 'ret-rule', 'WARNING', 'AGENT_STATE', '{}'::jsonb, 't', 'm', 'sys-retention', 'sys-retention')",
|
||||
ruleId, envId);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
void cleanUp() {
|
||||
jdbcTemplate.update("DELETE FROM alert_notifications WHERE alert_instance_id IN " +
|
||||
"(SELECT id FROM alert_instances WHERE environment_id = ?)", envId);
|
||||
jdbcTemplate.update("DELETE FROM alert_instances WHERE environment_id = ?", envId);
|
||||
jdbcTemplate.update("DELETE FROM alert_rules WHERE id = ?", ruleId);
|
||||
jdbcTemplate.update("DELETE FROM environments WHERE id = ?", envId);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Instance retention tests
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
@Test
|
||||
void resolvedInstance_olderThanRetention_isDeleted() {
|
||||
// Seed: RESOLVED, resolved_at = NOW - 100 days (> 90-day retention)
|
||||
Instant oldResolved = NOW.minusSeconds(100 * 86400L);
|
||||
UUID instanceId = seedResolvedInstance(oldResolved);
|
||||
|
||||
runJobAt(NOW);
|
||||
|
||||
assertInstanceGone(instanceId);
|
||||
}
|
||||
|
||||
@Test
|
||||
void resolvedInstance_fresherThanRetention_isKept() {
|
||||
// Seed: RESOLVED, resolved_at = NOW - 10 days (< 90-day retention)
|
||||
Instant recentResolved = NOW.minusSeconds(10 * 86400L);
|
||||
UUID instanceId = seedResolvedInstance(recentResolved);
|
||||
|
||||
runJobAt(NOW);
|
||||
|
||||
assertInstancePresent(instanceId);
|
||||
}
|
||||
|
||||
@Test
|
||||
void firingInstance_veryOld_isKept() {
|
||||
// Seed: FIRING (not RESOLVED), fired_at = NOW - 200 days
|
||||
Instant veryOldFired = NOW.minusSeconds(200 * 86400L);
|
||||
UUID instanceId = seedFiringInstance(veryOldFired);
|
||||
|
||||
runJobAt(NOW);
|
||||
|
||||
assertInstancePresent(instanceId);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Notification retention tests
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
@Test
|
||||
void deliveredNotification_olderThanRetention_isDeleted() {
|
||||
// Seed an instance first
|
||||
UUID instanceId = seedResolvedInstance(NOW.minusSeconds(5 * 86400L));
|
||||
// Notification created 40 days ago (> 30-day retention), DELIVERED
|
||||
Instant old = NOW.minusSeconds(40 * 86400L);
|
||||
UUID notifId = seedNotification(instanceId, NotificationStatus.DELIVERED, old);
|
||||
|
||||
runJobAt(NOW);
|
||||
|
||||
assertNotificationGone(notifId);
|
||||
}
|
||||
|
||||
@Test
|
||||
void pendingNotification_isNeverDeleted() {
|
||||
// Seed an instance first
|
||||
UUID instanceId = seedResolvedInstance(NOW.minusSeconds(5 * 86400L));
|
||||
// PENDING notification created 100 days ago — must NOT be deleted
|
||||
Instant veryOld = NOW.minusSeconds(100 * 86400L);
|
||||
UUID notifId = seedNotification(instanceId, NotificationStatus.PENDING, veryOld);
|
||||
|
||||
runJobAt(NOW);
|
||||
|
||||
assertNotificationPresent(notifId);
|
||||
}
|
||||
|
||||
@Test
|
||||
void failedNotification_fresherThanRetention_isKept() {
|
||||
UUID instanceId = seedResolvedInstance(NOW.minusSeconds(5 * 86400L));
|
||||
// FAILED notification created 5 days ago (< 30-day retention)
|
||||
Instant recent = NOW.minusSeconds(5 * 86400L);
|
||||
UUID notifId = seedNotification(instanceId, NotificationStatus.FAILED, recent);
|
||||
|
||||
runJobAt(NOW);
|
||||
|
||||
assertNotificationPresent(notifId);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
private void runJobAt(Instant fixedNow) {
|
||||
// Replace the job's clock by using a subclass trick — we can't inject the clock
|
||||
// into the scheduled job in Spring context without replacement, so we invoke a
|
||||
// freshly constructed job with a fixed clock directly.
|
||||
var fixedClock = Clock.fixed(fixedNow, ZoneOffset.UTC);
|
||||
|
||||
// The job bean is already wired in Spring context, but we want deterministic "now".
|
||||
// Since AlertingRetentionJob stores a Clock field, we can inject via the
|
||||
// @Autowired job using spring's test support. However, the simplest KISS approach
|
||||
// is to construct a local instance pointing at the real repos + fixed clock.
|
||||
var localJob = new AlertingRetentionJob(
|
||||
// pull retention days from context via job.props — but since we can't access
|
||||
// private field, we use direct construction from known values:
|
||||
// effectiveEventRetentionDays = 90, effectiveNotificationRetentionDays = 30
|
||||
new com.cameleer.server.app.alerting.config.AlertingProperties(
|
||||
null, null, null, null, null, null, null, null, null,
|
||||
90, 30, null, null),
|
||||
instanceRepo,
|
||||
notificationRepo,
|
||||
fixedClock);
|
||||
localJob.cleanup();
|
||||
}
|
||||
|
||||
private UUID seedResolvedInstance(Instant resolvedAt) {
|
||||
UUID id = UUID.randomUUID();
|
||||
jdbcTemplate.update("""
|
||||
INSERT INTO alert_instances
|
||||
(id, rule_id, rule_snapshot, environment_id, state, severity,
|
||||
fired_at, resolved_at, silenced, context, title, message,
|
||||
target_user_ids, target_group_ids, target_role_names)
|
||||
VALUES (?, ?, '{}'::jsonb, ?, 'RESOLVED'::alert_state_enum, 'WARNING'::severity_enum,
|
||||
?, ?, false, '{}'::jsonb, 'T', 'M',
|
||||
'{}', '{}', '{}')
|
||||
""",
|
||||
id, ruleId, envId, resolvedAt, resolvedAt);
|
||||
return id;
|
||||
}
|
||||
|
||||
private UUID seedFiringInstance(Instant firedAt) {
|
||||
UUID id = UUID.randomUUID();
|
||||
jdbcTemplate.update("""
|
||||
INSERT INTO alert_instances
|
||||
(id, rule_id, rule_snapshot, environment_id, state, severity,
|
||||
fired_at, silenced, context, title, message,
|
||||
target_user_ids, target_group_ids, target_role_names)
|
||||
VALUES (?, ?, '{}'::jsonb, ?, 'FIRING'::alert_state_enum, 'WARNING'::severity_enum,
|
||||
?, false, '{}'::jsonb, 'T', 'M',
|
||||
'{}', '{}', '{}')
|
||||
""",
|
||||
id, ruleId, envId, firedAt);
|
||||
return id;
|
||||
}
|
||||
|
||||
private UUID seedNotification(UUID alertInstanceId, NotificationStatus status, Instant createdAt) {
|
||||
UUID id = UUID.randomUUID();
|
||||
jdbcTemplate.update("""
|
||||
INSERT INTO alert_notifications
|
||||
(id, alert_instance_id, status, attempts, next_attempt_at, payload, created_at)
|
||||
VALUES (?, ?, ?::notification_status_enum, 0, ?, '{}'::jsonb, ?)
|
||||
""",
|
||||
id, alertInstanceId, status.name(), createdAt, createdAt);
|
||||
return id;
|
||||
}
|
||||
|
||||
private void assertInstanceGone(UUID id) {
|
||||
Integer count = jdbcTemplate.queryForObject(
|
||||
"SELECT COUNT(*) FROM alert_instances WHERE id = ?", Integer.class, id);
|
||||
assertThat(count).as("instance %s should be deleted", id).isZero();
|
||||
}
|
||||
|
||||
private void assertInstancePresent(UUID id) {
|
||||
Integer count = jdbcTemplate.queryForObject(
|
||||
"SELECT COUNT(*) FROM alert_instances WHERE id = ?", Integer.class, id);
|
||||
assertThat(count).as("instance %s should be present", id).isEqualTo(1);
|
||||
}
|
||||
|
||||
private void assertNotificationGone(UUID id) {
|
||||
Integer count = jdbcTemplate.queryForObject(
|
||||
"SELECT COUNT(*) FROM alert_notifications WHERE id = ?", Integer.class, id);
|
||||
assertThat(count).as("notification %s should be deleted", id).isZero();
|
||||
}
|
||||
|
||||
private void assertNotificationPresent(UUID id) {
|
||||
Integer count = jdbcTemplate.queryForObject(
|
||||
"SELECT COUNT(*) FROM alert_notifications WHERE id = ?", Integer.class, id);
|
||||
assertThat(count).as("notification %s should be present", id).isEqualTo(1);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user