Code Diff
diff --git a/pkg/storage/unified/resource/pending_delete_store.go b/pkg/storage/unified/resource/pending_delete_store.go
index a26fc99ef537..3231aa7435f9 100644
--- a/pkg/storage/unified/resource/pending_delete_store.go
+++ b/pkg/storage/unified/resource/pending_delete_store.go
@@ -19,13 +19,17 @@ const (
// that has been marked as pending deletion. The record is created before
// labelling begins so that cleanup can proceed even after partial failures.
//
-// When Force is true, the record cannot be removed by the tenant watcher.
+// When Orphaned is true, the record cannot be removed by the tenant watcher.
// This is used for manually-seeded records that clean up orphaned tenants
// which the tenant API considers active but are actually deleted in GCOM.
type PendingDeleteRecord struct {
DeleteAfter string `json:"deleteAfter"`
LabelingComplete bool `json:"labelingComplete"`
- Force bool `json:"force,omitempty"`
+ Orphaned bool `json:"orphaned,omitempty"`
+ // DeletedAt is set to an RFC3339 timestamp after all tenant data has been
+ // successfully deleted. Records with this field set are skipped by the
+ // tenant deleter.
+ DeletedAt string `json:"deletedAt,omitempty"`
}
// PendingDeleteStore manages pending-delete records in the KV store and keeps
diff --git a/pkg/storage/unified/resource/tenant_deleter.go b/pkg/storage/unified/resource/tenant_deleter.go
index 0e30401d546c..63570fd0feeb 100644
--- a/pkg/storage/unified/resource/tenant_deleter.go
+++ b/pkg/storage/unified/resource/tenant_deleter.go
@@ -2,6 +2,7 @@ package resource
import (
"context"
+ "fmt"
"math/rand/v2"
"strconv"
"time"
@@ -124,6 +125,11 @@ func (td *TenantDeleter) runDeletionPass(ctx context.Context) {
continue
}
+ if record.DeletedAt != "" {
+ // Already fully deleted; skip.
+ continue
+ }
+
deleteAfter, err := time.Parse(time.RFC3339, record.DeleteAfter)
if err != nil {
td.log.Warn("failed to parse delete-after time", "tenant", tenantName, "value", record.DeleteAfter)
@@ -224,6 +230,20 @@ func (td *TenantDeleter) deleteTenant(ctx context.Context, tenantName string, gr
return nil
}
- // we delete the pending-delete record at the end to ensure idempotency
- return td.pendingDeleteStore.Delete(ctx, tenantName)
+ record, err := td.pendingDeleteStore.Get(ctx, tenantName)
+ if err != nil {
+ return fmt.Errorf("reading pending delete record: %w", err)
+ }
+
+ // Orphaned records are manually seeded and won't be recreated by the
+ // tenant watcher, so we can safely remove them after cleanup.
+ if record.Orphaned {
+ return td.pendingDeleteStore.Delete(ctx, tenantName)
+ }
+
+ // Non-orphaned records must be kept with a DeletedAt timestamp to prevent
+ // the tenant watcher from recreating them while the tenant API still has
+ // the tenant with a pending-delete label.
+ record.DeletedAt = time.Now().UTC().Format(time.RFC3339)
+ return td.pendingDeleteStore.Upsert(ctx, tenantName, record)
}
diff --git a/pkg/storage/unified/resource/tenant_deleter_test.go b/pkg/storage/unified/resource/tenant_deleter_test.go
index c91671ce4066..c5676547e53b 100644
--- a/pkg/storage/unified/resource/tenant_deleter_test.go
+++ b/pkg/storage/unified/resource/tenant_deleter_test.go
@@ -107,6 +107,34 @@ func futureTime() string {
return time.Now().UTC().Add(1 * time.Hour).Format(time.RFC3339)
}
+// TestRunDeletionPass_SkipsAlreadyDeleted verifies that a tenant with DeletedAt
+// set is not processed again.
+func TestRunDeletionPass_SkipsAlreadyDeleted(t *testing.T) {
+ td, ds, pds := newTestTenantDeleter(t, false)
+
+ saveTestResource(t, ds, testStacksNS1, "apps", "dashboards", "dash1", 100, nil)
+
+ require.NoError(t, pds.Upsert(t.Context(), testStacksNS1, PendingDeleteRecord{
+ DeleteAfter: pastTime(),
+ DeletedAt: pastTime(),
+ }))
+
+ td.runDeletionPass(t.Context())
+
+ // Data should still be present because the record was already marked deleted.
+ listKey := ListRequestKey{Group: "apps", Resource: "dashboards", Namespace: testStacksNS1}
+ prefix := listKey.Prefix()
+ var count int
+ for _, err := range ds.kv.Keys(t.Context(), dataSection, ListOptions{
+ StartKey: prefix,
+ EndKey: PrefixRangeEnd(prefix),
+ }) {
+ require.NoError(t, err)
+ count++
+ }
+ assert.Equal(t, 1, count, "resource should still exist for already-deleted record")
+}
+
// TestRunDeletionPass_SkipsNotExpired verifies that a tenant with a future
// deletion time is not deleted.
func TestRunDeletionPass_SkipsNotExpired(t *testing.T) {
@@ -139,7 +167,7 @@ func TestRunDeletionPass_SkipsNotExpired(t *testing.T) {
}
// TestRunDeletionPass_DeletesExpired verifies that all resource keys for an
-// expired tenant are removed, and the pending-delete record is also removed.
+// expired tenant are removed, and the pending-delete record is marked with DeletedAt.
func TestRunDeletionPass_DeletesExpired(t *testing.T) {
td, ds, pds := newTestTenantDeleter(t, false)
@@ -165,9 +193,10 @@ func TestRunDeletionPass_DeletesExpired(t *testing.T) {
}
assert.Equal(t, 0, count, "resources should have been deleted for expired tenant")
- // Pending delete record should be gone.
- _, err := pds.Get(t.Context(), testStacksNS1)
- assert.ErrorIs(t, err, ErrNotFound)
+ // Pending delete record should have DeletedAt set.
+ record, err := pds.Get(t.Context(), testStacksNS1)
+ require.NoError(t, err)
+ assert.NotEmpty(t, record.DeletedAt, "DeletedAt should be set after successful deletion")
}
// TestRunDeletionPass_PreservesOtherTenants verifies that only the expired
@@ -214,12 +243,14 @@ func TestRunDeletionPass_PreservesOtherTenants(t *testing.T) {
}
assert.Equal(t, 1, count2, "second tenant resources should remain")
- // First tenant's pending delete record gone; second tenant's record still exists.
- _, err := pds.Get(t.Context(), testStacksNS1)
- assert.ErrorIs(t, err, ErrNotFound)
+ // First tenant's pending delete record should have DeletedAt set; second tenant's record should not.
+ record, err := pds.Get(t.Context(), testStacksNS1)
+ require.NoError(t, err)
+ assert.NotEmpty(t, record.DeletedAt, "DeletedAt should be set for deleted tenant")
- _, err = pds.Get(t.Context(), testStacksNS2)
- assert.NoError(t, err)
+ record2, err := pds.Get(t.Context(), testStacksNS2)
+ require.NoError(t, err)
+ assert.Empty(t, record2.DeletedAt, "DeletedAt should not be set for non-expired tenant")
}
// TestRunDeletionPass_DryRun verifies that dry-run mode does not delete any
@@ -248,14 +279,15 @@ func TestRunDeletionPass_DryRun(t *testing.T) {
}
assert.Equal(t, 1, count, "data should not be deleted in dry-run mode")
- // Pending delete record should still exist.
- _, err := pds.Get(t.Context(), testStacksNS1)
- assert.NoError(t, err)
+ // Pending delete record should still exist without DeletedAt.
+ record, err := pds.Get(t.Context(), testStacksNS1)
+ require.NoError(t, err)
+ assert.Empty(t, record.DeletedAt, "DeletedAt should not be set in dry-run mode")
}
-// TestRunDeletionPass_NoDataCleansUpRecord verifies that an expired tenant
-// with no resource data still has its pending-delete record removed.
-func TestRunDeletionPass_NoDataCleansUpRecord(t *testing.T) {
+// TestRunDeletionPass_NoDataSetsDeletedAt verifies that an expired tenant
+// with no resource data still has its pending-delete record marked with DeletedAt.
+func TestRunDeletionPass_NoDataSetsDeletedAt(t *testing.T) {
td, _, pds := newTestTenantDeleter(t, false)
// No resources saved — only a pending-delete record.
@@ -265,13 +297,14 @@ func TestRunDeletionPass_NoDataCleansUpRecord(t *testing.T) {
td.runDeletionPass(t.Context())
- // Pending delete record should be removed.
- _, err := pds.Get(t.Context(), testStacksNS1)
- assert.ErrorIs(t, err, ErrNotFound)
+ // Pending delete record should have DeletedAt set.
+ record, err := pds.Get(t.Context(), testStacksNS1)
+ require.NoError(t, err)
+ assert.NotEmpty(t, record.DeletedAt, "DeletedAt should be set after successful deletion")
}
// TestRunDeletionPass_IdempotentRerun verifies that running a deletion pass
-// twice is safe — the second pass is a no-op.
+// twice is safe — the second pass skips the already-deleted tenant.
func TestRunDeletionPass_IdempotentRerun(t *testing.T) {
td, ds, pds := newTestTenantDeleter(t, false)
@@ -282,13 +315,14 @@ func TestRunDeletionPass_IdempotentRerun(t *testing.T) {
DeleteAfter: pastTime(),
}))
- // First pass deletes everything.
+ // First pass deletes everything and sets DeletedAt.
td.runDeletionPass(t.Context())
- _, err := pds.Get(t.Context(), testStacksNS1)
- require.ErrorIs(t, err, ErrNotFound)
+ record, err := pds.Get(t.Context(), testStacksNS1)
+ require.NoError(t, err)
+ require.NotEmpty(t, record.DeletedAt)
- // Second pass should be a no-op (no record, no data).
+ // Second pass should be a no-op (record has DeletedAt).
td.runDeletionPass(t.Context())
// Verify data is still gone.
@@ -364,9 +398,10 @@ func TestRunDeletionPass_IdempotentAfterPartialFailure(t *testing.T) {
}
assert.Equal(t, 1, podCount, "pods should survive the failed first pass")
- // Pending-delete record must still exist (deleteTenant returned an error).
- _, err := pds.Get(t.Context(), testStacksNS1)
+ // Pending-delete record must still exist without DeletedAt (deleteTenant returned an error).
+ record, err := pds.Get(t.Context(), testStacksNS1)
require.NoError(t, err, "pending-delete record should survive after partial failure")
+ assert.Empty(t, record.DeletedAt, "DeletedAt should not be set after partial failure")
// Second pass: no more injected failures — should finish the job.
td.runDeletionPass(t.Context())
@@ -380,8 +415,9 @@ func TestRunDeletionPass_IdempotentAfterPartialFailure(t *testing.T) {
}
assert.Equal(t, 0, podCount, "pods should be deleted after retry pass")
- _, err = pds.Get(t.Context(), testStacksNS1)
- assert.ErrorIs(t, err, ErrNotFound, "pending-delete record should be removed after retry")
+ record, err = pds.Get(t.Context(), testStacksNS1)
+ require.NoError(t, err, "pending-delete record should still exist after retry")
+ assert.NotEmpty(t, record.DeletedAt, "DeletedAt should be set after successful retry")
}
// TestRunDeletionPass_DataWithoutPendingRecord verifies that tenant data is
@@ -444,14 +480,15 @@ func TestDeleteTenant_MultipleGroupResources(t *testing.T) {
assert.Equal(t, 0, count, "resources for %s/%s should have been deleted", gr.group, gr.resource)
}
- // Pending delete record should also be removed.
- _, err = pds.Get(t.Context(), testStacksNS1)
- assert.ErrorIs(t, err, ErrNotFound)
+ // Pending delete record should have DeletedAt set.
+ record, err := pds.Get(t.Context(), testStacksNS1)
+ require.NoError(t, err)
+ assert.NotEmpty(t, record.DeletedAt, "DeletedAt should be set after successful deletion")
}
-// TestRunDeletionPass_DeletesExpiredForceRecord verifies that the deleter can
-// delete tenant data and remove the pending-delete record even when Force=true.
-func TestRunDeletionPass_DeletesExpiredForceRecord(t *testing.T) {
+// TestRunDeletionPass_DeletesExpiredOrphanedRecord verifies that the deleter
+// removes both tenant data and the pending-delete record for orphaned tenants.
+func TestRunDeletionPass_DeletesExpiredOrphanedRecord(t *testing.T) {
td, ds, pds := newTestTenantDeleter(t, false)
saveTestResource(t, ds, testStacksNS1, "apps", "dashboards", "dash1", 100, nil)
@@ -459,7 +496,7 @@ func TestRunDeletionPass_DeletesExpiredForceRecord(t *testing.T) {
require.NoError(t, pds.Upsert(t.Context(), testStacksNS1, PendingDeleteRecord{
DeleteAfter: pastTime(),
LabelingComplete: true,
- Force: true,
+ Orphaned: true,
}))
td.runDeletionPass(t.Context())
@@ -474,10 +511,11 @@ func TestRunDeletionPass_DeletesExpiredForceRecord(t *testing.T) {
require.NoError(t, err)
count++
}
- assert.Equal(t, 0, count, "force record resources should be deleted when expired")
+ assert.Equal(t, 0, count, "orphaned record resources should be deleted when expired")
+ // Orphaned records are removed entirely after cleanup.
_, err := pds.Get(t.Context(), testStacksNS1)
- assert.ErrorIs(t, err, ErrNotFound)
+ assert.Error(t, err, "orphaned pending-delete record should be removed after cleanup")
}
// TestRunDeletionPass_AllowsWhenGcomReturnsDeletedStatus verifies local deletion when
@@ -515,8 +553,9 @@ func TestRunDeletionPass_AllowsWhenGcomReturnsDeletedStatus(t *testing.T) {
}
assert.Equal(t, 0, count)
- _, err := pds.Get(t.Context(), testStacksNS1)
- assert.ErrorIs(t, err, ErrNotFound)
+ record, err := pds.Get(t.Context(), testStacksNS1)
+ require.NoError(t, err)
+ assert.NotEmpty(t, record.DeletedAt, "DeletedAt should be set after successful deletion")
}
// TestRunDeletionPass_SkipsWhenGcomInstanceStillExists verifies that local data is
diff --git a/pkg/storage/unified/resource/tenant_watcher.go b/pkg/storage/unified/resource/tenant_watcher.go
index 8d60573bfcd8..a459e28907ba 100644
--- a/pkg/storage/unified/resource/tenant_watcher.go
+++ b/pkg/storage/unified/resource/tenant_watcher.go
@@ -272,7 +272,7 @@ func (tw *TenantWatcher) reconcileTenantPendingDelete(name string, deleteAfter s
record = PendingDeleteRecord{
DeleteAfter: deleteAfter,
LabelingComplete: false,
- Force: record.Force,
+ Orphaned: record.Orphaned,
}
if err := tw.pendingDeleteStore.Upsert(tw.ctx, name, record); err != nil {
tw.log.Error("failed to save pending delete record", "tenant", name, "error", err)
@@ -431,8 +431,8 @@ func (tw *TenantWatcher) clearTenantPendingDelete(name string) {
return
}
- if record.Force {
- tw.log.Warn("tenant has force pending-delete record, skipping clear", "tenant", name)
+ if record.Orphaned {
+ tw.log.Warn("tenant has orphaned pending-delete record, skipping clear", "tenant", name)
return
}
diff --git a/pkg/storage/unified/resource/tenant_watcher_test.go b/pkg/storage/unified/resource/tenant_watcher_test.go
index 89e07c22cb86..96ee52a920d6 100644
--- a/pkg/storage/unified/resource/tenant_watcher_test.go
+++ b/pkg/storage/unified/resource/tenant_watcher_test.go
@@ -135,13 +135,13 @@ func TestTenantClearPendingDelete(t *testing.T) {
assert.ErrorIs(t, err, ErrNotFound)
})
- t.Run("force record persists through reconcile and clear events", func(t *testing.T) {
+ t.Run("orphaned record persists through reconcile and clear events", func(t *testing.T) {
tw := newTestTenantWatcher(t)
require.NoError(t, tw.pendingDeleteStore.Upsert(t.Context(), "tenant-1", PendingDeleteRecord{
DeleteAfter: "2026-03-01T00:00:00Z",
LabelingComplete: true,
- Force: true,
+ Orphaned: true,
}))
tw.pendingDeleteStore.RefreshCache(t.Context())
@@ -150,7 +150,7 @@ func TestTenantClearPendingDelete(t *testing.T) {
record, err := tw.pendingDeleteStore.Get(t.Context(), "tenant-1")
require.NoError(t, err)
-
... [truncated]