Proof of Concept
Proof-of-concept (executable steps) to reproduce the vulnerability before the fix:
- DoS via oversized index file (derives from misadvertised manifest size or out-of-band growth):
1) In a Grafana deployment, configure BucketRemoteIndexStore to use a storage bucket (e.g., S3-compatible or in-memory test bucket).
2) Create a manifest meta.json that advertises a small size for a file, for example: { "GrafanaBuildVersion": "11.0.0", "Files": {"store/root.bolt": 10} }
3) Place a much larger object at the key corresponding to the manifest’s file, e.g., a 10000-byte file at the path that meta.json references.
4) Trigger the DownloadIndex operation (which reads meta.json and streams the file data to disk). In the unpatched code, Grafana could stream the large object to disk, exhausting disk space or I/O bandwidth before detection.
5) Observe unbounded data transfer or resource exhaustion before the manifest/size discrepancy is detected.
- Race condition in distributed lock heartbeat: (conceptual steps)
1) Run multiple replicas offering the same lock key with a TTL and heartbeat at the same time.
2) Without the patch, the exact boundary where a new lock could be acquired overlaps with the lease expiration, enabling race conditions or split-brain behavior.
3) With the patch, loss is declared one heartbeat interval before the TTL boundary, reducing the window for another replica to acquire the lock, mitigating the race.
A concrete, post-fix PoC is demonstrated by the unit tests added in this commit (TestRemoteIndexStore_DownloadRejectsOversizedFile and TestObjectStorageLock_HeartbeatLossDetectedBeforeTTL). To exercise the updated behavior directly in this repository, you can run:
go test ./pkg/storage/unified/search -run TestRemoteIndexStore_DownloadRejectsOversizedFile
go test ./pkg/storage/unified/search -run TestObjectStorageLock_HeartbeatLossDetectedBeforeTTL
The tests validate that oversized downloads are rejected with an explicit error and that heartbeat loss is detected before TTL, respectively.
Code Diff
diff --git a/pkg/storage/unified/search/lock_objstore.go b/pkg/storage/unified/search/lock_objstore.go
index 0e05823b7291..dc4b5510af0d 100644
--- a/pkg/storage/unified/search/lock_objstore.go
+++ b/pkg/storage/unified/search/lock_objstore.go
@@ -170,9 +170,12 @@ func (l *objectStorageLock) runHeartbeat(ctx context.Context, done chan struct{}
ticker := time.NewTicker(l.heartbeatInterval)
defer ticker.Stop()
- // Tolerate transient failures up to the lease boundary.
- // Floor ensures loss is detected no later than TTL
- maxFailures := int(l.ttl / l.heartbeatInterval)
+ // Declare loss one heartbeat interval before the lease expires server-side,
+ // so we stop work before another replica can acquire the lock. With TTL=400ms
+ // and HBI=100ms, the lease expires at t=400ms; we close lostCh after 3 failed
+ // ticks at t=300ms, leaving 100ms of margin. The -1 is the margin.
+ // The constructor enforces TTL >= 2*HeartbeatInterval, so maxFailures >= 1.
+ maxFailures := int(l.ttl/l.heartbeatInterval) - 1
consecutiveFailures := 0
for {
diff --git a/pkg/storage/unified/search/lock_objstore_test.go b/pkg/storage/unified/search/lock_objstore_test.go
index 83df5b901bf8..3e24af0616d0 100644
--- a/pkg/storage/unified/search/lock_objstore_test.go
+++ b/pkg/storage/unified/search/lock_objstore_test.go
@@ -286,6 +286,37 @@ func TestObjectStorageLock_ReleaseWaitsForInFlightUpdate(t *testing.T) {
}
}
+// TestObjectStorageLock_HeartbeatLossDetectedBeforeTTL asserts that with all
+// heartbeats failing transiently, lostCh fires before the lease expires
+// server-side. See the maxFailures comment in runHeartbeat for the rationale.
+func TestObjectStorageLock_HeartbeatLossDetectedBeforeTTL(t *testing.T) {
+ backend := &failingUpdateBackend{
+ lockBackend: newFakeBackend(newConditionalBucket()),
+ failAfterN: 0,
+ }
+
+ // 3:1 ratio mirrors the production default (180s/60s). Absolute times are
+ // scaled up so scheduler jitter and GC pauses on slow CI runners don't eat
+ // the safety margin we're trying to assert.
+ ttl := 900 * time.Millisecond
+ hbi := 300 * time.Millisecond
+ lock := newTestLock(t, backend, "test-lock", "instance-1", ttl, hbi)
+
+ ctx := context.Background()
+ require.NoError(t, lock.Acquire(ctx))
+ start := time.Now()
+
+ select {
+ case <-lock.Lost():
+ case <-time.After(ttl + 300*time.Millisecond):
+ t.Fatal("expected lock loss")
+ }
+
+ elapsed := time.Since(start)
+ // Loss should fire ~one heartbeat before TTL (~600ms here, vs the 900ms lease).
+ require.Less(t, elapsed, ttl-hbi/2, "loss should be detected with safety margin before TTL (got %s, ttl=%s)", elapsed, ttl)
+}
+
func TestObjectStorageLock_ImmediateLossOnOwnershipError(t *testing.T) {
backend := &failingUpdateBackend{
lockBackend: newFakeBackend(newConditionalBucket()),
diff --git a/pkg/storage/unified/search/remote_index_store.go b/pkg/storage/unified/search/remote_index_store.go
index d3f527bd6f05..b45111b5c0de 100644
--- a/pkg/storage/unified/search/remote_index_store.go
+++ b/pkg/storage/unified/search/remote_index_store.go
@@ -335,7 +335,7 @@ func (s *BucketRemoteIndexStore) DownloadIndex(ctx context.Context, nsResource r
if err := os.MkdirAll(filepath.Dir(localPath), 0750); err != nil {
return nil, fmt.Errorf("creating directory for %s: %w", relPath, err)
}
- if err := s.downloadFile(ctx, objectKey, localPath); err != nil {
+ if err := s.downloadFile(ctx, objectKey, localPath, expectedSize); err != nil {
return nil, fmt.Errorf("downloading %s: %w", relPath, err)
}
@@ -365,22 +365,29 @@ func validateManifestPaths(files map[string]int64) error {
if clean != relPath {
return fmt.Errorf("non-canonical path %q (canonical: %q)", relPath, clean)
}
- if clean == "." || filepath.IsAbs(clean) || strings.HasPrefix(clean, "..") {
+ if clean == "." || clean == ".." || filepath.IsAbs(clean) || strings.HasPrefix(clean, "../") {
return fmt.Errorf("invalid path %q", relPath)
}
}
return nil
}
-// downloadFile creates localPath and streams the remote object into it.
-func (s *BucketRemoteIndexStore) downloadFile(ctx context.Context, objectKey, localPath string) error {
+// downloadFile creates localPath and streams the remote object into it,
+// capping the transfer at expectedSize+1 bytes so a misadvertised manifest
+// size or a bucket object that's grown out of band fails fast before we
+// transfer unbounded data. meta.json uses the same pattern.
+func (s *BucketRemoteIndexStore) downloadFile(ctx context.Context, objectKey, localPath string, expectedSize int64) error {
f, err := os.Create(localPath) //nolint:gosec // path is under a Grafana-controlled staging directory
if err != nil {
return err
}
- if err := s.bucket.Download(ctx, objectKey, f, nil); err != nil {
+ lw := &resource.LimitedWriter{W: f, N: expectedSize + 1}
+ if err := s.bucket.Download(ctx, objectKey, lw, nil); err != nil {
_ = f.Close()
+ if errors.Is(err, resource.ErrWriteLimitExceeded) {
+ return fmt.Errorf("remote object exceeds expected size %d: %w", expectedSize, err)
+ }
return err
}
return f.Close()
diff --git a/pkg/storage/unified/search/remote_index_store_test.go b/pkg/storage/unified/search/remote_index_store_test.go
index b0e040e236dc..2aeedfd1c50d 100644
--- a/pkg/storage/unified/search/remote_index_store_test.go
+++ b/pkg/storage/unified/search/remote_index_store_test.go
@@ -1,6 +1,7 @@
package search
import (
+ "bytes"
"context"
"crypto/rand"
"encoding/json"
@@ -164,11 +165,13 @@ func TestValidateManifestPaths(t *testing.T) {
wantErr string
}{
{name: "valid paths", files: map[string]int64{"store/root.bolt": 100, "store/00001.zap": 200}},
+ {name: "leading double-dot in filename", files: map[string]int64{"..foo/bar.zap": 100}},
{name: "path traversal", files: map[string]int64{"../../../tmp/escape": 100}, wantErr: "invalid path"},
{name: "absolute path", files: map[string]int64{"/tmp/escape": 100}, wantErr: "invalid path"},
{name: "non-canonical dotslash", files: map[string]int64{"./store/root.bolt": 100}, wantErr: "non-canonical"},
{name: "non-canonical double dot", files: map[string]int64{"store/../store/root.bolt": 100}, wantErr: "non-canonical"},
{name: "dot entry", files: map[string]int64{".": 100}, wantErr: "invalid path"},
+ {name: "parent dir entry", files: map[string]int64{"..": 100}, wantErr: "invalid path"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
@@ -270,6 +273,35 @@ func TestRemoteIndexStore_DownloadRejectsCorruptMetaJSON(t *testing.T) {
})
}
+func TestRemoteIndexStore_DownloadRejectsOversizedFile(t *testing.T) {
+ // A bucket object that exceeds the size advertised in meta.json must fail
+ // fast — we should not transfer unbounded bytes to disk before noticing.
+ ctx := context.Background()
+ bucket := memblob.OpenBucket(nil)
+ defer func() { _ = bucket.Close() }()
+ store := newTestRemoteIndexStore(t, bucket)
+ ns := newTestNsResource()
+ key := ulid.Make()
+ pfx := indexPrefix(ns, key.String())
+
+ const advertised = 10
+ meta := IndexMeta{
+ GrafanaBuildVersion: "11.0.0",
+ Files: map[string]int64{"store/root.bolt": advertised},
+ }
+ metaBytes, err := json.Marshal(meta)
+ require.NoError(t, err)
+ require.NoError(t, bucket.WriteAll(ctx, pfx+"meta.json", metaBytes, nil))
+
+ // Plant a file far larger than what meta.json claims.
+ oversized := bytes.Repeat([]byte("x"), advertised*1000)
+ require.NoError(t, bucket.WriteAll(ctx, pfx+"store/root.bolt", oversized, nil))
+
+ _, err = store.DownloadIndex(ctx, ns, key, filepath.Join(t.TempDir(), "dl"))
+ require.Error(t, err)
+ require.Contains(t, err.Error(), "exceeds expected size")
+}
+
func TestRemoteIndexStore_DownloadValidatesCompleteness(t *testing.T) {
ctx := context.Background()
bucket := memblob.OpenBucket(nil)