evstack · julienrbrt · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Changes
 
 - Optimization of mutex usage in cache for reaper [#3286](https://github.com/evstack/ev-node/pull/3286)
+- Optimize metadata writes by making it async in cache store [#3298](https://github.com/evstack/ev-node/pull/3298)
 - Reduce tx cache retention to avoid OOM under (really) heavy tx load [#3299](https://github.com/evstack/ev-node/pull/3299)
 
 ## v1.1.1

diff --git a/pkg/store/cached_store.go b/pkg/store/cached_store.go
@@ -2,8 +2,11 @@ package store
 
 import (
 	"context"
+	"sync"
+	"time"
 
 	lru "github.com/hashicorp/golang-lru/v2"
+	"github.com/rs/zerolog"
 
 	"github.com/evstack/ev-node/types"
 )
@@ -14,15 +17,37 @@ const (
 
 	// DefaultBlockDataCacheSize is the default number of block data entries to cache.
 	DefaultBlockDataCacheSize = 200_000
+
+	asyncWriteBufferSize = 8192
+
+	// batchWindow is the time the write goroutine waits after receiving the first
+	// op before flushing. This allows bursts of metadata writes (e.g. 3-4 per
+	// height in the submitter) to be coalesced into a single Badger WriteBatch.
+	batchWindow = 100 * time.Microsecond
 )
 
+type asyncWriteOp struct {
+	key      string
+	value    []byte
+	isDelete bool
+}
+
 // CachedStore wraps a Store with LRU caching for frequently accessed data.
 // The underlying LRU cache is thread-safe, so no additional synchronization is needed.
+// Metadata writes (SetMetadata, DeleteMetadata) are processed asynchronously via a
+// buffered channel to avoid blocking Badger's write pipeline for critical operations
+// like block production (batch commits).
 type CachedStore struct {
 	Store
 
 	headerCache    *lru.Cache[uint64, *types.SignedHeader]
 	blockDataCache *lru.Cache[uint64, *blockDataEntry]
+
+	writeCh chan asyncWriteOp
+	done    chan struct{}
+	stopMu  sync.RWMutex
+	stopped bool
+	logger  zerolog.Logger
 }
 
 type blockDataEntry struct {
@@ -73,6 +98,9 @@ func NewCachedStore(store Store, opts ...CachedStoreOption) (*CachedStore, error
 		Store:          store,
 		headerCache:    headerCache,
 		blockDataCache: blockDataCache,
+		writeCh:        make(chan asyncWriteOp, asyncWriteBufferSize),
+		done:           make(chan struct{}),
+		logger:         zerolog.Nop(),
 	}
 
 	for _, opt := range opts {
@@ -81,9 +109,58 @@ func NewCachedStore(store Store, opts ...CachedStoreOption) (*CachedStore, error
 		}
 	}
 
+	cs.startWriteLoop()
+
 	return cs, nil
 }
 
+func (cs *CachedStore) startWriteLoop() {
+	go func() {
+		defer close(cs.done)
+		for op := range cs.writeCh {
+			ops := []asyncWriteOp{op}
+
+			timer := time.NewTimer(batchWindow)
+		collect:
+			for {
+				select {
+				case op, ok := <-cs.writeCh:
+					if !ok {
+						timer.Stop()
+						break collect
+					}
+					ops = append(ops, op)
+				case <-timer.C:
+					break collect
+				}
+			}
+
+			last := make(map[string]asyncWriteOp, len(ops))
+			for _, o := range ops {
+				last[o.key] = o
+			}
+
+			var puts []MetadataKV
+			var deletes []string
+			for _, o := range last {
+				if o.isDelete {
+					deletes = append(deletes, o.key)
+				} else {
+					puts = append(puts, MetadataKV{Key: o.key, Value: o.value})
+				}
+			}
+
+			if err := cs.BatchMetadata(context.Background(), puts, deletes); err != nil {
+				for _, o := range ops {
+					cs.logger.Error().Err(err).Str("key", o.key).
+						Bool("delete", o.isDelete).
+						Msg("async metadata batch write failed")
+				}
+			}
+		}
+	}()
+}
+
 // GetHeader returns the header at the given height, using the cache if available.
 func (cs *CachedStore) GetHeader(ctx context.Context, height uint64) (*types.SignedHeader, error) {
 	// Try cache first
@@ -162,7 +239,7 @@ func (cs *CachedStore) Rollback(ctx context.Context, height uint64, aggregator b
 }
 
 // PruneBlocks wraps the underlying store's PruneBlocks and invalidates caches
-// up to the heigh that we purne
+// up to the height that we prune
 func (cs *CachedStore) PruneBlocks(ctx context.Context, height uint64) error {
 	if err := cs.Store.PruneBlocks(ctx, height); err != nil {
 		return err
@@ -173,8 +250,42 @@ func (cs *CachedStore) PruneBlocks(ctx context.Context, height uint64) error {
 	return nil
 }
 
-// Close closes the underlying store.
+// SetMetadata queues an asynchronous metadata write. The write is persisted
+// by the background goroutine via BatchMetadata. If the store has been stopped,
+// the write falls back to synchronous execution on the underlying store.
+func (cs *CachedStore) SetMetadata(ctx context.Context, key string, value []byte) error {
+	cs.stopMu.RLock()
+	defer cs.stopMu.RUnlock()
+
+	if cs.stopped {
+		return cs.Store.SetMetadata(ctx, key, value)
+	}
+	valueCopy := append([]byte(nil), value...)
+	cs.writeCh <- asyncWriteOp{key: key, value: valueCopy}
+	return nil
+}
+
+// DeleteMetadata queues an asynchronous metadata delete. If the store has been
+// stopped, the delete falls back to synchronous execution.
+func (cs *CachedStore) DeleteMetadata(ctx context.Context, key string) error {
+	cs.stopMu.RLock()
+	defer cs.stopMu.RUnlock()
+
+	if cs.stopped {
+		return cs.Store.DeleteMetadata(ctx, key)
+	}
+	cs.writeCh <- asyncWriteOp{key: key, isDelete: true}
+	return nil
+}
+
+// Close drains pending async writes, then closes the underlying store.
 func (cs *CachedStore) Close() error {
+	cs.stopMu.Lock()
+	cs.stopped = true
+	close(cs.writeCh)
+	cs.stopMu.Unlock()
+	<-cs.done
+
 	cs.ClearCache()
 	return cs.Store.Close()
 }
diff --git a/pkg/store/cached_store_test.go b/pkg/store/cached_store_test.go
@@ -2,8 +2,12 @@ package store
 
 import (
 	"context"
+	"fmt"
 	"testing"
+	"time"
 
+	ds "github.com/ipfs/go-datastore"
+	"github.com/rs/zerolog"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 
@@ -270,3 +274,129 @@ func TestCachedStore_Close(t *testing.T) {
 	err = cachedStore.Close()
 	require.NoError(t, err)
 }
+
+func TestCachedStore_AsyncSetMetadata(t *testing.T) {
+	t.Parallel()
+	ctx := context.Background()
+
+	kv, err := NewTestInMemoryKVStore()
+	require.NoError(t, err)
+
+	base := New(kv)
+	cs, err := NewCachedStore(base)
+	require.NoError(t, err)
+	t.Cleanup(func() { cs.Close() })
+
+	require.NoError(t, cs.SetMetadata(ctx, "key1", []byte("value1")))
+
+	require.Eventually(t, func() bool {
+		v, err := base.GetMetadata(ctx, "key1")
+		return err == nil && string(v) == "value1"
+	}, time.Second, 10*time.Millisecond)
+}
+
+func TestCachedStore_AsyncDeleteMetadata(t *testing.T) {
+	t.Parallel()
+	ctx := context.Background()
+
+	kv, err := NewTestInMemoryKVStore()
+	require.NoError(t, err)
+
+	base := New(kv)
+	require.NoError(t, base.SetMetadata(ctx, "key1", []byte("value1")))
+
+	cs, err := NewCachedStore(base)
+	require.NoError(t, err)
+	t.Cleanup(func() { cs.Close() })
+
+	require.NoError(t, cs.DeleteMetadata(ctx, "key1"))
+
+	require.Eventually(t, func() bool {
+		_, err := base.GetMetadata(ctx, "key1")
+		return err != nil
+	}, time.Second, 10*time.Millisecond)
+}
+
+func TestCachedStore_Close_FlushesPendingWrites(t *testing.T) {
+	ctx := context.Background()
+
+	dir := t.TempDir()
+	kv, err := NewDefaultKVStore(dir, "", "test-db")
+	require.NoError(t, err)
+
+	base := New(kv)
+	cs, err := NewCachedStore(base)
+	require.NoError(t, err)
+
+	const n = 100
+	for i := range n {
+		k := fmt.Sprintf("key-%d", i)
+		require.NoError(t, cs.SetMetadata(ctx, k, []byte(k)))
+	}
+
+	require.NoError(t, cs.Close())
+
+	kv2, err := NewDefaultKVStore(dir, "", "test-db")
+	require.NoError(t, err)
+	t.Cleanup(func() { kv2.Close() })
+	reopened := New(kv2)
+
+	for i := range n {
+		k := fmt.Sprintf("key-%d", i)
+		v, err := reopened.GetMetadata(ctx, k)
+		require.NoError(t, err)
+		require.Equal(t, []byte(k), v)
+	}
+}
+
+func TestCachedStore_WriteAfterClose_FallsBack(t *testing.T) {
+	kv, err := NewTestInMemoryKVStore()
+	require.NoError(t, err)
+
+	base := New(kv)
+	cs, err := NewCachedStore(base)
+	require.NoError(t, err)
+
+	ctx := context.Background()
+	require.NoError(t, cs.SetMetadata(ctx, "before", []byte("ok")))
+
+	require.NoError(t, cs.Close())
+
+	err = cs.SetMetadata(ctx, "after", []byte("sync"))
+	require.Error(t, err)
+}
+
+func TestCachedStore_CoalescesSameKeyOps(t *testing.T) {
+	ctx := context.Background()
+
+	kv, err := NewTestInMemoryKVStore()
+	require.NoError(t, err)
+
+	require.NoError(t, kv.Put(ctx, ds.NewKey(GetMetaKey("k")), []byte("original")))
+
+	base := New(kv)
+
+	writeCh := make(chan asyncWriteOp, asyncWriteBufferSize)
+	done := make(chan struct{})
+	cs := &CachedStore{
+		Store:   base,
+		writeCh: writeCh,
+		done:    done,
+		logger:  zerolog.Nop(),
+	}
+	cs.startWriteLoop()
+
+	require.NoError(t, cs.SetMetadata(ctx, "k", []byte("v1")))
+	require.NoError(t, cs.DeleteMetadata(ctx, "k"))
+	require.NoError(t, cs.SetMetadata(ctx, "k", []byte("v2")))
+
+	cs.stopMu.Lock()
+	cs.stopped = true
+	close(writeCh)
+	cs.stopMu.Unlock()
+	<-done
+
+	v, err := base.GetMetadata(ctx, "k")
+	require.NoError(t, err)
+	require.Equal(t, []byte("v2"), v, "last write (Set) should win over delete")
+}
diff --git a/pkg/store/store.go b/pkg/store/store.go
@@ -190,6 +190,30 @@ func (s *DefaultStore) SetMetadata(ctx context.Context, key string, value []byte
 	return nil
 }
 
+func (s *DefaultStore) BatchMetadata(ctx context.Context, puts []MetadataKV, deletes []string) error {
+	if len(puts) == 0 && len(deletes) == 0 {
+		return nil
+	}
+	batch, err := s.db.Batch(ctx)
+	if err != nil {
+		return fmt.Errorf("failed to create metadata batch: %w", err)
+	}
+	for _, kv := range puts {
+		if err := batch.Put(ctx, ds.NewKey(GetMetaKey(kv.Key)), kv.Value); err != nil {
+			return fmt.Errorf("failed to batch-put metadata key '%s': %w", kv.Key, err)
+		}
+	}
+	for _, key := range deletes {
+		if err := batch.Delete(ctx, ds.NewKey(GetMetaKey(key))); err != nil {
+			return fmt.Errorf("failed to batch-delete metadata key '%s': %w", key, err)
+		}
+	}
+	if err := batch.Commit(ctx); err != nil {
+		return fmt.Errorf("failed to commit metadata batch: %w", err)
+	}
+	return nil
+}
+
 // GetMetadata returns values stored for given key with SetMetadata.
 func (s *DefaultStore) GetMetadata(ctx context.Context, key string) ([]byte, error) {
 	data, err := s.db.Get(ctx, ds.NewKey(GetMetaKey(key)))

diff --git a/pkg/store/tracing.go b/pkg/store/tracing.go
@@ -211,6 +211,25 @@ func (t *tracedStore) DeleteMetadata(ctx context.Context, key string) error {
 	return nil
 }
 
+func (t *tracedStore) BatchMetadata(ctx context.Context, puts []MetadataKV, deletes []string) error {
+	ctx, span := t.tracer.Start(ctx, "Store.BatchMetadata",
+		trace.WithAttributes(
+			attribute.Int("puts", len(puts)),
+			attribute.Int("deletes", len(deletes)),
+		),
+	)
+	defer span.End()
+
+	err := t.inner.BatchMetadata(ctx, puts, deletes)
+	if err != nil {
+		span.RecordError(err)
+		span.SetStatus(codes.Error, err.Error())
+		return err
+	}
+
+	return nil
+}
+
 func (t *tracedStore) DeleteStateAtHeight(ctx context.Context, height uint64) error {
 	ctx, span := t.tracer.Start(ctx, "Store.DeleteStateAtHeight",
 		trace.WithAttributes(attribute.Int64("height", int64(height))),