From 377228d733e35ad75e982292cad60cf8bf9716b5 Mon Sep 17 00:00:00 2001 From: "lakshimiraman.s" Date: Fri, 29 May 2026 12:31:04 +0530 Subject: [PATCH 1/5] Add support for custom election tick --- conn/node.go | 14 +++++++++++--- conn/node_test.go | 2 +- dgraph/cmd/alpha/run.go | 3 +++ dgraph/cmd/zero/raft.go | 2 +- dgraph/cmd/zero/run.go | 6 +++++- worker/draft.go | 3 ++- worker/server_state.go | 2 +- 7 files changed, 24 insertions(+), 8 deletions(-) diff --git a/conn/node.go b/conn/node.go index 644756161b4..626d1c44bba 100644 --- a/conn/node.go +++ b/conn/node.go @@ -79,7 +79,15 @@ type Node struct { } // NewNode returns a new Node instance. -func NewNode(rc *pb.RaftContext, store *raftwal.DiskStorage, tlsConfig *tls.Config) *Node { +// electionTick controls how many ticks (each 100ms) before an election is triggered. +// If electionTick <= 0, defaults to 20 (i.e., 2s election timeout). +func NewNode(rc *pb.RaftContext, store *raftwal.DiskStorage, tlsConfig *tls.Config, + electionTick int) *Node { + + if electionTick <= 0 { + electionTick = 20 + } + snap, err := store.Snapshot() x.Check(err) @@ -90,8 +98,8 @@ func NewNode(rc *pb.RaftContext, store *raftwal.DiskStorage, tlsConfig *tls.Conf Store: store, Cfg: &raft.Config{ ID: rc.Id, - ElectionTick: 20, // 2s if we call Tick() every 100 ms. - HeartbeatTick: 1, // 100ms if we call Tick() every 100 ms. + ElectionTick: electionTick, // Default 2s if tick is 100ms. + HeartbeatTick: 1, // 100ms if we call Tick() every 100 ms. Storage: store, MaxInflightMsgs: 256, MaxSizePerMsg: 256 << 10, // 256 KB should allow more batching. diff --git a/conn/node_test.go b/conn/node_test.go index 66cb421c139..c59512fb970 100644 --- a/conn/node_test.go +++ b/conn/node_test.go @@ -53,7 +53,7 @@ func TestProposal(t *testing.T) { store := raftwal.Init(dir) rc := &pb.RaftContext{Id: 1} - n := NewNode(rc, store, nil) + n := NewNode(rc, store, nil, 0) peers := []raft.Peer{{ID: n.Id}} n.SetRaft(raft.StartNode(n.Cfg, peers)) diff --git a/dgraph/cmd/alpha/run.go b/dgraph/cmd/alpha/run.go index 901d7b0a808..d87f06e209e 100644 --- a/dgraph/cmd/alpha/run.go +++ b/dgraph/cmd/alpha/run.go @@ -164,6 +164,9 @@ they form a Raft group and provide synchronous replication. "to 0 to disable duration based snapshot."). Flag("pending-proposals", "Number of pending mutation proposals. Useful for rate limiting."). + Flag("election-tick", + "Number of ticks (each 100ms) before a follower starts an election. "+ + "Default 20 means 2s election timeout. Increase in high-latency networks."). String()) flag.String("security", worker.SecurityDefaults, z.NewSuperFlagHelp(worker.SecurityDefaults). diff --git a/dgraph/cmd/zero/raft.go b/dgraph/cmd/zero/raft.go index f9fea51f82b..14bf9cec25c 100644 --- a/dgraph/cmd/zero/raft.go +++ b/dgraph/cmd/zero/raft.go @@ -37,7 +37,7 @@ import ( ) const ( - raftDefaults = "idx=1; learner=false;" + raftDefaults = "idx=1; learner=false; election-tick=20;" ) var proposalKey uint64 diff --git a/dgraph/cmd/zero/run.go b/dgraph/cmd/zero/run.go index 4dc39401357..bb1eb753e5d 100644 --- a/dgraph/cmd/zero/run.go +++ b/dgraph/cmd/zero/run.go @@ -105,6 +105,9 @@ instances to achieve high-availability. Flag("learner", `Make this Zero a "learner" node. In learner mode, this Zero will not participate `+ "in Raft elections. This can be used to achieve a read-only replica."). + Flag("election-tick", + "Number of ticks (each 100ms) before a follower starts an election. "+ + "Default 20 means 2s election timeout. Increase in high-latency networks."). String()) flag.String("audit", worker.AuditDefaults, z.NewSuperFlagHelp(worker.AuditDefaults). @@ -160,7 +163,8 @@ func (st *state) serveGRPC(l net.Listener, store *raftwal.DiskStorage) { Group: 0, IsLearner: opts.raft.GetBool("learner"), } - m := conn.NewNode(&rc, store, opts.tlsClientConfig) + electionTick := opts.raft.GetInt64("election-tick") + m := conn.NewNode(&rc, store, opts.tlsClientConfig, int(electionTick)) // Zero followers should not be forwarding proposals to the leader, to avoid txn commits which // were calculated in a previous Zero leader. diff --git a/worker/draft.go b/worker/draft.go index c2cb9947519..2709d6d61dd 100644 --- a/worker/draft.go +++ b/worker/draft.go @@ -264,7 +264,8 @@ func newNode(store *raftwal.DiskStorage, gid uint32, id uint64, myAddr string) * IsLearner: isLearner, } glog.Infof("RaftContext: %+v\n", rc) - m := conn.NewNode(rc, store, x.WorkerConfig.TLSClientConfig) + electionTick := x.WorkerConfig.Raft.GetInt64("election-tick") + m := conn.NewNode(rc, store, x.WorkerConfig.TLSClientConfig, int(electionTick)) n := &node{ Node: m, diff --git a/worker/server_state.go b/worker/server_state.go index 0591ccb4b5b..2286c09f4b4 100644 --- a/worker/server_state.go +++ b/worker/server_state.go @@ -31,7 +31,7 @@ const ( AuditDefaults = `compress=false; days=10; size=100; dir=; output=; encrypt-file=;` BadgerDefaults = `compression=snappy; numgoroutines=8;` RaftDefaults = `learner=false; snapshot-after-entries=10000; ` + - `snapshot-after-duration=30m; pending-proposals=256; idx=; group=;` + `snapshot-after-duration=30m; pending-proposals=256; idx=; group=; election-tick=20;` SecurityDefaults = `token=; whitelist=;` CDCDefaults = `file=; kafka=; sasl_user=; sasl_password=; ca_cert=; client_cert=; ` + `client_key=; sasl-mechanism=PLAIN; tls=false;` From ee4d6ff121dcb67c463afc79f5cfc281824ca71a Mon Sep 17 00:00:00 2001 From: "lakshimiraman.s" Date: Wed, 3 Jun 2026 13:03:38 +0530 Subject: [PATCH 2/5] fix: add issues:write permission to labeler workflow --- .github/workflows/labeler.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 7c1d9e41dcb..11d76d3a11c 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -14,12 +14,14 @@ on: permissions: contents: read pull-requests: write + issues: write jobs: label: permissions: contents: read pull-requests: write + issues: write runs-on: blacksmith-4vcpu-ubuntu-2404 steps: - uses: actions/checkout@v5 From f26e195214606430153f1e11c7254f71cb6ff7e0 Mon Sep 17 00:00:00 2001 From: "lakshimiraman.s" Date: Tue, 9 Jun 2026 12:02:44 +0530 Subject: [PATCH 3/5] Fail fast with a clear error message instead of letting etcd raft panic with a cryptic. ( election tick > heartbeat issue) --- .github/workflows/labeler.yml | 2 +- conn/node.go | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 11d76d3a11c..4add255e278 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -1,7 +1,7 @@ name: labeler on: - pull_request: + pull_request_target: types: - opened - reopened diff --git a/conn/node.go b/conn/node.go index 626d1c44bba..e42bdbcc29f 100644 --- a/conn/node.go +++ b/conn/node.go @@ -84,9 +84,14 @@ type Node struct { func NewNode(rc *pb.RaftContext, store *raftwal.DiskStorage, tlsConfig *tls.Config, electionTick int) *Node { + const heartbeatTick = 1 // 100ms per tick if electionTick <= 0 { electionTick = 20 } + if electionTick <= heartbeatTick { + glog.Fatalf("election-tick=%d is invalid: must be greater than heartbeat-tick (%d). "+ + "Recommended minimum is 10 (1s election timeout).", electionTick, heartbeatTick) + } snap, err := store.Snapshot() x.Check(err) @@ -98,8 +103,8 @@ func NewNode(rc *pb.RaftContext, store *raftwal.DiskStorage, tlsConfig *tls.Conf Store: store, Cfg: &raft.Config{ ID: rc.Id, - ElectionTick: electionTick, // Default 2s if tick is 100ms. - HeartbeatTick: 1, // 100ms if we call Tick() every 100 ms. + ElectionTick: electionTick, // Default 2s if tick is 100ms. + HeartbeatTick: heartbeatTick, // 100ms if we call Tick() every 100 ms. Storage: store, MaxInflightMsgs: 256, MaxSizePerMsg: 256 << 10, // 256 KB should allow more batching. From 6542e183ea97afc6171107a83c0479c1ce84a8be Mon Sep 17 00:00:00 2001 From: "lakshimiraman.s" Date: Mon, 15 Jun 2026 12:55:19 +0530 Subject: [PATCH 4/5] fix(raft): warn on negative and low election-tick values - Split negative and zero handling: negative values log a warning and default to 20; zero silently defaults to 20 (unset/zero-value case) - Return warning string from normalizeElectionTick instead of two bools - Warn (not reject) when election-tick < 10 per reviewer guidance --- conn/node.go | 46 ++++++++++++++++++++++++------- conn/node_test.go | 60 +++++++++++++++++++++++++++++++++++++++++ dgraph/cmd/alpha/run.go | 5 ++-- dgraph/cmd/zero/run.go | 5 ++-- 4 files changed, 102 insertions(+), 14 deletions(-) diff --git a/conn/node.go b/conn/node.go index e42bdbcc29f..8652e9e474c 100644 --- a/conn/node.go +++ b/conn/node.go @@ -40,6 +40,34 @@ var ( ErrNoNode = errors.Errorf("No node has been set up yet") ) +const ( + heartbeatTick = 1 + defaultElectionTick = 20 + recommendedElectionTick = 10 +) + +func normalizeElectionTick(electionTick int) (tick int, warning string) { + if electionTick < 0 { + return defaultElectionTick, fmt.Sprintf( + "--raft election-tick=%d is invalid; defaulting to %d. Use 0 or omit the flag to accept the default.", + electionTick, defaultElectionTick) + } + if electionTick == 0 { + return defaultElectionTick, "" + } + if electionTick <= heartbeatTick { + glog.Fatalf("invalid --raft election-tick=%d: must be greater than internal heartbeat tick (%d).", + electionTick, heartbeatTick) + } + if electionTick < recommendedElectionTick { + return electionTick, fmt.Sprintf( + "--raft election-tick=%d gives a %dms minimum election timeout. Values below %d (1s) "+ + "may cause spurious leader elections under GC pauses or network jitter.", + electionTick, electionTick*100, recommendedElectionTick) + } + return electionTick, "" +} + // Node represents a node participating in the RAFT protocol. type Node struct { x.SafeMutex @@ -79,18 +107,16 @@ type Node struct { } // NewNode returns a new Node instance. -// electionTick controls how many ticks (each 100ms) before an election is triggered. -// If electionTick <= 0, defaults to 20 (i.e., 2s election timeout). +// electionTick controls how many Raft Tick() calls pass before election timeout. +// In production Alpha/Zero, Tick() runs every 100ms and Raft randomizes the timeout. +// If electionTick <= 0, defaults to 20. func NewNode(rc *pb.RaftContext, store *raftwal.DiskStorage, tlsConfig *tls.Config, electionTick int) *Node { - const heartbeatTick = 1 // 100ms per tick - if electionTick <= 0 { - electionTick = 20 - } - if electionTick <= heartbeatTick { - glog.Fatalf("election-tick=%d is invalid: must be greater than heartbeat-tick (%d). "+ - "Recommended minimum is 10 (1s election timeout).", electionTick, heartbeatTick) + var warning string + electionTick, warning = normalizeElectionTick(electionTick) + if warning != "" { + glog.Warningf(warning) } snap, err := store.Snapshot() @@ -103,7 +129,7 @@ func NewNode(rc *pb.RaftContext, store *raftwal.DiskStorage, tlsConfig *tls.Conf Store: store, Cfg: &raft.Config{ ID: rc.Id, - ElectionTick: electionTick, // Default 2s if tick is 100ms. + ElectionTick: electionTick, HeartbeatTick: heartbeatTick, // 100ms if we call Tick() every 100 ms. Storage: store, MaxInflightMsgs: 256, diff --git a/conn/node_test.go b/conn/node_test.go index c59512fb970..d1dfdc3d281 100644 --- a/conn/node_test.go +++ b/conn/node_test.go @@ -71,3 +71,63 @@ func TestProposal(t *testing.T) { } wg.Wait() } + +func TestNormalizeElectionTick(t *testing.T) { + tests := []struct { + name string + electionTick int + wantTick int + wantWarning string + }{ + { + name: "zero defaults silently", + electionTick: 0, + wantTick: 20, + wantWarning: "", + }, + { + name: "negative defaults with warning", + electionTick: -1, + wantTick: 20, + wantWarning: "--raft election-tick=-1 is invalid; defaulting to 20. Use 0 or omit the flag to accept the default.", + }, + { + name: "large negative defaults with warning", + electionTick: -100, + wantTick: 20, + wantWarning: "--raft election-tick=-100 is invalid; defaulting to 20. Use 0 or omit the flag to accept the default.", + }, + { + name: "low valid value warns below recommended", + electionTick: 2, + wantTick: 2, + wantWarning: "--raft election-tick=2 gives a 200ms minimum election timeout. Values below 10 (1s) may cause spurious leader elections under GC pauses or network jitter.", + }, + { + name: "recommended minimum no warning", + electionTick: 10, + wantTick: 10, + wantWarning: "", + }, + { + name: "default value no warning", + electionTick: 20, + wantTick: 20, + wantWarning: "", + }, + { + name: "large value accepted", + electionTick: 100, + wantTick: 100, + wantWarning: "", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + gotTick, gotWarning := normalizeElectionTick(tc.electionTick) + require.Equal(t, tc.wantTick, gotTick) + require.Equal(t, tc.wantWarning, gotWarning) + }) + } +} diff --git a/dgraph/cmd/alpha/run.go b/dgraph/cmd/alpha/run.go index d87f06e209e..2cca04d4a85 100644 --- a/dgraph/cmd/alpha/run.go +++ b/dgraph/cmd/alpha/run.go @@ -165,8 +165,9 @@ they form a Raft group and provide synchronous replication. Flag("pending-proposals", "Number of pending mutation proposals. Useful for rate limiting."). Flag("election-tick", - "Number of ticks (each 100ms) before a follower starts an election. "+ - "Default 20 means 2s election timeout. Increase in high-latency networks."). + "Number of Raft ticks before a follower starts an election. Each production tick is "+ + "100ms and Raft randomizes the timeout between N and 2N-1 ticks. Default 20 "+ + "means ~2s-4s; values below 10 may cause spurious elections under jitter."). String()) flag.String("security", worker.SecurityDefaults, z.NewSuperFlagHelp(worker.SecurityDefaults). diff --git a/dgraph/cmd/zero/run.go b/dgraph/cmd/zero/run.go index bb1eb753e5d..48a82b46544 100644 --- a/dgraph/cmd/zero/run.go +++ b/dgraph/cmd/zero/run.go @@ -106,8 +106,9 @@ instances to achieve high-availability. `Make this Zero a "learner" node. In learner mode, this Zero will not participate `+ "in Raft elections. This can be used to achieve a read-only replica."). Flag("election-tick", - "Number of ticks (each 100ms) before a follower starts an election. "+ - "Default 20 means 2s election timeout. Increase in high-latency networks."). + "Number of Raft ticks before a follower starts an election. Each production tick is "+ + "100ms and Raft randomizes the timeout between N and 2N-1 ticks. Default 20 "+ + "means ~2s-4s; values below 10 may cause spurious elections under jitter."). String()) flag.String("audit", worker.AuditDefaults, z.NewSuperFlagHelp(worker.AuditDefaults). From 7071d3b892d5cacd8d9d32d49623bee0f0bc7600 Mon Sep 17 00:00:00 2001 From: "lakshimiraman.s" Date: Mon, 15 Jun 2026 12:58:54 +0530 Subject: [PATCH 5/5] revert: labeler workflow changes Reverting scope-creep changes to .github/workflows/labeler.yml. Will open a separate PR with rationale if needed. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/workflows/labeler.yml | 4 +--- conn/node_test.go | 6 +++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 4add255e278..7c1d9e41dcb 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -1,7 +1,7 @@ name: labeler on: - pull_request_target: + pull_request: types: - opened - reopened @@ -14,14 +14,12 @@ on: permissions: contents: read pull-requests: write - issues: write jobs: label: permissions: contents: read pull-requests: write - issues: write runs-on: blacksmith-4vcpu-ubuntu-2404 steps: - uses: actions/checkout@v5 diff --git a/conn/node_test.go b/conn/node_test.go index d1dfdc3d281..bcc9e1cec0a 100644 --- a/conn/node_test.go +++ b/conn/node_test.go @@ -74,10 +74,10 @@ func TestProposal(t *testing.T) { func TestNormalizeElectionTick(t *testing.T) { tests := []struct { - name string + name string electionTick int - wantTick int - wantWarning string + wantTick int + wantWarning string }{ { name: "zero defaults silently",