Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 42 additions & 3 deletions conn/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,34 @@ var (
ErrNoNode = errors.Errorf("No node has been set up yet")
)

const (
heartbeatTick = 1
defaultElectionTick = 20
recommendedElectionTick = 10
)

func normalizeElectionTick(electionTick int) (tick int, warning string) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The election-tick=1 handling in normalizeElectionTick is inconsistent with how the function treats every other invalid value, and I'd pull it into line.

Right now a negative tick coerces to the default 20 and logs a warning, but electionTick == 1 calls glog.Fatalf and takes the process down. Those are both equally-invalid inputs, so it's odd that -100 gets a friendly default while a single fat-fingered 1 is fatal. An operator who typos election-tick=1 shouldn't lose the node over it.

The Fatalf also blocks the test. The table covers 0, -1, -100, 2, 10, 20, 100 but skips 1, and it has to: exercising that path would os.Exit the test binary. So the one genuinely-fatal branch is the one branch with no coverage.

The intent behind it is right. Failing with a clear message beats letting etcd/raft panic with something cryptic. But coercing 1 to the default and warning, exactly like the negative case, gets you that clear message while keeping the behavior consistent, fully unit-testable, and crash-free on a typo.

Suggest dropping the glog.Fatalf branch and folding 1 into the same coerce-and-warn path as the negatives.

if electionTick < 0 {
return defaultElectionTick, fmt.Sprintf(
"--raft election-tick=%d is invalid; defaulting to %d. Use 0 or omit the flag to accept the default.",
electionTick, defaultElectionTick)
}
if electionTick == 0 {
return defaultElectionTick, ""
}
if electionTick <= heartbeatTick {
glog.Fatalf("invalid --raft election-tick=%d: must be greater than internal heartbeat tick (%d).",
electionTick, heartbeatTick)
}
if electionTick < recommendedElectionTick {
return electionTick, fmt.Sprintf(
"--raft election-tick=%d gives a %dms minimum election timeout. Values below %d (1s) "+
"may cause spurious leader elections under GC pauses or network jitter.",
electionTick, electionTick*100, recommendedElectionTick)
}
return electionTick, ""
}

// Node represents a node participating in the RAFT protocol.
type Node struct {
x.SafeMutex
Expand Down Expand Up @@ -79,7 +107,18 @@ type Node struct {
}

// NewNode returns a new Node instance.
func NewNode(rc *pb.RaftContext, store *raftwal.DiskStorage, tlsConfig *tls.Config) *Node {
// electionTick controls how many Raft Tick() calls pass before election timeout.
// In production Alpha/Zero, Tick() runs every 100ms and Raft randomizes the timeout.
// If electionTick <= 0, defaults to 20.
func NewNode(rc *pb.RaftContext, store *raftwal.DiskStorage, tlsConfig *tls.Config,
electionTick int) *Node {

var warning string
electionTick, warning = normalizeElectionTick(electionTick)
if warning != "" {
glog.Warningf(warning)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The formatted version is not needed (and should be failing govet pass)... please just use glog.Warning

}

snap, err := store.Snapshot()
x.Check(err)

Expand All @@ -90,8 +129,8 @@ func NewNode(rc *pb.RaftContext, store *raftwal.DiskStorage, tlsConfig *tls.Conf
Store: store,
Cfg: &raft.Config{
ID: rc.Id,
ElectionTick: 20, // 2s if we call Tick() every 100 ms.
HeartbeatTick: 1, // 100ms if we call Tick() every 100 ms.
ElectionTick: electionTick,
HeartbeatTick: heartbeatTick, // 100ms if we call Tick() every 100 ms.
Storage: store,
MaxInflightMsgs: 256,
MaxSizePerMsg: 256 << 10, // 256 KB should allow more batching.
Expand Down
62 changes: 61 additions & 1 deletion conn/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func TestProposal(t *testing.T) {
store := raftwal.Init(dir)

rc := &pb.RaftContext{Id: 1}
n := NewNode(rc, store, nil)
n := NewNode(rc, store, nil, 0)

peers := []raft.Peer{{ID: n.Id}}
n.SetRaft(raft.StartNode(n.Cfg, peers))
Expand All @@ -71,3 +71,63 @@ func TestProposal(t *testing.T) {
}
wg.Wait()
}

func TestNormalizeElectionTick(t *testing.T) {
tests := []struct {
name string
electionTick int
wantTick int
wantWarning string
}{
{
name: "zero defaults silently",
electionTick: 0,
wantTick: 20,
wantWarning: "",
},
{
name: "negative defaults with warning",
electionTick: -1,
wantTick: 20,
wantWarning: "--raft election-tick=-1 is invalid; defaulting to 20. Use 0 or omit the flag to accept the default.",
},
{
name: "large negative defaults with warning",
electionTick: -100,
wantTick: 20,
wantWarning: "--raft election-tick=-100 is invalid; defaulting to 20. Use 0 or omit the flag to accept the default.",
},
{
name: "low valid value warns below recommended",
electionTick: 2,
wantTick: 2,
wantWarning: "--raft election-tick=2 gives a 200ms minimum election timeout. Values below 10 (1s) may cause spurious leader elections under GC pauses or network jitter.",
},
{
name: "recommended minimum no warning",
electionTick: 10,
wantTick: 10,
wantWarning: "",
},
{
name: "default value no warning",
electionTick: 20,
wantTick: 20,
wantWarning: "",
},
{
name: "large value accepted",
electionTick: 100,
wantTick: 100,
wantWarning: "",
},
}

for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
gotTick, gotWarning := normalizeElectionTick(tc.electionTick)
require.Equal(t, tc.wantTick, gotTick)
require.Equal(t, tc.wantWarning, gotWarning)
})
}
}
4 changes: 4 additions & 0 deletions dgraph/cmd/alpha/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,10 @@ they form a Raft group and provide synchronous replication.
"to 0 to disable duration based snapshot.").
Flag("pending-proposals",
"Number of pending mutation proposals. Useful for rate limiting.").
Flag("election-tick",
"Number of Raft ticks before a follower starts an election. Each production tick is "+
"100ms and Raft randomizes the timeout between N and 2N-1 ticks. Default 20 "+
"means ~2s-4s; values below 10 may cause spurious elections under jitter.").
String())

flag.String("security", worker.SecurityDefaults, z.NewSuperFlagHelp(worker.SecurityDefaults).
Expand Down
2 changes: 1 addition & 1 deletion dgraph/cmd/zero/raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ import (
)

const (
raftDefaults = "idx=1; learner=false;"
raftDefaults = "idx=1; learner=false; election-tick=20;"
)

var proposalKey uint64
Expand Down
7 changes: 6 additions & 1 deletion dgraph/cmd/zero/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ instances to achieve high-availability.
Flag("learner",
`Make this Zero a "learner" node. In learner mode, this Zero will not participate `+
"in Raft elections. This can be used to achieve a read-only replica.").
Flag("election-tick",
"Number of Raft ticks before a follower starts an election. Each production tick is "+
"100ms and Raft randomizes the timeout between N and 2N-1 ticks. Default 20 "+
"means ~2s-4s; values below 10 may cause spurious elections under jitter.").
String())

flag.String("audit", worker.AuditDefaults, z.NewSuperFlagHelp(worker.AuditDefaults).
Expand Down Expand Up @@ -160,7 +164,8 @@ func (st *state) serveGRPC(l net.Listener, store *raftwal.DiskStorage) {
Group: 0,
IsLearner: opts.raft.GetBool("learner"),
}
m := conn.NewNode(&rc, store, opts.tlsClientConfig)
electionTick := opts.raft.GetInt64("election-tick")
m := conn.NewNode(&rc, store, opts.tlsClientConfig, int(electionTick))

// Zero followers should not be forwarding proposals to the leader, to avoid txn commits which
// were calculated in a previous Zero leader.
Expand Down
3 changes: 2 additions & 1 deletion worker/draft.go
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,8 @@ func newNode(store *raftwal.DiskStorage, gid uint32, id uint64, myAddr string) *
IsLearner: isLearner,
}
glog.Infof("RaftContext: %+v\n", rc)
m := conn.NewNode(rc, store, x.WorkerConfig.TLSClientConfig)
electionTick := x.WorkerConfig.Raft.GetInt64("election-tick")
m := conn.NewNode(rc, store, x.WorkerConfig.TLSClientConfig, int(electionTick))

n := &node{
Node: m,
Expand Down
2 changes: 1 addition & 1 deletion worker/server_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ const (
AuditDefaults = `compress=false; days=10; size=100; dir=; output=; encrypt-file=;`
BadgerDefaults = `compression=snappy; numgoroutines=8;`
RaftDefaults = `learner=false; snapshot-after-entries=10000; ` +
`snapshot-after-duration=30m; pending-proposals=256; idx=; group=;`
`snapshot-after-duration=30m; pending-proposals=256; idx=; group=; election-tick=20;`
SecurityDefaults = `token=; whitelist=;`
CDCDefaults = `file=; kafka=; sasl_user=; sasl_password=; ca_cert=; client_cert=; ` +
`client_key=; sasl-mechanism=PLAIN; tls=false;`
Expand Down
Loading