From 744e08cbcd6ddf90e4120be585d46c65e67c1761 Mon Sep 17 00:00:00 2001 From: auricom <27022259+auricom@users.noreply.github.com> Date: Tue, 28 Apr 2026 10:47:49 +0200 Subject: [PATCH 1/7] docs: ev-node high availability --- docs/.vitepress/config.ts | 18 ++ docs/guides/ha/cluster-setup.md | 388 +++++++++++++++++++++++++++++++ docs/guides/ha/overview.md | 369 ++++++++++++++++++++++++++++++ docs/guides/ha/single-to-ha.md | 394 ++++++++++++++++++++++++++++++++ 4 files changed, 1169 insertions(+) create mode 100644 docs/guides/ha/cluster-setup.md create mode 100644 docs/guides/ha/overview.md create mode 100644 docs/guides/ha/single-to-ha.md diff --git a/docs/.vitepress/config.ts b/docs/.vitepress/config.ts index 0cfdf5c7ae..abb2a4824e 100644 --- a/docs/.vitepress/config.ts +++ b/docs/.vitepress/config.ts @@ -273,6 +273,24 @@ function sidebarHome() { }, ], }, + { + text: "High Availability", + collapsed: true, + items: [ + { + text: "Overview & Configuration", + link: "/guides/ha/overview", + }, + { + text: "Bootstrap a 5-Node Cluster", + link: "/guides/ha/cluster-setup", + }, + { + text: "Migrate Single → HA", + link: "/guides/ha/single-to-ha", + }, + ], + }, { text: "Run a Full Node", link: "/guides/full-node", diff --git a/docs/guides/ha/cluster-setup.md b/docs/guides/ha/cluster-setup.md new file mode 100644 index 0000000000..84deaa8794 --- /dev/null +++ b/docs/guides/ha/cluster-setup.md @@ -0,0 +1,388 @@ +# Bootstrap a 5-Node HA Cluster from Scratch + +This tutorial walks you through setting up a production-ready 5-node ev-node Raft cluster from zero. By the end you will have five sequencer nodes that automatically elect a leader, replicate block state, and survive individual node failures. + +## Prerequisites + +- Five machines (VMs, bare metal, or containers) with: + - Network connectivity to each other on the Raft port (we use `5001`) and P2P port (`26656`) + - Persistent storage for the Raft data directory + - A working ev-node binary (see the [quickstart guide](../quick-start.md)) +- A private network, VPN, or encrypted mesh between all nodes (Raft transport is plain TCP — never expose the Raft port publicly) +- A shared genesis file for your chain (see [Create Genesis](../create-genesis.md)) +- A signer key on each node (all nodes must share the same signing identity so block hashes match regardless of which node is the current leader) + +### Node addresses used in this guide + +| Node | Private IP | Raft address | P2P port | +|------|------------|--------------|----------| +| node-1 | 10.0.0.1 | 10.0.0.1:5001 | 26656 | +| node-2 | 10.0.0.2 | 10.0.0.2:5001 | 26656 | +| node-3 | 10.0.0.3 | 10.0.0.3:5001 | 26656 | +| node-4 | 10.0.0.4 | 10.0.0.4:5001 | 26656 | +| node-5 | 10.0.0.5 | 10.0.0.5:5001 | 26656 | + +Replace these with your actual IP addresses throughout the guide. + +P2P peers use the libp2p multiaddr format, which includes each node's peer ID: + +``` +/ip4//tcp//p2p/ +``` + +You will collect peer IDs in Step 3 after initializing each node. + +--- + +## Step 1: Measure Network RTT + +The Raft timing parameters must be sized for your network. Run the following from each node to every other node and note the highest average RTT you observe: + +```bash +# Example: from node-1, ping all peers +for ip in 10.0.0.2 10.0.0.3 10.0.0.4 10.0.0.5; do + echo -n "$ip: " + ping -c 20 $ip | tail -1 | awk -F'/' '{print $5 "ms avg"}' +done +``` + +Repeat from each node. Take the single highest value across all measurements — this is your `RTT_MAX`. + +For nodes within the same region or data center, `RTT_MAX` is typically 5–30ms. For the configuration file below we assume `RTT_MAX ≤ 25ms`. If your measurement is higher, adjust the timing parameters using the formulas in the [configuration reference](./overview.md#timing-parameters). + +--- + +## Step 2: Verify Network Connectivity + +Confirm that the Raft port and P2P port are reachable between nodes before starting anything: + +```bash +# From node-2, verify node-1's Raft port is reachable +nc -zv 10.0.0.1 5001 + +# From node-2, verify node-1's P2P port is reachable +nc -zv 10.0.0.1 26656 +``` + +Do this for every node pair in both directions. If any check fails, fix your firewall rules before proceeding. + +--- + +## Step 3: Initialize Each Node + +Run this on every node. Each node gets its own home directory where the config, keys, and data live. + +```bash +# Run on every node (the binary name depends on your chain) +./evm init --evnode.node.aggregator=true --evnode.signer.passphrase +``` + +This creates the home directory structure (default `~/.evm`) with a `config/evnode.yaml` file and generates the signer key. + +After initializing each node, retrieve its peer ID — you will need all five when writing the configuration in Step 5: + +```bash +# Run on each node after init +./evm net-info +``` + +Note the `peer_id` value from each node's output. It looks like `12D3KooW...`. You will need all five peer IDs before writing the configuration files. + +> **Shared signer key:** All cluster nodes must sign blocks with the same key so that block hashes produced by any leader are identical. Copy the key material from node-1 to all other nodes after initialization: +> +> ```bash +> # On node-1: locate the signer key +> ls ~/.evm/config/ +> +> # Secure-copy it to each peer +> scp ~/.evm/config/signer.json user@10.0.0.2:~/.evm/config/ +> scp ~/.evm/config/signer.json user@10.0.0.3:~/.evm/config/ +> scp ~/.evm/config/signer.json user@10.0.0.4:~/.evm/config/ +> scp ~/.evm/config/signer.json user@10.0.0.5:~/.evm/config/ +> ``` + +--- + +## Step 4: Distribute the Genesis File + +Every node must start with the same genesis file. Create it on node-1 (see [Create Genesis](../create-genesis.md)) then copy it to all peers: + +```bash +scp ~/.evm/config/genesis.json user@10.0.0.2:~/.evm/config/ +scp ~/.evm/config/genesis.json user@10.0.0.3:~/.evm/config/ +scp ~/.evm/config/genesis.json user@10.0.0.4:~/.evm/config/ +scp ~/.evm/config/genesis.json user@10.0.0.5:~/.evm/config/ +``` + +--- + +## Step 5: Write the Configuration Files + +Write the following `evnode.yaml` on each node. The only field that differs per node is `raft.node_id` and `raft.raft_addr` — everything else is identical. + +### node-1 (`~/.evm/config/evnode.yaml`) + +```yaml +node: + aggregator: true + block_time: "1s" + +raft: + enable: true + node_id: "node-1" + raft_addr: "0.0.0.0:5001" + raft_dir: "/var/lib/ev-node/raft" + peers: "node-1@10.0.0.1:5001,node-2@10.0.0.2:5001,node-3@10.0.0.3:5001,node-4@10.0.0.4:5001,node-5@10.0.0.5:5001" + + # Timing — tuned for RTT_MAX ≤ 25ms + heartbeat_timeout: "92ms" + election_timeout: "368ms" + leader_lease_timeout: "46ms" + send_timeout: "50ms" + + # Log retention — covers ~5 hours of absence at 1 block/s + trailing_logs: 18000 + snapshot_threshold: 5000 + snap_count: 3 + +p2p: + listen_address: "/ip4/0.0.0.0/tcp/26656" + peers: "/ip4/10.0.0.2/tcp/26656/p2p/,/ip4/10.0.0.3/tcp/26656/p2p/,/ip4/10.0.0.4/tcp/26656/p2p/,/ip4/10.0.0.5/tcp/26656/p2p/" +``` + +### node-2 (change only `node_id` and P2P peers) + +```yaml +# ... same as node-1 except: +raft: + node_id: "node-2" + raft_addr: "0.0.0.0:5001" + # peers list is identical + +p2p: + peers: "/ip4/10.0.0.1/tcp/26656/p2p/,/ip4/10.0.0.3/tcp/26656/p2p/,/ip4/10.0.0.4/tcp/26656/p2p/,/ip4/10.0.0.5/tcp/26656/p2p/" +``` + +Repeat for node-3 through node-5, updating `node_id` and the P2P peers list (exclude the local node from its own P2P peers). + +--- + +## Step 6: Create the Raft Data Directories + +```bash +# Run on every node +sudo mkdir -p /var/lib/ev-node/raft +sudo chown $(whoami) /var/lib/ev-node/raft +``` + +For Docker deployments, this is handled by the named volume — skip this step. + +--- + +## Step 7: Start All Nodes Simultaneously + +Raft requires a majority of configured peers to be online before it can elect a leader. For a 5-node cluster, you need at least 3 nodes to be up before a leader can be elected and blocks can be produced. + +Start all five nodes as close together as possible. The order does not matter but they should all be up within a few seconds of each other. + +```bash +# Run this on each node, substituting the correct binary name and flags for your chain + +./evm start \ + --evnode.node.aggregator=true \ + --evnode.raft.enable=true \ + --evnode.raft.node_id="node-1" \ + --evnode.raft.raft_addr="0.0.0.0:5001" \ + --evnode.raft.raft_dir="/var/lib/ev-node/raft" \ + --evnode.raft.peers="node-1@10.0.0.1:5001,node-2@10.0.0.2:5001,node-3@10.0.0.3:5001,node-4@10.0.0.4:5001,node-5@10.0.0.5:5001" \ + --evnode.raft.heartbeat_timeout="92ms" \ + --evnode.raft.election_timeout="368ms" \ + --evnode.raft.leader_lease_timeout="46ms" \ + --evnode.raft.send_timeout="50ms" \ + --evnode.raft.trailing_logs=18000 \ + --evnode.raft.snapshot_threshold=5000 \ + --evnode.p2p.listen_address="/ip4/0.0.0.0/tcp/26656" \ + --evnode.p2p.peers="/ip4/10.0.0.2/tcp/26656/p2p/,/ip4/10.0.0.3/tcp/26656/p2p/,/ip4/10.0.0.4/tcp/26656/p2p/,/ip4/10.0.0.5/tcp/26656/p2p/" \ + --evnode.signer.passphrase= \ + --evm.jwt-secret=$(cat /path/to/jwt.hex) \ + --evm.genesis-hash= +``` + +Adjust flags for your execution layer (e.g., remove EVM flags if you are running a Cosmos SDK chain). + +--- + +## Step 8: Verify the Cluster Is Healthy + +### Watch the logs + +Within a few seconds of starting, you should see one node win the election: + +``` +INF raft: entering candidate state node=node-1 +INF raft: election won tally=3 +INF raft: entering leader state leader=node-1 +INF block produced height=1 hash=0xabc... +``` + +The other nodes will log: + +``` +INF raft: entering follower state leader=node-1 +INF block applied from raft log height=1 hash=0xabc... +``` + +### Check block production + +Query the RPC endpoint of any node to confirm blocks are being produced: + +```bash +curl http://10.0.0.1:26657/status | jq '.result.sync_info.latest_block_height' +``` + +Increment a few seconds and check again — the height should be increasing. + +### Verify all nodes are synced + +Query each node; all five should report the same `latest_block_height` (or within 1–2 blocks of each other): + +```bash +for ip in 10.0.0.1 10.0.0.2 10.0.0.3 10.0.0.4 10.0.0.5; do + echo -n "$ip: height=" + curl -s http://$ip:26657/status | jq -r '.result.sync_info.latest_block_height' +done +``` + +--- + +## Step 9: Test Failover + +With all five nodes running and producing blocks, simulate a leader failure: + +```bash +# Identify the current leader from its logs, then on that machine: +kill -SIGTERM $(pgrep evm) +``` + +Within `election_timeout` (368ms in this configuration), the remaining four nodes will elect a new leader and resume block production. Measure the actual gap in your logs: + +```bash +# Look for the last block before the kill and first block after +grep "block produced\|block applied" /var/log/ev-node/node-2.log | tail -20 +``` + +The gap should be well under 1 second in most cases (a few election cycles at most). + +--- + +## Running as a Systemd Service + +For production, manage each node with systemd. + +```ini +# /etc/systemd/system/ev-node.service +[Unit] +Description=ev-node HA sequencer +After=network-online.target +Wants=network-online.target + +[Service] +User=ev-node +ExecStart=/usr/local/bin/evm start \ + --evnode.node.aggregator=true \ + --evnode.raft.enable=true \ + --evnode.raft.node_id=node-1 \ + --evnode.raft.raft_addr=0.0.0.0:5001 \ + --evnode.raft.raft_dir=/var/lib/ev-node/raft \ + --evnode.raft.peers=node-1@10.0.0.1:5001,node-2@10.0.0.2:5001,node-3@10.0.0.3:5001,node-4@10.0.0.4:5001,node-5@10.0.0.5:5001 \ + --evnode.raft.heartbeat_timeout=92ms \ + --evnode.raft.election_timeout=368ms \ + --evnode.raft.leader_lease_timeout=46ms \ + --evnode.raft.send_timeout=50ms \ + --evnode.raft.trailing_logs=18000 \ + --evnode.raft.snapshot_threshold=5000 \ + --evnode.p2p.listen_address=/ip4/0.0.0.0/tcp/26656 \ + --evnode.p2p.peers=/ip4/10.0.0.2/tcp/26656/p2p/,/ip4/10.0.0.3/tcp/26656/p2p/,/ip4/10.0.0.4/tcp/26656/p2p/,/ip4/10.0.0.5/tcp/26656/p2p/ \ + --evnode.signer.passphrase= +Restart=on-failure +RestartSec=5s + +# Give the process time to transfer leadership before systemd kills it +TimeoutStopSec=30 + +[Install] +WantedBy=multi-user.target +``` + +```bash +sudo systemctl daemon-reload +sudo systemctl enable ev-node +sudo systemctl start ev-node +sudo journalctl -u ev-node -f +``` + +`TimeoutStopSec=30` gives the node enough time to perform a graceful leadership transfer on `SIGTERM` before systemd sends `SIGKILL`. Do not set this too short. + +--- + +## Performing a Rolling Restart + +To restart nodes without taking the cluster offline (e.g., for a config change or binary upgrade): + +1. Restart one non-leader node at a time and wait for it to rejoin before touching the next. +2. For the leader node, restart it last. `ev-node` will transfer leadership to a peer before shutting down. + +```bash +# Restart non-leader nodes first +ssh user@10.0.0.2 "sudo systemctl restart ev-node" +# Wait ~30 seconds for node-2 to rejoin and sync +ssh user@10.0.0.3 "sudo systemctl restart ev-node" +# ...wait... +ssh user@10.0.0.4 "sudo systemctl restart ev-node" +# ...wait... +ssh user@10.0.0.5 "sudo systemctl restart ev-node" +# ...wait... +# Restart the leader last +ssh user@10.0.0.1 "sudo systemctl restart ev-node" +``` + +Verify each node is back in the cluster (check logs for `entering follower state` or `entering leader state`) before proceeding to the next. + +--- + +## Troubleshooting + +### Cluster does not elect a leader + +Check that: +- At least 3 out of 5 nodes are running and can reach each other on port 5001. +- The `peers` list on every node is identical and all addresses are correct. +- No firewall rule is blocking TCP on port 5001. + +```bash +# Quick connectivity check from node-2 to node-1 Raft port +nc -zv 10.0.0.1 5001 +``` + +### Node panics on startup with "state divergence" + +This means the node's local block store is ahead of or behind the Raft consensus state in a way that cannot be reconciled automatically. This typically happens when a node's `raft_dir` was wiped but the block database was not (or vice versa). + +Stop the node, wipe both `raft_dir` and the node's block data directory, then restart. The node will receive a Raft snapshot and rebuild from there. + +### Spurious elections / leadership flapping + +Symptoms: frequent `election won` and `entering follower state` lines in the logs, block production pausing briefly every few seconds. + +Causes: +- `heartbeat_timeout` is too short for your network RTT — increase it. +- Network congestion or packet loss between nodes. +- Node CPU is saturated and cannot process heartbeats in time. + +As a quick diagnostic, check the RTT between nodes while the cluster is running: + +```bash +ping -c 100 10.0.0.2 | tail -5 +``` + +If the max RTT is close to or above `heartbeat_timeout`, increase `heartbeat_timeout` and `election_timeout` proportionally. diff --git a/docs/guides/ha/overview.md b/docs/guides/ha/overview.md new file mode 100644 index 0000000000..b3b0ca7bda --- /dev/null +++ b/docs/guides/ha/overview.md @@ -0,0 +1,369 @@ +# High Availability Sequencer + +ev-node supports running your sequencer in a **High Availability (HA)** cluster using the [Raft consensus algorithm](https://raft.github.io/). Instead of a single aggregator node that is a point of failure, multiple nodes form a cluster that automatically elects a leader and recovers from individual node failures without manual intervention and without halting block production. + +## Why Raft HA + +A single sequencer node means that if the machine crashes, loses power, or needs maintenance, your chain stops producing blocks until the node is back online. With a Raft cluster: + +- **Automatic failover** — when the active leader fails, remaining nodes elect a new leader within seconds. +- **No double-signing** — the Raft log guarantees at most one leader at a time and synchronizes block state across all nodes before any block is committed. +- **Graceful restarts** — before shutting down, the leader transfers leadership to a healthy peer so downtime is measured in milliseconds. +- **Fault tolerance** — a 5-node cluster keeps producing blocks as long as at least 3 nodes are reachable; it can absorb 2 simultaneous failures. + +## How It Works + +Each node in the cluster runs `ev-node` in aggregator mode with Raft enabled. The nodes communicate over a private TCP transport to: + +1. **Elect a leader** — using Raft leader election. Only the elected leader produces blocks. +2. **Replicate state** — every block the leader produces is appended to the Raft log and replicated to all followers before it is considered committed. +3. **Apply to FSM** — each node applies committed log entries to its Finite State Machine (FSM), which tracks the latest committed block height, hash, and timestamp. +4. **Detect failure** — followers watch for heartbeats from the leader. If heartbeats stop arriving within the election timeout, a follower starts a new election. +5. **Catch up** — a node that was offline rejoins by receiving a Raft snapshot (fast-forward to the current head) and then fetching any missing historical blocks from peers via P2P. + +### Storage + +Raft state is stored in the directory specified by `raft.raft_dir`: + +| File | Purpose | +|------|---------| +| `raft-log.db` | Raft log entries (BoltDB) | +| `raft-stable.db` | Current term and vote state (BoltDB) | +| `*.snp` | Snapshots of the FSM state | + +These files represent the node's **cluster identity**. They must live on persistent storage — loss of this directory is equivalent to removing the node from the cluster. + +## Cluster Sizing + +Always run an **odd number** of nodes. Raft requires a majority (quorum) to elect a leader and commit entries. + +| Nodes | Quorum | Tolerated failures | +|-------|--------|--------------------| +| 3 | 2 | 1 | +| **5** | **3** | **2** | +| 7 | 4 | 3 | + +**5 nodes is the recommended production configuration.** It tolerates two simultaneous node failures — enough to absorb a rolling upgrade plus an unexpected crash at the same time — while keeping the cluster size manageable. + +## Network Requirements + +Raft transport is **plain TCP** with no built-in encryption. Before deploying: + +- Run all nodes inside a **private network, VPN, or encrypted mesh** (WireGuard, Tailscale, AWS VPC, etc.). +- **Never expose the Raft port to the public internet.** An attacker with access to the Raft port can send forged messages that disrupt or hijack cluster consensus. +- Ensure low-latency connectivity between nodes. Timeouts must be sized larger than the worst-case round-trip time (RTT) between any two nodes in the cluster. + +--- + +## Configuration Reference + +Raft is configured under the `raft` section of `evnode.yaml`, or via `--evnode.raft.*` CLI flags. + +### Required Parameters + +These must be set on every node for the cluster to form. + +#### `raft.enable` + +```yaml +raft: + enable: true +``` + +**CLI:** `--evnode.raft.enable` +**Default:** `false` + +Enables Raft consensus. Must be `true` on every cluster member. When disabled (the default), the node runs as a traditional single sequencer. Setting this to `true` also requires `node.aggregator: true`. + +--- + +#### `raft.node_id` + +```yaml +raft: + node_id: "node-1" +``` + +**CLI:** `--evnode.raft.node_id` +**Default:** _(none, required)_ + +A string that uniquely identifies this node within the cluster. Every node must have a different `node_id`. The ID is stored in the Raft log and used by other nodes to route messages — **never change it after the cluster is bootstrapped**, as doing so will break the cluster membership records. + +Convention: use stable, descriptive names like `node-1`, `node-2`, … `node-5` or names tied to the host (`sequencer-us-east-1`, `sequencer-eu-east-2`). + +--- + +#### `raft.raft_addr` + +```yaml +raft: + raft_addr: "0.0.0.0:5001" +``` + +**CLI:** `--evnode.raft.raft_addr` +**Default:** _(none, required)_ + +The TCP address this node listens on for Raft transport messages from other cluster members. The `0.0.0.0` bind address accepts connections on all interfaces; bind to a specific private IP if you want to restrict which interface is used for cluster traffic. + +The port (here `5001`) must be reachable from every other node in the cluster. + +The address you advertise in `raft.peers` must resolve to this port from the perspective of other nodes. If you bind to `0.0.0.0` internally, advertise the node's actual private IP in the peers list. + +--- + +#### `raft.raft_dir` + +```yaml +raft: + raft_dir: "/var/lib/ev-node/raft" +``` + +**CLI:** `--evnode.raft.raft_dir` +**Default:** `/raft` + +The directory where Raft stores its persistent state: log database, stable store, and snapshots. This directory **must be on persistent storage** (not tmpfs, not ephemeral container storage). Losing this directory means the node loses its cluster identity — it cannot rejoin without being reconfigured as a new member. + +For Docker deployments, mount this as a named volume. For bare-metal or systemd services, ensure the directory survives reboots. + +--- + +#### `raft.peers` + +```yaml +raft: + peers: "node-2@10.0.0.2:5001,node-3@10.0.0.3:5001,node-4@10.0.0.4:5001,node-5@10.0.0.5:5001" +``` + +**CLI:** `--evnode.raft.peers` +**Default:** _(none, required)_ + +A comma-separated list of **all** remote cluster members, in the format `nodeID@host:port`. The host and port must be the Raft address (`raft_addr`) of each peer as reachable from this node. + + Raft uses this list to: +- Bootstrap the cluster on first start (when no persisted state exists). +- Know which addresses to dial when sending log entries or heartbeats. + +--- + +### Timing Parameters + +These parameters control how quickly the cluster detects failures and elects a new leader. They must be sized relative to the **maximum round-trip time (RTT) between any two nodes** in the cluster. Too tight and the cluster experiences spurious leader changes; too loose and failover takes longer than necessary. + +**To measure your network RTT:** + +```bash +# Run from each node to every other node; note the maximum result +ping -c 20 | tail -1 +``` + +Take the maximum average RTT across all pairs — this is your `RTT_MAX`. + +#### `raft.heartbeat_timeout` + +```yaml +raft: + heartbeat_timeout: "92ms" +``` + +**CLI:** `--evnode.raft.heartbeat_timeout` +**Default:** `350ms` + +How often the leader sends heartbeat messages to followers. Followers that do not receive a heartbeat within this interval begin a new election. + +**Tuning rule:** Set to **4–5× RTT_MAX**. This ensures followers can distinguish a slow network from a dead leader without triggering spurious elections. + +- Too low (< 2× RTT_MAX): followers time out due to normal network jitter and start unnecessary elections, causing leadership flapping and brief block production pauses. +- Too high: failover takes longer; the cluster is slower to react to a leader crash. + +| RTT_MAX | Recommended heartbeat_timeout | +|---------|-------------------------------| +| 10ms | 40–50ms | +| 23ms | 92ms | +| 50ms | 200–250ms | +| 100ms | 400–500ms | + +--- + +#### `raft.election_timeout` + +```yaml +raft: + election_timeout: "368ms" +``` + +**CLI:** `--evnode.raft.election_timeout` +**Default:** `1000ms` + +How long a follower waits without receiving a heartbeat before it concludes the leader is dead and starts a new election. Must be greater than or equal to `heartbeat_timeout`. + +**Tuning rule:** Set to **4× heartbeat_timeout** (or approximately 16–20× RTT_MAX). The factor of 4 gives the leader several missed heartbeat opportunities before a follower acts — enough to ride out transient packet loss without triggering unnecessary elections. + +A larger election timeout means a slower reaction to leader failure (failover takes longer). A smaller election timeout risks false positives: the cluster starts an election while the leader is merely experiencing a brief network delay, causing a term increment and a short pause in block production. + +--- + +#### `raft.leader_lease_timeout` + +```yaml +raft: + leader_lease_timeout: "46ms" +``` + +**CLI:** `--evnode.raft.leader_lease_timeout` +**Default:** `175ms` + +The duration for which a leader considers its leadership valid after the last successful heartbeat acknowledgment. Leader lease enables local reads from the leader without a round-trip to quorum. + +**Tuning rule:** Set to approximately **half of `heartbeat_timeout`** (i.e., ~2× RTT_MAX), and always **strictly less than `election_timeout`**. If `leader_lease_timeout` is close to or exceeds `election_timeout`, a node may believe it is still the leader after followers have already elected a replacement, which can cause split-brain reads. + +--- + +#### `raft.send_timeout` + +```yaml +raft: + send_timeout: "50ms" +``` + +**CLI:** `--evnode.raft.send_timeout` +**Default:** `200ms` + +The maximum time the leader waits for a single message (log entry, heartbeat) to be delivered to a peer before marking the delivery as failed. A failed send is retried, but repeated failures trigger follower health tracking. + +**Tuning rule:** Set to **2–3× RTT_MAX**. This allows for normal network latency plus one retransmission before giving up on a delivery attempt. + +--- + +### Snapshot and Log Retention Parameters + +These parameters control how frequently Raft snapshots the FSM state and how many log entries are kept around after a snapshot. They affect both disk usage and how quickly a lagging node can catch up. + +#### `raft.snapshot_threshold` + +```yaml +raft: + snapshot_threshold: 5000 +``` + +**CLI:** `--evnode.raft.snapshot_threshold` +**Default:** `500` + +The number of committed log entries that must accumulate before Raft automatically takes a snapshot of the FSM state. After a snapshot, log entries older than the snapshot are compacted away. + +**Effect on operations:** +- **Lower values** (e.g., `500`): snapshots are taken frequently, keeping the log small. A restarting node receives a recent snapshot and has fewer log entries to replay, but snapshot writes happen more often, adding brief I/O bursts. +- **Higher values** (e.g., `5000`): less frequent snapshots mean less I/O overhead during normal operation, but a lagging node may have more log entries to replay when catching up. + +At 10 block/second, `snapshot_threshold: 5000` takes a snapshot roughly every 83 seconds. + +--- + +#### `raft.trailing_logs` + +```yaml +raft: + trailing_logs: 18000 +``` + +**CLI:** `--evnode.raft.trailing_logs` +**Default:** `200` + +The number of log entries to **retain after a snapshot** is taken. These entries act as a catch-up buffer: a node that missed fewer than `trailing_logs` entries since the last snapshot can replay from the log without needing to transfer the full snapshot. + +**Effect on operations:** +- **Lower values** (e.g., `200`): tighter disk usage; a node that misses even a few minutes of operation must receive a full snapshot on rejoin. +- **Higher values** (e.g., `18000`): a lagging node can catch up via log replay for up to 5 minutes at 10 block/second without needing a full snapshot transfer, reducing the cost of brief outages. + +Set this high enough to cover your typical maintenance window (restart, upgrade, brief network partition). At 10 block/second, `trailing_logs: 18000` covers 5 minutes of absence. + +--- + +#### `raft.snap_count` + +```yaml +raft: + snap_count: 3 +``` + +**CLI:** `--evnode.raft.snap_count` +**Default:** `3` + +The number of snapshot files to retain on disk. Older snapshots are deleted when new ones are created. Keeping 2–3 snapshots provides a rollback option in case the latest snapshot is corrupt. + +--- + +### Recommended Production Configuration + +The following configuration is recommended for a **5-node cluster on a network with RTT_MAX ≤ 25ms** (typical for nodes in the same region). It was validated by an extensive sweep of 10 configurations across 150 SIGTERM kill cycles and 50 latency-injection cycles, with zero undetected failures and zero split-brain events recorded. + +```yaml +# evnode.yaml — paste this raft section into every node's config +# Replace node_id, raft_addr, and peers with your actual values. + +node: + aggregator: true + +raft: + enable: true + node_id: "node-1" # unique per node + raft_addr: "0.0.0.0:5001" + raft_dir: "/var/lib/ev-node/raft" # must be persistent + + # Remote peers list — different on every node + peers: >- + node-2@10.0.0.2:5001, + node-3@10.0.0.3:5001, + node-4@10.0.0.4:5001, + node-5@10.0.0.5:5001 + + # Timing — tuned for RTT_MAX ≤ 25ms + heartbeat_timeout: "92ms" + election_timeout: "368ms" + leader_lease_timeout: "46ms" + send_timeout: "50ms" + + # Log retention + trailing_logs: 18000 + snapshot_threshold: 5000 + snap_count: 3 +``` + +**Adapting for different RTT values:** + +Measure RTT_MAX first and scale the timing parameters: + +``` +heartbeat_timeout = RTT_MAX × 4 +election_timeout = heartbeat_timeout × 4 +leader_lease_timeout = heartbeat_timeout / 2 +send_timeout = RTT_MAX × 3 +``` + +--- + +## Interaction with P2P + +Even in a Raft cluster, each node must have P2P configured. Raft handles **hot replication** — it replicates the latest block state to all followers in near real-time. But if a node falls far enough behind that the missing entries have already been compacted out of the Raft log (i.e., it missed more entries than `trailing_logs`), it receives a Raft snapshot to jump to the current head. Historical blocks between the node's last known state and the snapshot are then fetched via the **P2P network or DA layer**. + +```yaml +p2p: + listen_address: "/ip4/0.0.0.0/tcp/7676" + peers: "/ip4//tcp//p2p/,..." +``` + +Ensure P2P ports are open between nodes in addition to the Raft port. + +--- + +## Monitoring + +Track these metrics (available via Prometheus if `metrics.enabled: true`) to catch problems early: + +| Signal | What it means | +|--------|---------------| +| Frequent leadership changes | Network instability, asymmetric packet loss, or overloaded nodes | +| Growing applied-index lag | FSM cannot keep up with commits; check CPU and disk I/O | +| Snapshot transfers | Node fell behind `trailing_logs` entries — check network and disk | +| Election timeouts | Heartbeats are being dropped; check MTU, firewall rules, network congestion | + +See the [Monitoring guide](../operations/monitoring.md) for the full Prometheus metric list. diff --git a/docs/guides/ha/single-to-ha.md b/docs/guides/ha/single-to-ha.md new file mode 100644 index 0000000000..ec8a605acf --- /dev/null +++ b/docs/guides/ha/single-to-ha.md @@ -0,0 +1,394 @@ +# Migrate from Single Sequencer to HA Cluster + +This guide walks through converting a live single-sequencer chain into a 5-node Raft HA cluster with zero block-production downtime during the cutover window. + +## Overview + +A single sequencer stores its block production state (latest height, hash, and timestamp) only locally. A Raft cluster shares this state across all nodes via the Raft log. To migrate, you: + +1. Prepare four new nodes with the same genesis, signer key, and chain data as the existing node. +2. Reconfigure all five nodes (existing + four new) with Raft enabled. +3. Stop the existing sequencer and start all five nodes together to bootstrap the cluster. + +The chain experiences one planned downtime window — the gap between stopping the single sequencer and the Raft cluster electing its first leader, which takes a few seconds. + +## Before You Start + +### Understand what changes + +| | Single sequencer | Raft cluster | +|-|-----------------|-------------| +| Produces blocks | One node always | Elected leader | +| Block production key | Local to one node | Shared across all nodes | +| Raft data directory | Not used | Required, persistent | +| Config flags | No `raft.*` flags | All `raft.*` flags required | +| Restart behavior | Manual recovery | Automatic leader election | + +### Requirements + +- Five machines that can reach each other on the Raft port (we use `5001`) and P2P port (`26656`) +- A private network or VPN between all nodes (Raft transport is unencrypted) +- The existing sequencer's: + - Binary (`evm` or your chain binary) + - Config file (`evnode.yaml`) + - Signer key + - Genesis file + - Block data directory (optional — peers can sync from DA, but copying saves time) +- A scheduled maintenance window of ~5 minutes + +--- + +## Step 1: Measure Network RTT + +Before writing any config, measure the maximum RTT between your nodes. The Raft timing parameters must be sized for your actual network: + +```bash +# From the existing node, ping each new node +for ip in 10.0.0.2 10.0.0.3 10.0.0.4 10.0.0.5; do + echo -n "$ip: " + ping -c 20 $ip | tail -1 | awk -F'/' '{print $5 "ms avg"}' +done +``` + +For RTT_MAX ≤ 25ms (same-region nodes), use the recommended values in this guide. For higher RTT, adjust using the formulas in the [configuration reference](./overview.md#timing-parameters). + +--- + +## Step 2: Provision the Four New Nodes + +On each of the four new machines, install the same ev-node binary version as the existing sequencer. + +```bash +# Verify the binary version matches on all machines +./evm version +``` + +Initialize each new node's home directory: + +```bash +# On each new node — do NOT pass --evnode.node.aggregator here yet +./evm init +``` + +--- + +## Step 3: Copy the Signer Key to All New Nodes + +All five nodes must sign blocks with the **same key**. The existing sequencer's key is the one to use — do not generate new keys on the new nodes. + +```bash +# On the existing sequencer (node-1) +# Locate the signer key (exact filename depends on your chain) +ls ~/.evm/config/ + +# Copy to each new node +scp ~/.evm/config/priv_validator_key.json user@10.0.0.2:~/.evm/config/ +scp ~/.evm/config/priv_validator_key.json user@10.0.0.3:~/.evm/config/ +scp ~/.evm/config/priv_validator_key.json user@10.0.0.4:~/.evm/config/ +scp ~/.evm/config/priv_validator_key.json user@10.0.0.5:~/.evm/config/ +``` + +--- + +## Step 4: Copy the Genesis File + +```bash +scp ~/.evm/config/genesis.json user@10.0.0.2:~/.evm/config/ +scp ~/.evm/config/genesis.json user@10.0.0.3:~/.evm/config/ +scp ~/.evm/config/genesis.json user@10.0.0.4:~/.evm/config/ +scp ~/.evm/config/genesis.json user@10.0.0.5:~/.evm/config/ +``` + +--- + +## Step 5: Copy Block Data to New Nodes + +New nodes can sync their block history from the DA layer or P2P peers after the cluster is running, but copying the existing chain data speeds up the initial sync significantly for long-running chains. + +```bash +# Stop the existing sequencer temporarily to get a consistent snapshot +# (you will start it again in step 9) +systemctl stop ev-node # or kill the process + +# Copy the data directory — adjust the path to your chain +rsync -avz ~/.evm/data/ user@10.0.0.2:~/.evm/data/ +rsync -avz ~/.evm/data/ user@10.0.0.3:~/.evm/data/ +rsync -avz ~/.evm/data/ user@10.0.0.4:~/.evm/data/ +rsync -avz ~/.evm/data/ user@10.0.0.5:~/.evm/data/ +``` + +> If your chain uses an EVM execution layer (ev-reth), copy the execution layer database as well. See the [Reth backup guide](../evm/reth-backup.md) for the correct procedure. + +After the copy, note the **latest block height** — this is your reference point: + +```bash +# Note the height before shutdown +cast block --rpc-url http://: +``` + +--- + +## Step 6: Collect Peer IDs + +Before writing the configuration, collect the peer ID from each node. Peer IDs are needed to build the P2P peers list in multiaddr format. + +```bash +# Run on each node +./evm net-info +``` + +Note the `peer_id` value from each node's output — it looks like `12D3KooW...`. You need all five before writing the configuration files. + +--- + +## Step 7: Write the New Configuration on All Five Nodes + +Write the following `evnode.yaml` on every node. The `node_id` is the only field that differs. + +**Existing sequencer becomes node-1.** Assign `node-2` through `node-5` to the four new machines. + +### node-1 (existing sequencer — `~/.evm/config/evnode.yaml`) + +```yaml +node: + aggregator: true + block_time: "1s" # keep your existing block_time + +raft: + enable: true + node_id: "node-1" + raft_addr: "0.0.0.0:5001" + raft_dir: "/var/lib/ev-node/raft" + peers: "node-2@10.0.0.2:5001,node-3@10.0.0.3:5001,node-4@10.0.0.4:5001,node-5@10.0.0.5:5001" + + # Timing — tuned for RTT_MAX ≤ 25ms + heartbeat_timeout: "92ms" + election_timeout: "368ms" + leader_lease_timeout: "46ms" + send_timeout: "50ms" + + # Log retention + trailing_logs: 18000 + snapshot_threshold: 5000 + snap_count: 3 + +p2p: + listen_address: "/ip4/0.0.0.0/tcp/26656" + peers: "/ip4/10.0.0.2/tcp/26656/p2p/,/ip4/10.0.0.3/tcp/26656/p2p/,/ip4/10.0.0.4/tcp/26656/p2p/,/ip4/10.0.0.5/tcp/26656/p2p/" +``` + +### node-2 through node-5 + +Identical except for `node_id`, raft and P2P peers (exclude self from the P2P list): + +```yaml +# node-2 +node: + aggregator: true + block_time: "1s" + +raft: + enable: true + node_id: "node-2" # change per node + raft_addr: "0.0.0.0:5001" + raft_dir: "/var/lib/ev-node/raft" + peers: "node-2@10.0.0.2:5001,node-3@10.0.0.3:5001,node-4@10.0.0.4:5001,node-5@10.0.0.5:5001" + heartbeat_timeout: "92ms" + election_timeout: "368ms" + leader_lease_timeout: "46ms" + send_timeout: "50ms" + trailing_logs: 18000 + snapshot_threshold: 5000 + snap_count: 3 + +p2p: + listen_address: "/ip4/0.0.0.0/tcp/26656" + peers: "/ip4/10.0.0.1/tcp/26656/p2p/,/ip4/10.0.0.3/tcp/26656/p2p/,/ip4/10.0.0.4/tcp/26656/p2p/,/ip4/10.0.0.5/tcp/26656/p2p/" +``` + +--- + +## Step 8: Create Raft Data Directories + +Run on every node: + +```bash +sudo mkdir -p /var/lib/ev-node/raft +sudo chown $(whoami) /var/lib/ev-node/raft +``` + +--- + +## Step 9: The Cutover + +This is the planned maintenance window. The chain pauses block production from when you stop the existing sequencer until the new Raft cluster elects its first leader (a few seconds). + +### 9a. Stop the existing single sequencer + +```bash +# On node-1 (existing sequencer) +systemctl stop ev-node # or kill -SIGTERM $(pgrep evm) +``` + +Confirm it has stopped: + +```bash +pgrep evm || echo "stopped" +``` + +### 9b. Start all five nodes simultaneously + +The key requirement here is that all nodes must start within a short window of each other. Raft needs a majority (3 out of 5) online to elect a leader. If you start only 2 nodes and wait, the cluster will not elect a leader until the 3rd node joins. + +Use a coordination mechanism — a simple approach is to open five terminals (or tmux panes) and fire the start commands in quick succession: + +```bash +# On node-1 +./evm start \ + --evnode.node.aggregator=true \ + --evnode.raft.enable=true \ + --evnode.raft.node_id="node-1" \ + --evnode.raft.raft_addr="0.0.0.0:5001" \ + --evnode.raft.raft_dir="/var/lib/ev-node/raft" \ + --evnode.raft.peers="node-1@10.0.0.1:5001,node-2@10.0.0.2:5001,node-3@10.0.0.3:5001,node-4@10.0.0.4:5001,node-5@10.0.0.5:5001" \ + --evnode.raft.heartbeat_timeout="92ms" \ + --evnode.raft.election_timeout="368ms" \ + --evnode.raft.leader_lease_timeout="46ms" \ + --evnode.raft.send_timeout="50ms" \ + --evnode.raft.trailing_logs=18000 \ + --evnode.raft.snapshot_threshold=5000 \ + --evnode.p2p.listen_address="/ip4/0.0.0.0/tcp/26656" \ + --evnode.p2p.peers="/ip4/10.0.0.2/tcp/26656/p2p/,/ip4/10.0.0.3/tcp/26656/p2p/,/ip4/10.0.0.4/tcp/26656/p2p/,/ip4/10.0.0.5/tcp/26656/p2p/" \ + --evnode.signer.passphrase= \ + --evm.jwt-secret=$(cat /path/to/jwt.hex) \ + --evm.genesis-hash= +``` + +```bash +# On node-2 (at the same time, or within a few seconds) +./evm start \ + --evnode.raft.node_id="node-2" \ + # ... same flags, change node_id and p2p.peers +``` + +Repeat for node-3, node-4, node-5. + +--- + +## Step 10: Verify the Migration Succeeded + +### Check leader election + +Within seconds of starting, one node will win the election. Look for: + +``` +INF raft: election won tally=3 leader=node-1 +INF raft: entering leader state +INF block produced height= +``` + +where `N` is the last block produced by the old single sequencer. + +The followers will show: + +``` +INF raft: entering follower state leader=node-1 +INF block applied from raft log height= +``` + +### Verify block height continuity + +The new cluster must continue from exactly where the old sequencer left off. Query the RPC: + +```bash +# From the existing sequencer's last known height (noted in step 5) +LAST_HEIGHT= + +# Query node-1 (or any node) +NEW_HEIGHT=$(cast block --rpc-url http://:) + +echo "Last old height: $LAST_HEIGHT" +echo "New cluster height: $NEW_HEIGHT" + +# New height should be LAST_HEIGHT + 1 (or a few blocks ahead if it took a moment) +``` + +### Check all nodes are synced + +```bash +for ip in 10.0.0.1 10.0.0.2 10.0.0.3 10.0.0.4 10.0.0.5; do + echo -n "$ip: height=" + curl -s http://$ip:26657/status | jq -r '.result.sync_info.latest_block_height' +done +``` + +All nodes should be at the same height (within 1–2 blocks of each other). + +--- + +## Step 11: Set Up Systemd on All Nodes + +Once you have confirmed the cluster is healthy, set up systemd for automatic restarts and service management. See the [cluster setup guide](./cluster-setup.md#running-as-a-systemd-service) for a ready-to-use unit file template. + +--- + +## Rollback Plan + +If anything goes wrong during the cutover, you can revert to the single sequencer: + +1. Stop all five nodes. +2. Wipe the Raft data directories (`/var/lib/ev-node/raft`) on all nodes to clear any bootstrapped cluster state. +3. Remove the Raft configuration from node-1's `evnode.yaml` (or revert to the pre-migration config file). +4. Start node-1 with `raft.enable: false` — it resumes as a single sequencer from the block height it was at when you stopped it. + +```bash +# Emergency rollback — revert node-1 to single sequencer +./evm start \ + --evnode.node.aggregator=true \ + --evnode.raft.enable=false \ + --evnode.signer.passphrase= \ + # ... your original flags +``` + +The chain continues from the last block committed before the cutover. No blocks are lost because the single sequencer's data was never modified. + +--- + +## New Nodes Without Existing Chain Data + +If you did not copy the block data in step 5 (or if you are adding nodes long after the chain started), the new nodes will sync historical block data via P2P and the DA layer after joining the cluster. This process runs in the background and does not prevent the cluster from electing a leader or producing new blocks. + +Monitor sync progress on a new node: + +```bash +# The node will log progress as it fetches historical blocks +journalctl -u ev-node -f | grep "sync\|height" +``` + +--- + +## Troubleshooting + +### Node-1 starts but no leader is elected + +The cluster cannot elect a leader without a quorum (3 out of 5 nodes). Ensure all five nodes are running and can reach each other on port 5001. + +### New nodes report height mismatch or divergence panic + +This happens if the block data on the new nodes was copied from a different snapshot than the final state of the old sequencer, or if the copy was done while the old sequencer was still running and produced additional blocks during the copy. + +Wipe the new nodes' block data and Raft directories, re-copy from the stopped node-1, and retry. + +### Block height jumps backward or chain forks + +This should not happen if all five nodes are running the same binary version and have the same genesis file and signer key. If you see it: + +1. Stop all nodes immediately. +2. Identify which node produced the offending block. +3. Check that its genesis hash and signer key match the other nodes. + +### Old single sequencer comes back online accidentally + +If the old sequencer (without Raft) is started again after the new cluster is already producing blocks, it will attempt to produce blocks independently, creating a fork. This is why it is important to disable or remove the old single-sequencer startup scripts immediately after the cutover. + +With Raft enabled on all five nodes, only the elected leader will produce blocks — there is no risk of a sixth "rogue" leader as long as the old machine is not restarted with the old non-Raft configuration. From 9e05891c74d038b647c123241d1ee6bba58c3d23 Mon Sep 17 00:00:00 2001 From: auricom <27022259+auricom@users.noreply.github.com> Date: Tue, 28 Apr 2026 10:57:16 +0200 Subject: [PATCH 2/7] docs: node placement --- docs/guides/ha/overview.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/guides/ha/overview.md b/docs/guides/ha/overview.md index b3b0ca7bda..4e159bff25 100644 --- a/docs/guides/ha/overview.md +++ b/docs/guides/ha/overview.md @@ -53,6 +53,20 @@ Raft transport is **plain TCP** with no built-in encryption. Before deploying: - **Never expose the Raft port to the public internet.** An attacker with access to the Raft port can send forged messages that disrupt or hijack cluster consensus. - Ensure low-latency connectivity between nodes. Timeouts must be sized larger than the worst-case round-trip time (RTT) between any two nodes in the cluster. +### Node Placement + +**Run all nodes in the same region, spread across different availability zones.** + +This is the single most important infrastructure decision for cluster stability. All nodes must have roughly the same RTT to each other. The timing parameters (heartbeat timeout, election timeout) are sized for a single `RTT_MAX` value — if one node has materially higher latency than its peers, it degrades the entire cluster's ability to detect failures and elect leaders reliably. + +Specifically: +- **Same region, different AZs** gives uniform 5–30ms RTT and is the validated production topology. Nodes are isolated from AZ-level failures while keeping latency uniform. +- **Cross-region nodes** introduce higher and asymmetric RTT (100ms+). Even a single high-latency node can destabilize the cluster under network stress. + +This was observed directly in load testing: a 3-node cluster where one node averaged 99ms RTT (2× higher than its peers at 45–49ms) showed election times up to 284 seconds, three undetected leader elections, and one skipped cycle when 200–500ms of additional latency was injected — the same disruption level where the two lower-latency nodes recovered in under 55 seconds. Moving to a 5-node cluster with uniform ~45ms RTT across all nodes eliminated all undetected elections, reduced the worst-case election time from 284s to 66s, and reduced cascade risk from 10% of cycles to 3%. + +If your deployment requires nodes in different regions, increase `heartbeat_timeout` and `election_timeout` to at least 4–5× the worst-case inter-node RTT, and expect slower failover. See the [timing parameters](#timing-parameters) section for tuning formulas. + --- ## Configuration Reference From 409e2180919292eaf82fa58e846188f097566d4a Mon Sep 17 00:00:00 2001 From: auricom <27022259+auricom@users.noreply.github.com> Date: Tue, 28 Apr 2026 14:15:18 +0200 Subject: [PATCH 3/7] docs(ha): address PR review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Critical fixes: - Fix snapshot_threshold math: 5000 ÷ 10 = 500s ≈ 8.3 min (not 83s) - Fix trailing_logs math: 18000 ÷ 10 = 1800s = 30 min (not 5 min) Medium fixes: - Fix heartbeat_timeout description: it is a follower-side election trigger, not the interval at which the leader sends heartbeats - Add explicit restart instruction after Step 5 data copy in single-to-ha.md so the chain keeps producing blocks during preparation (Steps 6-8) - Replace priv_validator_key.json with signer.json in single-to-ha.md to match cluster-setup.md and the E2E tests Minor fixes: - Exclude self from raft.peers in all examples (cluster-setup.md node-1 yaml/CLI/systemd, single-to-ha.md node-1 and node-2) - Add "exclude local node" note to raft.peers description in overview.md - Fix P2P port in overview.md Interaction with P2P section (7676 → 26656) - Add text language tag to all bare fenced blocks (MD040): multiaddr example, RTT equations, and all log snippets Co-Authored-By: Claude Sonnet 4.6 --- docs/guides/ha/cluster-setup.md | 22 ++++++++++++++-------- docs/guides/ha/overview.md | 16 ++++++++-------- docs/guides/ha/single-to-ha.md | 23 +++++++++++++++-------- 3 files changed, 37 insertions(+), 24 deletions(-) diff --git a/docs/guides/ha/cluster-setup.md b/docs/guides/ha/cluster-setup.md index 84deaa8794..d8f554b3b9 100644 --- a/docs/guides/ha/cluster-setup.md +++ b/docs/guides/ha/cluster-setup.md @@ -26,7 +26,7 @@ Replace these with your actual IP addresses throughout the guide. P2P peers use the libp2p multiaddr format, which includes each node's peer ID: -``` +```text /ip4//tcp//p2p/ ``` @@ -132,7 +132,7 @@ raft: node_id: "node-1" raft_addr: "0.0.0.0:5001" raft_dir: "/var/lib/ev-node/raft" - peers: "node-1@10.0.0.1:5001,node-2@10.0.0.2:5001,node-3@10.0.0.3:5001,node-4@10.0.0.4:5001,node-5@10.0.0.5:5001" + peers: "node-2@10.0.0.2:5001,node-3@10.0.0.3:5001,node-4@10.0.0.4:5001,node-5@10.0.0.5:5001" # Timing — tuned for RTT_MAX ≤ 25ms heartbeat_timeout: "92ms" @@ -194,7 +194,7 @@ Start all five nodes as close together as possible. The order does not matter bu --evnode.raft.node_id="node-1" \ --evnode.raft.raft_addr="0.0.0.0:5001" \ --evnode.raft.raft_dir="/var/lib/ev-node/raft" \ - --evnode.raft.peers="node-1@10.0.0.1:5001,node-2@10.0.0.2:5001,node-3@10.0.0.3:5001,node-4@10.0.0.4:5001,node-5@10.0.0.5:5001" \ + --evnode.raft.peers="node-2@10.0.0.2:5001,node-3@10.0.0.3:5001,node-4@10.0.0.4:5001,node-5@10.0.0.5:5001" \ --evnode.raft.heartbeat_timeout="92ms" \ --evnode.raft.election_timeout="368ms" \ --evnode.raft.leader_lease_timeout="46ms" \ @@ -218,7 +218,7 @@ Adjust flags for your execution layer (e.g., remove EVM flags if you are running Within a few seconds of starting, you should see one node win the election: -``` +```text INF raft: entering candidate state node=node-1 INF raft: election won tally=3 INF raft: entering leader state leader=node-1 @@ -227,7 +227,7 @@ INF block produced height=1 hash=0xabc... The other nodes will log: -``` +```text INF raft: entering follower state leader=node-1 INF block applied from raft log height=1 hash=0xabc... ``` @@ -260,8 +260,14 @@ done With all five nodes running and producing blocks, simulate a leader failure: ```bash -# Identify the current leader from its logs, then on that machine: -kill -SIGTERM $(pgrep evm) +# Identify the current leader from its logs, then on that machine. +# Preferred: use the systemd unit if ev-node runs as a service +sudo systemctl stop ev-node + +# Fallback: stop the process directly (verify exactly one PID before killing) +PID=$(pgrep -f "evm start") +echo "Stopping PID $PID" +kill -SIGTERM "$PID" ``` Within `election_timeout` (368ms in this configuration), the remaining four nodes will elect a new leader and resume block production. Measure the actual gap in your logs: @@ -294,7 +300,7 @@ ExecStart=/usr/local/bin/evm start \ --evnode.raft.node_id=node-1 \ --evnode.raft.raft_addr=0.0.0.0:5001 \ --evnode.raft.raft_dir=/var/lib/ev-node/raft \ - --evnode.raft.peers=node-1@10.0.0.1:5001,node-2@10.0.0.2:5001,node-3@10.0.0.3:5001,node-4@10.0.0.4:5001,node-5@10.0.0.5:5001 \ + --evnode.raft.peers=node-2@10.0.0.2:5001,node-3@10.0.0.3:5001,node-4@10.0.0.4:5001,node-5@10.0.0.5:5001 \ --evnode.raft.heartbeat_timeout=92ms \ --evnode.raft.election_timeout=368ms \ --evnode.raft.leader_lease_timeout=46ms \ diff --git a/docs/guides/ha/overview.md b/docs/guides/ha/overview.md index 4e159bff25..bcc84cd057 100644 --- a/docs/guides/ha/overview.md +++ b/docs/guides/ha/overview.md @@ -151,7 +151,7 @@ raft: **CLI:** `--evnode.raft.peers` **Default:** _(none, required)_ -A comma-separated list of **all** remote cluster members, in the format `nodeID@host:port`. The host and port must be the Raft address (`raft_addr`) of each peer as reachable from this node. +A comma-separated list of the **other** cluster members (exclude the local node), in the format `nodeID@host:port`. The host and port must be the Raft address (`raft_addr`) of each peer as reachable from this node. Do not list the node's own `node_id` in its own `peers` field. Raft uses this list to: - Bootstrap the cluster on first start (when no persisted state exists). @@ -182,7 +182,7 @@ raft: **CLI:** `--evnode.raft.heartbeat_timeout` **Default:** `350ms` -How often the leader sends heartbeat messages to followers. Followers that do not receive a heartbeat within this interval begin a new election. +The maximum time a follower will wait without receiving a heartbeat from the leader before starting a new election. The leader sends heartbeats more frequently than this value internally; this parameter is purely a follower-side timeout that triggers a new election when crossed. **Tuning rule:** Set to **4–5× RTT_MAX**. This ensures followers can distinguish a slow network from a dead leader without triggering spurious elections. @@ -268,7 +268,7 @@ The number of committed log entries that must accumulate before Raft automatical - **Lower values** (e.g., `500`): snapshots are taken frequently, keeping the log small. A restarting node receives a recent snapshot and has fewer log entries to replay, but snapshot writes happen more often, adding brief I/O bursts. - **Higher values** (e.g., `5000`): less frequent snapshots mean less I/O overhead during normal operation, but a lagging node may have more log entries to replay when catching up. -At 10 block/second, `snapshot_threshold: 5000` takes a snapshot roughly every 83 seconds. +At 10 block/second, `snapshot_threshold: 5000` takes a snapshot roughly every 8.3 minutes (500 seconds). --- @@ -286,9 +286,9 @@ The number of log entries to **retain after a snapshot** is taken. These entries **Effect on operations:** - **Lower values** (e.g., `200`): tighter disk usage; a node that misses even a few minutes of operation must receive a full snapshot on rejoin. -- **Higher values** (e.g., `18000`): a lagging node can catch up via log replay for up to 5 minutes at 10 block/second without needing a full snapshot transfer, reducing the cost of brief outages. +- **Higher values** (e.g., `18000`): a lagging node can catch up via log replay for up to 30 minutes at 10 block/second without needing a full snapshot transfer, reducing the cost of brief outages. -Set this high enough to cover your typical maintenance window (restart, upgrade, brief network partition). At 10 block/second, `trailing_logs: 18000` covers 5 minutes of absence. +Set this high enough to cover your typical maintenance window (restart, upgrade, brief network partition). At 10 block/second, `trailing_logs: 18000` covers 30 minutes of absence (1800 seconds). --- @@ -346,7 +346,7 @@ raft: Measure RTT_MAX first and scale the timing parameters: -``` +```text heartbeat_timeout = RTT_MAX × 4 election_timeout = heartbeat_timeout × 4 leader_lease_timeout = heartbeat_timeout / 2 @@ -361,8 +361,8 @@ Even in a Raft cluster, each node must have P2P configured. Raft handles **hot r ```yaml p2p: - listen_address: "/ip4/0.0.0.0/tcp/7676" - peers: "/ip4//tcp//p2p/,..." + listen_address: "/ip4/0.0.0.0/tcp/26656" + peers: "/ip4//tcp/26656/p2p/,..." ``` Ensure P2P ports are open between nodes in addition to the Raft port. diff --git a/docs/guides/ha/single-to-ha.md b/docs/guides/ha/single-to-ha.md index ec8a605acf..3c1a11156a 100644 --- a/docs/guides/ha/single-to-ha.md +++ b/docs/guides/ha/single-to-ha.md @@ -82,10 +82,10 @@ All five nodes must sign blocks with the **same key**. The existing sequencer's ls ~/.evm/config/ # Copy to each new node -scp ~/.evm/config/priv_validator_key.json user@10.0.0.2:~/.evm/config/ -scp ~/.evm/config/priv_validator_key.json user@10.0.0.3:~/.evm/config/ -scp ~/.evm/config/priv_validator_key.json user@10.0.0.4:~/.evm/config/ -scp ~/.evm/config/priv_validator_key.json user@10.0.0.5:~/.evm/config/ +scp ~/.evm/config/signer.json user@10.0.0.2:~/.evm/config/ +scp ~/.evm/config/signer.json user@10.0.0.3:~/.evm/config/ +scp ~/.evm/config/signer.json user@10.0.0.4:~/.evm/config/ +scp ~/.evm/config/signer.json user@10.0.0.5:~/.evm/config/ ``` --- @@ -126,6 +126,13 @@ After the copy, note the **latest block height** — this is your reference poin cast block --rpc-url http://: ``` +**Restart the existing sequencer now** so the chain keeps producing blocks while you prepare the remaining nodes (Steps 6–8). The chain will run uninterrupted until the planned cutover in Step 9. + +```bash +# On node-1 — restart with your original single-sequencer flags +systemctl start ev-node +``` + --- ## Step 6: Collect Peer IDs @@ -192,7 +199,7 @@ raft: node_id: "node-2" # change per node raft_addr: "0.0.0.0:5001" raft_dir: "/var/lib/ev-node/raft" - peers: "node-2@10.0.0.2:5001,node-3@10.0.0.3:5001,node-4@10.0.0.4:5001,node-5@10.0.0.5:5001" + peers: "node-1@10.0.0.1:5001,node-3@10.0.0.3:5001,node-4@10.0.0.4:5001,node-5@10.0.0.5:5001" heartbeat_timeout: "92ms" election_timeout: "368ms" leader_lease_timeout: "46ms" @@ -250,7 +257,7 @@ Use a coordination mechanism — a simple approach is to open five terminals (or --evnode.raft.node_id="node-1" \ --evnode.raft.raft_addr="0.0.0.0:5001" \ --evnode.raft.raft_dir="/var/lib/ev-node/raft" \ - --evnode.raft.peers="node-1@10.0.0.1:5001,node-2@10.0.0.2:5001,node-3@10.0.0.3:5001,node-4@10.0.0.4:5001,node-5@10.0.0.5:5001" \ + --evnode.raft.peers="node-2@10.0.0.2:5001,node-3@10.0.0.3:5001,node-4@10.0.0.4:5001,node-5@10.0.0.5:5001" \ --evnode.raft.heartbeat_timeout="92ms" \ --evnode.raft.election_timeout="368ms" \ --evnode.raft.leader_lease_timeout="46ms" \ @@ -281,7 +288,7 @@ Repeat for node-3, node-4, node-5. Within seconds of starting, one node will win the election. Look for: -``` +```text INF raft: election won tally=3 leader=node-1 INF raft: entering leader state INF block produced height= @@ -291,7 +298,7 @@ where `N` is the last block produced by the old single sequencer. The followers will show: -``` +```text INF raft: entering follower state leader=node-1 INF block applied from raft log height= ``` From 4d0bb1feec8498491222767eca60d4fc9cd1c068 Mon Sep 17 00:00:00 2001 From: auricom <27022259+auricom@users.noreply.github.com> Date: Tue, 28 Apr 2026 14:25:01 +0200 Subject: [PATCH 4/7] docs(ha): absorb raft_production.md into ha/overview.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit raft_production.md had no sidebar entry and its content was fully superseded by the new ha/ guides. Extract the three pieces that were unique to it — bootstrap flag docs, auto-detection startup mode explanation, and static-membership limitation note — into ha/overview.md, then delete the file. Co-Authored-By: Claude Sonnet 4.6 --- docs/guides/ha/overview.md | 23 +++++++- docs/guides/raft_production.md | 102 --------------------------------- 2 files changed, 22 insertions(+), 103 deletions(-) delete mode 100644 docs/guides/raft_production.md diff --git a/docs/guides/ha/overview.md b/docs/guides/ha/overview.md index bcc84cd057..4d39e22e37 100644 --- a/docs/guides/ha/overview.md +++ b/docs/guides/ha/overview.md @@ -153,10 +153,31 @@ raft: A comma-separated list of the **other** cluster members (exclude the local node), in the format `nodeID@host:port`. The host and port must be the Raft address (`raft_addr`) of each peer as reachable from this node. Do not list the node's own `node_id` in its own `peers` field. - Raft uses this list to: +Raft uses this list to: - Bootstrap the cluster on first start (when no persisted state exists). - Know which addresses to dial when sending log entries or heartbeats. +> **Limitation — static membership only.** Changing the peer set at runtime (adding or removing nodes without a full cluster restart) is not currently supported. All nodes that will ever participate in the cluster must be listed in `peers` before the cluster is first bootstrapped. + +--- + +#### `raft.bootstrap` + +```yaml +raft: + bootstrap: false +``` + +**CLI:** `--evnode.raft.bootstrap` +**Default:** `false` + +Compatibility flag retained for older deployments. **You do not need to set this.** ev-node auto-detects the correct startup mode from the state of `raft_dir`: + +- If `raft_dir` contains existing Raft state → the node **rejoins** the cluster automatically. +- If `raft_dir` is empty or does not exist → the node **bootstraps** a new cluster from the `peers` list. + +Setting `bootstrap: true` explicitly has no additional effect beyond what auto-detection already does. + --- ### Timing Parameters diff --git a/docs/guides/raft_production.md b/docs/guides/raft_production.md deleted file mode 100644 index 9e02758da9..0000000000 --- a/docs/guides/raft_production.md +++ /dev/null @@ -1,102 +0,0 @@ -# Raft Implementation & Production Configuration - -This guide details the Raft consensus implementation in `ev-node`, used for High Availability (HA) of the Sequencer/Aggregator. It is targeted at experienced DevOps and developers configuring production environments. - -## Overview - -`ev-node` uses the [HashiCorp Raft](https://github.com/hashicorp/raft) implementation to manage leader election and state replication when running in **Aggregator Mode**. - -* **Role**: Ensures only one active Aggregator (Leader) produces blocks at a time. -* **Failover**: Automatically elects a new leader if the current leader fails. -* **Safety**: Synchronizes the block production state to prevent double-signing or fork divergence. - -### Architecture - -* **Transport**: TCP-based transport for inter-node communication. -* **Storage**: [BoltDB](https://github.com/etcd-io/bbolt) is used for both the Raft Log (`raft-log.db`) and Stable Store (`raft-stable.db`). Snapshots are stored as files. -* **FSM (Finite State Machine)**: The State Machine applies `RaftBlockState` (Protobuf) containing the latest block height, hash, and timestamp. -* **Safety Checks**: - * **Startup**: Nodes check for divergence between local block store and Raft state. - * **Leadership Transfer**: Before becoming leader, a node waits for its FSM to catch up (`waitForMsgsLanded`) to prevent proposing blocks from a stale state. - * **Shutdown**: The leader attempts to transfer leadership gracefully before shutting down to minimize downtime. - -## Configuration - -Raft is configured via CLI flags or the `config.toml` file under the `[raft]` (or `[evnode.raft]`) section. - -### Essential Flags - -| Flag | Config Key | Description | Production Value | -|------|------------|-------------|------------------| -| `--evnode.raft.enable` | `raft.enable` | Enable Raft consensus. | `true` | -| `--evnode.raft.node_id` | `raft.node_id` | **Unique** identifier for the node. | e.g., `node-01` | -| `--evnode.raft.raft_addr` | `raft.raft_addr` | TCP address for Raft transport. | `0.0.0.0:5001` (Bind to private IP) | -| `--evnode.raft.raft_dir` | `raft.raft_dir` | Directory for Raft data. | `/data/raft` (Must be persistent) | -| `--evnode.raft.peers` | `raft.peers` | Comma-separated list of peer addresses in format `nodeID@host:port`. | `node-1@10.0.0.1:5001,node-2@10.0.0.2:5001,node-3@10.0.0.3:5001` | -| `--evnode.raft.bootstrap` | `raft.bootstrap` | Compatibility flag. Startup mode is selected automatically from persisted raft configuration state. | optional | - -### Timeout Tuning - -Raft timeouts should be tuned relative to your **Block Time** (`--evnode.node.block_time`) to utilize the fast failover capabilities without causing instability. - -| Flag | Default | Recommended Tuning | -|------|---------|--------------------| -| `--evnode.raft.heartbeat_timeout` | `1s` | **10-30% of Leader Lease**. For sub-second block times, lower to `50ms-100ms`. | -| `--evnode.raft.leader_lease_timeout` | `500ms` | **Must be < Election Timeout**. Use `500ms` for 1s block times. For slower chains (e.g., 10s blocks), increase to `1s-2s` to tolerate network jitter. | -| `--evnode.raft.send_timeout` | `1s` | Should be `> 2x RTT`. | - -**Relation to Block Time**: -Ideally, a failover should complete within `2 * BlockTime` to minimize user impact. -* **Fast Chain (BlockTime < 1s)**: Tighten timeouts. Heartbeat `50ms`, Lease `250ms`. -* **Standard Chain (BlockTime = 1s)**: Heartbeat `100ms`, Lease `500ms`. -* **Slow Chain (BlockTime > 5s)**: Defaults are usually sufficient (`1s` heartbeat). - -> **Warning**: Setting timeouts too low (< RTT + Jitter) will cause leadership flapping and halted block production. - -## Production Deployment Principles - -### 1. Static Peering & Automatic Startup Mode -Use static peering with automatic mode selection from local raft configuration: -* If local raft configuration already exists in `--evnode.raft.raft_dir`, the node starts in rejoin mode. -* If no local raft configuration exists yet, the node bootstraps from configured peers. -* `--evnode.raft.bootstrap` is retained for compatibility but does not control mode selection. -* **All configured cluster members** should list the full set of peers in `--evnode.raft.peers`. -* The `peers` list format is strict: `NodeID@Host:Port`. -* **Limitation**: Dynamic addition of peers (run-time membership changes) via RPC/CLI is not currently exposed. -* **Not supported**: Joining an existing cluster as a brand-new node that was not part of the initial static membership. - -### 2. Infrastructure Requirements -* **Encrypted Network (CRITICAL)**: Raft traffic is **unencrypted** (plain TCP). You **MUST** run the cluster inside a private network, VPN, or encrypted mesh (e.g., WireGuard, Tailscale). **Never expose Raft ports to the public internet**; doing so allows attackers to hijack the cluster consensus. -* **Cluster Size**: Run an **odd number** of nodes (3 or 5) to tolerate failures (3 nodes tolerate 1 failure; 5 nodes tolerate 2). -* **Storage**: The `--evnode.raft.raft_dir` **MUST** be mounted on persistent storage. Loss of this directory will cause the node to lose its identity and commit history, effectively removing it from the cluster. -* **Network**: Raft requires low-latency, reliable connectivity. Ensure firewall rules allow TCP traffic on `raft_addr`. - -### 3. P2P Interaction & Catch-Up -Raft and P2P work in parallel to ensure reliability: -* **Hot Replication (Raft)**: New blocks produced by the leader are replicated via the Raft transport (Header + Data) to all followers. This ensures low-latency propagation of the chain tip. -* **Catch-Up (P2P)**: If a node falls behind (e.g., disconnected for longer than the Raft log retention), it will receive a **Raft Snapshot** to update its consensus state to the latest head. However, the *historical blocks* between its local state and the new head are fetched via the **P2P Network** (or DA). - * **Implication**: You must ensure P2P connectivity (`--p2p.listen_address` and `--p2p.peers`) is configured even for Raft nodes, to allow them to backfill missing data from peers. - -### 4. Lifecycle Management -* **Rolling Restarts**: You can restart nodes one by one. The `ev-node` implementation handles graceful shutdown (leadership transfer) to minimize impact. -* **State Divergence**: If a node falls too far behind or its local store conflicts with Raft (e.g., due to catastrophic disk failure), it may panic on startup to protect safety. In such cases, a manual extensive recovery (wiping state and re-syncing) may be required. - -### 4. Monitoring -Monitor the following metrics (propagated via Prometheus if enabled): -* **Leadership Changes**: Frequent changes indicate network instability or overloaded nodes. -* **Applied Index vs Commit Index**: A growing lag indicates the FSM cannot keep up. - -## Example Command - -```bash -./ev-node start \ - --evnode.node.aggregator=true \ - --evnode.raft.enable=true \ - --evnode.raft.node_id="node-1" \ - --evnode.raft.raft_addr="0.0.0.0:5001" \ - --evnode.raft.raft_dir="/var/lib/ev-node/raft" \ - --evnode.raft.bootstrap=true \ - --evnode.raft.peers="node-1@10.0.1.1:5001,node-2@10.0.1.2:5001,node-3@10.0.1.3:5001" \ - --evnode.p2p.listen_address="/ip4/0.0.0.0/tcp/26656" \ - ...other flags -``` From 01d753249e7fe1eb60aa4b524b132d215d512348 Mon Sep 17 00:00:00 2001 From: auricom <27022259+auricom@users.noreply.github.com> Date: Tue, 28 Apr 2026 14:54:47 +0200 Subject: [PATCH 5/7] docs(ha): use EnvironmentFile for signer passphrase Passing --evnode.signer.passphrase inline exposes the secret in ps aux, journalctl, and shell history. - Add EnvironmentFile=/etc/ev-node/env (chmod 600) to the systemd unit in cluster-setup.md with setup instructions - Replace all inline occurrences with $EV_SIGNER_PASSPHRASE sourced from /etc/ev-node/env in every evm start / evm init snippet across both guides Co-Authored-By: Claude Sonnet 4.6 --- docs/guides/ha/cluster-setup.md | 27 +++++++++++++++++++++++---- docs/guides/ha/single-to-ha.md | 9 +++++++-- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/docs/guides/ha/cluster-setup.md b/docs/guides/ha/cluster-setup.md index d8f554b3b9..95bc1d87d2 100644 --- a/docs/guides/ha/cluster-setup.md +++ b/docs/guides/ha/cluster-setup.md @@ -74,9 +74,11 @@ Run this on every node. Each node gets its own home directory where the config, ```bash # Run on every node (the binary name depends on your chain) -./evm init --evnode.node.aggregator=true --evnode.signer.passphrase +./evm init --evnode.node.aggregator=true --evnode.signer.passphrase "$EV_SIGNER_PASSPHRASE" ``` +> Set `EV_SIGNER_PASSPHRASE` in your shell session before running this command so the passphrase does not appear in `ps aux` or your shell history. The [EnvironmentFile setup](#running-as-a-systemd-service) later in this guide shows how to store it securely. + This creates the home directory structure (default `~/.evm`) with a `config/evnode.yaml` file and generates the signer key. After initializing each node, retrieve its peer ID — you will need all five when writing the configuration in Step 5: @@ -186,8 +188,10 @@ Raft requires a majority of configured peers to be online before it can elect a Start all five nodes as close together as possible. The order does not matter but they should all be up within a few seconds of each other. ```bash -# Run this on each node, substituting the correct binary name and flags for your chain +# Load the passphrase from the secure env file (avoids it appearing in ps aux) +source /etc/ev-node/env +# Run this on each node, substituting the correct binary name and flags for your chain ./evm start \ --evnode.node.aggregator=true \ --evnode.raft.enable=true \ @@ -203,7 +207,7 @@ Start all five nodes as close together as possible. The order does not matter bu --evnode.raft.snapshot_threshold=5000 \ --evnode.p2p.listen_address="/ip4/0.0.0.0/tcp/26656" \ --evnode.p2p.peers="/ip4/10.0.0.2/tcp/26656/p2p/,/ip4/10.0.0.3/tcp/26656/p2p/,/ip4/10.0.0.4/tcp/26656/p2p/,/ip4/10.0.0.5/tcp/26656/p2p/" \ - --evnode.signer.passphrase= \ + --evnode.signer.passphrase="$EV_SIGNER_PASSPHRASE" \ --evm.jwt-secret=$(cat /path/to/jwt.hex) \ --evm.genesis-hash= ``` @@ -285,6 +289,20 @@ The gap should be well under 1 second in most cases (a few election cycles at mo For production, manage each node with systemd. +### Create the environment file + +Store secrets in a file that only the service user can read. systemd loads it at start time, so the passphrase never appears in `ps aux`, `journalctl`, or the unit file itself. + +```bash +# Run on every node +sudo mkdir -p /etc/ev-node +echo "EV_SIGNER_PASSPHRASE=" | sudo tee /etc/ev-node/env > /dev/null +sudo chmod 600 /etc/ev-node/env +sudo chown ev-node:ev-node /etc/ev-node/env +``` + +### Unit file + ```ini # /etc/systemd/system/ev-node.service [Unit] @@ -294,6 +312,7 @@ Wants=network-online.target [Service] User=ev-node +EnvironmentFile=/etc/ev-node/env ExecStart=/usr/local/bin/evm start \ --evnode.node.aggregator=true \ --evnode.raft.enable=true \ @@ -309,7 +328,7 @@ ExecStart=/usr/local/bin/evm start \ --evnode.raft.snapshot_threshold=5000 \ --evnode.p2p.listen_address=/ip4/0.0.0.0/tcp/26656 \ --evnode.p2p.peers=/ip4/10.0.0.2/tcp/26656/p2p/,/ip4/10.0.0.3/tcp/26656/p2p/,/ip4/10.0.0.4/tcp/26656/p2p/,/ip4/10.0.0.5/tcp/26656/p2p/ \ - --evnode.signer.passphrase= + --evnode.signer.passphrase=$EV_SIGNER_PASSPHRASE Restart=on-failure RestartSec=5s diff --git a/docs/guides/ha/single-to-ha.md b/docs/guides/ha/single-to-ha.md index 3c1a11156a..2c0151aee9 100644 --- a/docs/guides/ha/single-to-ha.md +++ b/docs/guides/ha/single-to-ha.md @@ -250,6 +250,10 @@ The key requirement here is that all nodes must start within a short window of e Use a coordination mechanism — a simple approach is to open five terminals (or tmux panes) and fire the start commands in quick succession: ```bash +# Load the passphrase from the secure env file (avoids it appearing in ps aux) +# See the cluster-setup guide for how to create /etc/ev-node/env with chmod 600 +source /etc/ev-node/env + # On node-1 ./evm start \ --evnode.node.aggregator=true \ @@ -266,7 +270,7 @@ Use a coordination mechanism — a simple approach is to open five terminals (or --evnode.raft.snapshot_threshold=5000 \ --evnode.p2p.listen_address="/ip4/0.0.0.0/tcp/26656" \ --evnode.p2p.peers="/ip4/10.0.0.2/tcp/26656/p2p/,/ip4/10.0.0.3/tcp/26656/p2p/,/ip4/10.0.0.4/tcp/26656/p2p/,/ip4/10.0.0.5/tcp/26656/p2p/" \ - --evnode.signer.passphrase= \ + --evnode.signer.passphrase="$EV_SIGNER_PASSPHRASE" \ --evm.jwt-secret=$(cat /path/to/jwt.hex) \ --evm.genesis-hash= ``` @@ -350,10 +354,11 @@ If anything goes wrong during the cutover, you can revert to the single sequence ```bash # Emergency rollback — revert node-1 to single sequencer +source /etc/ev-node/env ./evm start \ --evnode.node.aggregator=true \ --evnode.raft.enable=false \ - --evnode.signer.passphrase= \ + --evnode.signer.passphrase="$EV_SIGNER_PASSPHRASE" \ # ... your original flags ``` From 0b5aa74fcd5b8bc998fa0f1934d46dd971298c81 Mon Sep 17 00:00:00 2001 From: auricom <27022259+auricom@users.noreply.github.com> Date: Tue, 28 Apr 2026 17:14:43 +0200 Subject: [PATCH 6/7] docs(ha): explicit node-2 peers and action-based rolling restart - Replace "peers list is identical" stub in node-2 config with an explicit peers list that excludes node-2 itself, and add a note that each node must omit itself from raft.peers - Replace "Wait ~30 seconds" in rolling restart with journalctl one-liners that exit as soon as the node logs follower/leader state, giving a deterministic signal instead of an arbitrary timeout Co-Authored-By: Claude Sonnet 4.6 --- docs/guides/ha/cluster-setup.md | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/docs/guides/ha/cluster-setup.md b/docs/guides/ha/cluster-setup.md index 95bc1d87d2..8cc7673254 100644 --- a/docs/guides/ha/cluster-setup.md +++ b/docs/guides/ha/cluster-setup.md @@ -152,20 +152,22 @@ p2p: peers: "/ip4/10.0.0.2/tcp/26656/p2p/,/ip4/10.0.0.3/tcp/26656/p2p/,/ip4/10.0.0.4/tcp/26656/p2p/,/ip4/10.0.0.5/tcp/26656/p2p/" ``` -### node-2 (change only `node_id` and P2P peers) +### node-2 (`~/.evm/config/evnode.yaml`) + +Each node's `raft.peers` must list every **other** node — never the node itself. ```yaml -# ... same as node-1 except: +# ... same as node-1 except node_id and raft.peers: raft: node_id: "node-2" raft_addr: "0.0.0.0:5001" - # peers list is identical + peers: "node-1@10.0.0.1:5001,node-3@10.0.0.3:5001,node-4@10.0.0.4:5001,node-5@10.0.0.5:5001" p2p: peers: "/ip4/10.0.0.1/tcp/26656/p2p/,/ip4/10.0.0.3/tcp/26656/p2p/,/ip4/10.0.0.4/tcp/26656/p2p/,/ip4/10.0.0.5/tcp/26656/p2p/" ``` -Repeat for node-3 through node-5, updating `node_id` and the P2P peers list (exclude the local node from its own P2P peers). +Repeat for node-3 through node-5: increment `node_id`, remove the local node from both `raft.peers` and `p2p.peers`. --- @@ -358,20 +360,27 @@ To restart nodes without taking the cluster offline (e.g., for a config change o 2. For the leader node, restart it last. `ev-node` will transfer leadership to a peer before shutting down. ```bash -# Restart non-leader nodes first +# Restart non-leader nodes first, one at a time. +# After each restart, wait until the node confirms it has rejoined before touching the next. + ssh user@10.0.0.2 "sudo systemctl restart ev-node" -# Wait ~30 seconds for node-2 to rejoin and sync +ssh user@10.0.0.2 "sudo journalctl -u ev-node --since '1 min ago' -f | grep -m1 'follower state\|leader state'" + ssh user@10.0.0.3 "sudo systemctl restart ev-node" -# ...wait... +ssh user@10.0.0.3 "sudo journalctl -u ev-node --since '1 min ago' -f | grep -m1 'follower state\|leader state'" + ssh user@10.0.0.4 "sudo systemctl restart ev-node" -# ...wait... +ssh user@10.0.0.4 "sudo journalctl -u ev-node --since '1 min ago' -f | grep -m1 'follower state\|leader state'" + ssh user@10.0.0.5 "sudo systemctl restart ev-node" -# ...wait... -# Restart the leader last +ssh user@10.0.0.5 "sudo journalctl -u ev-node --since '1 min ago' -f | grep -m1 'follower state\|leader state'" + +# Restart the leader last — ev-node transfers leadership before shutting down ssh user@10.0.0.1 "sudo systemctl restart ev-node" +ssh user@10.0.0.1 "sudo journalctl -u ev-node --since '1 min ago' -f | grep -m1 'follower state\|leader state'" ``` -Verify each node is back in the cluster (check logs for `entering follower state` or `entering leader state`) before proceeding to the next. +The `grep -m1` exits as soon as the node logs `entering follower state` or `entering leader state`, confirming it has rejoined the cluster. Only then proceed to the next node. --- From 516b5332a369656774a5b96172b15c428cecba94 Mon Sep 17 00:00:00 2001 From: auricom <27022259+auricom@users.noreply.github.com> Date: Tue, 28 Apr 2026 17:16:16 +0200 Subject: [PATCH 7/7] docs(ha): fix raft.peers self-inclusion startup bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The abbreviated node-2 snippet with "# peers list is identical" caused a startup failure: with raft_addr=0.0.0.0:5001 the bootstrap code's literal address comparison does not recognise node-2@10.0.0.2:5001 as self, so node-2 is appended twice and deduplicateServers returns "duplicate peers found in config". - Fix intro text: "only raft.node_id and raft_addr differ" → "raft.node_id is unique; raft.peers and p2p.peers must exclude self" - Expand node-2 snippet to a full evnode.yaml with the correct peers list (node-1, node-3, node-4, node-5 — no node-2) and an inline explanation of the wildcard address pitfall - Align overview.md trailing_logs example to 1 block/s (matching block_time: "1s" used throughout) and note the 10 block/s rate too Co-Authored-By: Claude Sonnet 4.6 --- docs/guides/ha/cluster-setup.md | 25 +++++++++++++++++++++---- docs/guides/ha/overview.md | 4 ++-- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/docs/guides/ha/cluster-setup.md b/docs/guides/ha/cluster-setup.md index 8cc7673254..85f589aa2f 100644 --- a/docs/guides/ha/cluster-setup.md +++ b/docs/guides/ha/cluster-setup.md @@ -120,7 +120,7 @@ scp ~/.evm/config/genesis.json user@10.0.0.5:~/.evm/config/ ## Step 5: Write the Configuration Files -Write the following `evnode.yaml` on each node. The only field that differs per node is `raft.node_id` and `raft.raft_addr` — everything else is identical. +Write the following `evnode.yaml` on each node. `raft.node_id` is unique per node; `raft.peers` and `p2p.peers` must each exclude the local node — everything else is identical. ### node-1 (`~/.evm/config/evnode.yaml`) @@ -154,20 +154,37 @@ p2p: ### node-2 (`~/.evm/config/evnode.yaml`) -Each node's `raft.peers` must list every **other** node — never the node itself. +`raft.peers` must omit the local node. Because `raft_addr` is `0.0.0.0:5001` (a wildcard), the self-exclusion check in the bootstrap code compares addresses literally — it will not recognise `node-2@10.0.0.2:5001` as itself and will add node-2 twice, causing a startup error. Always list only the **other** nodes. ```yaml -# ... same as node-1 except node_id and raft.peers: +node: + aggregator: true + block_time: "1s" + raft: + enable: true node_id: "node-2" raft_addr: "0.0.0.0:5001" + raft_dir: "/var/lib/ev-node/raft" peers: "node-1@10.0.0.1:5001,node-3@10.0.0.3:5001,node-4@10.0.0.4:5001,node-5@10.0.0.5:5001" + # Timing — tuned for RTT_MAX ≤ 25ms + heartbeat_timeout: "92ms" + election_timeout: "368ms" + leader_lease_timeout: "46ms" + send_timeout: "50ms" + + # Log retention — covers ~5 hours of absence at 1 block/s + trailing_logs: 18000 + snapshot_threshold: 5000 + snap_count: 3 + p2p: + listen_address: "/ip4/0.0.0.0/tcp/26656" peers: "/ip4/10.0.0.1/tcp/26656/p2p/,/ip4/10.0.0.3/tcp/26656/p2p/,/ip4/10.0.0.4/tcp/26656/p2p/,/ip4/10.0.0.5/tcp/26656/p2p/" ``` -Repeat for node-3 through node-5: increment `node_id`, remove the local node from both `raft.peers` and `p2p.peers`. +Repeat for node-3 through node-5, updating `node_id`, `raft.peers` (exclude the local node), and `p2p.peers` (exclude the local node). --- diff --git a/docs/guides/ha/overview.md b/docs/guides/ha/overview.md index 4d39e22e37..cc5ce407dc 100644 --- a/docs/guides/ha/overview.md +++ b/docs/guides/ha/overview.md @@ -307,9 +307,9 @@ The number of log entries to **retain after a snapshot** is taken. These entries **Effect on operations:** - **Lower values** (e.g., `200`): tighter disk usage; a node that misses even a few minutes of operation must receive a full snapshot on rejoin. -- **Higher values** (e.g., `18000`): a lagging node can catch up via log replay for up to 30 minutes at 10 block/second without needing a full snapshot transfer, reducing the cost of brief outages. +- **Higher values** (e.g., `18000`): a lagging node can catch up via log replay without needing a full snapshot transfer, reducing the cost of brief outages. At 1 block/second (`block_time: "1s"`), `trailing_logs: 18000` covers ~5 hours; at 10 block/second, ~30 minutes. -Set this high enough to cover your typical maintenance window (restart, upgrade, brief network partition). At 10 block/second, `trailing_logs: 18000` covers 30 minutes of absence (1800 seconds). +Set this high enough to cover your typical maintenance window (restart, upgrade, brief network partition). Scale proportionally with your chain's block rate. ---