Failover Script Updated
Professional route-based Dual-WAN failover for Ethereum validator with failback.
Key properties (That implemented in the script):
1. Deployment files you need
#!/usr/bin/env bash
# dual-wan-failover.sh
# Professional route-based Dual-WAN failover for Ethereum validator
# - Does NOT bring interfaces down
# - Adds hysteresis for failover/failback
# - Replaces default route to preferred/backup gateway
# - Logs to syslog and /var/log/dual-wan-failover.log
#
# Edit the variables in the "USER CONFIG" block before running.
set -uo pipefail
LOGFILE="/var/log/dual-wan-failover.log"
exec 1>>"$LOGFILE" 2>&1
timestamp() { date "+%F %T"; }
# ------------------ USER CONFIG ------------------
# Interfaces
PRIMARY_IF="eth0" # interface for WAN1 (static public IP)
SECONDARY_IF="eth1" # interface for WAN2 (dynamic)
# Gateways (next-hop addresses on each WAN's LAN)
PRIMARY_GW="192.168.1.1" # gateway for PRIMARY_IF
SECONDARY_GW="192.168.2.1" # gateway for SECONDARY_IF
# Health check targets (public IPs or gateways). Use reliable IPs
# Prefer an external public IP (8.8.8.8) and/or the ISP gateway IP.
CHECK_TARGET="8.8.8.8"
# Health check behavior
PING_COUNT_PER_CHECK=1 # pings per check
PING_TIMEOUT=2 # seconds per ping
# Hysteresis
FAIL_THRESHOLD=3 # consecutive failed checks before failover
RECOVER_THRESHOLD=12 # consecutive successful checks before failback (e.g. 12*5s = 60s)
# Poll interval
SLEEP_INTERVAL=5 # seconds between checks
# Optional TCP check (e.g. test port 80/443) - leave blank to skip
TCP_CHECK_HOST=""
TCP_CHECK_PORT=""
# Safety: don't allow auto-failback immediately; set to true to require manual failback
AUTO_FAILBACK=true
# -------------------------------------------------
# Validate environment
if ! command -v ip >/dev/null 2>&1; then
echo "$(timestamp) ERROR: iproute2 (ip) required" >&2
exit 1
fi
if ! command -v ping >/dev/null 2>&1; then
echo "$(timestamp) ERROR: ping required" >&2
exit 1
fi
# helper: check target via interface with ping
ping_check() {
local ifname="$1"
local target="$2"
# Use -I interface address; if interface has no IPv4 yet, ping will fail
ping -c "$PING_COUNT_PER_CHECK" -W "$PING_TIMEOUT" -I "$ifname" "$target" >/dev/null 2>&1
}
# Optional TCP connect check using timeout+bash / redirection (requires bash's /dev/tcp)
tcp_check() {
local host="$1" port="$2" timeout="${3:-2}"
# bash /dev/tcp method (may hang on some shells); use timeout wrapper
timeout "$timeout" bash -c "cat < /dev/null > /dev/tcp/$host/$port" >/dev/null 2>&1
}
# Get current default nexthop (show only first default route)
get_current_default() {
ip route show default 2>/dev/null | awk 'NR==1{print $3}'
}
# Set default route (replace)
set_default_route() {
local gw="$1" dev="$2"
# replace default route — idempotent
ip route replace default via "$gw" dev "$dev" proto static
echo "$(timestamp) INFO: Default route set -> gw=$gw dev=$dev"
}
# Log wrapper
log() {
echo "$(timestamp) - $*"
}
# initial counters
primary_fail_count=0
primary_ok_count=0
# On startup, set default to primary if available, else try secondary
initial_primary_ok=false
if ping_check "$PRIMARY_IF" "$CHECK_TARGET"; then
set_default_route "$PRIMARY_GW" "$PRIMARY_IF"
initial_primary_ok=true
log "Startup: primary reachable, using primary"
else
set_default_route "$SECONDARY_GW" "$SECONDARY_IF"
log "Startup: primary NOT reachable, using secondary"
fi
# main loop
while true; do
# Check primary
if ping_check "$PRIMARY_IF" "$CHECK_TARGET"; then
# optional additional TCP check
if [ -n "$TCP_CHECK_HOST" ] && [ -n "$TCP_CHECK_PORT" ]; then
if tcp_check "$TCP_CHECK_HOST" "$TCP_CHECK_PORT" 2; then
primary_ok=true
else
primary_ok=false
fi
else
primary_ok=true
fi
else
primary_ok=false
fi
if $primary_ok; then
primary_fail_count=0
primary_ok_count=$((primary_ok_count+1))
# if default is on secondary and we've stabilized, switch back (only if AUTO_FAILBACK true)
current_gw=$(get_current_default)
if [ "$AUTO_FAILBACK" = true ] && [ "$current_gw" != "$PRIMARY_GW" ]; then
if [ "$primary_ok_count" -ge "$RECOVER_THRESHOLD" ]; then
log "Primary stabilized for $primary_ok_count checks -> switching default back to PRIMARY"
set_default_route "$PRIMARY_GW" "$PRIMARY_IF"
primary_ok_count=0
else
log "Primary OK (${primary_ok_count}/${RECOVER_THRESHOLD}) - waiting before failback"
fi
fi
else
primary_ok_count=0
primary_fail_count=$((primary_fail_count+1))
log "Primary NOT reachable (${primary_fail_count}/${FAIL_THRESHOLD})"
# On threshold breach, switch to secondary
if [ "$primary_fail_count" -ge "$FAIL_THRESHOLD" ]; then
current_gw=$(get_current_default)
if [ "$current_gw" != "$SECONDARY_GW" ]; then
log "Primary down for $primary_fail_count checks -> switching default to SECONDARY"
set_default_route "$SECONDARY_GW" "$SECONDARY_IF"
else
log "Already on SECONDARY"
fi
fi
fi
sleep "$SLEEP_INTERVAL"
done
2. What is changed from the previous version script (why it’s safer)
3. Recommended parameter values (tweak if needed)
4. Tests and verification
5. Manual failback (if you prefer)
6. Extra hardening tips for validator reliability
7. Is manual switch-back OK to avoid missed attestations?
Last updated