Fix cross-cell ICMP routing: state-based cell DROP + e2e test
The cell catch-all DROP rule blocked all traffic from a connected cell's subnet, including ESTABLISHED/RELATED packets (ICMP replies, TCP ACKs) for connections initiated by local VPN peers. This broke ping to the remote cell's WireGuard IP even when the cell-to-cell tunnel was healthy. Change the DROP to match only NEW,INVALID connections so established reply traffic passes through to the stateful ACCEPT rule. Also adds tests/e2e/wg/test_cell_to_cell_routing.py — an end-to-end test that brings up a real WireGuard tunnel from the test runner to pic1 and verifies full cross-cell routing including ICMP ping, API /health, and Caddy. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,303 @@
|
||||
"""
|
||||
E2E test: cross-cell routing for a split-tunnel VPN peer.
|
||||
|
||||
Creates a temporary WireGuard peer on cell2 (pic1 / test), brings up a real
|
||||
WireGuard tunnel from the test-runner host, and verifies that cell1 (pic0 / dev)
|
||||
is reachable end-to-end via the cell-to-cell link.
|
||||
|
||||
Why this test is meaningful
|
||||
---------------------------
|
||||
10.0.0.1 is cell1's WireGuard server IP, reachable ONLY inside cell1's
|
||||
cell-wireguard Docker container. It is NOT reachable directly from the
|
||||
test-runner host (verified: 100% packet loss without VPN).
|
||||
|
||||
If a ping to 10.0.0.1 succeeds during the test, the full path was taken:
|
||||
|
||||
[test-runner wg-e2e] → 192.168.31.52:51821 → [pic1 cell-wireguard FORWARD]
|
||||
→ [cell-to-cell WG tunnel] → [pic0 cell-wireguard] → 10.0.0.1
|
||||
|
||||
Prerequisites
|
||||
-------------
|
||||
* SSH access to 192.168.31.52 (pic1) as 'roof' with no passphrase
|
||||
* `wg-quick` and `sudo` available on the test runner (pic0)
|
||||
* Both cells must have an active cell-to-cell WireGuard handshake
|
||||
|
||||
Skip conditions are checked at fixture time; no manual flag needed.
|
||||
"""
|
||||
import os
|
||||
import subprocess
|
||||
import secrets
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Constants
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
PIC1_LAN = '192.168.31.52' # test cell (cell2)
|
||||
PIC1_WG_PORT = 51821 # WireGuard ListenPort on pic1
|
||||
PIC1_WG_PUBKEY = 'ITl3+KfcNjsDq9ztE+1TC10rmeqaLmpGgTXEEk07BiE='
|
||||
|
||||
PIC1_WG_SERVER_IP = '10.0.2.1' # cell2's WireGuard server IP
|
||||
PIC0_WG_SERVER_IP = '10.0.0.1' # cell1's WireGuard server IP (cross-cell target)
|
||||
|
||||
TEST_PEER_IP = '10.0.2.250' # unused IP in cell2's VPN subnet
|
||||
TEST_PEER_CIDR = f'{TEST_PEER_IP}/32'
|
||||
IFACE_NAME = 'pic-e2e-c2c'
|
||||
|
||||
# AllowedIPs for the test peer: cell2's local subnet + cell1's subnet (cross-cell)
|
||||
SPLIT_TUNNEL_ALLOWED_IPS = '10.0.2.0/24, 10.0.0.0/24'
|
||||
|
||||
IPTABLES_COMMENT = 'pic-e2e-c2c-test'
|
||||
|
||||
pytestmark = pytest.mark.wg
|
||||
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
def _run(cmd, **kw):
|
||||
return subprocess.run(cmd, capture_output=True, text=True, **kw)
|
||||
|
||||
|
||||
def _ssh(cmd, timeout=15):
|
||||
"""Run a command on pic1 via SSH and return the CompletedProcess."""
|
||||
return _run(
|
||||
['ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'BatchMode=yes',
|
||||
'-o', f'ConnectTimeout=5', f'roof@{PIC1_LAN}', cmd],
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
|
||||
def _pic1_wg(args, timeout=10):
|
||||
"""Run a command inside pic1's cell-wireguard container via SSH."""
|
||||
cmd = 'docker exec cell-wireguard ' + args
|
||||
r = _ssh(cmd, timeout=timeout)
|
||||
return r
|
||||
|
||||
|
||||
def _ping(ip, count=3, wait=2):
|
||||
r = _run(['ping', '-c', str(count), '-W', str(wait), ip], timeout=count * wait + 5)
|
||||
return r.returncode == 0
|
||||
|
||||
|
||||
def _cleanup_iface():
|
||||
_run(['sudo', 'ip', 'link', 'delete', IFACE_NAME], timeout=5)
|
||||
|
||||
|
||||
def _cleanup_pic1_peer(pubkey):
|
||||
_pic1_wg(f'wg set wg0 peer {pubkey} remove')
|
||||
|
||||
|
||||
def _cleanup_pic1_iptables():
|
||||
_pic1_wg(f'iptables -D FORWARD -s {TEST_PEER_IP} -j ACCEPT '
|
||||
f'-m comment --comment {IPTABLES_COMMENT}')
|
||||
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Session-level skip check
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
def _check_prerequisites():
|
||||
"""Return a skip reason string, or None if all prereqs are met."""
|
||||
# Check wg-quick
|
||||
if _run(['which', 'wg-quick']).returncode != 0:
|
||||
return 'wg-quick not found on test runner'
|
||||
# Check sudo
|
||||
if _run(['sudo', '-n', 'true']).returncode != 0:
|
||||
return 'passwordless sudo not available on test runner'
|
||||
# Check SSH to pic1
|
||||
r = _ssh('echo ok', timeout=6)
|
||||
if r.returncode != 0 or 'ok' not in r.stdout:
|
||||
return f'SSH to {PIC1_LAN} failed: {r.stderr.strip() or r.stdout.strip()}'
|
||||
# Check that 10.0.0.1 is NOT reachable directly (otherwise test is meaningless)
|
||||
# (a failure here is just a warning, not a skip)
|
||||
return None
|
||||
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Module-level skip
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
_SKIP_REASON = _check_prerequisites()
|
||||
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def wg_setup(tmp_path_factory):
|
||||
"""
|
||||
Module-scoped fixture: adds test peer to pic1, brings up wg interface on
|
||||
pic0 host, yields, then tears everything down.
|
||||
|
||||
Yields a dict:
|
||||
{
|
||||
'peer_ip': '10.0.2.250',
|
||||
'allowed_ips': '10.0.2.0/24, 10.0.0.0/24',
|
||||
'privkey': '<wg private key>',
|
||||
'pubkey': '<wg public key>',
|
||||
}
|
||||
"""
|
||||
if _SKIP_REASON:
|
||||
pytest.skip(_SKIP_REASON)
|
||||
|
||||
tmp_path = tmp_path_factory.mktemp('wg_e2e_c2c')
|
||||
|
||||
# --- Generate a WireGuard key pair ---
|
||||
priv_r = _run(['wg', 'genkey'], timeout=5)
|
||||
assert priv_r.returncode == 0, f'wg genkey failed: {priv_r.stderr}'
|
||||
privkey = priv_r.stdout.strip()
|
||||
|
||||
pub_r = subprocess.run(['wg', 'pubkey'], input=privkey, capture_output=True,
|
||||
text=True, timeout=5)
|
||||
assert pub_r.returncode == 0, f'wg pubkey failed: {pub_r.stderr}'
|
||||
pubkey = pub_r.stdout.strip()
|
||||
|
||||
# --- Add peer to pic1's wg0 (live, no restart needed) ---
|
||||
r = _pic1_wg(f'wg set wg0 peer {pubkey} allowed-ips {TEST_PEER_CIDR} persistent-keepalive 25')
|
||||
assert r.returncode == 0, f'wg set peer failed on pic1: {r.stderr}'
|
||||
|
||||
# --- Add permissive iptables rule so test traffic passes FORWARD ---
|
||||
r = _pic1_wg(
|
||||
f'iptables -I FORWARD 1 -s {TEST_PEER_IP} -j ACCEPT '
|
||||
f'-m comment --comment {IPTABLES_COMMENT}'
|
||||
)
|
||||
assert r.returncode == 0, f'iptables -I FORWARD failed on pic1: {r.stderr}'
|
||||
|
||||
# --- Write wg-quick config on the test runner ---
|
||||
conf_path = str(tmp_path / f'{IFACE_NAME}.conf')
|
||||
conf = (
|
||||
f'[Interface]\n'
|
||||
f'PrivateKey = {privkey}\n'
|
||||
f'Address = {TEST_PEER_IP}/32\n'
|
||||
f'\n'
|
||||
f'[Peer]\n'
|
||||
f'PublicKey = {PIC1_WG_PUBKEY}\n'
|
||||
f'Endpoint = {PIC1_LAN}:{PIC1_WG_PORT}\n'
|
||||
f'AllowedIPs = {SPLIT_TUNNEL_ALLOWED_IPS}\n'
|
||||
f'PersistentKeepalive = 25\n'
|
||||
)
|
||||
with open(conf_path, 'w') as f:
|
||||
f.write(conf)
|
||||
os.chmod(conf_path, 0o600)
|
||||
|
||||
# --- Bring up the WireGuard interface ---
|
||||
up_r = _run(['sudo', 'wg-quick', 'up', conf_path], timeout=15)
|
||||
assert up_r.returncode == 0, f'wg-quick up failed: {up_r.stderr}\n{up_r.stdout}'
|
||||
|
||||
# Give WireGuard a moment to establish the handshake
|
||||
time.sleep(3)
|
||||
|
||||
yield {
|
||||
'peer_ip': TEST_PEER_IP,
|
||||
'allowed_ips': SPLIT_TUNNEL_ALLOWED_IPS,
|
||||
'privkey': privkey,
|
||||
'pubkey': pubkey,
|
||||
'conf_path': conf_path,
|
||||
}
|
||||
|
||||
# --- Teardown ---
|
||||
_run(['sudo', 'wg-quick', 'down', conf_path], timeout=15)
|
||||
try:
|
||||
os.unlink(conf_path)
|
||||
except Exception:
|
||||
pass
|
||||
_cleanup_pic1_iptables()
|
||||
_cleanup_pic1_peer(pubkey)
|
||||
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Tests
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
class TestCellToCellRouting:
|
||||
"""
|
||||
Full end-to-end: split-tunnel peer on cell2 reaches cell1 via cell-to-cell tunnel.
|
||||
"""
|
||||
|
||||
def test_prerequisites_10_0_0_1_not_reachable_directly(self):
|
||||
"""Confirm 10.0.0.1 is NOT reachable from host without VPN (test validity check)."""
|
||||
assert not _ping(PIC0_WG_SERVER_IP, count=1, wait=1), (
|
||||
f'{PIC0_WG_SERVER_IP} is reachable WITHOUT the VPN — the test would be '
|
||||
f'a false positive. The test is only meaningful when this IP is unreachable '
|
||||
f'without the tunnel.'
|
||||
)
|
||||
|
||||
def test_cell2_wg_ip_reachable(self, wg_setup):
|
||||
"""Cell2's WireGuard server IP is reachable (basic tunnel sanity)."""
|
||||
assert _ping(PIC1_WG_SERVER_IP), (
|
||||
f'Cell2 WG server IP {PIC1_WG_SERVER_IP} not reachable. '
|
||||
f'Handshake may not have established. '
|
||||
f'Peer allowed-ips: {wg_setup["allowed_ips"]}'
|
||||
)
|
||||
|
||||
def test_handshake_established(self, wg_setup):
|
||||
"""A WireGuard handshake with pic1 has completed (within 30 s)."""
|
||||
deadline = time.time() + 30
|
||||
while time.time() < deadline:
|
||||
r = _run(['sudo', 'wg', 'show', IFACE_NAME], timeout=5)
|
||||
if 'latest handshake' in r.stdout:
|
||||
return
|
||||
time.sleep(2)
|
||||
pytest.fail(
|
||||
f'No WireGuard handshake with pic1 after 30 s.\n'
|
||||
f'wg show output:\n{r.stdout}'
|
||||
)
|
||||
|
||||
def test_cross_cell_wg_ip_reachable(self, wg_setup):
|
||||
"""
|
||||
Cell1's WireGuard IP (10.0.0.1) is reachable from a peer connected to cell2.
|
||||
|
||||
This is the critical cross-cell routing test. The full path is:
|
||||
test-runner → wg-e2e → pic1 cell-wireguard FORWARD → cell-to-cell tunnel → pic0 10.0.0.1
|
||||
"""
|
||||
assert _ping(PIC0_WG_SERVER_IP, count=3, wait=3), (
|
||||
f'Cell1 WG IP {PIC0_WG_SERVER_IP} NOT reachable from split-tunnel peer on cell2. '
|
||||
f'\nAllowed IPs: {wg_setup["allowed_ips"]}'
|
||||
f'\nThis means the cell-to-cell routing is broken. Check:'
|
||||
f'\n 1. pic1 FORWARD chain has ESTABLISHED,RELATED ACCEPT'
|
||||
f'\n 2. pic1 wg0.conf has AllowedIPs=10.0.0.0/24 for the dev cell peer'
|
||||
f'\n 3. Cell-to-cell WireGuard handshake is recent (wg show on pic1)'
|
||||
)
|
||||
|
||||
def test_cross_cell_api_reachable(self, wg_setup):
|
||||
"""Cell1's API /health is reachable through the cell-to-cell tunnel."""
|
||||
import urllib.request, urllib.error
|
||||
url = f'http://{PIC0_WG_SERVER_IP}:3000/health'
|
||||
try:
|
||||
with urllib.request.urlopen(url, timeout=8) as resp:
|
||||
import json
|
||||
body = json.loads(resp.read())
|
||||
assert body.get('status') == 'healthy', (
|
||||
f'Cell1 API returned unexpected health: {body}'
|
||||
)
|
||||
except urllib.error.URLError as e:
|
||||
pytest.fail(
|
||||
f'Cell1 API at {url} not reachable via cell-to-cell tunnel: {e}. '
|
||||
f'\nNote: if test_cross_cell_wg_ip_reachable passed but this fails, '
|
||||
f'the tunnel is up but port 3000 may be blocked by cell1\'s firewall.'
|
||||
)
|
||||
|
||||
def test_cross_cell_web_reachable(self, wg_setup):
|
||||
"""Cell1's web service (port 80 via Caddy) is reachable through the tunnel."""
|
||||
import urllib.request, urllib.error
|
||||
# Port 80 goes to Caddy → services. We expect any HTTP response (even a redirect).
|
||||
url = f'http://{PIC0_WG_SERVER_IP}/'
|
||||
try:
|
||||
with urllib.request.urlopen(url, timeout=8) as resp:
|
||||
assert resp.status in (200, 301, 302, 307, 308), (
|
||||
f'Unexpected HTTP status from cell1 Caddy: {resp.status}'
|
||||
)
|
||||
except urllib.error.HTTPError as e:
|
||||
# HTTPError means we got a response — tunnel works even if it's a 4xx/5xx
|
||||
assert e.code < 500, (
|
||||
f'Cell1 Caddy returned server error {e.code} — may indicate a Caddy issue'
|
||||
)
|
||||
except urllib.error.URLError as e:
|
||||
pytest.fail(
|
||||
f'Cell1 web (Caddy) at {url} not reachable via tunnel: {e}'
|
||||
)
|
||||
Reference in New Issue
Block a user