Fix cross-cell ICMP routing: state-based cell DROP + e2e test

The cell catch-all DROP rule blocked all traffic from a connected cell's
subnet, including ESTABLISHED/RELATED packets (ICMP replies, TCP ACKs) for
connections initiated by local VPN peers. This broke ping to the remote
cell's WireGuard IP even when the cell-to-cell tunnel was healthy.

Change the DROP to match only NEW,INVALID connections so established reply
traffic passes through to the stateful ACCEPT rule.

Also adds tests/e2e/wg/test_cell_to_cell_routing.py — an end-to-end test
that brings up a real WireGuard tunnel from the test runner to pic1 and
verifies full cross-cell routing including ICMP ping, API /health, and Caddy.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-05 10:59:11 -04:00
parent 5a4e292440
commit 1e1bda4679
2 changed files with 308 additions and 1 deletions
+303
View File
@@ -0,0 +1,303 @@
"""
E2E test: cross-cell routing for a split-tunnel VPN peer.
Creates a temporary WireGuard peer on cell2 (pic1 / test), brings up a real
WireGuard tunnel from the test-runner host, and verifies that cell1 (pic0 / dev)
is reachable end-to-end via the cell-to-cell link.
Why this test is meaningful
---------------------------
10.0.0.1 is cell1's WireGuard server IP, reachable ONLY inside cell1's
cell-wireguard Docker container. It is NOT reachable directly from the
test-runner host (verified: 100% packet loss without VPN).
If a ping to 10.0.0.1 succeeds during the test, the full path was taken:
[test-runner wg-e2e] → 192.168.31.52:51821 → [pic1 cell-wireguard FORWARD]
→ [cell-to-cell WG tunnel] → [pic0 cell-wireguard] → 10.0.0.1
Prerequisites
-------------
* SSH access to 192.168.31.52 (pic1) as 'roof' with no passphrase
* `wg-quick` and `sudo` available on the test runner (pic0)
* Both cells must have an active cell-to-cell WireGuard handshake
Skip conditions are checked at fixture time; no manual flag needed.
"""
import os
import subprocess
import secrets
import time
import pytest
# -------------------------------------------------------------------------
# Constants
# -------------------------------------------------------------------------
PIC1_LAN = '192.168.31.52' # test cell (cell2)
PIC1_WG_PORT = 51821 # WireGuard ListenPort on pic1
PIC1_WG_PUBKEY = 'ITl3+KfcNjsDq9ztE+1TC10rmeqaLmpGgTXEEk07BiE='
PIC1_WG_SERVER_IP = '10.0.2.1' # cell2's WireGuard server IP
PIC0_WG_SERVER_IP = '10.0.0.1' # cell1's WireGuard server IP (cross-cell target)
TEST_PEER_IP = '10.0.2.250' # unused IP in cell2's VPN subnet
TEST_PEER_CIDR = f'{TEST_PEER_IP}/32'
IFACE_NAME = 'pic-e2e-c2c'
# AllowedIPs for the test peer: cell2's local subnet + cell1's subnet (cross-cell)
SPLIT_TUNNEL_ALLOWED_IPS = '10.0.2.0/24, 10.0.0.0/24'
IPTABLES_COMMENT = 'pic-e2e-c2c-test'
pytestmark = pytest.mark.wg
# -------------------------------------------------------------------------
# Helpers
# -------------------------------------------------------------------------
def _run(cmd, **kw):
return subprocess.run(cmd, capture_output=True, text=True, **kw)
def _ssh(cmd, timeout=15):
"""Run a command on pic1 via SSH and return the CompletedProcess."""
return _run(
['ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'BatchMode=yes',
'-o', f'ConnectTimeout=5', f'roof@{PIC1_LAN}', cmd],
timeout=timeout,
)
def _pic1_wg(args, timeout=10):
"""Run a command inside pic1's cell-wireguard container via SSH."""
cmd = 'docker exec cell-wireguard ' + args
r = _ssh(cmd, timeout=timeout)
return r
def _ping(ip, count=3, wait=2):
r = _run(['ping', '-c', str(count), '-W', str(wait), ip], timeout=count * wait + 5)
return r.returncode == 0
def _cleanup_iface():
_run(['sudo', 'ip', 'link', 'delete', IFACE_NAME], timeout=5)
def _cleanup_pic1_peer(pubkey):
_pic1_wg(f'wg set wg0 peer {pubkey} remove')
def _cleanup_pic1_iptables():
_pic1_wg(f'iptables -D FORWARD -s {TEST_PEER_IP} -j ACCEPT '
f'-m comment --comment {IPTABLES_COMMENT}')
# -------------------------------------------------------------------------
# Session-level skip check
# -------------------------------------------------------------------------
def _check_prerequisites():
"""Return a skip reason string, or None if all prereqs are met."""
# Check wg-quick
if _run(['which', 'wg-quick']).returncode != 0:
return 'wg-quick not found on test runner'
# Check sudo
if _run(['sudo', '-n', 'true']).returncode != 0:
return 'passwordless sudo not available on test runner'
# Check SSH to pic1
r = _ssh('echo ok', timeout=6)
if r.returncode != 0 or 'ok' not in r.stdout:
return f'SSH to {PIC1_LAN} failed: {r.stderr.strip() or r.stdout.strip()}'
# Check that 10.0.0.1 is NOT reachable directly (otherwise test is meaningless)
# (a failure here is just a warning, not a skip)
return None
# -------------------------------------------------------------------------
# Module-level skip
# -------------------------------------------------------------------------
_SKIP_REASON = _check_prerequisites()
# -------------------------------------------------------------------------
# Fixtures
# -------------------------------------------------------------------------
@pytest.fixture(scope='module')
def wg_setup(tmp_path_factory):
"""
Module-scoped fixture: adds test peer to pic1, brings up wg interface on
pic0 host, yields, then tears everything down.
Yields a dict:
{
'peer_ip': '10.0.2.250',
'allowed_ips': '10.0.2.0/24, 10.0.0.0/24',
'privkey': '<wg private key>',
'pubkey': '<wg public key>',
}
"""
if _SKIP_REASON:
pytest.skip(_SKIP_REASON)
tmp_path = tmp_path_factory.mktemp('wg_e2e_c2c')
# --- Generate a WireGuard key pair ---
priv_r = _run(['wg', 'genkey'], timeout=5)
assert priv_r.returncode == 0, f'wg genkey failed: {priv_r.stderr}'
privkey = priv_r.stdout.strip()
pub_r = subprocess.run(['wg', 'pubkey'], input=privkey, capture_output=True,
text=True, timeout=5)
assert pub_r.returncode == 0, f'wg pubkey failed: {pub_r.stderr}'
pubkey = pub_r.stdout.strip()
# --- Add peer to pic1's wg0 (live, no restart needed) ---
r = _pic1_wg(f'wg set wg0 peer {pubkey} allowed-ips {TEST_PEER_CIDR} persistent-keepalive 25')
assert r.returncode == 0, f'wg set peer failed on pic1: {r.stderr}'
# --- Add permissive iptables rule so test traffic passes FORWARD ---
r = _pic1_wg(
f'iptables -I FORWARD 1 -s {TEST_PEER_IP} -j ACCEPT '
f'-m comment --comment {IPTABLES_COMMENT}'
)
assert r.returncode == 0, f'iptables -I FORWARD failed on pic1: {r.stderr}'
# --- Write wg-quick config on the test runner ---
conf_path = str(tmp_path / f'{IFACE_NAME}.conf')
conf = (
f'[Interface]\n'
f'PrivateKey = {privkey}\n'
f'Address = {TEST_PEER_IP}/32\n'
f'\n'
f'[Peer]\n'
f'PublicKey = {PIC1_WG_PUBKEY}\n'
f'Endpoint = {PIC1_LAN}:{PIC1_WG_PORT}\n'
f'AllowedIPs = {SPLIT_TUNNEL_ALLOWED_IPS}\n'
f'PersistentKeepalive = 25\n'
)
with open(conf_path, 'w') as f:
f.write(conf)
os.chmod(conf_path, 0o600)
# --- Bring up the WireGuard interface ---
up_r = _run(['sudo', 'wg-quick', 'up', conf_path], timeout=15)
assert up_r.returncode == 0, f'wg-quick up failed: {up_r.stderr}\n{up_r.stdout}'
# Give WireGuard a moment to establish the handshake
time.sleep(3)
yield {
'peer_ip': TEST_PEER_IP,
'allowed_ips': SPLIT_TUNNEL_ALLOWED_IPS,
'privkey': privkey,
'pubkey': pubkey,
'conf_path': conf_path,
}
# --- Teardown ---
_run(['sudo', 'wg-quick', 'down', conf_path], timeout=15)
try:
os.unlink(conf_path)
except Exception:
pass
_cleanup_pic1_iptables()
_cleanup_pic1_peer(pubkey)
# -------------------------------------------------------------------------
# Tests
# -------------------------------------------------------------------------
class TestCellToCellRouting:
"""
Full end-to-end: split-tunnel peer on cell2 reaches cell1 via cell-to-cell tunnel.
"""
def test_prerequisites_10_0_0_1_not_reachable_directly(self):
"""Confirm 10.0.0.1 is NOT reachable from host without VPN (test validity check)."""
assert not _ping(PIC0_WG_SERVER_IP, count=1, wait=1), (
f'{PIC0_WG_SERVER_IP} is reachable WITHOUT the VPN — the test would be '
f'a false positive. The test is only meaningful when this IP is unreachable '
f'without the tunnel.'
)
def test_cell2_wg_ip_reachable(self, wg_setup):
"""Cell2's WireGuard server IP is reachable (basic tunnel sanity)."""
assert _ping(PIC1_WG_SERVER_IP), (
f'Cell2 WG server IP {PIC1_WG_SERVER_IP} not reachable. '
f'Handshake may not have established. '
f'Peer allowed-ips: {wg_setup["allowed_ips"]}'
)
def test_handshake_established(self, wg_setup):
"""A WireGuard handshake with pic1 has completed (within 30 s)."""
deadline = time.time() + 30
while time.time() < deadline:
r = _run(['sudo', 'wg', 'show', IFACE_NAME], timeout=5)
if 'latest handshake' in r.stdout:
return
time.sleep(2)
pytest.fail(
f'No WireGuard handshake with pic1 after 30 s.\n'
f'wg show output:\n{r.stdout}'
)
def test_cross_cell_wg_ip_reachable(self, wg_setup):
"""
Cell1's WireGuard IP (10.0.0.1) is reachable from a peer connected to cell2.
This is the critical cross-cell routing test. The full path is:
test-runner → wg-e2e → pic1 cell-wireguard FORWARD → cell-to-cell tunnel → pic0 10.0.0.1
"""
assert _ping(PIC0_WG_SERVER_IP, count=3, wait=3), (
f'Cell1 WG IP {PIC0_WG_SERVER_IP} NOT reachable from split-tunnel peer on cell2. '
f'\nAllowed IPs: {wg_setup["allowed_ips"]}'
f'\nThis means the cell-to-cell routing is broken. Check:'
f'\n 1. pic1 FORWARD chain has ESTABLISHED,RELATED ACCEPT'
f'\n 2. pic1 wg0.conf has AllowedIPs=10.0.0.0/24 for the dev cell peer'
f'\n 3. Cell-to-cell WireGuard handshake is recent (wg show on pic1)'
)
def test_cross_cell_api_reachable(self, wg_setup):
"""Cell1's API /health is reachable through the cell-to-cell tunnel."""
import urllib.request, urllib.error
url = f'http://{PIC0_WG_SERVER_IP}:3000/health'
try:
with urllib.request.urlopen(url, timeout=8) as resp:
import json
body = json.loads(resp.read())
assert body.get('status') == 'healthy', (
f'Cell1 API returned unexpected health: {body}'
)
except urllib.error.URLError as e:
pytest.fail(
f'Cell1 API at {url} not reachable via cell-to-cell tunnel: {e}. '
f'\nNote: if test_cross_cell_wg_ip_reachable passed but this fails, '
f'the tunnel is up but port 3000 may be blocked by cell1\'s firewall.'
)
def test_cross_cell_web_reachable(self, wg_setup):
"""Cell1's web service (port 80 via Caddy) is reachable through the tunnel."""
import urllib.request, urllib.error
# Port 80 goes to Caddy → services. We expect any HTTP response (even a redirect).
url = f'http://{PIC0_WG_SERVER_IP}/'
try:
with urllib.request.urlopen(url, timeout=8) as resp:
assert resp.status in (200, 301, 302, 307, 308), (
f'Unexpected HTTP status from cell1 Caddy: {resp.status}'
)
except urllib.error.HTTPError as e:
# HTTPError means we got a response — tunnel works even if it's a 4xx/5xx
assert e.code < 500, (
f'Cell1 Caddy returned server error {e.code} — may indicate a Caddy issue'
)
except urllib.error.URLError as e:
pytest.fail(
f'Cell1 web (Caddy) at {url} not reachable via tunnel: {e}'
)