fix: complete cross-cell peer-sync push (domain SNI + source-preserving NAT)
Unit Tests / test (push) Successful in 9m45s

Finishes the transport repair (L1+L2 landed in 714fb9b). The push now works
end-to-end between linked cells — verified live: offer/permission state
propagates automatically and the cell_relay derives/reverts without manual steps.

L3 — push by domain, not bare IP (cell_link_manager): the push targeted
https://<vpn-ip>, but in DDNS/ACME mode Caddy only holds a cert for the cell's
domain, so the TLS handshake failed by IP. Target https://<remote-domain> with
`curl --resolve <domain>:443:<dns_ip>` — connect to the VPN IP over the tunnel
but present the domain as SNI/Host. remote_api_url is now domain-based; legacy
http://ip:3000 and https://ip URLs migrate on load.

L4 — preserve the real source for auth (firewall_manager): the blanket
`-o eth0 MASQUERADE` rewrote the push source, so the remote's X-Forwarded-For
source-subnet auth couldn't match. apply_cell_rules adds a tightly-scoped nat
POSTROUTING RETURN (linked-subnet → caddy:443 only) above the masquerade; the
host route returns Caddy's reply through the tunnel. Reviewed by pic-security:
WireGuard per-cell AllowedIPs + Caddy last-XFF (no trusted_proxies) keep this
un-spoofable; the API stays 127.0.0.1-only.

Also:
- validate remote-invite domain/dns_ip/endpoint/subnet at ingest (they reach a
  curl --resolve argv — block leading-dash argument-injection).
- remove the host subnet route on cell unlink (remove_cell_subnet_route); the
  route was never cleaned, leaving a stale subnet that made is_local_request
  treat it as local. Mock firewall side-effects in the affected unit tests.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-06-17 01:02:20 -04:00
parent 714fb9b1a9
commit 639fb66e5b
4 changed files with 250 additions and 46 deletions
+99 -23
View File
@@ -14,9 +14,39 @@ import json
import shutil
from unittest.mock import MagicMock, patch
import cell_link_manager
from cell_link_manager import CellLinkManager
_fw_patch = None
def setUpModule():
"""Stop cell-link unit tests from running real firewall side-effects.
add_connection/remove_connection call into firewall_manager, which shells out
to `docker exec cell-wireguard iptables` and `docker run` host-route changes.
On the dev/CI host those mutate live routes — a stale cell-subnet route once
made is_local_request treat a VPN subnet as local and broke the full suite.
Tests that assert specific firewall calls use their own local patch, which
takes precedence within its context.
"""
global _fw_patch
_fw_patch = patch.multiple(
'firewall_manager',
apply_cell_rules=MagicMock(return_value=True),
clear_cell_rules=MagicMock(),
ensure_cell_subnet_routes=MagicMock(),
remove_cell_subnet_route=MagicMock(),
)
_fw_patch.start()
def tearDownModule():
if _fw_patch is not None:
_fw_patch.stop()
def _make_wg_mock():
wg = MagicMock()
wg.get_keys.return_value = {'public_key': 'serverpubkey=', 'private_key': 'serverprivkey='}
@@ -50,6 +80,37 @@ SAMPLE_INVITE = {
}
class TestInviteFieldValidation(unittest.TestCase):
"""_validate_invite_fields rejects malformed remote-invite fields.
The domain/dns_ip flow into a `curl --resolve` argv on peer-sync push, so a
leading-dash domain (argument injection) and non-IP dns_ip must be rejected.
"""
def test_valid_invite_passes(self):
cell_link_manager._validate_invite_fields(SAMPLE_INVITE) # no raise
def test_rejects_leading_dash_domain(self):
bad = {**SAMPLE_INVITE, 'domain': '-oProxyCommand=evil'}
with self.assertRaises(ValueError):
cell_link_manager._validate_invite_fields(bad)
def test_rejects_non_ip_dns_ip(self):
bad = {**SAMPLE_INVITE, 'dns_ip': '-x'}
with self.assertRaises(ValueError):
cell_link_manager._validate_invite_fields(bad)
def test_rejects_bad_subnet(self):
bad = {**SAMPLE_INVITE, 'vpn_subnet': 'not-a-cidr'}
with self.assertRaises(ValueError):
cell_link_manager._validate_invite_fields(bad)
def test_rejects_bad_endpoint(self):
bad = {**SAMPLE_INVITE, 'endpoint': '-evil:51820'}
with self.assertRaises(ValueError):
cell_link_manager._validate_invite_fields(bad)
class TestCellLinkManagerInvite(unittest.TestCase):
def setUp(self):
@@ -146,6 +207,15 @@ class TestCellLinkManagerConnections(unittest.TestCase):
self.mgr.remove_connection('office')
self.nm.remove_cell_dns_forward.assert_called_once_with('office.cell')
def test_remove_connection_removes_host_subnet_route(self):
"""Unlinking a cell removes its host route so the subnet isn't left
blackholed / treated as locally attached."""
import firewall_manager as _fm
self.mgr.add_connection(SAMPLE_INVITE)
_fm.remove_cell_subnet_route.reset_mock()
self.mgr.remove_connection('office')
_fm.remove_cell_subnet_route.assert_called_once_with('10.1.0.0/24')
def test_remove_connection_deletes_from_list(self):
self.mgr.add_connection(SAMPLE_INVITE)
self.mgr.remove_connection('office')
@@ -190,7 +260,9 @@ class TestCellLinkManagerConnections(unittest.TestCase):
result = self.mgr.accept_invite(updated_invite)
self.assertEqual(result['dns_ip'], '10.1.0.2')
self.assertEqual(result['remote_api_url'], 'https://10.1.0.2')
# remote_api_url is domain-based (the push --resolves it to the VPN IP),
# so a dns_ip change does not alter it.
self.assertEqual(result['remote_api_url'], 'https://office.cell')
self.nm.remove_cell_dns_forward.assert_called()
self.nm.add_cell_dns_forward.assert_called_with(
domain='office.cell', dns_ip='10.1.0.2')
@@ -615,7 +687,7 @@ class TestAcceptInviteNew(unittest.TestCase):
with patch('firewall_manager.apply_cell_rules'):
result = self.mgr.accept_invite(updated)
self.assertEqual(result['dns_ip'], '10.1.0.5')
self.assertEqual(result['remote_api_url'], 'https://10.1.0.5')
self.assertEqual(result['remote_api_url'], 'https://office.cell')
self.nm.remove_cell_dns_forward.assert_called()
self.nm.add_cell_dns_forward.assert_called_with(
domain='office.cell', dns_ip='10.1.0.5')
@@ -1055,10 +1127,11 @@ class TestPermissionSync(unittest.TestCase):
self.assertIn('last_push_at', link)
self.assertIn('last_remote_update_at', link)
def test_add_connection_sets_remote_api_url_from_dns_ip(self):
def test_add_connection_sets_remote_api_url_from_domain(self):
link = self._add_office()
# Cross-cell API is reached over the tunnel via Caddy/443, not :3000.
self.assertEqual(link['remote_api_url'], 'https://10.1.0.1')
# Cross-cell API is reached via the remote's domain over Caddy/443 (the
# push --resolves the domain to the VPN IP over the tunnel).
self.assertEqual(link['remote_api_url'], 'https://office.cell')
def test_add_connection_triggers_push(self):
push_mock = MagicMock(return_value={'ok': True, 'error': None})
@@ -1332,7 +1405,7 @@ class TestPermissionSync(unittest.TestCase):
self.assertIn('last_push_status', link)
self.assertIn('last_push_at', link)
self.assertIn('last_remote_update_at', link)
self.assertEqual(link['remote_api_url'], 'https://10.1.0.1')
self.assertEqual(link['remote_api_url'], 'https://office.cell')
self.assertTrue(link['pending_push']) # pre-existing → marked pending
self.assertEqual(link['last_push_status'], 'never')
@@ -1341,23 +1414,26 @@ class TestPermissionSync(unittest.TestCase):
raw = json.load(f)
self.assertIn('pending_push', raw[0])
def test_load_migrates_legacy_http_3000_url_to_https(self):
"""An existing link with the old http://<ip>:3000 URL (unreachable across
cells) is rewritten to the HTTPS/Caddy form on load."""
legacy = [{
'cell_name': 'office',
'public_key': 'officepubkey=',
'vpn_subnet': '10.1.0.0/24',
'dns_ip': '10.1.0.9',
'domain': 'office.cell',
'permissions': {'inbound': {}, 'outbound': {}},
'remote_api_url': 'http://10.1.0.9:3000',
}]
links_file = os.path.join(self.test_dir, 'cell_links.json')
with open(links_file, 'w') as f:
json.dump(legacy, f)
link = self.mgr.list_connections()[0]
self.assertEqual(link['remote_api_url'], 'https://10.1.0.9')
def test_load_migrates_legacy_url_forms_to_https_domain(self):
"""Legacy remote_api_url forms — http://<ip>:3000 (unreachable) and
https://<ip> (no matching Caddy cert by bare IP) — are rewritten on load to
the domain-based HTTPS form."""
for legacy_url in ('http://10.1.0.9:3000', 'https://10.1.0.9'):
legacy = [{
'cell_name': 'office',
'public_key': 'officepubkey=',
'vpn_subnet': '10.1.0.0/24',
'dns_ip': '10.1.0.9',
'domain': 'office.cell',
'permissions': {'inbound': {}, 'outbound': {}},
'remote_api_url': legacy_url,
}]
links_file = os.path.join(self.test_dir, 'cell_links.json')
with open(links_file, 'w') as f:
json.dump(legacy, f)
link = self.mgr.list_connections()[0]
self.assertEqual(link['remote_api_url'], 'https://office.cell',
f'failed to migrate {legacy_url!r}')
class TestExitOffer(unittest.TestCase):
+42
View File
@@ -652,6 +652,7 @@ class TestCellRules(unittest.TestCase):
patch.object(firewall_manager, '_get_cell_api_ip', return_value=self._FAKE_API_IP), \
patch.object(firewall_manager, '_get_caddy_container_ip', return_value=self._FAKE_CADDY_IP), \
patch.object(firewall_manager, '_get_dns_container_ip', return_value=self._FAKE_DNS_IP), \
patch.object(firewall_manager, 'ensure_cell_subnet_routes', return_value=None), \
patch.object(firewall_manager, 'ensure_forward_stateful', return_value=True):
firewall_manager.apply_cell_rules(cell_name, vpn_subnet, inbound_services)
@@ -742,6 +743,20 @@ class TestCellRules(unittest.TestCase):
]
self.assertFalse(api_3000, 'Peer-sync must not target cell-api:3000')
def test_apply_cell_rules_excludes_peer_sync_from_masquerade(self):
"""Peer-sync to Caddy:443 must RETURN in nat POSTROUTING (skip the blanket
MASQUERADE) so the remote sees the linked cell's real VPN source for auth."""
calls = self._capture_apply('office', '10.0.1.0/24', [])
returns = [
c for c in calls
if '-t' in c and 'nat' in c and 'POSTROUTING' in c
and '-s' in c and '10.0.1.0/24' in c
and '-d' in c and self._FAKE_CADDY_IP in c
and '--dport' in c and '443' in c
and '-j' in c and c[c.index('-j') + 1] == 'RETURN'
]
self.assertTrue(returns, 'Expected nat POSTROUTING RETURN to preserve peer-sync source')
def test_apply_cell_rules_api_sync_accept_before_catchall_drop(self):
"""The API-sync ACCEPT must be inserted after service rules so it ends up above DROP."""
insertion_order = []
@@ -756,6 +771,7 @@ class TestCellRules(unittest.TestCase):
patch.object(firewall_manager, '_get_cell_api_ip', return_value='172.20.0.10'), \
patch.object(firewall_manager, '_get_caddy_container_ip', return_value='172.20.0.2'), \
patch.object(firewall_manager, '_get_dns_container_ip', return_value='172.20.0.3'), \
patch.object(firewall_manager, 'ensure_cell_subnet_routes', return_value=None), \
patch.object(firewall_manager, 'ensure_forward_stateful', return_value=True):
firewall_manager.apply_cell_rules('office', '10.0.1.0/24', [])
@@ -852,6 +868,31 @@ class TestCellRules(unittest.TestCase):
# peer rule for a different entity must survive
self.assertIn('pic-peer-10-0-0-2/32', content)
# ── remove_cell_subnet_route ──────────────────────────────────────────────
def test_remove_cell_subnet_route_issues_ip_route_del(self):
"""remove_cell_subnet_route deletes the host route for the cell's subnet."""
captured = {}
def fake_run(cmd, **kw):
captured['cmd'] = cmd
return MagicMock(returncode=0, stdout='', stderr='')
with patch.object(firewall_manager, '_run', side_effect=fake_run):
firewall_manager.remove_cell_subnet_route('10.1.0.0/24')
cmd = captured.get('cmd', [])
self.assertIn('ip', cmd)
self.assertIn('route', cmd)
self.assertIn('del', cmd)
self.assertIn('10.1.0.0/24', cmd)
self.assertIn('172.20.0.9', cmd)
def test_remove_cell_subnet_route_noop_on_empty(self):
"""An empty subnet is a no-op (no docker call)."""
with patch.object(firewall_manager, '_run') as run:
firewall_manager.remove_cell_subnet_route('')
run.assert_not_called()
# ── apply_all_cell_rules ──────────────────────────────────────────────────
def test_apply_all_cell_rules_calls_apply_for_each(self):
@@ -1130,6 +1171,7 @@ class TestEnsureForwardStateful(unittest.TestCase):
patch.object(firewall_manager, '_get_caddy_container_ip', return_value='172.20.0.2'), \
patch.object(firewall_manager, '_get_dns_container_ip', return_value='172.20.0.3'), \
patch.object(firewall_manager, '_get_cell_api_ip', return_value='172.20.0.10'), \
patch.object(firewall_manager, 'ensure_cell_subnet_routes', return_value=None), \
patch.object(firewall_manager, 'ensure_forward_stateful') as mock_stateful:
firewall_manager.apply_cell_rules('testcell', '10.0.0.0/24', [])
mock_stateful.assert_called_once()