Fix Phase 1 permission sync: route push via cell-wireguard + DNAT receive

cell-api has no route to remote WG tunnel IPs — only cell-wireguard does.
Fix _push_permissions_to_remote() to use 'docker exec cell-wireguard curl'
so outbound sync HTTP traverses the WG tunnel from the right namespace.

On the receive side, add ensure_cell_api_dnat() which installs three
iptables rules inside cell-wireguard on startup:
  - PREROUTING DNAT: wg0:3000 → cell-api:3000 (Docker bridge IP)
  - POSTROUTING MASQUERADE: so cell-api's reply routes back via wg0
  - FORWARD ACCEPT: allow the wg0→eth0 forwarded traffic

Called from _apply_startup_enforcement() so rules survive container restarts.
Tests updated to mock subprocess.run instead of urllib.request.urlopen.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-01 13:48:49 -04:00
parent a3d0cd5a48
commit 4ba79fd614
5 changed files with 149 additions and 28 deletions
+1
View File
@@ -264,6 +264,7 @@ def _apply_startup_enforcement():
cell_links = cell_link_manager.list_connections()
firewall_manager.apply_all_peer_rules(peers)
firewall_manager.apply_all_cell_rules(cell_links)
firewall_manager.ensure_cell_api_dnat()
firewall_manager.apply_all_dns_rules(peers, COREFILE_PATH, _configured_domain(),
cell_links=cell_links)
logger.info(f"Applied enforcement rules for {len(peers)} peers, {len(cell_links)} cells on startup")
+30 -17
View File
@@ -11,8 +11,7 @@ Each connection is stored in data/cell_links.json and manifests as:
import json
import logging
import os
import urllib.error
import urllib.request
import subprocess
from datetime import datetime
from typing import Any, Dict, List, Optional
@@ -108,6 +107,10 @@ class CellLinkManager:
The body inverts inbound/outbound: our inbound (what we share with them)
becomes their outbound (what they receive from us), and vice-versa.
Uses 'docker exec cell-wireguard curl' so the HTTP request originates
from inside cell-wireguard's network namespace, which has routes to
remote cell VPN subnets that cell-api (on the Docker bridge) lacks.
"""
url = link.get('remote_api_url')
if not url:
@@ -124,22 +127,32 @@ class CellLinkManager:
},
'sent_at': datetime.utcnow().isoformat() + 'Z',
}
payload = json.dumps(body).encode('utf-8')
req = urllib.request.Request(
url.rstrip('/') + '/api/cells/peer-sync/permissions',
data=payload,
method='POST',
headers={'Content-Type': 'application/json'},
)
payload = json.dumps(body)
endpoint = url.rstrip('/') + '/api/cells/peer-sync/permissions'
cmd = [
'docker', 'exec', 'cell-wireguard',
'curl', '-s', '-o', '/dev/null', '-w', '%{http_code}',
'-X', 'POST',
'-H', 'Content-Type: application/json',
'-d', payload,
'--max-time', str(_PUSH_TIMEOUT),
'--connect-timeout', '3',
endpoint,
]
try:
with urllib.request.urlopen(req, timeout=_PUSH_TIMEOUT) as resp:
if 200 <= resp.status < 300:
return {'ok': True, 'error': None}
return {'ok': False, 'error': f'HTTP {resp.status}'}
except urllib.error.HTTPError as e:
return {'ok': False, 'error': f'HTTP {e.code}'}
except urllib.error.URLError as e:
return {'ok': False, 'error': str(e.reason)[:200]}
result = subprocess.run(
cmd, capture_output=True, text=True, timeout=_PUSH_TIMEOUT + 5
)
if result.returncode != 0:
err = (result.stderr or result.stdout or 'curl error').strip()[:200]
return {'ok': False, 'error': err}
status = result.stdout.strip()
if status.startswith('2'):
return {'ok': True, 'error': None}
return {'ok': False, 'error': f'HTTP {status}'}
except subprocess.TimeoutExpired:
return {'ok': False, 'error': 'timeout'}
except Exception as e:
return {'ok': False, 'error': str(e)[:200]}
+45
View File
@@ -298,6 +298,51 @@ def apply_all_cell_rules(cell_links: List[Dict[str, Any]]) -> None:
apply_cell_rules(name, subnet, inbound)
def ensure_cell_api_dnat() -> bool:
"""DNAT wg0:3000 → cell-api:3000 inside cell-wireguard.
Remote cells push permission updates over the WireGuard tunnel to our
wg0 interface on port 3000. Since cell-api only listens on the Docker
bridge, we need a DNAT rule inside cell-wireguard's namespace to forward
that traffic. Called on every startup so rules survive container restarts.
"""
try:
r = _run(['docker', 'inspect', '--format',
'{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}',
'cell-api'], check=False)
api_ip = r.stdout.strip()
if not api_ip:
logger.warning('ensure_cell_api_dnat: cell-api container not found or no IP')
return False
dnat_check = ['-t', 'nat', '-C', 'PREROUTING', '-i', 'wg0', '-p', 'tcp',
'--dport', '3000', '-j', 'DNAT', '--to-destination', f'{api_ip}:3000']
dnat_add = ['-t', 'nat', '-A', 'PREROUTING', '-i', 'wg0', '-p', 'tcp',
'--dport', '3000', '-j', 'DNAT', '--to-destination', f'{api_ip}:3000']
if _wg_exec(['iptables'] + dnat_check).returncode != 0:
_wg_exec(['iptables'] + dnat_add)
masq_check = ['-t', 'nat', '-C', 'POSTROUTING', '-o', 'eth0', '-d', api_ip,
'-p', 'tcp', '--dport', '3000', '-j', 'MASQUERADE']
masq_add = ['-t', 'nat', '-A', 'POSTROUTING', '-o', 'eth0', '-d', api_ip,
'-p', 'tcp', '--dport', '3000', '-j', 'MASQUERADE']
if _wg_exec(['iptables'] + masq_check).returncode != 0:
_wg_exec(['iptables'] + masq_add)
fwd_check = ['-C', 'FORWARD', '-i', 'wg0', '-o', 'eth0',
'-p', 'tcp', '--dport', '3000', '-j', 'ACCEPT']
fwd_add = ['-I', 'FORWARD', '-i', 'wg0', '-o', 'eth0',
'-p', 'tcp', '--dport', '3000', '-j', 'ACCEPT']
if _wg_exec(['iptables'] + fwd_check).returncode != 0:
_wg_exec(['iptables'] + fwd_add)
logger.info(f'ensure_cell_api_dnat: wg0:3000 → {api_ip}:3000')
return True
except Exception as e:
logger.error(f'ensure_cell_api_dnat: {e}')
return False
# ---------------------------------------------------------------------------
# DNS ACL (CoreDNS Corefile generation)
# ---------------------------------------------------------------------------
+15 -11
View File
@@ -522,16 +522,18 @@ class TestPermissionSync(unittest.TestCase):
sent_body = {}
def fake_urlopen(req, timeout=None):
def fake_run(cmd, **kwargs):
import json as _j
sent_body.update(_j.loads(req.data))
resp = MagicMock()
resp.__enter__ = lambda s: s
resp.__exit__ = MagicMock(return_value=False)
resp.status = 200
return resp
# Extract the -d payload from curl args
d_idx = cmd.index('-d')
sent_body.update(_j.loads(cmd[d_idx + 1]))
r = MagicMock()
r.returncode = 0
r.stdout = '200'
r.stderr = ''
return r
with patch('urllib.request.urlopen', fake_urlopen):
with patch('subprocess.run', fake_run):
result = self.mgr._push_permissions_to_remote(link, 'home', 'homepubkey=')
self.assertTrue(result['ok'])
@@ -544,9 +546,11 @@ class TestPermissionSync(unittest.TestCase):
def test_push_http_error_returns_not_ok(self):
self._add_office()
link = self.mgr.list_connections()[0]
with patch('urllib.request.urlopen',
side_effect=__import__('urllib.error', fromlist=['HTTPError']).HTTPError(
url='', code=503, msg='Service Unavailable', hdrs=None, fp=None)):
mock_result = MagicMock()
mock_result.returncode = 0
mock_result.stdout = '503'
mock_result.stderr = ''
with patch('subprocess.run', return_value=mock_result):
result = self.mgr._push_permissions_to_remote(link, 'home', 'homepubkey=')
self.assertFalse(result['ok'])
self.assertIn('503', result['error'])
+58
View File
@@ -622,5 +622,63 @@ class TestCellRules(unittest.TestCase):
self.assertEqual(mock_apply.call_args.args[0], 'office')
class TestEnsureCellApiDnat(unittest.TestCase):
"""Tests for ensure_cell_api_dnat — DNAT wg0:3000 → cell-api:3000."""
def _wg_exec_no_existing_rules(self, args):
r = MagicMock()
r.returncode = 1 if '-C' in args else 0 # -C = check: fail = not present
r.stdout = ''
r.stderr = ''
return r
def _wg_exec_all_rules_exist(self, args):
r = MagicMock()
r.returncode = 0 # -C succeeds = rule already present
r.stdout = ''
return r
def _inspect_ok(self, api_ip='172.20.0.10'):
r = MagicMock()
r.returncode = 0
r.stdout = api_ip
return r
def test_dnat_rules_added_when_not_present(self):
with patch.object(firewall_manager, '_run', return_value=self._inspect_ok()), \
patch.object(firewall_manager, '_wg_exec',
side_effect=self._wg_exec_no_existing_rules) as wg_mock:
result = firewall_manager.ensure_cell_api_dnat()
self.assertTrue(result)
calls_args = [c.args[0] for c in wg_mock.call_args_list]
dnat_adds = [a for a in calls_args if 'DNAT' in a and '-A' in a]
self.assertTrue(len(dnat_adds) >= 1, 'DNAT -A rule must be added')
def test_dnat_skipped_if_already_present(self):
with patch.object(firewall_manager, '_run', return_value=self._inspect_ok()), \
patch.object(firewall_manager, '_wg_exec',
side_effect=self._wg_exec_all_rules_exist) as wg_mock:
result = firewall_manager.ensure_cell_api_dnat()
self.assertTrue(result)
calls_args = [c.args[0] for c in wg_mock.call_args_list]
add_calls = [a for a in calls_args if '-A' in a or '-I' in a]
self.assertEqual(len(add_calls), 0, 'No rules should be added when they already exist')
def test_returns_false_when_cell_api_not_found(self):
r = MagicMock()
r.returncode = 0
r.stdout = ''
with patch.object(firewall_manager, '_run', return_value=r):
result = firewall_manager.ensure_cell_api_dnat()
self.assertFalse(result)
def test_returns_false_on_exception(self):
with patch.object(firewall_manager, '_run', side_effect=RuntimeError('docker gone')):
result = firewall_manager.ensure_cell_api_dnat()
self.assertFalse(result)
if __name__ == '__main__':
unittest.main()