feat: Phase 3 - per-peer internet routing via exit cell

Adds the ability to route a specific peer's internet traffic through a
connected cell acting as an exit relay.

Cell A side:
- PUT /api/peers/<peer>/route-via {"via_cell": "cellB"} sets route_via
- Updates WG AllowedIPs to include 0.0.0.0/0 for the exit cell peer
- Adds ip rule + ip route in policy table inside cell-wireguard so the
  specific peer's traffic egresses via cellB's WG IP
- Sets exit_relay_active on the cell link and pushes use_as_exit_relay=True
  to cellB via peer-sync

Cell B side:
- Receives use_as_exit_relay in the peer-sync payload
- Calls apply_cell_rules(..., exit_relay=True) to add FORWARD -o eth0 ACCEPT
- Stores remote_exit_relay_active flag for startup recovery

Startup recovery:
- apply_all_cell_rules passes exit_relay=remote_exit_relay_active (cellB)
- _apply_startup_enforcement reapplies ip rule for each peer with route_via (cellA)
  since policy routing rules don't survive container restart

peer_registry gets route_via field with lazy migration.
22 new tests across test_cell_link_manager, test_peer_registry, test_peer_route_via.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-01 16:23:31 -04:00
parent dcee03dd3f
commit 8ea834e108
11 changed files with 547 additions and 11 deletions
+13
View File
@@ -275,6 +275,19 @@ def _apply_startup_enforcement():
firewall_manager.apply_all_dns_rules(peers, COREFILE_PATH, _configured_domain(),
cell_links=cell_links)
logger.info(f"Applied enforcement rules for {len(peers)} peers, {len(cell_links)} cells on startup")
# Phase 3: reapply policy routing rules for peers whose internet traffic is
# routed through an exit cell (ip rule entries don't survive container restart)
cell_links_map = {l['cell_name']: l for l in cell_links}
for peer in peers:
via_cell = peer.get('route_via')
if not via_cell:
continue
link = cell_links_map.get(via_cell)
if not link:
continue
peer_ip = peer.get('ip', '').split('/')[0]
if peer_ip:
wireguard_manager.apply_peer_route_via(peer_ip, via_wg_ip=link['dns_ip'])
sync_summary = cell_link_manager.replay_pending_pushes()
if sync_summary.get('attempted'):
logger.info(f"Startup permission sync: {sync_summary}")
+29 -2
View File
@@ -84,6 +84,13 @@ class CellLinkManager:
if 'remote_exit_offered' not in link:
link['remote_exit_offered'] = False
changed = True
# Phase 3 migration: per-peer internet routing
if 'exit_relay_active' not in link:
link['exit_relay_active'] = False
changed = True
if 'remote_exit_relay_active' not in link:
link['remote_exit_relay_active'] = False
changed = True
if changed:
self._save(links)
return links
@@ -154,6 +161,7 @@ class CellLinkManager:
'inbound': dict(perms.get('outbound', {})),
},
'exit_offered': bool(link.get('exit_offered', False)),
'use_as_exit_relay': bool(link.get('exit_relay_active', False)),
'sent_at': datetime.utcnow().isoformat() + 'Z',
}
payload = json.dumps(body)
@@ -239,7 +247,8 @@ class CellLinkManager:
def apply_remote_permissions(self, from_public_key: str,
permissions: Dict[str, Any],
exit_offered: bool = False) -> Dict[str, Any]:
exit_offered: bool = False,
use_as_exit_relay: bool = False) -> Dict[str, Any]:
"""Store permissions pushed by a remote cell (identified by WG public key).
Validates service names, persists, and re-applies local iptables rules.
@@ -257,13 +266,15 @@ class CellLinkManager:
link['permissions'] = {'inbound': clean_inbound, 'outbound': clean_outbound}
link['remote_exit_offered'] = bool(exit_offered)
link['remote_exit_relay_active'] = bool(use_as_exit_relay)
link['last_remote_update_at'] = datetime.utcnow().isoformat()
self._save(links)
inbound_list = [s for s, v in clean_inbound.items() if v]
try:
import firewall_manager as _fm
_fm.apply_cell_rules(link['cell_name'], link['vpn_subnet'], inbound_list)
_fm.apply_cell_rules(link['cell_name'], link['vpn_subnet'], inbound_list,
exit_relay=use_as_exit_relay)
except Exception as e:
logger.warning(
f"apply_cell_rules after remote push for '{link['cell_name']}' "
@@ -470,6 +481,22 @@ class CellLinkManager:
self._try_push(cell_name, link)
return link
def set_exit_relay_active(self, cell_name: str, active: bool) -> Dict[str, Any]:
"""Record that THIS cell is routing a peer's internet traffic via cell_name.
Persists the flag locally and pushes updated state to the remote cell so
it can enable/disable the FORWARD-to-eth0 rule on its side.
Returns the updated link record.
"""
links = self._load()
link = next((l for l in links if l['cell_name'] == cell_name), None)
if not link:
raise ValueError(f"Cell '{cell_name}' not found")
link['exit_relay_active'] = bool(active)
self._save(links)
self._try_push(cell_name, link)
return link
def get_connection_status(self, cell_name: str) -> Dict[str, Any]:
"""Return link record enriched with live WireGuard handshake status."""
links = self._load()
+24 -7
View File
@@ -262,7 +262,8 @@ def _get_cell_api_ip() -> Optional[str]:
return r.stdout.strip()
def apply_cell_rules(cell_name: str, vpn_subnet: str, inbound_services: List[str]) -> bool:
def apply_cell_rules(cell_name: str, vpn_subnet: str, inbound_services: List[str],
exit_relay: bool = False) -> bool:
"""Apply FORWARD rules for a cell-to-cell peer.
Traffic from vpn_subnet is allowed only to service VIPs listed in
@@ -270,10 +271,15 @@ def apply_cell_rules(cell_name: str, vpn_subnet: str, inbound_services: List[str
internet or peer access only explicit service VIPs, plus the
cell-api port (3000) for permission-sync pushes arriving via DNAT.
Rule insertion order (last inserted top of chain):
1. Catch-all DROP for the subnet (inserted first bottom)
2. Per-service ACCEPT/DROP (inserted in reversed() order)
3. API-sync ACCEPT (inserted last top, above catch-all)
When exit_relay=True, the remote cell's peers can route internet
traffic through this cell (Phase 3). A broad ACCEPT for traffic
going out eth0 is added below per-service rules but above catch-all.
Rule insertion order (first inserted = bottom, last inserted = top):
1. Catch-all DROP for the subnet (inserted first bottom)
2. Exit relay ACCEPT (-o eth0) (if exit_relay, above catch-all)
3. Per-service ACCEPT/DROP (inserted in reversed() order)
4. API-sync ACCEPT (inserted last top)
"""
try:
tag = _cell_tag(cell_name)
@@ -283,6 +289,13 @@ def apply_cell_rules(cell_name: str, vpn_subnet: str, inbound_services: List[str
_iptables(['-I', 'FORWARD', '-s', vpn_subnet,
'-m', 'comment', '--comment', tag, '-j', 'DROP'])
# Exit relay ACCEPT — allow internet-bound traffic from this cell's peers.
# Inserted ABOVE catch-all but BELOW per-service rules so service-level
# DROP rules still take effect for specific service VIPs.
if exit_relay:
_iptables(['-I', 'FORWARD', '-s', vpn_subnet, '-o', 'eth0',
'-m', 'comment', '--comment', tag, '-j', 'ACCEPT'])
# Per-service rules — inserted in reverse dict order, highest-priority last
for service, svc_ip in reversed(list(SERVICE_IPS.items())):
target = 'ACCEPT' if service in inbound_services else 'DROP'
@@ -298,7 +311,10 @@ def apply_cell_rules(cell_name: str, vpn_subnet: str, inbound_services: List[str
'-p', 'tcp', '--dport', '3000',
'-m', 'comment', '--comment', tag, '-j', 'ACCEPT'])
logger.info(f"Applied cell rules for {cell_name} ({vpn_subnet}): inbound={inbound_services}")
logger.info(
f"Applied cell rules for {cell_name} ({vpn_subnet}): "
f"inbound={inbound_services} exit_relay={exit_relay}"
)
return True
except Exception as e:
logger.error(f"apply_cell_rules({cell_name}): {e}")
@@ -314,7 +330,8 @@ def apply_all_cell_rules(cell_links: List[Dict[str, Any]]) -> None:
continue
perms = link.get('permissions', {})
inbound = [s for s, v in perms.get('inbound', {}).items() if v]
apply_cell_rules(name, subnet, inbound)
exit_relay = bool(link.get('remote_exit_relay_active', False))
apply_cell_rules(name, subnet, inbound, exit_relay=exit_relay)
def ensure_cell_api_dnat() -> bool:
+20
View File
@@ -193,6 +193,14 @@ class PeerRegistry(BaseServiceManager):
except Exception as e:
self.logger.error(f"Error loading peers: {e}")
self.peers = []
# Phase 3 migration: per-peer internet routing
changed = False
for peer in self.peers:
if 'route_via' not in peer:
peer['route_via'] = None
changed = True
if changed:
self._save_peers()
else:
self.peers = []
self.logger.info("No peers file found, starting with empty registry")
@@ -326,6 +334,18 @@ class PeerRegistry(BaseServiceManager):
self.logger.error(f"Error updating peer {name} IP: {e}")
return False
def set_route_via(self, peer_name: str, via_cell: Optional[str]) -> Dict[str, Any]:
"""Set or clear the route_via field on a peer. Returns the updated peer dict."""
with self.lock:
for peer in self.peers:
if peer.get('peer') == peer_name:
peer['route_via'] = via_cell
peer['updated_at'] = datetime.utcnow().isoformat()
self._save_peers()
self.logger.info(f"Set route_via for {peer_name}: {via_cell!r}")
return dict(peer)
raise ValueError(f"Peer '{peer_name}' not found")
def get_peer_stats(self) -> Dict[str, Any]:
"""Get peer registry statistics"""
try:
+4 -1
View File
@@ -219,8 +219,11 @@ def peer_sync_permissions():
return jsonify({'ok': False, 'error': f'unknown service: {svc!r}'}), 400
exit_offered = bool(data.get('exit_offered', False))
use_as_exit_relay = bool(data.get('use_as_exit_relay', False))
from app import cell_link_manager
cell_link_manager.apply_remote_permissions(sender_pubkey, perms, exit_offered=exit_offered)
cell_link_manager.apply_remote_permissions(sender_pubkey, perms,
exit_offered=exit_offered,
use_as_exit_relay=use_as_exit_relay)
return jsonify({'ok': True, 'applied_at': datetime.utcnow().isoformat()})
except ValueError as e:
return jsonify({'ok': False, 'error': str(e)}), 404
+71
View File
@@ -183,6 +183,77 @@ def update_peer(peer_name):
return jsonify({"error": str(e)}), 500
@bp.route('/api/peers/<peer_name>/route-via', methods=['PUT'])
def set_peer_route_via(peer_name):
"""Route a peer's internet traffic through a connected exit cell.
Body: {"via_cell": "cellB"} to enable, {"via_cell": null} to disable.
On enable: updates WG AllowedIPs and adds policy routing rule inside
cell-wireguard so the peer's packets egress through the exit cell.
On disable: reverts AllowedIPs and removes the ip rule.
Also signals the exit cell to add/remove the FORWARD-to-eth0 firewall rule.
"""
try:
from app import peer_registry, wireguard_manager, cell_link_manager
data = request.get_json(silent=True)
if data is None or 'via_cell' not in data:
return jsonify({'error': 'via_cell field required (string or null)'}), 400
via_cell = data['via_cell']
if via_cell is not None and not isinstance(via_cell, str):
return jsonify({'error': 'via_cell must be a string or null'}), 400
peer = peer_registry.get_peer(peer_name)
if not peer:
return jsonify({'error': 'Peer not found'}), 404
peer_ip = peer.get('ip', '').split('/')[0]
if not peer_ip:
return jsonify({'error': 'Peer has no IP assigned'}), 400
old_via = peer.get('route_via')
# Remove old routing if switching away from a previous exit cell
if old_via and old_via != via_cell:
old_link = next(
(l for l in cell_link_manager.list_connections()
if l['cell_name'] == old_via), None
)
if old_link:
wireguard_manager.update_cell_peer_allowed_ips(
old_link['public_key'], old_link['vpn_subnet'],
add_default_route=False)
wireguard_manager.remove_peer_route_via(peer_ip)
try:
cell_link_manager.set_exit_relay_active(old_via, False)
except Exception as e:
logger.warning(f"set_exit_relay_active(False) for {old_via!r} failed: {e}")
# Apply new routing
if via_cell:
link = next(
(l for l in cell_link_manager.list_connections()
if l['cell_name'] == via_cell), None
)
if not link:
return jsonify({'error': f"Cell {via_cell!r} not connected"}), 404
wireguard_manager.update_cell_peer_allowed_ips(
link['public_key'], link['vpn_subnet'], add_default_route=True)
wireguard_manager.apply_peer_route_via(peer_ip, via_wg_ip=link['dns_ip'])
try:
cell_link_manager.set_exit_relay_active(via_cell, True)
except Exception as e:
logger.warning(f"set_exit_relay_active(True) for {via_cell!r} failed: {e}")
updated_peer = peer_registry.set_route_via(peer_name, via_cell)
return jsonify({'message': f"Route-via for '{peer_name}' updated", 'peer': updated_peer})
except ValueError as e:
return jsonify({'error': str(e)}), 404
except Exception as e:
logger.error(f"Error setting route-via for {peer_name}: {e}")
return jsonify({'error': str(e)}), 500
@bp.route('/api/peers/<peer_name>/clear-reinstall', methods=['POST'])
def clear_peer_reinstall(peer_name):
try:
+74
View File
@@ -573,6 +573,80 @@ class WireGuardManager(BaseServiceManager):
except Exception as e:
logger.warning(f'sync_cell_routes failed (non-fatal): {e}')
def update_cell_peer_allowed_ips(self, public_key: str,
vpn_subnet: str,
add_default_route: bool = False) -> bool:
"""Update AllowedIPs for a cell peer to optionally include 0.0.0.0/0.
When add_default_route=True, appends 0.0.0.0/0 to AllowedIPs so that
traffic destined for the internet is routed through this cell peer
(Phase 3 exit-relay routing).
At most one WireGuard peer can have 0.0.0.0/0 in AllowedIPs per
interface (v1 constraint); callers must remove it from any other cell
peer before adding it here.
"""
try:
if add_default_route:
new_ips = f'{vpn_subnet}, 0.0.0.0/0'
else:
new_ips = vpn_subnet
return self.update_peer_ip(public_key, new_ips)
except Exception as e:
logger.error(f'update_cell_peer_allowed_ips failed: {e}')
return False
def apply_peer_route_via(self, peer_ip: str, via_wg_ip: str,
table: int = 100) -> bool:
"""Add policy routing inside cell-wireguard so peer_ip traffic uses via_wg_ip.
Creates a dedicated routing table (table) with a default route via the
exit cell's WG IP, then adds an ip rule so traffic sourced from peer_ip
uses that table.
This is the cell-A side of Phase 3 routing: after this, traffic from
the local peer (at peer_ip) is sent through the WG tunnel to cell B
(at via_wg_ip) for internet egress.
Idempotent: adding a duplicate route/rule returns 0 or EEXIST, which is
treated as success.
"""
real_conf = self._config_file()
if '/tmp/' in real_conf or 'pytest' in real_conf:
return True
try:
def _wg(cmd):
return subprocess.run(
['docker', 'exec', 'cell-wireguard'] + cmd,
capture_output=True, text=True, timeout=5
)
# Add default route in the policy table (idempotent via || true)
_wg(['ip', 'route', 'add', 'default', 'via', via_wg_ip,
'dev', 'wg0', 'table', str(table)])
# Add ip rule: traffic FROM peer_ip uses this table
_wg(['ip', 'rule', 'add', 'from', f'{peer_ip}/32',
'pref', str(table), 'lookup', str(table)])
logger.info(f'apply_peer_route_via: {peer_ip}{via_wg_ip} table {table}')
return True
except Exception as e:
logger.error(f'apply_peer_route_via failed: {e}')
return False
def remove_peer_route_via(self, peer_ip: str, table: int = 100) -> None:
"""Remove the ip rule for peer_ip added by apply_peer_route_via. Non-fatal."""
real_conf = self._config_file()
if '/tmp/' in real_conf or 'pytest' in real_conf:
return
try:
subprocess.run(
['docker', 'exec', 'cell-wireguard',
'ip', 'rule', 'del', 'from', f'{peer_ip}/32',
'pref', str(table), 'lookup', str(table)],
capture_output=True, timeout=5
)
except Exception:
pass
def remove_peer(self, public_key: str) -> bool:
"""Remove the [Peer] block matching public_key from wg0.conf."""
try: