fix(connectivity): clean up cell_relay policy routing on teardown
Unit Tests / test (push) Successful in 9m37s

A cell_relay policy-routes an assigned peer with `ip rule from <peer>
lookup <table>` plus a shared `default via <cell-ip>` route in that table
inside cell-wireguard. Two teardown bugs leaked both (confirmed on hardware,
pic0<->pic1):

- remove_peer_route_via deleted the rule with a hardcoded default table 100,
  but the v2 cell_relay path adds it with the connection's own table (1000+),
  so the rule never matched and survived peer detach/delete. It now deletes
  by source IP (table-agnostic), covering both the v2 and the legacy
  route-via (table 100) paths.
- nothing ever removed the table's shared default route: delete_connection
  explicitly skipped cell_relay and reconcile_cell_relays deletes the record
  directly. Added wireguard_manager.teardown_route_table(table) (removes any
  leftover lookup-<table> rules + flushes the table) and call it from both
  delete_connection and the reconcile removal path.

Also clear a peer's relay rule on peer deletion so a peer deleted while still
assigned doesn't leave a stale source rule that could misroute a future peer
reusing the IP.

Regression tests: detach removes the rule by source; delete_connection and
reconcile-removal each flush the relay table.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-17 11:34:41 -04:00
parent 639fb66e5b
commit 2ab6e715d8
4 changed files with 129 additions and 8 deletions
+24
View File
@@ -1437,6 +1437,18 @@ class ConnectivityManager(BaseServiceManager):
except Exception as e:
logger.warning(f"delete_connection: killswitch cleanup failed "
f"(non-fatal): {e}")
elif (record.get('type') == self.CELL_RELAY_TYPE
and isinstance(table, int)
and self.wireguard_manager is not None):
# A cell_relay policy-routes peers via a source ip rule + a
# shared default route in its table inside cell-wireguard. Per-peer
# detach removes the rules; the table's default route only goes
# away here, when the connection is gone — otherwise it leaks.
try:
self.wireguard_manager.teardown_route_table(table)
except Exception as e:
logger.warning(f"delete_connection: cell_relay route table "
f"cleanup failed (non-fatal): {e}")
for secret_ref in record.get('secret_refs', []):
if self.vault_manager is not None:
@@ -1554,6 +1566,18 @@ class ConnectivityManager(BaseServiceManager):
f"{cell_name!r} no longer offered but still "
f"referenced; keeping")
continue
# Flush the relay's policy-routing table (shared default route)
# before forgetting the record — this path deletes the config
# entry directly rather than via delete_connection, so it must
# do the same host-routing teardown or the route leaks.
rtable = rec.get('table')
if self.wireguard_manager is not None and isinstance(rtable, int):
try:
self.wireguard_manager.teardown_route_table(rtable)
except Exception as e:
logger.warning(f"reconcile_cell_relays: route table "
f"cleanup for {cell_name!r} failed "
f"(non-fatal): {e}")
try:
self.config_manager.delete_connection(rec.get('id'))
removed.append(rec.get('id'))
+7
View File
@@ -357,6 +357,13 @@ def remove_peer(peer_name):
if success:
if peer_ip:
firewall_manager.clear_peer_rules(peer_ip)
# Clear any cell_relay / route-via policy rule for this peer so a
# deleted-while-assigned peer doesn't leave a stale source ip rule
# (which could later misroute a new peer that reuses the IP).
try:
wireguard_manager.remove_peer_route_via(peer_ip)
except Exception as wg_err:
logger.warning(f"Peer {peer_name}: relay route cleanup failed (non-fatal): {wg_err}")
_dns_primary, _dns_szones = _configured_dns_params()
firewall_manager.apply_all_dns_rules(peer_registry.list_peers(), COREFILE_PATH, _dns_primary,
cell_links=cell_link_manager.list_connections(),
+47 -8
View File
@@ -786,21 +786,60 @@ class WireGuardManager(BaseServiceManager):
logger.error(f'apply_peer_route_via failed: {e}')
return False
def remove_peer_route_via(self, peer_ip: str, table: int = 100) -> None:
"""Remove the ip rule for peer_ip added by apply_peer_route_via. Non-fatal."""
def remove_peer_route_via(self, peer_ip: str) -> None:
"""Remove the policy-routing ip rule(s) for peer_ip. Non-fatal.
Deletes every `ip rule from peer_ip/32` regardless of which table it
points at: the v2 cell_relay path adds the rule with the connection's
own table (1000+) while the legacy route-via path uses table 100, so a
caller clearing a peer's exit does not reliably know the table. Matching
by source alone removes the rule in both cases (and any duplicate). The
shared routing *table* itself is torn down separately at connection
teardown — see teardown_route_table.
"""
real_conf = self._config_file()
if '/tmp/' in real_conf or 'pytest' in real_conf or 'wg_confs' not in real_conf:
return
try:
subprocess.run(
['docker', 'exec', 'cell-wireguard',
'ip', 'rule', 'del', 'from', f'{peer_ip}/32',
'pref', str(table), 'lookup', str(table)],
capture_output=True, timeout=5
)
for _ in range(32):
r = subprocess.run(
['docker', 'exec', 'cell-wireguard',
'ip', 'rule', 'del', 'from', f'{peer_ip}/32'],
capture_output=True, timeout=5
)
if r.returncode != 0:
break
except Exception:
pass
def teardown_route_table(self, table: int) -> None:
"""Tear down a relay routing table when its connection is removed. Non-fatal.
Removes any remaining `ip rule ... lookup <table>` entries (e.g. one left
by a peer deleted while still assigned) and flushes the table's routes —
notably the `default via <cell-ip>` route that apply_peer_route_via
installs. That route is shared by every peer routed through the relay, so
no per-peer detach may remove it; it can only be cleared once the
connection itself is gone, or it leaks (stale default route + a possible
blackhole if a rule survives).
"""
real_conf = self._config_file()
if '/tmp/' in real_conf or 'pytest' in real_conf or 'wg_confs' not in real_conf:
return
try:
def _wg(cmd):
return subprocess.run(
['docker', 'exec', 'cell-wireguard'] + cmd,
capture_output=True, timeout=5
)
for _ in range(64):
r = _wg(['ip', 'rule', 'del', 'lookup', str(table)])
if r.returncode != 0:
break
_wg(['ip', 'route', 'flush', 'table', str(table)])
except Exception as e:
logger.warning(f'teardown_route_table({table}) failed: {e}')
def remove_peer(self, public_key: str) -> bool:
"""Remove the [Peer] block matching public_key from wg0.conf."""
try: