fix(connectivity): clean up cell_relay policy routing on teardown
Unit Tests / test (push) Successful in 9m37s
Unit Tests / test (push) Successful in 9m37s
A cell_relay policy-routes an assigned peer with `ip rule from <peer> lookup <table>` plus a shared `default via <cell-ip>` route in that table inside cell-wireguard. Two teardown bugs leaked both (confirmed on hardware, pic0<->pic1): - remove_peer_route_via deleted the rule with a hardcoded default table 100, but the v2 cell_relay path adds it with the connection's own table (1000+), so the rule never matched and survived peer detach/delete. It now deletes by source IP (table-agnostic), covering both the v2 and the legacy route-via (table 100) paths. - nothing ever removed the table's shared default route: delete_connection explicitly skipped cell_relay and reconcile_cell_relays deletes the record directly. Added wireguard_manager.teardown_route_table(table) (removes any leftover lookup-<table> rules + flushes the table) and call it from both delete_connection and the reconcile removal path. Also clear a peer's relay rule on peer deletion so a peer deleted while still assigned doesn't leave a stale source rule that could misroute a future peer reusing the IP. Regression tests: detach removes the rule by source; delete_connection and reconcile-removal each flush the relay table. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -1437,6 +1437,18 @@ class ConnectivityManager(BaseServiceManager):
|
||||
except Exception as e:
|
||||
logger.warning(f"delete_connection: killswitch cleanup failed "
|
||||
f"(non-fatal): {e}")
|
||||
elif (record.get('type') == self.CELL_RELAY_TYPE
|
||||
and isinstance(table, int)
|
||||
and self.wireguard_manager is not None):
|
||||
# A cell_relay policy-routes peers via a source ip rule + a
|
||||
# shared default route in its table inside cell-wireguard. Per-peer
|
||||
# detach removes the rules; the table's default route only goes
|
||||
# away here, when the connection is gone — otherwise it leaks.
|
||||
try:
|
||||
self.wireguard_manager.teardown_route_table(table)
|
||||
except Exception as e:
|
||||
logger.warning(f"delete_connection: cell_relay route table "
|
||||
f"cleanup failed (non-fatal): {e}")
|
||||
|
||||
for secret_ref in record.get('secret_refs', []):
|
||||
if self.vault_manager is not None:
|
||||
@@ -1554,6 +1566,18 @@ class ConnectivityManager(BaseServiceManager):
|
||||
f"{cell_name!r} no longer offered but still "
|
||||
f"referenced; keeping")
|
||||
continue
|
||||
# Flush the relay's policy-routing table (shared default route)
|
||||
# before forgetting the record — this path deletes the config
|
||||
# entry directly rather than via delete_connection, so it must
|
||||
# do the same host-routing teardown or the route leaks.
|
||||
rtable = rec.get('table')
|
||||
if self.wireguard_manager is not None and isinstance(rtable, int):
|
||||
try:
|
||||
self.wireguard_manager.teardown_route_table(rtable)
|
||||
except Exception as e:
|
||||
logger.warning(f"reconcile_cell_relays: route table "
|
||||
f"cleanup for {cell_name!r} failed "
|
||||
f"(non-fatal): {e}")
|
||||
try:
|
||||
self.config_manager.delete_connection(rec.get('id'))
|
||||
removed.append(rec.get('id'))
|
||||
|
||||
@@ -357,6 +357,13 @@ def remove_peer(peer_name):
|
||||
if success:
|
||||
if peer_ip:
|
||||
firewall_manager.clear_peer_rules(peer_ip)
|
||||
# Clear any cell_relay / route-via policy rule for this peer so a
|
||||
# deleted-while-assigned peer doesn't leave a stale source ip rule
|
||||
# (which could later misroute a new peer that reuses the IP).
|
||||
try:
|
||||
wireguard_manager.remove_peer_route_via(peer_ip)
|
||||
except Exception as wg_err:
|
||||
logger.warning(f"Peer {peer_name}: relay route cleanup failed (non-fatal): {wg_err}")
|
||||
_dns_primary, _dns_szones = _configured_dns_params()
|
||||
firewall_manager.apply_all_dns_rules(peer_registry.list_peers(), COREFILE_PATH, _dns_primary,
|
||||
cell_links=cell_link_manager.list_connections(),
|
||||
|
||||
@@ -786,21 +786,60 @@ class WireGuardManager(BaseServiceManager):
|
||||
logger.error(f'apply_peer_route_via failed: {e}')
|
||||
return False
|
||||
|
||||
def remove_peer_route_via(self, peer_ip: str, table: int = 100) -> None:
|
||||
"""Remove the ip rule for peer_ip added by apply_peer_route_via. Non-fatal."""
|
||||
def remove_peer_route_via(self, peer_ip: str) -> None:
|
||||
"""Remove the policy-routing ip rule(s) for peer_ip. Non-fatal.
|
||||
|
||||
Deletes every `ip rule from peer_ip/32` regardless of which table it
|
||||
points at: the v2 cell_relay path adds the rule with the connection's
|
||||
own table (1000+) while the legacy route-via path uses table 100, so a
|
||||
caller clearing a peer's exit does not reliably know the table. Matching
|
||||
by source alone removes the rule in both cases (and any duplicate). The
|
||||
shared routing *table* itself is torn down separately at connection
|
||||
teardown — see teardown_route_table.
|
||||
"""
|
||||
real_conf = self._config_file()
|
||||
if '/tmp/' in real_conf or 'pytest' in real_conf or 'wg_confs' not in real_conf:
|
||||
return
|
||||
try:
|
||||
subprocess.run(
|
||||
['docker', 'exec', 'cell-wireguard',
|
||||
'ip', 'rule', 'del', 'from', f'{peer_ip}/32',
|
||||
'pref', str(table), 'lookup', str(table)],
|
||||
capture_output=True, timeout=5
|
||||
)
|
||||
for _ in range(32):
|
||||
r = subprocess.run(
|
||||
['docker', 'exec', 'cell-wireguard',
|
||||
'ip', 'rule', 'del', 'from', f'{peer_ip}/32'],
|
||||
capture_output=True, timeout=5
|
||||
)
|
||||
if r.returncode != 0:
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def teardown_route_table(self, table: int) -> None:
|
||||
"""Tear down a relay routing table when its connection is removed. Non-fatal.
|
||||
|
||||
Removes any remaining `ip rule ... lookup <table>` entries (e.g. one left
|
||||
by a peer deleted while still assigned) and flushes the table's routes —
|
||||
notably the `default via <cell-ip>` route that apply_peer_route_via
|
||||
installs. That route is shared by every peer routed through the relay, so
|
||||
no per-peer detach may remove it; it can only be cleared once the
|
||||
connection itself is gone, or it leaks (stale default route + a possible
|
||||
blackhole if a rule survives).
|
||||
"""
|
||||
real_conf = self._config_file()
|
||||
if '/tmp/' in real_conf or 'pytest' in real_conf or 'wg_confs' not in real_conf:
|
||||
return
|
||||
try:
|
||||
def _wg(cmd):
|
||||
return subprocess.run(
|
||||
['docker', 'exec', 'cell-wireguard'] + cmd,
|
||||
capture_output=True, timeout=5
|
||||
)
|
||||
for _ in range(64):
|
||||
r = _wg(['ip', 'rule', 'del', 'lookup', str(table)])
|
||||
if r.returncode != 0:
|
||||
break
|
||||
_wg(['ip', 'route', 'flush', 'table', str(table)])
|
||||
except Exception as e:
|
||||
logger.warning(f'teardown_route_table({table}) failed: {e}')
|
||||
|
||||
def remove_peer(self, public_key: str) -> bool:
|
||||
"""Remove the [Peer] block matching public_key from wg0.conf."""
|
||||
try:
|
||||
|
||||
@@ -319,5 +319,56 @@ class TestHealth(_Base):
|
||||
self.assertEqual(health, 'down')
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Teardown cleanup — regression for the confirmed cell_relay routing leak.
|
||||
#
|
||||
# A cell_relay policy-routes a peer with `ip rule from <peer> lookup <table>`
|
||||
# plus a shared `default via <cell-ip>` route in that table, inside
|
||||
# cell-wireguard. Before the fix, detaching/deleting the peer left the rule
|
||||
# (remove_peer_route_via used the wrong default table) and nothing ever flushed
|
||||
# the table's default route — both leaked, confirmed on hardware.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestTeardownCleanup(_Base):
|
||||
|
||||
def _relay(self):
|
||||
self.cell_link.list_connections.return_value = [_link('alpha')]
|
||||
self.mgr.reconcile_cell_relays()
|
||||
return self._raw_relays()[0]
|
||||
|
||||
def test_detach_removes_peer_ip_rule(self):
|
||||
relay = self._relay()
|
||||
peer = {'peer': 'laptop', 'ip': '10.0.0.5/32',
|
||||
'exit_via': relay['id'], 'route_via': 'alpha'}
|
||||
self.peer_registry.get_peer.return_value = peer
|
||||
self.peer_registry.set_peer_exit_via.return_value = True
|
||||
with patch.object(self.mgr, 'apply_routes'):
|
||||
res = self.mgr.set_peer_exit('laptop', 'default')
|
||||
self.assertTrue(res['ok'])
|
||||
# The peer's source ip rule is cleared by source (table-agnostic), so it
|
||||
# matches the relay's allocated table rather than the old default 100.
|
||||
self.wg.remove_peer_route_via.assert_called_once_with('10.0.0.5')
|
||||
|
||||
def test_delete_connection_flushes_relay_route_table(self):
|
||||
relay = self._relay()
|
||||
# Not referenced by any peer (detached) → deletable.
|
||||
self.peer_registry.list_peers.return_value = []
|
||||
res = self.mgr.delete_connection(relay['id'])
|
||||
self.assertTrue(res['ok'])
|
||||
self.wg.teardown_route_table.assert_called_once_with(relay['table'])
|
||||
|
||||
def test_reconcile_removal_flushes_relay_route_table(self):
|
||||
relay = self._relay()
|
||||
table = relay['table']
|
||||
# Offer withdrawn and not referenced → reconcile removes the relay and
|
||||
# must flush its routing table (this path bypasses delete_connection).
|
||||
self.cell_link.list_connections.return_value = [
|
||||
_link('alpha', remote_exit_offered=False)]
|
||||
self.peer_registry.list_peers.return_value = []
|
||||
out = self.mgr.reconcile_cell_relays()
|
||||
self.assertIn(relay['id'], out['removed'])
|
||||
self.wg.teardown_route_table.assert_called_once_with(table)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user