diff --git a/api/connectivity_manager.py b/api/connectivity_manager.py index 45260e1..ba372d0 100644 --- a/api/connectivity_manager.py +++ b/api/connectivity_manager.py @@ -1437,6 +1437,18 @@ class ConnectivityManager(BaseServiceManager): except Exception as e: logger.warning(f"delete_connection: killswitch cleanup failed " f"(non-fatal): {e}") + elif (record.get('type') == self.CELL_RELAY_TYPE + and isinstance(table, int) + and self.wireguard_manager is not None): + # A cell_relay policy-routes peers via a source ip rule + a + # shared default route in its table inside cell-wireguard. Per-peer + # detach removes the rules; the table's default route only goes + # away here, when the connection is gone — otherwise it leaks. + try: + self.wireguard_manager.teardown_route_table(table) + except Exception as e: + logger.warning(f"delete_connection: cell_relay route table " + f"cleanup failed (non-fatal): {e}") for secret_ref in record.get('secret_refs', []): if self.vault_manager is not None: @@ -1554,6 +1566,18 @@ class ConnectivityManager(BaseServiceManager): f"{cell_name!r} no longer offered but still " f"referenced; keeping") continue + # Flush the relay's policy-routing table (shared default route) + # before forgetting the record — this path deletes the config + # entry directly rather than via delete_connection, so it must + # do the same host-routing teardown or the route leaks. + rtable = rec.get('table') + if self.wireguard_manager is not None and isinstance(rtable, int): + try: + self.wireguard_manager.teardown_route_table(rtable) + except Exception as e: + logger.warning(f"reconcile_cell_relays: route table " + f"cleanup for {cell_name!r} failed " + f"(non-fatal): {e}") try: self.config_manager.delete_connection(rec.get('id')) removed.append(rec.get('id')) diff --git a/api/routes/peers.py b/api/routes/peers.py index 09dd485..0d09cc6 100644 --- a/api/routes/peers.py +++ b/api/routes/peers.py @@ -357,6 +357,13 @@ def remove_peer(peer_name): if success: if peer_ip: firewall_manager.clear_peer_rules(peer_ip) + # Clear any cell_relay / route-via policy rule for this peer so a + # deleted-while-assigned peer doesn't leave a stale source ip rule + # (which could later misroute a new peer that reuses the IP). + try: + wireguard_manager.remove_peer_route_via(peer_ip) + except Exception as wg_err: + logger.warning(f"Peer {peer_name}: relay route cleanup failed (non-fatal): {wg_err}") _dns_primary, _dns_szones = _configured_dns_params() firewall_manager.apply_all_dns_rules(peer_registry.list_peers(), COREFILE_PATH, _dns_primary, cell_links=cell_link_manager.list_connections(), diff --git a/api/wireguard_manager.py b/api/wireguard_manager.py index e4524e5..8c60780 100644 --- a/api/wireguard_manager.py +++ b/api/wireguard_manager.py @@ -786,21 +786,60 @@ class WireGuardManager(BaseServiceManager): logger.error(f'apply_peer_route_via failed: {e}') return False - def remove_peer_route_via(self, peer_ip: str, table: int = 100) -> None: - """Remove the ip rule for peer_ip added by apply_peer_route_via. Non-fatal.""" + def remove_peer_route_via(self, peer_ip: str) -> None: + """Remove the policy-routing ip rule(s) for peer_ip. Non-fatal. + + Deletes every `ip rule from peer_ip/32` regardless of which table it + points at: the v2 cell_relay path adds the rule with the connection's + own table (1000+) while the legacy route-via path uses table 100, so a + caller clearing a peer's exit does not reliably know the table. Matching + by source alone removes the rule in both cases (and any duplicate). The + shared routing *table* itself is torn down separately at connection + teardown — see teardown_route_table. + """ real_conf = self._config_file() if '/tmp/' in real_conf or 'pytest' in real_conf or 'wg_confs' not in real_conf: return try: - subprocess.run( - ['docker', 'exec', 'cell-wireguard', - 'ip', 'rule', 'del', 'from', f'{peer_ip}/32', - 'pref', str(table), 'lookup', str(table)], - capture_output=True, timeout=5 - ) + for _ in range(32): + r = subprocess.run( + ['docker', 'exec', 'cell-wireguard', + 'ip', 'rule', 'del', 'from', f'{peer_ip}/32'], + capture_output=True, timeout=5 + ) + if r.returncode != 0: + break except Exception: pass + def teardown_route_table(self, table: int) -> None: + """Tear down a relay routing table when its connection is removed. Non-fatal. + + Removes any remaining `ip rule ... lookup ` entries (e.g. one left + by a peer deleted while still assigned) and flushes the table's routes — + notably the `default via ` route that apply_peer_route_via + installs. That route is shared by every peer routed through the relay, so + no per-peer detach may remove it; it can only be cleared once the + connection itself is gone, or it leaks (stale default route + a possible + blackhole if a rule survives). + """ + real_conf = self._config_file() + if '/tmp/' in real_conf or 'pytest' in real_conf or 'wg_confs' not in real_conf: + return + try: + def _wg(cmd): + return subprocess.run( + ['docker', 'exec', 'cell-wireguard'] + cmd, + capture_output=True, timeout=5 + ) + for _ in range(64): + r = _wg(['ip', 'rule', 'del', 'lookup', str(table)]) + if r.returncode != 0: + break + _wg(['ip', 'route', 'flush', 'table', str(table)]) + except Exception as e: + logger.warning(f'teardown_route_table({table}) failed: {e}') + def remove_peer(self, public_key: str) -> bool: """Remove the [Peer] block matching public_key from wg0.conf.""" try: diff --git a/tests/test_connectivity_cell_relay.py b/tests/test_connectivity_cell_relay.py index 90e9058..53a54da 100644 --- a/tests/test_connectivity_cell_relay.py +++ b/tests/test_connectivity_cell_relay.py @@ -319,5 +319,56 @@ class TestHealth(_Base): self.assertEqual(health, 'down') +# --------------------------------------------------------------------------- +# Teardown cleanup — regression for the confirmed cell_relay routing leak. +# +# A cell_relay policy-routes a peer with `ip rule from lookup
` +# plus a shared `default via ` route in that table, inside +# cell-wireguard. Before the fix, detaching/deleting the peer left the rule +# (remove_peer_route_via used the wrong default table) and nothing ever flushed +# the table's default route — both leaked, confirmed on hardware. +# --------------------------------------------------------------------------- + +class TestTeardownCleanup(_Base): + + def _relay(self): + self.cell_link.list_connections.return_value = [_link('alpha')] + self.mgr.reconcile_cell_relays() + return self._raw_relays()[0] + + def test_detach_removes_peer_ip_rule(self): + relay = self._relay() + peer = {'peer': 'laptop', 'ip': '10.0.0.5/32', + 'exit_via': relay['id'], 'route_via': 'alpha'} + self.peer_registry.get_peer.return_value = peer + self.peer_registry.set_peer_exit_via.return_value = True + with patch.object(self.mgr, 'apply_routes'): + res = self.mgr.set_peer_exit('laptop', 'default') + self.assertTrue(res['ok']) + # The peer's source ip rule is cleared by source (table-agnostic), so it + # matches the relay's allocated table rather than the old default 100. + self.wg.remove_peer_route_via.assert_called_once_with('10.0.0.5') + + def test_delete_connection_flushes_relay_route_table(self): + relay = self._relay() + # Not referenced by any peer (detached) → deletable. + self.peer_registry.list_peers.return_value = [] + res = self.mgr.delete_connection(relay['id']) + self.assertTrue(res['ok']) + self.wg.teardown_route_table.assert_called_once_with(relay['table']) + + def test_reconcile_removal_flushes_relay_route_table(self): + relay = self._relay() + table = relay['table'] + # Offer withdrawn and not referenced → reconcile removes the relay and + # must flush its routing table (this path bypasses delete_connection). + self.cell_link.list_connections.return_value = [ + _link('alpha', remote_exit_offered=False)] + self.peer_registry.list_peers.return_value = [] + out = self.mgr.reconcile_cell_relays() + self.assertIn(relay['id'], out['removed']) + self.wg.teardown_route_table.assert_called_once_with(table) + + if __name__ == '__main__': unittest.main()