diff --git a/api/firewall_manager.py b/api/firewall_manager.py index 1650714..f1793eb 100644 --- a/api/firewall_manager.py +++ b/api/firewall_manager.py @@ -797,12 +797,18 @@ def generate_corefile(peers: List[Dict[str, Any]], corefile_path: str = COREFILE # local.{domain} block intentionally omitted: /data/local.zone does not exist # and CoreDNS logs errors on every reload for a missing zone file. os.makedirs(os.path.dirname(corefile_path), exist_ok=True) - tmp_path = corefile_path + '.tmp' - with open(tmp_path, 'w') as f: + # Write in place (truncate + rewrite the SAME inode) rather than + # writing a temp file and os.replace()-ing it in. The Corefile is a + # Docker FILE bind-mount (./config/dns/Corefile:/etc/coredns/Corefile); + # os.replace creates a NEW inode, but the container stays bound to the + # original inode and never sees the update — so CoreDNS silently runs + # stale config until the container restarts. CoreDNS only re-reads on + # the SIGUSR1 we send right after this completes, so a non-atomic + # write is safe here. + with open(corefile_path, 'w') as f: f.write(corefile) f.flush() os.fsync(f.fileno()) - os.replace(tmp_path, corefile_path) logger.info(f"Wrote Corefile to {corefile_path}") return True diff --git a/api/managers.py b/api/managers.py index 2049ad5..57c7951 100644 --- a/api/managers.py +++ b/api/managers.py @@ -107,7 +107,8 @@ egress_manager = EgressManager( ) service_store_manager.egress_manager = egress_manager -setup_manager = SetupManager(config_manager=config_manager, auth_manager=auth_manager) +setup_manager = SetupManager(config_manager=config_manager, auth_manager=auth_manager, + network_manager=network_manager) # Service logger configuration _service_log_configs = { diff --git a/api/setup_manager.py b/api/setup_manager.py index 6e420b5..f5fe790 100644 --- a/api/setup_manager.py +++ b/api/setup_manager.py @@ -94,9 +94,10 @@ def _build_ddns_config(domain_mode: str, cloudflare_api_token: str = '', class SetupManager: """Manages the first-run setup wizard state and completion.""" - def __init__(self, config_manager, auth_manager): + def __init__(self, config_manager, auth_manager, network_manager=None): self.config_manager = config_manager self.auth_manager = auth_manager + self.network_manager = network_manager # ── state helpers ───────────────────────────────────────────────────── @@ -270,6 +271,28 @@ class SetupManager: 'HTTPS will activate once registration succeeds.' ) + # ── write the split-horizon DNS zone for non-LAN modes ───────── + # VPN clients use the cell's CoreDNS (DNS=) and must resolve + # the effective domain to the internal Caddy IP so traffic reaches + # Caddy through the tunnel. _bootstrap_dns runs at container start + # BEFORE setup completes (domain_mode still 'lan'), so it takes the + # LAN branch and never writes this zone — leaving CoreDNS pointing + # at a missing zone file and VPN lookups returning nothing + # (dns_probe_finished_bad_config). Write it here now that the mode + # and effective domain are known. + if domain_mode != 'lan' and self.network_manager is not None: + try: + effective_domain = self.config_manager.get_effective_domain() + primary_domain = self.config_manager.get_identity().get('domain', 'cell') + if effective_domain and effective_domain != primary_domain: + caddy_ip = self.network_manager._get_wg_server_ip() + self.network_manager.update_split_horizon_zone( + effective_domain, caddy_ip, primary_domain=primary_domain) + logger.info( + f'Split-horizon zone written for {effective_domain} -> {caddy_ip}') + except Exception as exc: + logger.warning(f'Split-horizon zone setup failed (non-fatal): {exc}') + # ── mark setup complete (must be last) ───────────────────────── self.config_manager.set_identity_field('setup_complete', True) diff --git a/tests/test_firewall_manager.py b/tests/test_firewall_manager.py index 7e29ea9..f3cfd02 100644 --- a/tests/test_firewall_manager.py +++ b/tests/test_firewall_manager.py @@ -132,6 +132,18 @@ class TestGenerateCorefile(unittest.TestCase): content = open(self.path).read() self.assertIn('reload', content) + def test_rewrite_preserves_inode(self): + # Regression: the Corefile is a Docker FILE bind-mount, so it must be + # rewritten in place. os.replace() would swap the inode and the + # container would keep reading stale config forever. + firewall_manager.generate_corefile([], self.path) + first_inode = os.stat(self.path).st_ino + peers = [_make_peer('10.0.0.3', internet=False, services=['calendar'])] + firewall_manager.generate_corefile(peers, self.path) + self.assertEqual(os.stat(self.path).st_ino, first_inode) + # And the new content actually landed in that same inode. + self.assertIn('block net 10.0.0.3/32', open(self.path).read()) + def test_returns_false_on_write_error(self): with unittest.mock.patch('builtins.open', side_effect=OSError('Permission denied')): result = firewall_manager.generate_corefile([], '/any/path/Corefile') diff --git a/tests/test_setup_manager.py b/tests/test_setup_manager.py index e7548c6..e4e8bd2 100644 --- a/tests/test_setup_manager.py +++ b/tests/test_setup_manager.py @@ -37,9 +37,17 @@ def mock_auth_manager(): @pytest.fixture -def setup_manager(mock_config_manager, mock_auth_manager): - """SetupManager wired to both mocks.""" - return SetupManager(mock_config_manager, mock_auth_manager) +def mock_network_manager(): + """A MagicMock standing in for NetworkManager.""" + mgr = MagicMock() + mgr._get_wg_server_ip.return_value = '10.0.0.1' + return mgr + + +@pytest.fixture +def setup_manager(mock_config_manager, mock_auth_manager, mock_network_manager): + """SetupManager wired to all mocks.""" + return SetupManager(mock_config_manager, mock_auth_manager, mock_network_manager) # ── valid payload helper ─────────────────────────────────────────────────────── @@ -225,6 +233,28 @@ def test_complete_setup_returns_success_redirect_on_valid_payload( assert result == {'success': True, 'redirect': '/login'} +def test_complete_setup_writes_split_horizon_zone_for_pic_ngo( + setup_manager, mock_config_manager, mock_auth_manager, mock_network_manager, tmp_path): + """Non-LAN setup must write the split-horizon zone so VPN clients can + resolve the effective domain to the internal Caddy IP through the tunnel. + Regression: this was missing, causing dns_probe_finished_bad_config.""" + mock_config_manager.get_identity.return_value = {'domain': 'roof.local'} + mock_config_manager.get_effective_domain.return_value = 'roof.pic.ngo' + with patch.dict(os.environ, {'DATA_DIR': str(tmp_path)}): + setup_manager.complete_setup(_valid_payload(domain_mode='pic_ngo')) + mock_network_manager.update_split_horizon_zone.assert_called_once_with( + 'roof.pic.ngo', '10.0.0.1', primary_domain='roof.local') + + +def test_complete_setup_skips_split_horizon_zone_for_lan( + setup_manager, mock_config_manager, mock_auth_manager, mock_network_manager, tmp_path): + """LAN mode has no effective public domain — no split-horizon zone.""" + mock_config_manager.get_identity.return_value = {} + with patch.dict(os.environ, {'DATA_DIR': str(tmp_path)}): + setup_manager.complete_setup(_valid_payload(domain_mode='lan')) + mock_network_manager.update_split_horizon_zone.assert_not_called() + + def test_complete_setup_returns_error_when_already_complete( setup_manager, mock_config_manager, tmp_path): """If setup is already done when the lock-protected re-check runs, return error."""