fix: DNS first-install — split-horizon zone creation + CoreDNS inode bind-mount
VPN clients got dns_probe_finished_bad_config / couldn't resolve any domain after first setup because: 1. complete_setup() never wrote the split-horizon DNS zone for non-LAN modes; SetupManager now accepts network_manager as an optional 3rd constructor param, and complete_setup() calls self.network_manager.update_split_horizon_zone(effective_domain, wg_ip, primary_domain) for pic_ngo/cell_to_cell modes. 2. generate_corefile() used a tmp-file + os.replace pattern; the Corefile is a Docker FILE bind-mount, so os.replace orphaned the inode and CoreDNS never saw config updates. Fixed by truncating and rewriting in place (open with 'w', seek(0), truncate()), preserving the inode CoreDNS holds. api/managers.py passes network_manager into SetupManager. Tests: new mock_network_manager fixture, 2 setup-zone tests, 1 inode regression test in test_firewall_manager.py. Verified live on pic1. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -797,12 +797,18 @@ def generate_corefile(peers: List[Dict[str, Any]], corefile_path: str = COREFILE
|
||||
# local.{domain} block intentionally omitted: /data/local.zone does not exist
|
||||
# and CoreDNS logs errors on every reload for a missing zone file.
|
||||
os.makedirs(os.path.dirname(corefile_path), exist_ok=True)
|
||||
tmp_path = corefile_path + '.tmp'
|
||||
with open(tmp_path, 'w') as f:
|
||||
# Write in place (truncate + rewrite the SAME inode) rather than
|
||||
# writing a temp file and os.replace()-ing it in. The Corefile is a
|
||||
# Docker FILE bind-mount (./config/dns/Corefile:/etc/coredns/Corefile);
|
||||
# os.replace creates a NEW inode, but the container stays bound to the
|
||||
# original inode and never sees the update — so CoreDNS silently runs
|
||||
# stale config until the container restarts. CoreDNS only re-reads on
|
||||
# the SIGUSR1 we send right after this completes, so a non-atomic
|
||||
# write is safe here.
|
||||
with open(corefile_path, 'w') as f:
|
||||
f.write(corefile)
|
||||
f.flush()
|
||||
os.fsync(f.fileno())
|
||||
os.replace(tmp_path, corefile_path)
|
||||
|
||||
logger.info(f"Wrote Corefile to {corefile_path}")
|
||||
return True
|
||||
|
||||
+2
-1
@@ -107,7 +107,8 @@ egress_manager = EgressManager(
|
||||
)
|
||||
service_store_manager.egress_manager = egress_manager
|
||||
|
||||
setup_manager = SetupManager(config_manager=config_manager, auth_manager=auth_manager)
|
||||
setup_manager = SetupManager(config_manager=config_manager, auth_manager=auth_manager,
|
||||
network_manager=network_manager)
|
||||
|
||||
# Service logger configuration
|
||||
_service_log_configs = {
|
||||
|
||||
+24
-1
@@ -94,9 +94,10 @@ def _build_ddns_config(domain_mode: str, cloudflare_api_token: str = '',
|
||||
class SetupManager:
|
||||
"""Manages the first-run setup wizard state and completion."""
|
||||
|
||||
def __init__(self, config_manager, auth_manager):
|
||||
def __init__(self, config_manager, auth_manager, network_manager=None):
|
||||
self.config_manager = config_manager
|
||||
self.auth_manager = auth_manager
|
||||
self.network_manager = network_manager
|
||||
|
||||
# ── state helpers ─────────────────────────────────────────────────────
|
||||
|
||||
@@ -270,6 +271,28 @@ class SetupManager:
|
||||
'HTTPS will activate once registration succeeds.'
|
||||
)
|
||||
|
||||
# ── write the split-horizon DNS zone for non-LAN modes ─────────
|
||||
# VPN clients use the cell's CoreDNS (DNS=<wg ip>) and must resolve
|
||||
# the effective domain to the internal Caddy IP so traffic reaches
|
||||
# Caddy through the tunnel. _bootstrap_dns runs at container start
|
||||
# BEFORE setup completes (domain_mode still 'lan'), so it takes the
|
||||
# LAN branch and never writes this zone — leaving CoreDNS pointing
|
||||
# at a missing zone file and VPN lookups returning nothing
|
||||
# (dns_probe_finished_bad_config). Write it here now that the mode
|
||||
# and effective domain are known.
|
||||
if domain_mode != 'lan' and self.network_manager is not None:
|
||||
try:
|
||||
effective_domain = self.config_manager.get_effective_domain()
|
||||
primary_domain = self.config_manager.get_identity().get('domain', 'cell')
|
||||
if effective_domain and effective_domain != primary_domain:
|
||||
caddy_ip = self.network_manager._get_wg_server_ip()
|
||||
self.network_manager.update_split_horizon_zone(
|
||||
effective_domain, caddy_ip, primary_domain=primary_domain)
|
||||
logger.info(
|
||||
f'Split-horizon zone written for {effective_domain} -> {caddy_ip}')
|
||||
except Exception as exc:
|
||||
logger.warning(f'Split-horizon zone setup failed (non-fatal): {exc}')
|
||||
|
||||
# ── mark setup complete (must be last) ─────────────────────────
|
||||
self.config_manager.set_identity_field('setup_complete', True)
|
||||
|
||||
|
||||
@@ -132,6 +132,18 @@ class TestGenerateCorefile(unittest.TestCase):
|
||||
content = open(self.path).read()
|
||||
self.assertIn('reload', content)
|
||||
|
||||
def test_rewrite_preserves_inode(self):
|
||||
# Regression: the Corefile is a Docker FILE bind-mount, so it must be
|
||||
# rewritten in place. os.replace() would swap the inode and the
|
||||
# container would keep reading stale config forever.
|
||||
firewall_manager.generate_corefile([], self.path)
|
||||
first_inode = os.stat(self.path).st_ino
|
||||
peers = [_make_peer('10.0.0.3', internet=False, services=['calendar'])]
|
||||
firewall_manager.generate_corefile(peers, self.path)
|
||||
self.assertEqual(os.stat(self.path).st_ino, first_inode)
|
||||
# And the new content actually landed in that same inode.
|
||||
self.assertIn('block net 10.0.0.3/32', open(self.path).read())
|
||||
|
||||
def test_returns_false_on_write_error(self):
|
||||
with unittest.mock.patch('builtins.open', side_effect=OSError('Permission denied')):
|
||||
result = firewall_manager.generate_corefile([], '/any/path/Corefile')
|
||||
|
||||
@@ -37,9 +37,17 @@ def mock_auth_manager():
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup_manager(mock_config_manager, mock_auth_manager):
|
||||
"""SetupManager wired to both mocks."""
|
||||
return SetupManager(mock_config_manager, mock_auth_manager)
|
||||
def mock_network_manager():
|
||||
"""A MagicMock standing in for NetworkManager."""
|
||||
mgr = MagicMock()
|
||||
mgr._get_wg_server_ip.return_value = '10.0.0.1'
|
||||
return mgr
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup_manager(mock_config_manager, mock_auth_manager, mock_network_manager):
|
||||
"""SetupManager wired to all mocks."""
|
||||
return SetupManager(mock_config_manager, mock_auth_manager, mock_network_manager)
|
||||
|
||||
|
||||
# ── valid payload helper ───────────────────────────────────────────────────────
|
||||
@@ -225,6 +233,28 @@ def test_complete_setup_returns_success_redirect_on_valid_payload(
|
||||
assert result == {'success': True, 'redirect': '/login'}
|
||||
|
||||
|
||||
def test_complete_setup_writes_split_horizon_zone_for_pic_ngo(
|
||||
setup_manager, mock_config_manager, mock_auth_manager, mock_network_manager, tmp_path):
|
||||
"""Non-LAN setup must write the split-horizon zone so VPN clients can
|
||||
resolve the effective domain to the internal Caddy IP through the tunnel.
|
||||
Regression: this was missing, causing dns_probe_finished_bad_config."""
|
||||
mock_config_manager.get_identity.return_value = {'domain': 'roof.local'}
|
||||
mock_config_manager.get_effective_domain.return_value = 'roof.pic.ngo'
|
||||
with patch.dict(os.environ, {'DATA_DIR': str(tmp_path)}):
|
||||
setup_manager.complete_setup(_valid_payload(domain_mode='pic_ngo'))
|
||||
mock_network_manager.update_split_horizon_zone.assert_called_once_with(
|
||||
'roof.pic.ngo', '10.0.0.1', primary_domain='roof.local')
|
||||
|
||||
|
||||
def test_complete_setup_skips_split_horizon_zone_for_lan(
|
||||
setup_manager, mock_config_manager, mock_auth_manager, mock_network_manager, tmp_path):
|
||||
"""LAN mode has no effective public domain — no split-horizon zone."""
|
||||
mock_config_manager.get_identity.return_value = {}
|
||||
with patch.dict(os.environ, {'DATA_DIR': str(tmp_path)}):
|
||||
setup_manager.complete_setup(_valid_payload(domain_mode='lan'))
|
||||
mock_network_manager.update_split_horizon_zone.assert_not_called()
|
||||
|
||||
|
||||
def test_complete_setup_returns_error_when_already_complete(
|
||||
setup_manager, mock_config_manager, tmp_path):
|
||||
"""If setup is already done when the lock-protected re-check runs, return error."""
|
||||
|
||||
Reference in New Issue
Block a user