fix: advertise WireGuard endpoint by domain, and reach linked cells over HTTPS
Unit Tests / test (push) Successful in 9m50s

Three related cell-link/peer-config fixes (the peer and cell endpoints were
showing the raw external IP, which confused public-vs-internal addressing):

1. Peer WireGuard configs now embed the cell's effective domain (DDNS/ACME
   modes) instead of the detected external IP, via the new
   WireGuardManager.get_advertised_endpoint(). A name that resolves to the
   public IP survives IP changes and lets the datacenter forward each cell's
   WG port to the right host. LAN mode still falls back to the IP; an admin
   wireguard_endpoint override still wins.

2. Cell invites advertise <effective-domain>:<this cell's WG port> (was the
   external IP + a default/possibly-wrong port), so a remote cell pairs to the
   right host and port over the public path.

3. Cross-cell peer-sync no longer targets http://<ip>:3000 (the API binds
   127.0.0.1 and is unreachable across cells). It targets the remote's Caddy on
   HTTPS/443 — which the WireGuard server already DNATs over the tunnel — and the
   initial pre-tunnel invite push goes to https://<endpoint-host>/... ; legacy
   http://<ip>:3000 link URLs migrate to https on load.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-06-16 04:21:16 -04:00
parent fa746a3b30
commit 1bb8a5eb59
6 changed files with 181 additions and 37 deletions
+48 -18
View File
@@ -30,6 +30,17 @@ _BACKOFF_BASE_S = 60
_BACKOFF_MAX_S = 3600
def _remote_api_url(dns_ip: Optional[str]) -> Optional[str]:
"""Base URL for a linked cell's API, reached over the WG tunnel.
Cross-cell peer-sync goes to the remote's Caddy on 443 (the WireGuard server
DNATs VPN-IP:443 → Caddy → API). The API's own :3000 is bound to 127.0.0.1
and is NOT reachable from another cell, so we must target HTTPS/443, not
http://<ip>:3000.
"""
return f"https://{dns_ip}" if dns_ip else None
def _compute_next_retry(attempts: int) -> str:
"""Return an ISO timestamp for the earliest next retry using capped exponential backoff."""
delay = min(_BACKOFF_BASE_S * (2 ** (attempts - 1)), _BACKOFF_MAX_S)
@@ -66,10 +77,12 @@ class CellLinkManager:
changed = True
# Phase 1 migration: permission-sync tracking fields
if 'remote_api_url' not in link:
link['remote_api_url'] = (
f"http://{link['dns_ip']}:3000"
if link.get('dns_ip') else None
)
link['remote_api_url'] = _remote_api_url(link.get('dns_ip'))
changed = True
# Migrate legacy http://<ip>:3000 URLs (unreachable across
# cells) to the HTTPS/Caddy form.
elif str(link.get('remote_api_url', '')).startswith('http://'):
link['remote_api_url'] = _remote_api_url(link.get('dns_ip'))
changed = True
if 'last_push_status' not in link:
link['last_push_status'] = 'never'
@@ -193,7 +206,10 @@ class CellLinkManager:
cmd = [
'docker', 'exec', 'cell-wireguard',
'curl', '-s', '-o', '/dev/null', '-w', '%{http_code}',
# -k: the request reaches Caddy by the remote's VPN IP over the
# encrypted WG tunnel, so the TLS cert (issued for the cell's domain)
# won't match the IP — the tunnel already authenticates the peer.
'curl', '-s', '-k', '-o', '/dev/null', '-w', '%{http_code}',
'-X', 'POST',
'-H', 'Content-Type: application/json',
]
@@ -371,14 +387,24 @@ class CellLinkManager:
# ── Public API ────────────────────────────────────────────────────────────
def generate_invite(self, cell_name: str, domain: str) -> Dict[str, Any]:
"""Return an invite package describing this cell for another cell to import."""
"""Return an invite package describing this cell for another cell to import.
The endpoint advertises the cell's public domain (when in a DDNS/ACME
mode) plus this cell's own WireGuard port, rather than a raw external IP —
so the remote cell reaches us by name and a NAT/router can forward each
cell's distinct WG port to the right host.
"""
keys = self.wireguard_manager.get_keys()
srv = self.wireguard_manager.get_server_config()
server_vpn_ip = self.wireguard_manager._get_configured_address().split('/')[0]
try:
from app import config_manager as _cm
except Exception:
_cm = None
endpoint = self.wireguard_manager.get_advertised_endpoint(_cm)
return {
'cell_name': cell_name,
'public_key': keys['public_key'],
'endpoint': srv.get('endpoint'),
'endpoint': endpoint,
'vpn_subnet': self.wireguard_manager._get_configured_network(),
'dns_ip': server_vpn_ip,
'domain': domain,
@@ -448,15 +474,16 @@ class CellLinkManager:
def _push_invite_to_remote(self, link: Dict[str, Any]) -> Dict[str, Any]:
"""Send OUR invite to the remote cell so it can complete mutual WG pairing.
Called immediately after adding the remote as our WG peer. Uses the
remote's endpoint IP (LAN-reachable before the WG tunnel is up) rather
than the WG-internal dns_ip. Non-fatal — one-sided pairing degrades
gracefully; the admin can pair from the other side manually.
Called immediately after adding the remote as our WG peer, before the WG
tunnel is up. Reaches the remote over the PUBLIC path at its advertised
endpoint host (a domain in DDNS/ACME modes) on Caddy/443 — the API's :3000
is 127.0.0.1-only and not reachable across cells. Non-fatal — one-sided
pairing degrades gracefully; the admin can pair from the other side.
"""
endpoint = link.get('endpoint') or ''
if not endpoint:
return {'ok': False, 'error': 'no endpoint'}
# Parse LAN IP from endpoint (e.g. "192.168.31.52:51820" → "192.168.31.52")
# Host from endpoint (e.g. "alice.pic.ngo:51821" → "alice.pic.ngo").
try:
host = endpoint.rsplit(':', 1)[0].strip('[]')
except Exception:
@@ -471,11 +498,14 @@ class CellLinkManager:
except Exception as e:
return {'ok': False, 'error': f'could not build own invite: {e}'}
url = f'http://{host}:3000/api/cells/peer-sync/accept-invite'
url = f'https://{host}/api/cells/peer-sync/accept-invite'
payload = json.dumps({'invite': own_invite})
cmd = [
'docker', 'exec', 'cell-wireguard',
'curl', '-s', '-o', '/dev/null', '-w', '%{http_code}',
# -k: endpoint may be a bare IP (LAN/fallback) whose cert won't match;
# accept-invite carries only public keys and the WG handshake is the
# real authentication.
'curl', '-s', '-k', '-o', '/dev/null', '-w', '%{http_code}',
'-X', 'POST',
'-H', 'Content-Type: application/json',
'-d', payload,
@@ -537,7 +567,7 @@ class CellLinkManager:
old_domain = existing.get('domain', '')
existing['dns_ip'] = invite['dns_ip']
existing['vpn_subnet'] = invite['vpn_subnet']
existing['remote_api_url'] = f"http://{invite['dns_ip']}:3000"
existing['remote_api_url'] = _remote_api_url(invite['dns_ip'])
if invite.get('endpoint'):
existing['endpoint'] = invite['endpoint']
if domain_changed:
@@ -599,7 +629,7 @@ class CellLinkManager:
'domain': invite['domain'],
'connected_at': datetime.utcnow().isoformat(),
'permissions': _default_perms(),
'remote_api_url': f"http://{invite['dns_ip']}:3000",
'remote_api_url': _remote_api_url(invite['dns_ip']),
'last_push_status': 'never',
'last_push_at': None,
'last_push_error': None,
@@ -659,7 +689,7 @@ class CellLinkManager:
'domain': invite['domain'],
'connected_at': datetime.utcnow().isoformat(),
'permissions': perms,
'remote_api_url': f"http://{invite['dns_ip']}:3000",
'remote_api_url': _remote_api_url(invite['dns_ip']),
'last_push_status': 'never',
'last_push_at': None,
'last_push_error': None,