fix: advertise WireGuard endpoint by domain, and reach linked cells over HTTPS
Unit Tests / test (push) Successful in 9m50s
Unit Tests / test (push) Successful in 9m50s
Three related cell-link/peer-config fixes (the peer and cell endpoints were showing the raw external IP, which confused public-vs-internal addressing): 1. Peer WireGuard configs now embed the cell's effective domain (DDNS/ACME modes) instead of the detected external IP, via the new WireGuardManager.get_advertised_endpoint(). A name that resolves to the public IP survives IP changes and lets the datacenter forward each cell's WG port to the right host. LAN mode still falls back to the IP; an admin wireguard_endpoint override still wins. 2. Cell invites advertise <effective-domain>:<this cell's WG port> (was the external IP + a default/possibly-wrong port), so a remote cell pairs to the right host and port over the public path. 3. Cross-cell peer-sync no longer targets http://<ip>:3000 (the API binds 127.0.0.1 and is unreachable across cells). It targets the remote's Caddy on HTTPS/443 — which the WireGuard server already DNATs over the tunnel — and the initial pre-tunnel invite push goes to https://<endpoint-host>/... ; legacy http://<ip>:3000 link URLs migrate to https on load. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
+48
-18
@@ -30,6 +30,17 @@ _BACKOFF_BASE_S = 60
|
||||
_BACKOFF_MAX_S = 3600
|
||||
|
||||
|
||||
def _remote_api_url(dns_ip: Optional[str]) -> Optional[str]:
|
||||
"""Base URL for a linked cell's API, reached over the WG tunnel.
|
||||
|
||||
Cross-cell peer-sync goes to the remote's Caddy on 443 (the WireGuard server
|
||||
DNATs VPN-IP:443 → Caddy → API). The API's own :3000 is bound to 127.0.0.1
|
||||
and is NOT reachable from another cell, so we must target HTTPS/443, not
|
||||
http://<ip>:3000.
|
||||
"""
|
||||
return f"https://{dns_ip}" if dns_ip else None
|
||||
|
||||
|
||||
def _compute_next_retry(attempts: int) -> str:
|
||||
"""Return an ISO timestamp for the earliest next retry using capped exponential backoff."""
|
||||
delay = min(_BACKOFF_BASE_S * (2 ** (attempts - 1)), _BACKOFF_MAX_S)
|
||||
@@ -66,10 +77,12 @@ class CellLinkManager:
|
||||
changed = True
|
||||
# Phase 1 migration: permission-sync tracking fields
|
||||
if 'remote_api_url' not in link:
|
||||
link['remote_api_url'] = (
|
||||
f"http://{link['dns_ip']}:3000"
|
||||
if link.get('dns_ip') else None
|
||||
)
|
||||
link['remote_api_url'] = _remote_api_url(link.get('dns_ip'))
|
||||
changed = True
|
||||
# Migrate legacy http://<ip>:3000 URLs (unreachable across
|
||||
# cells) to the HTTPS/Caddy form.
|
||||
elif str(link.get('remote_api_url', '')).startswith('http://'):
|
||||
link['remote_api_url'] = _remote_api_url(link.get('dns_ip'))
|
||||
changed = True
|
||||
if 'last_push_status' not in link:
|
||||
link['last_push_status'] = 'never'
|
||||
@@ -193,7 +206,10 @@ class CellLinkManager:
|
||||
|
||||
cmd = [
|
||||
'docker', 'exec', 'cell-wireguard',
|
||||
'curl', '-s', '-o', '/dev/null', '-w', '%{http_code}',
|
||||
# -k: the request reaches Caddy by the remote's VPN IP over the
|
||||
# encrypted WG tunnel, so the TLS cert (issued for the cell's domain)
|
||||
# won't match the IP — the tunnel already authenticates the peer.
|
||||
'curl', '-s', '-k', '-o', '/dev/null', '-w', '%{http_code}',
|
||||
'-X', 'POST',
|
||||
'-H', 'Content-Type: application/json',
|
||||
]
|
||||
@@ -371,14 +387,24 @@ class CellLinkManager:
|
||||
# ── Public API ────────────────────────────────────────────────────────────
|
||||
|
||||
def generate_invite(self, cell_name: str, domain: str) -> Dict[str, Any]:
|
||||
"""Return an invite package describing this cell for another cell to import."""
|
||||
"""Return an invite package describing this cell for another cell to import.
|
||||
|
||||
The endpoint advertises the cell's public domain (when in a DDNS/ACME
|
||||
mode) plus this cell's own WireGuard port, rather than a raw external IP —
|
||||
so the remote cell reaches us by name and a NAT/router can forward each
|
||||
cell's distinct WG port to the right host.
|
||||
"""
|
||||
keys = self.wireguard_manager.get_keys()
|
||||
srv = self.wireguard_manager.get_server_config()
|
||||
server_vpn_ip = self.wireguard_manager._get_configured_address().split('/')[0]
|
||||
try:
|
||||
from app import config_manager as _cm
|
||||
except Exception:
|
||||
_cm = None
|
||||
endpoint = self.wireguard_manager.get_advertised_endpoint(_cm)
|
||||
return {
|
||||
'cell_name': cell_name,
|
||||
'public_key': keys['public_key'],
|
||||
'endpoint': srv.get('endpoint'),
|
||||
'endpoint': endpoint,
|
||||
'vpn_subnet': self.wireguard_manager._get_configured_network(),
|
||||
'dns_ip': server_vpn_ip,
|
||||
'domain': domain,
|
||||
@@ -448,15 +474,16 @@ class CellLinkManager:
|
||||
def _push_invite_to_remote(self, link: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Send OUR invite to the remote cell so it can complete mutual WG pairing.
|
||||
|
||||
Called immediately after adding the remote as our WG peer. Uses the
|
||||
remote's endpoint IP (LAN-reachable before the WG tunnel is up) rather
|
||||
than the WG-internal dns_ip. Non-fatal — one-sided pairing degrades
|
||||
gracefully; the admin can pair from the other side manually.
|
||||
Called immediately after adding the remote as our WG peer, before the WG
|
||||
tunnel is up. Reaches the remote over the PUBLIC path at its advertised
|
||||
endpoint host (a domain in DDNS/ACME modes) on Caddy/443 — the API's :3000
|
||||
is 127.0.0.1-only and not reachable across cells. Non-fatal — one-sided
|
||||
pairing degrades gracefully; the admin can pair from the other side.
|
||||
"""
|
||||
endpoint = link.get('endpoint') or ''
|
||||
if not endpoint:
|
||||
return {'ok': False, 'error': 'no endpoint'}
|
||||
# Parse LAN IP from endpoint (e.g. "192.168.31.52:51820" → "192.168.31.52")
|
||||
# Host from endpoint (e.g. "alice.pic.ngo:51821" → "alice.pic.ngo").
|
||||
try:
|
||||
host = endpoint.rsplit(':', 1)[0].strip('[]')
|
||||
except Exception:
|
||||
@@ -471,11 +498,14 @@ class CellLinkManager:
|
||||
except Exception as e:
|
||||
return {'ok': False, 'error': f'could not build own invite: {e}'}
|
||||
|
||||
url = f'http://{host}:3000/api/cells/peer-sync/accept-invite'
|
||||
url = f'https://{host}/api/cells/peer-sync/accept-invite'
|
||||
payload = json.dumps({'invite': own_invite})
|
||||
cmd = [
|
||||
'docker', 'exec', 'cell-wireguard',
|
||||
'curl', '-s', '-o', '/dev/null', '-w', '%{http_code}',
|
||||
# -k: endpoint may be a bare IP (LAN/fallback) whose cert won't match;
|
||||
# accept-invite carries only public keys and the WG handshake is the
|
||||
# real authentication.
|
||||
'curl', '-s', '-k', '-o', '/dev/null', '-w', '%{http_code}',
|
||||
'-X', 'POST',
|
||||
'-H', 'Content-Type: application/json',
|
||||
'-d', payload,
|
||||
@@ -537,7 +567,7 @@ class CellLinkManager:
|
||||
old_domain = existing.get('domain', '')
|
||||
existing['dns_ip'] = invite['dns_ip']
|
||||
existing['vpn_subnet'] = invite['vpn_subnet']
|
||||
existing['remote_api_url'] = f"http://{invite['dns_ip']}:3000"
|
||||
existing['remote_api_url'] = _remote_api_url(invite['dns_ip'])
|
||||
if invite.get('endpoint'):
|
||||
existing['endpoint'] = invite['endpoint']
|
||||
if domain_changed:
|
||||
@@ -599,7 +629,7 @@ class CellLinkManager:
|
||||
'domain': invite['domain'],
|
||||
'connected_at': datetime.utcnow().isoformat(),
|
||||
'permissions': _default_perms(),
|
||||
'remote_api_url': f"http://{invite['dns_ip']}:3000",
|
||||
'remote_api_url': _remote_api_url(invite['dns_ip']),
|
||||
'last_push_status': 'never',
|
||||
'last_push_at': None,
|
||||
'last_push_error': None,
|
||||
@@ -659,7 +689,7 @@ class CellLinkManager:
|
||||
'domain': invite['domain'],
|
||||
'connected_at': datetime.utcnow().isoformat(),
|
||||
'permissions': perms,
|
||||
'remote_api_url': f"http://{invite['dns_ip']}:3000",
|
||||
'remote_api_url': _remote_api_url(invite['dns_ip']),
|
||||
'last_push_status': 'never',
|
||||
'last_push_at': None,
|
||||
'last_push_error': None,
|
||||
|
||||
Reference in New Issue
Block a user