fix: complete cross-cell peer-sync push (domain SNI + source-preserving NAT)
Unit Tests / test (push) Successful in 9m45s
Unit Tests / test (push) Successful in 9m45s
Finishes the transport repair (L1+L2 landed in 714fb9b). The push now works
end-to-end between linked cells — verified live: offer/permission state
propagates automatically and the cell_relay derives/reverts without manual steps.
L3 — push by domain, not bare IP (cell_link_manager): the push targeted
https://<vpn-ip>, but in DDNS/ACME mode Caddy only holds a cert for the cell's
domain, so the TLS handshake failed by IP. Target https://<remote-domain> with
`curl --resolve <domain>:443:<dns_ip>` — connect to the VPN IP over the tunnel
but present the domain as SNI/Host. remote_api_url is now domain-based; legacy
http://ip:3000 and https://ip URLs migrate on load.
L4 — preserve the real source for auth (firewall_manager): the blanket
`-o eth0 MASQUERADE` rewrote the push source, so the remote's X-Forwarded-For
source-subnet auth couldn't match. apply_cell_rules adds a tightly-scoped nat
POSTROUTING RETURN (linked-subnet → caddy:443 only) above the masquerade; the
host route returns Caddy's reply through the tunnel. Reviewed by pic-security:
WireGuard per-cell AllowedIPs + Caddy last-XFF (no trusted_proxies) keep this
un-spoofable; the API stays 127.0.0.1-only.
Also:
- validate remote-invite domain/dns_ip/endpoint/subnet at ingest (they reach a
curl --resolve argv — block leading-dash argument-injection).
- remove the host subnet route on cell unlink (remove_cell_subnet_route); the
route was never cleaned, leaving a stale subnet that made is_local_request
treat it as local. Mock firewall side-effects in the affected unit tests.
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
+75
-23
@@ -8,10 +8,12 @@ Each connection is stored in data/cell_links.json and manifests as:
|
||||
- An iptables FORWARD rule set (service-level access control)
|
||||
"""
|
||||
|
||||
import ipaddress
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import subprocess
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from typing import Any, Dict, List, Optional
|
||||
@@ -30,15 +32,56 @@ _BACKOFF_BASE_S = 60
|
||||
_BACKOFF_MAX_S = 3600
|
||||
|
||||
|
||||
def _remote_api_url(dns_ip: Optional[str]) -> Optional[str]:
|
||||
# Strict formats for fields imported from a remote cell's invite. The domain and
|
||||
# dns_ip flow into a `curl --resolve <domain>:443:<dns_ip>` argv (peer-sync push);
|
||||
# anchoring them — domain must start alphanumeric, dns_ip must be an IP — prevents
|
||||
# a malicious invite injecting a leading-dash value that curl reads as a flag.
|
||||
_INVITE_HOSTNAME_RE = re.compile(r'^[A-Za-z0-9]([A-Za-z0-9.-]{0,253}[A-Za-z0-9])?$')
|
||||
_INVITE_CELL_NAME_RE = re.compile(r'^[A-Za-z0-9][A-Za-z0-9 _.-]{0,63}$')
|
||||
_INVITE_ENDPOINT_RE = re.compile(r'^[A-Za-z0-9][A-Za-z0-9._-]*:\d{1,5}$')
|
||||
|
||||
|
||||
def _validate_invite_fields(invite: Dict[str, Any]) -> None:
|
||||
"""Reject a remote cell's invite whose fields aren't strictly well-formed.
|
||||
|
||||
Defence-in-depth: these values come from another cell and reach iptables,
|
||||
DNS config, and a curl argv (the peer-sync push --resolves <domain>:443:
|
||||
<dns_ip>). Anchoring domain/dns_ip/endpoint to start alphanumeric blocks a
|
||||
malicious leading-dash value that curl would read as a flag. The public_key
|
||||
is validated downstream by WireGuardManager.add_cell_peer. Raise ValueError
|
||||
on anything malformed.
|
||||
"""
|
||||
name = invite.get('cell_name', '')
|
||||
if not isinstance(name, str) or not _INVITE_CELL_NAME_RE.match(name):
|
||||
raise ValueError(f'invalid cell_name {name!r}')
|
||||
domain = invite.get('domain', '')
|
||||
if not isinstance(domain, str) or not _INVITE_HOSTNAME_RE.match(domain):
|
||||
raise ValueError(f'invalid domain {domain!r}: must be a hostname')
|
||||
try:
|
||||
ipaddress.ip_address(str(invite.get('dns_ip', '')))
|
||||
except ValueError:
|
||||
raise ValueError(f"invalid dns_ip {invite.get('dns_ip')!r}")
|
||||
try:
|
||||
ipaddress.ip_network(str(invite.get('vpn_subnet', '')), strict=False)
|
||||
except ValueError:
|
||||
raise ValueError(f"invalid vpn_subnet {invite.get('vpn_subnet')!r}")
|
||||
endpoint = invite.get('endpoint')
|
||||
if endpoint and not _INVITE_ENDPOINT_RE.match(str(endpoint)):
|
||||
raise ValueError(f'invalid endpoint {endpoint!r}')
|
||||
|
||||
|
||||
def _remote_api_url(domain: Optional[str]) -> Optional[str]:
|
||||
"""Base URL for a linked cell's API, reached over the WG tunnel.
|
||||
|
||||
Cross-cell peer-sync goes to the remote's Caddy on 443 (the WireGuard server
|
||||
DNATs VPN-IP:443 → Caddy → API). The API's own :3000 is bound to 127.0.0.1
|
||||
and is NOT reachable from another cell, so we must target HTTPS/443, not
|
||||
http://<ip>:3000.
|
||||
DNATs VPN-IP:443 → Caddy → API; the API's own :3000 binds 127.0.0.1 and is
|
||||
unreachable from another cell). The URL uses the remote cell's DOMAIN — not
|
||||
its VPN IP — because Caddy only holds a certificate for the domain (ACME) or
|
||||
the .cell name (internal CA); a request by bare IP has no matching SNI and the
|
||||
TLS handshake fails. The push connects to the VPN IP over the tunnel via
|
||||
`curl --resolve <domain>:443:<dns_ip>` (see _push_permissions_to_remote).
|
||||
"""
|
||||
return f"https://{dns_ip}" if dns_ip else None
|
||||
return f"https://{domain}" if domain else None
|
||||
|
||||
|
||||
def _compute_next_retry(attempts: int) -> str:
|
||||
@@ -76,13 +119,12 @@ class CellLinkManager:
|
||||
link['permissions'] = _default_perms()
|
||||
changed = True
|
||||
# Phase 1 migration: permission-sync tracking fields
|
||||
if 'remote_api_url' not in link:
|
||||
link['remote_api_url'] = _remote_api_url(link.get('dns_ip'))
|
||||
changed = True
|
||||
# Migrate legacy http://<ip>:3000 URLs (unreachable across
|
||||
# cells) to the HTTPS/Caddy form.
|
||||
elif str(link.get('remote_api_url', '')).startswith('http://'):
|
||||
link['remote_api_url'] = _remote_api_url(link.get('dns_ip'))
|
||||
# Domain-based HTTPS URL. Rebuild if missing, or if it's a
|
||||
# legacy form: http://<ip>:3000 (unreachable) or https://<ip>
|
||||
# (no matching Caddy cert by bare IP).
|
||||
_want_url = _remote_api_url(link.get('domain'))
|
||||
if link.get('remote_api_url') != _want_url and _want_url:
|
||||
link['remote_api_url'] = _want_url
|
||||
changed = True
|
||||
if 'last_push_status' not in link:
|
||||
link['last_push_status'] = 'never'
|
||||
@@ -197,19 +239,26 @@ class CellLinkManager:
|
||||
payload = json.dumps(body)
|
||||
endpoint = url.rstrip('/') + '/api/cells/peer-sync/permissions'
|
||||
|
||||
# Determine local WG IP so the remote can authenticate us by source subnet.
|
||||
# MASQUERADE rewrites source to cell-wireguard's eth0 IP (172.20.x.x), which
|
||||
# is NOT in the cell's vpn_subnet. Passing the true WG IP in X-Forwarded-For
|
||||
# lets _authenticate_peer_cell() find the matching cell link.
|
||||
# Determine local WG IP for X-Forwarded-For (belt-and-suspenders for the
|
||||
# remote's source-subnet auth). With the peer-sync masquerade exclusion
|
||||
# the remote's Caddy already sees our real VPN source and appends it, but
|
||||
# passing it explicitly is harmless.
|
||||
local_wg_ip = self._local_wg_ip()
|
||||
xff_header = f'X-Forwarded-For: {local_wg_ip}' if local_wg_ip else None
|
||||
|
||||
# Reach the remote over the WG tunnel by its VPN IP, but present the
|
||||
# cell's DOMAIN as SNI/Host so Caddy serves its certificate — a request
|
||||
# to a bare IP has no matching cert and the TLS handshake fails. -k still
|
||||
# covers LAN mode (internal-CA cert curl won't chain to).
|
||||
domain = link.get('domain')
|
||||
dns_ip = link.get('dns_ip')
|
||||
cmd = [
|
||||
'docker', 'exec', 'cell-wireguard',
|
||||
# -k: the request reaches Caddy by the remote's VPN IP over the
|
||||
# encrypted WG tunnel, so the TLS cert (issued for the cell's domain)
|
||||
# won't match the IP — the tunnel already authenticates the peer.
|
||||
'curl', '-s', '-k', '-o', '/dev/null', '-w', '%{http_code}',
|
||||
]
|
||||
if domain and dns_ip:
|
||||
cmd += ['--resolve', f'{domain}:443:{dns_ip}']
|
||||
cmd += [
|
||||
'-X', 'POST',
|
||||
'-H', 'Content-Type: application/json',
|
||||
]
|
||||
@@ -537,6 +586,7 @@ class CellLinkManager:
|
||||
for field in ('cell_name', 'public_key', 'vpn_subnet', 'dns_ip', 'domain'):
|
||||
if field not in invite:
|
||||
raise ValueError(f"Invite missing field: {field!r}")
|
||||
_validate_invite_fields(invite)
|
||||
|
||||
links = self._load()
|
||||
name = invite['cell_name']
|
||||
@@ -567,7 +617,7 @@ class CellLinkManager:
|
||||
old_domain = existing.get('domain', '')
|
||||
existing['dns_ip'] = invite['dns_ip']
|
||||
existing['vpn_subnet'] = invite['vpn_subnet']
|
||||
existing['remote_api_url'] = _remote_api_url(invite['dns_ip'])
|
||||
existing['remote_api_url'] = _remote_api_url(invite['domain'])
|
||||
if invite.get('endpoint'):
|
||||
existing['endpoint'] = invite['endpoint']
|
||||
if domain_changed:
|
||||
@@ -629,7 +679,7 @@ class CellLinkManager:
|
||||
'domain': invite['domain'],
|
||||
'connected_at': datetime.utcnow().isoformat(),
|
||||
'permissions': _default_perms(),
|
||||
'remote_api_url': _remote_api_url(invite['dns_ip']),
|
||||
'remote_api_url': _remote_api_url(invite['domain']),
|
||||
'last_push_status': 'never',
|
||||
'last_push_at': None,
|
||||
'last_push_error': None,
|
||||
@@ -651,6 +701,7 @@ class CellLinkManager:
|
||||
def add_connection(self, invite: Dict[str, Any],
|
||||
inbound_services: Optional[List[str]] = None) -> Dict[str, Any]:
|
||||
"""Import a remote cell's invite and establish the connection."""
|
||||
_validate_invite_fields(invite)
|
||||
links = self._load()
|
||||
name = invite['cell_name']
|
||||
if any(l['cell_name'] == name for l in links):
|
||||
@@ -689,7 +740,7 @@ class CellLinkManager:
|
||||
'domain': invite['domain'],
|
||||
'connected_at': datetime.utcnow().isoformat(),
|
||||
'permissions': perms,
|
||||
'remote_api_url': _remote_api_url(invite['dns_ip']),
|
||||
'remote_api_url': _remote_api_url(invite['domain']),
|
||||
'last_push_status': 'never',
|
||||
'last_push_at': None,
|
||||
'last_push_error': None,
|
||||
@@ -747,8 +798,9 @@ class CellLinkManager:
|
||||
try:
|
||||
import firewall_manager as _fm
|
||||
_fm.clear_cell_rules(cell_name)
|
||||
_fm.remove_cell_subnet_route(link.get('vpn_subnet', ''))
|
||||
except Exception as e:
|
||||
logger.warning(f"clear_cell_rules for {cell_name} failed (non-fatal): {e}")
|
||||
logger.warning(f"firewall teardown for {cell_name} failed (non-fatal): {e}")
|
||||
|
||||
self.wireguard_manager.remove_peer(link['public_key'])
|
||||
self.network_manager.remove_cell_dns_forward(link['domain'])
|
||||
|
||||
Reference in New Issue
Block a user