fix: complete cross-cell peer-sync push (domain SNI + source-preserving NAT)
Unit Tests / test (push) Successful in 9m45s

Finishes the transport repair (L1+L2 landed in 714fb9b). The push now works
end-to-end between linked cells — verified live: offer/permission state
propagates automatically and the cell_relay derives/reverts without manual steps.

L3 — push by domain, not bare IP (cell_link_manager): the push targeted
https://<vpn-ip>, but in DDNS/ACME mode Caddy only holds a cert for the cell's
domain, so the TLS handshake failed by IP. Target https://<remote-domain> with
`curl --resolve <domain>:443:<dns_ip>` — connect to the VPN IP over the tunnel
but present the domain as SNI/Host. remote_api_url is now domain-based; legacy
http://ip:3000 and https://ip URLs migrate on load.

L4 — preserve the real source for auth (firewall_manager): the blanket
`-o eth0 MASQUERADE` rewrote the push source, so the remote's X-Forwarded-For
source-subnet auth couldn't match. apply_cell_rules adds a tightly-scoped nat
POSTROUTING RETURN (linked-subnet → caddy:443 only) above the masquerade; the
host route returns Caddy's reply through the tunnel. Reviewed by pic-security:
WireGuard per-cell AllowedIPs + Caddy last-XFF (no trusted_proxies) keep this
un-spoofable; the API stays 127.0.0.1-only.

Also:
- validate remote-invite domain/dns_ip/endpoint/subnet at ingest (they reach a
  curl --resolve argv — block leading-dash argument-injection).
- remove the host subnet route on cell unlink (remove_cell_subnet_route); the
  route was never cleaned, leaving a stale subnet that made is_local_request
  treat it as local. Mock firewall side-effects in the affected unit tests.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-06-17 01:02:20 -04:00
parent 714fb9b1a9
commit 639fb66e5b
4 changed files with 250 additions and 46 deletions
+75 -23
View File
@@ -8,10 +8,12 @@ Each connection is stored in data/cell_links.json and manifests as:
- An iptables FORWARD rule set (service-level access control)
"""
import ipaddress
import json
import logging
import os
import random
import re
import subprocess
from datetime import datetime, timezone, timedelta
from typing import Any, Dict, List, Optional
@@ -30,15 +32,56 @@ _BACKOFF_BASE_S = 60
_BACKOFF_MAX_S = 3600
def _remote_api_url(dns_ip: Optional[str]) -> Optional[str]:
# Strict formats for fields imported from a remote cell's invite. The domain and
# dns_ip flow into a `curl --resolve <domain>:443:<dns_ip>` argv (peer-sync push);
# anchoring them — domain must start alphanumeric, dns_ip must be an IP — prevents
# a malicious invite injecting a leading-dash value that curl reads as a flag.
_INVITE_HOSTNAME_RE = re.compile(r'^[A-Za-z0-9]([A-Za-z0-9.-]{0,253}[A-Za-z0-9])?$')
_INVITE_CELL_NAME_RE = re.compile(r'^[A-Za-z0-9][A-Za-z0-9 _.-]{0,63}$')
_INVITE_ENDPOINT_RE = re.compile(r'^[A-Za-z0-9][A-Za-z0-9._-]*:\d{1,5}$')
def _validate_invite_fields(invite: Dict[str, Any]) -> None:
"""Reject a remote cell's invite whose fields aren't strictly well-formed.
Defence-in-depth: these values come from another cell and reach iptables,
DNS config, and a curl argv (the peer-sync push --resolves <domain>:443:
<dns_ip>). Anchoring domain/dns_ip/endpoint to start alphanumeric blocks a
malicious leading-dash value that curl would read as a flag. The public_key
is validated downstream by WireGuardManager.add_cell_peer. Raise ValueError
on anything malformed.
"""
name = invite.get('cell_name', '')
if not isinstance(name, str) or not _INVITE_CELL_NAME_RE.match(name):
raise ValueError(f'invalid cell_name {name!r}')
domain = invite.get('domain', '')
if not isinstance(domain, str) or not _INVITE_HOSTNAME_RE.match(domain):
raise ValueError(f'invalid domain {domain!r}: must be a hostname')
try:
ipaddress.ip_address(str(invite.get('dns_ip', '')))
except ValueError:
raise ValueError(f"invalid dns_ip {invite.get('dns_ip')!r}")
try:
ipaddress.ip_network(str(invite.get('vpn_subnet', '')), strict=False)
except ValueError:
raise ValueError(f"invalid vpn_subnet {invite.get('vpn_subnet')!r}")
endpoint = invite.get('endpoint')
if endpoint and not _INVITE_ENDPOINT_RE.match(str(endpoint)):
raise ValueError(f'invalid endpoint {endpoint!r}')
def _remote_api_url(domain: Optional[str]) -> Optional[str]:
"""Base URL for a linked cell's API, reached over the WG tunnel.
Cross-cell peer-sync goes to the remote's Caddy on 443 (the WireGuard server
DNATs VPN-IP:443 → Caddy → API). The API's own :3000 is bound to 127.0.0.1
and is NOT reachable from another cell, so we must target HTTPS/443, not
http://<ip>:3000.
DNATs VPN-IP:443 → Caddy → API; the API's own :3000 binds 127.0.0.1 and is
unreachable from another cell). The URL uses the remote cell's DOMAIN — not
its VPN IP — because Caddy only holds a certificate for the domain (ACME) or
the .cell name (internal CA); a request by bare IP has no matching SNI and the
TLS handshake fails. The push connects to the VPN IP over the tunnel via
`curl --resolve <domain>:443:<dns_ip>` (see _push_permissions_to_remote).
"""
return f"https://{dns_ip}" if dns_ip else None
return f"https://{domain}" if domain else None
def _compute_next_retry(attempts: int) -> str:
@@ -76,13 +119,12 @@ class CellLinkManager:
link['permissions'] = _default_perms()
changed = True
# Phase 1 migration: permission-sync tracking fields
if 'remote_api_url' not in link:
link['remote_api_url'] = _remote_api_url(link.get('dns_ip'))
changed = True
# Migrate legacy http://<ip>:3000 URLs (unreachable across
# cells) to the HTTPS/Caddy form.
elif str(link.get('remote_api_url', '')).startswith('http://'):
link['remote_api_url'] = _remote_api_url(link.get('dns_ip'))
# Domain-based HTTPS URL. Rebuild if missing, or if it's a
# legacy form: http://<ip>:3000 (unreachable) or https://<ip>
# (no matching Caddy cert by bare IP).
_want_url = _remote_api_url(link.get('domain'))
if link.get('remote_api_url') != _want_url and _want_url:
link['remote_api_url'] = _want_url
changed = True
if 'last_push_status' not in link:
link['last_push_status'] = 'never'
@@ -197,19 +239,26 @@ class CellLinkManager:
payload = json.dumps(body)
endpoint = url.rstrip('/') + '/api/cells/peer-sync/permissions'
# Determine local WG IP so the remote can authenticate us by source subnet.
# MASQUERADE rewrites source to cell-wireguard's eth0 IP (172.20.x.x), which
# is NOT in the cell's vpn_subnet. Passing the true WG IP in X-Forwarded-For
# lets _authenticate_peer_cell() find the matching cell link.
# Determine local WG IP for X-Forwarded-For (belt-and-suspenders for the
# remote's source-subnet auth). With the peer-sync masquerade exclusion
# the remote's Caddy already sees our real VPN source and appends it, but
# passing it explicitly is harmless.
local_wg_ip = self._local_wg_ip()
xff_header = f'X-Forwarded-For: {local_wg_ip}' if local_wg_ip else None
# Reach the remote over the WG tunnel by its VPN IP, but present the
# cell's DOMAIN as SNI/Host so Caddy serves its certificate — a request
# to a bare IP has no matching cert and the TLS handshake fails. -k still
# covers LAN mode (internal-CA cert curl won't chain to).
domain = link.get('domain')
dns_ip = link.get('dns_ip')
cmd = [
'docker', 'exec', 'cell-wireguard',
# -k: the request reaches Caddy by the remote's VPN IP over the
# encrypted WG tunnel, so the TLS cert (issued for the cell's domain)
# won't match the IP — the tunnel already authenticates the peer.
'curl', '-s', '-k', '-o', '/dev/null', '-w', '%{http_code}',
]
if domain and dns_ip:
cmd += ['--resolve', f'{domain}:443:{dns_ip}']
cmd += [
'-X', 'POST',
'-H', 'Content-Type: application/json',
]
@@ -537,6 +586,7 @@ class CellLinkManager:
for field in ('cell_name', 'public_key', 'vpn_subnet', 'dns_ip', 'domain'):
if field not in invite:
raise ValueError(f"Invite missing field: {field!r}")
_validate_invite_fields(invite)
links = self._load()
name = invite['cell_name']
@@ -567,7 +617,7 @@ class CellLinkManager:
old_domain = existing.get('domain', '')
existing['dns_ip'] = invite['dns_ip']
existing['vpn_subnet'] = invite['vpn_subnet']
existing['remote_api_url'] = _remote_api_url(invite['dns_ip'])
existing['remote_api_url'] = _remote_api_url(invite['domain'])
if invite.get('endpoint'):
existing['endpoint'] = invite['endpoint']
if domain_changed:
@@ -629,7 +679,7 @@ class CellLinkManager:
'domain': invite['domain'],
'connected_at': datetime.utcnow().isoformat(),
'permissions': _default_perms(),
'remote_api_url': _remote_api_url(invite['dns_ip']),
'remote_api_url': _remote_api_url(invite['domain']),
'last_push_status': 'never',
'last_push_at': None,
'last_push_error': None,
@@ -651,6 +701,7 @@ class CellLinkManager:
def add_connection(self, invite: Dict[str, Any],
inbound_services: Optional[List[str]] = None) -> Dict[str, Any]:
"""Import a remote cell's invite and establish the connection."""
_validate_invite_fields(invite)
links = self._load()
name = invite['cell_name']
if any(l['cell_name'] == name for l in links):
@@ -689,7 +740,7 @@ class CellLinkManager:
'domain': invite['domain'],
'connected_at': datetime.utcnow().isoformat(),
'permissions': perms,
'remote_api_url': _remote_api_url(invite['dns_ip']),
'remote_api_url': _remote_api_url(invite['domain']),
'last_push_status': 'never',
'last_push_at': None,
'last_push_error': None,
@@ -747,8 +798,9 @@ class CellLinkManager:
try:
import firewall_manager as _fm
_fm.clear_cell_rules(cell_name)
_fm.remove_cell_subnet_route(link.get('vpn_subnet', ''))
except Exception as e:
logger.warning(f"clear_cell_rules for {cell_name} failed (non-fatal): {e}")
logger.warning(f"firewall teardown for {cell_name} failed (non-fatal): {e}")
self.wireguard_manager.remove_peer(link['public_key'])
self.network_manager.remove_cell_dns_forward(link['domain'])
+34
View File
@@ -438,11 +438,26 @@ def apply_cell_rules(cell_name: str, vpn_subnet: str, inbound_services: List[str
_iptables(['-I', 'FORWARD', '-s', vpn_subnet, '-d', caddy_ip,
'-p', 'tcp', '--dport', '443',
'-m', 'comment', '--comment', tag, '-j', 'ACCEPT'])
# Preserve the linked cell's real VPN source on peer-sync traffic:
# the blanket `-o eth0 MASQUERADE` would rewrite it to cell-wireguard's
# bridge IP, and the remote side authenticates the push by matching the
# source (via X-Forwarded-For) to the cell's VPN subnet. RETURN before
# the MASQUERADE (inserted at the top of nat POSTROUTING). Caddy's reply
# to the real VPN IP routes back via the cell-subnet host route
# (ensure_cell_subnet_routes). The :80 service path keeps masquerade.
_iptables(['-t', 'nat', '-I', 'POSTROUTING', '-s', vpn_subnet,
'-d', caddy_ip, '-p', 'tcp', '--dport', '443',
'-m', 'comment', '--comment', tag, '-j', 'RETURN'])
# Ensure reply traffic (e.g. ICMP, TCP ACKs) for connections initiated
# by local peers to this cell is not dropped by the cell's catch-all DROP.
ensure_forward_stateful()
# Host route so Caddy's peer-sync reply (to the linked cell's un-masqueraded
# VPN IP) leaves via cell-wireguard rather than the default gateway. Added at
# startup for all links; ensure it on runtime link-add too. Idempotent.
ensure_cell_subnet_routes([{'vpn_subnet': vpn_subnet}])
logger.info(
f"Applied cell rules for {cell_name} ({vpn_subnet}): "
f"inbound={inbound_services} exit_relay={exit_relay}"
@@ -689,6 +704,25 @@ def ensure_cell_subnet_routes(cell_links: List[Dict[str, Any]]) -> None:
logger.warning(f'ensure_cell_subnet_routes: {subnet}: {e}')
def remove_cell_subnet_route(vpn_subnet: str) -> None:
"""Remove the host route for a disconnected cell's VPN subnet (idempotent).
Counterpart to ensure_cell_subnet_routes. Without it the route lingers after a
cell is unlinked — blackholing that subnet via cell-wireguard, and (on a host
that runs the API/tests directly, e.g. a dev box) making is_local_request /
_local_subnets treat the stale subnet as locally attached.
"""
if not vpn_subnet:
return
WG_BRIDGE_IP = '172.20.0.9'
try:
_run(['docker', 'run', '--rm', '--network', 'host', '--cap-add', 'NET_ADMIN',
'alpine', 'ip', 'route', 'del', vpn_subnet, 'via', WG_BRIDGE_IP],
check=False)
except Exception as e:
logger.warning(f'remove_cell_subnet_route: {vpn_subnet}: {e}')
# ---------------------------------------------------------------------------
# DNS ACL (CoreDNS Corefile generation)
# ---------------------------------------------------------------------------