feat: connectivity redesign phase 3+4 — per-connection health, per-peer fallback, connection CRUD API
Unit Tests / test (push) Successful in 13m15s

Health probes (probe_health/refresh_health) are type-aware: WireGuard
checks the last WG handshake timestamp, OpenVPN checks the tun/tap
interface, Tor checks the control-port GETINFO, and sshuttle/proxy
types do a TCP reachability probe to the remote endpoint. Results are
persisted via set_connection_status and wired into the health_monitor_loop
so the UI always has a current health snapshot without polling.

Per-peer fail-open semantics: VPN, SSH, and proxy connections default to
fail-closed (kill-switch stays active even when the tunnel is down).
Tor defaults to fail-open. The default can be overridden per-peer via
set_peer_failopen/effective_failopen. apply_routes skips the fwmark and
kill-switch rules for any fail-open peer whose connection health is not
"working", letting traffic fall back to direct routing transparently.

New generic admin-only connection CRUD endpoints (GET/POST/PUT/DELETE
/api/connectivity/connections, GET /<id>/health, PUT
/api/connectivity/peers/<peer>/failopen) are guarded by the existing
admin role check. connection.create, connection.update, connection.delete,
and peer.failopen are all registered in ROUTE_ACTION_MAP for the audit
hook so every change is recorded in the owner-visible change log.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-06-10 21:50:45 -04:00
parent 8b50fb1036
commit d39c091cec
6 changed files with 1249 additions and 2 deletions
+303 -2
View File
@@ -43,6 +43,7 @@ import logging
import os
import re
import secrets
import socket
import threading
import time
from typing import Any, Dict, List, Optional, Tuple
@@ -154,6 +155,13 @@ class ConnectivityManager(BaseServiceManager):
REDIRECT_PORT_BASE = 9100
REDIRECT_PORT_MAX = 9199
# WireGuard handshake older than this (seconds) means the tunnel is down.
WG_HANDSHAKE_MAX_AGE = 180
# Cached health is reused for this long so on-demand GETs don't re-probe.
HEALTH_TTL = 30
# TCP connect timeout for socket-based probes (sshuttle/proxy).
PROBE_TCP_TIMEOUT = 3
IFACE_PREFIXES = {"wireguard_ext": "wgext_", "openvpn": "ovpn_"}
CONNECTION_NAME_RE = re.compile(r'^[A-Za-z0-9][A-Za-z0-9 _.-]{0,63}$')
@@ -1374,7 +1382,16 @@ class ConnectivityManager(BaseServiceManager):
f"apply_routes: ip rule {conn.get('id')} failed: {e}")
# Per-peer marking + nat redirect, resolved through each peer's
# connection instance.
# connection instance. A peer whose connection is unhealthy AND whose
# effective fail-open is True is skipped entirely: no mark, so its
# traffic falls through to the default route (direct internet) instead
# of being blocked by the kill-switch. Fail-closed peers keep their mark
# so the kill-switch blocks them while the tunnel is down.
#
# marked_conn_ids tracks connections that actually have at least one
# marked peer, so their kill-switch stays installed; a connection whose
# only peers are all fail-open-and-down gets no kill-switch.
marked_conn_ids: set = set()
if self.peer_registry is not None:
try:
peers = self.peer_registry.list_peers()
@@ -1387,18 +1404,27 @@ class ConnectivityManager(BaseServiceManager):
conn = self._resolve_peer_connection(peer, by_id)
if conn is None:
continue
if self._peer_fails_open(peer, conn):
logger.info(
f"apply_routes: peer {peer.get('peer')!r} fails open over "
f"down connection {conn.get('id')!r}; skipping mark")
continue
src_ip = self._peer_source_ip(peer.get('peer', ''))
if not src_ip:
continue
rules_applied += self._apply_connection_for_src(src_ip, conn)
marked_conn_ids.add(conn.get('id'))
# Kill-switch: drop marked packets that would otherwise leak via the
# default route if an iface-based exit interface is down.
# default route if an iface-based exit interface is down. Installed only
# for connections that still have at least one marked peer.
for conn in connections:
iface = conn.get('iface')
mark = conn.get('mark')
if not iface or not isinstance(mark, int):
continue
if conn.get('id') not in marked_conn_ids:
continue
try:
self._add_killswitch(mark, iface)
rules_applied += 1
@@ -1408,6 +1434,20 @@ class ConnectivityManager(BaseServiceManager):
return {'ok': True, 'rules_applied': rules_applied}
def _peer_fails_open(self, peer: Dict[str, Any],
conn: Dict[str, Any]) -> bool:
"""True when this peer should fall back to the default route.
A peer falls back only when its connection is unhealthy (health is
explicitly 'down') AND its effective fail-open is True. A 'working' or
'unknown' connection always routes normally so a stale/never-probed
status never silently drops the tunnel.
"""
health = (conn.get('status') or {}).get('health')
if health != 'down':
return False
return self.effective_failopen(peer, conn)
def _routing_connections(self) -> List[Dict[str, Any]]:
"""Return the connection instances that drive routing (enabled only)."""
if self.config_manager is None:
@@ -1631,6 +1671,267 @@ class ConnectivityManager(BaseServiceManager):
except Exception:
return False
# ── Health probing ────────────────────────────────────────────────────
#
# probe_health(connection) returns 'working' | 'down' | 'unknown' per type.
# All real subprocess/socket calls are factored behind small helpers
# (_exec_in_container, _tcp_reachable) so tests patch those, never raw
# subprocess/socket. refresh_health() persists results via
# config_manager.set_connection_status and caches them for HEALTH_TTL.
# Per-type fallback default: tor fails open (direct internet on outage),
# all interface/tunnel/proxy types fail closed (traffic blocked on outage).
FAILOPEN_DEFAULTS = {
"wireguard_ext": False,
"openvpn": False,
"sshuttle": False,
"proxy": False,
"tor": True,
}
def probe_health(self, connection: Dict[str, Any]) -> Tuple[str, Optional[str]]:
"""Probe a connection's liveness. Returns (health, detail).
health is 'working' | 'down' | 'unknown'. detail is a short human
string (or None). Never raises — probe errors map to ('unknown', msg).
"""
conn_type = connection.get('type')
try:
if conn_type == 'wireguard_ext':
return self._probe_wireguard_ext(connection)
if conn_type == 'openvpn':
return self._probe_openvpn(connection)
if conn_type == 'tor':
return self._probe_tor(connection)
if conn_type == 'sshuttle':
return self._probe_sshuttle(connection)
if conn_type == 'proxy':
return self._probe_proxy(connection)
except Exception as e:
logger.warning(f"probe_health({connection.get('id')}): {e}")
return 'unknown', str(e)
return 'unknown', f'no probe for type {conn_type!r}'
def _probe_wireguard_ext(
self, conn: Dict[str, Any],
) -> Tuple[str, Optional[str]]:
"""A WireGuard exit is working when its latest handshake is recent."""
iface = conn.get('iface')
if not iface:
return 'unknown', 'no interface assigned'
container = self.EXIT_CONTAINERS.get('wireguard_ext')
r = self._exec_in_container(
container, ['wg', 'show', iface, 'latest-handshakes'])
if r is None or r.returncode != 0:
return 'down', 'wg show failed or interface absent'
newest = 0
for line in (r.stdout or '').splitlines():
parts = line.split()
if len(parts) >= 2:
try:
newest = max(newest, int(parts[-1]))
except ValueError:
continue
if newest == 0:
return 'down', 'no handshake yet'
age = int(time.time()) - newest
if age <= self.WG_HANDSHAKE_MAX_AGE:
return 'working', f'handshake {age}s ago'
return 'down', f'handshake stale ({age}s ago)'
def _probe_openvpn(self, conn: Dict[str, Any]) -> Tuple[str, Optional[str]]:
"""An OpenVPN exit is working when its tun iface exists and is UP."""
iface = conn.get('iface')
container = self.EXIT_CONTAINERS.get('openvpn')
# The tun device lives in the openvpn container's net namespace.
r = self._exec_in_container(container, ['ip', 'link', 'show', iface]) \
if iface else None
if r is None or r.returncode != 0:
# Fall back to the legacy fixed tun0 iface check in the WG container.
r2 = self._wg_ip(['link', 'show', self.IFACES.get('openvpn', 'tun0')])
if r2.returncode == 0 and 'UP' in (r2.stdout or ''):
return 'working', 'tun up'
return 'down', 'tun interface absent or down'
if 'UP' in (r.stdout or ''):
return 'working', 'tun up'
return 'down', 'tun interface down'
def _probe_tor(self, conn: Dict[str, Any]) -> Tuple[str, Optional[str]]:
"""Tor is working when its container is running and bootstrapped."""
container = self.EXIT_CONTAINERS.get('tor')
if not self._container_running(container):
return 'down', 'tor container not running'
r = self._exec_in_container(
container, ['sh', '-c',
'grep -m1 "Bootstrapped 100" /var/log/tor/notices.log '
'2>/dev/null || true'])
if r is not None and 'Bootstrapped 100' in (r.stdout or ''):
return 'working', 'bootstrapped'
# Container is up but bootstrap state is unknown — treat as working
# (tor fails open by default; a running container is the cheap signal).
return 'working', 'running'
def _probe_sshuttle(self, conn: Dict[str, Any]) -> Tuple[str, Optional[str]]:
"""sshuttle is working when the SSH host AND local listener are reachable."""
cfg = conn.get('config', {})
host = cfg.get('host')
port = cfg.get('port', 22)
if not host:
return 'unknown', 'no host configured'
if not self._tcp_reachable(host, int(port)):
return 'down', f'ssh host {host}:{port} unreachable'
listen_port = conn.get('redirect_port')
container = self.EXIT_CONTAINERS.get('sshuttle')
if isinstance(listen_port, int) and not self._listener_reachable(
container, listen_port):
return 'down', f'sshuttle listener :{listen_port} down'
return 'working', 'ssh host + listener reachable'
def _probe_proxy(self, conn: Dict[str, Any]) -> Tuple[str, Optional[str]]:
"""A proxy exit is working when the upstream proxy host:port accepts TCP."""
cfg = conn.get('config', {})
host = cfg.get('host')
port = cfg.get('port')
if not host or not port:
return 'unknown', 'no upstream host/port configured'
if self._tcp_reachable(host, int(port)):
return 'working', f'{host}:{port} reachable'
return 'down', f'upstream {host}:{port} unreachable'
def _listener_reachable(self, container: Optional[str], port: int) -> bool:
"""True when a local TCP listener on `port` is up inside the exit container."""
r = self._exec_in_container(
container, ['sh', '-c',
f'nc -z -w2 127.0.0.1 {port} 2>/dev/null && echo ok || true'])
if r is not None and 'ok' in (r.stdout or ''):
return True
# Fall back to a host-side probe (container may lack nc).
return self._tcp_reachable('127.0.0.1', port)
def _container_running(self, container: Optional[str]) -> bool:
"""True when the named container is running (cheap docker inspect)."""
if not container:
return False
try:
r = subprocess.run(
['docker', 'inspect', '-f', '{{.State.Running}}', container],
capture_output=True, text=True, timeout=5)
return r.returncode == 0 and r.stdout.strip() == 'true'
except Exception:
return False
def _exec_in_container(
self, container: Optional[str], args: List[str], timeout: int = 8,
) -> Optional[subprocess.CompletedProcess]:
"""Run a command inside a container; None on failure. Mock target for tests."""
if not container:
return None
try:
return subprocess.run(
['docker', 'exec', container] + args,
capture_output=True, text=True, timeout=timeout)
except Exception as e:
logger.debug(f"_exec_in_container({container}): {e}")
return None
def _tcp_reachable(self, host: str, port: int) -> bool:
"""True when a TCP connection to host:port succeeds. Mock target for tests."""
try:
with socket.create_connection(
(host, port), timeout=self.PROBE_TCP_TIMEOUT):
return True
except Exception:
return False
def refresh_health(
self, connection_id: Optional[str] = None, force: bool = False,
) -> Dict[str, Any]:
"""Probe one or all connections and persist health into their status.
With a TTL cache: a connection whose last_check is younger than
HEALTH_TTL is skipped unless force=True. Returns {id: health}.
"""
if self.config_manager is None:
return {}
try:
conns = self.config_manager.list_connections()
except Exception as e:
logger.warning(f"refresh_health: list_connections failed: {e}")
return {}
if connection_id is not None:
conns = [c for c in conns if c.get('id') == connection_id]
results: Dict[str, Any] = {}
now = int(time.time())
for conn in conns:
cid = conn.get('id')
if not cid or not conn.get('enabled', True):
continue
status = dict(conn.get('status', {}))
if not force and self._health_is_fresh(status, now):
results[cid] = status.get('health', 'unknown')
continue
health, detail = self.probe_health(conn)
status['health'] = health
status['detail'] = detail
status['last_check'] = self._now_iso()
try:
self.config_manager.set_connection_status(cid, status)
except Exception as e:
logger.warning(f"refresh_health: persist {cid} failed: {e}")
results[cid] = health
return results
def _health_is_fresh(self, status: Dict[str, Any], now: int) -> bool:
"""True when status.last_check is within HEALTH_TTL of `now`.
last_check is a UTC ISO string (from _now_iso); calendar.timegm parses
it back as UTC so the comparison is timezone-consistent with `now`
(int(time.time()), also epoch/UTC).
"""
import calendar
last = status.get('last_check')
if not last:
return False
try:
ts = calendar.timegm(time.strptime(last, '%Y-%m-%dT%H:%M:%SZ'))
return (now - int(ts)) < self.HEALTH_TTL
except (ValueError, OverflowError):
return False
def effective_failopen(self, peer: Dict[str, Any],
conn: Optional[Dict[str, Any]]) -> bool:
"""Resolve a peer's effective fail-open for its connection.
peer.exit_failopen overrides when set (bool); otherwise the per-type
default applies. An unknown/missing connection type fails closed.
"""
override = peer.get('exit_failopen')
if isinstance(override, bool):
return override
conn_type = conn.get('type') if conn else None
return self.FAILOPEN_DEFAULTS.get(conn_type, False)
def set_peer_failopen(self, peer_name: str,
failopen: Optional[bool]) -> Dict[str, Any]:
"""Set or clear a peer's fail-open override (None = use type default)."""
if failopen is not None and not isinstance(failopen, bool):
return {'ok': False, 'error': 'failopen must be a boolean or null'}
if self.peer_registry is None:
return {'ok': False, 'error': 'peer_registry not available'}
if not self._peer_exists(peer_name):
return {'ok': False, 'error': f'peer {peer_name!r} not found'}
try:
self.peer_registry.update_peer(peer_name, {'exit_failopen': failopen})
except Exception as e:
logger.error(f"set_peer_failopen({peer_name}): {e}")
return {'ok': False, 'error': str(e)}
try:
self.apply_routes()
except Exception as e:
logger.warning(f"set_peer_failopen: apply_routes failed (non-fatal): {e}")
return {'ok': True, 'peer': peer_name, 'exit_failopen': failopen}
def _peer_source_ip(self, peer_name: str) -> Optional[str]:
"""Return a peer's WireGuard IP (no /CIDR suffix)."""
if not peer_name or self.peer_registry is None: