feat: connectivity redesign phase 3+4 — per-connection health, per-peer fallback, connection CRUD API
Unit Tests / test (push) Successful in 13m15s

Health probes (probe_health/refresh_health) are type-aware: WireGuard
checks the last WG handshake timestamp, OpenVPN checks the tun/tap
interface, Tor checks the control-port GETINFO, and sshuttle/proxy
types do a TCP reachability probe to the remote endpoint. Results are
persisted via set_connection_status and wired into the health_monitor_loop
so the UI always has a current health snapshot without polling.

Per-peer fail-open semantics: VPN, SSH, and proxy connections default to
fail-closed (kill-switch stays active even when the tunnel is down).
Tor defaults to fail-open. The default can be overridden per-peer via
set_peer_failopen/effective_failopen. apply_routes skips the fwmark and
kill-switch rules for any fail-open peer whose connection health is not
"working", letting traffic fall back to direct routing transparently.

New generic admin-only connection CRUD endpoints (GET/POST/PUT/DELETE
/api/connectivity/connections, GET /<id>/health, PUT
/api/connectivity/peers/<peer>/failopen) are guarded by the existing
admin role check. connection.create, connection.update, connection.delete,
and peer.failopen are all registered in ROUTE_ACTION_MAP for the audit
hook so every change is recorded in the owner-visible change log.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-06-10 21:50:45 -04:00
parent 8b50fb1036
commit d39c091cec
6 changed files with 1249 additions and 2 deletions
+120
View File
@@ -409,6 +409,10 @@ ROUTE_ACTION_MAP = {
('POST', 'connectivity_configure_sshuttle'): ('connection.exit_sshuttle', 'connection', None),
('POST', 'connectivity_configure_proxy'): ('connection.exit_proxy', 'connection', None),
('PUT', 'connectivity_set_peer_exit'): ('connection.peer_exit_set', 'peer', 'peer_name'),
('POST', 'connectivity_create_connection'): ('connection.create', 'connection', None),
('PUT', 'connectivity_update_connection'): ('connection.update', 'connection', 'conn_id'),
('DELETE', 'connectivity_delete_connection'): ('connection.delete', 'connection', 'conn_id'),
('PUT', 'connectivity_set_peer_failopen'): ('peer.failopen', 'peer', 'peer_name'),
# egress
('PUT', 'egress_set_service_exit'): ('egress.service_exit_set', 'service', 'service_id'),
# cells
@@ -867,6 +871,7 @@ def perform_health_check():
def health_monitor_loop():
_cert_check_cycle = 0
_conn_health_cycle = 0
while health_monitor_running:
with app.app_context():
health_result = perform_health_check()
@@ -898,6 +903,15 @@ def health_monitor_loop():
caddy_manager.refresh_cert_status()
except Exception as _cert_err:
logger.warning("Cert status refresh failed (non-fatal): %s", _cert_err)
# Refresh connection health every 2 cycles (\u2248 every 2 min) so the
# connections list and per-peer fallback decisions stay current.
_conn_health_cycle += 1
if _conn_health_cycle >= 2:
_conn_health_cycle = 0
try:
connectivity_manager.refresh_health()
except Exception as _ch_err:
logger.warning("Connection health refresh failed (non-fatal): %s", _ch_err)
time.sleep(60) # Check every 60 seconds
# Start health monitor thread
@@ -1172,6 +1186,112 @@ def connectivity_get_peer_exits():
return jsonify({'error': str(e)}), 500
# Connectivity v2 — generic connection CRUD (going-forward API; admin-only via
# enforce_auth which restricts all non-peer /api/* routes to the admin role).
@app.route('/api/connectivity/connections', methods=['GET'])
def connectivity_list_connections():
"""List all connection instances (with status; never any secret value)."""
try:
return jsonify({'connections': connectivity_manager.list_connections()})
except Exception as e:
logger.error(f"connectivity_list_connections: {e}")
return jsonify({'error': str(e)}), 500
@app.route('/api/connectivity/connections', methods=['POST'])
def connectivity_create_connection():
"""Create a connection instance. Secrets are stored in the vault, never echoed."""
try:
data = request.get_json(silent=True) or {}
conn_type = data.get('type')
name = data.get('name')
config = data.get('config') or {}
conn_secrets = data.get('secrets') or {}
if not isinstance(conn_type, str) or not conn_type:
return jsonify({'ok': False, 'error': 'type is required'}), 400
if not isinstance(name, str) or not name.strip():
return jsonify({'ok': False, 'error': 'name is required'}), 400
result = connectivity_manager.create_connection(
conn_type, name, config=config, secrets=conn_secrets)
if result.get('ok'):
return jsonify(result), 201
return jsonify(result), 400
except Exception as e:
logger.error(f"connectivity_create_connection: {e}")
return jsonify({'error': 'internal error'}), 500
@app.route('/api/connectivity/connections/<conn_id>', methods=['PUT'])
def connectivity_update_connection(conn_id: str):
"""Update a connection's name, config and/or secrets. Secrets never echoed."""
try:
data = request.get_json(silent=True) or {}
result = connectivity_manager.update_connection(
conn_id,
name=data.get('name'),
config=data.get('config'),
secrets=data.get('secrets'),
)
if result.get('ok'):
return jsonify(result)
status = 404 if 'not found' in result.get('error', '') else 400
return jsonify(result), status
except Exception as e:
logger.error(f"connectivity_update_connection({conn_id}): {e}")
return jsonify({'error': 'internal error'}), 500
@app.route('/api/connectivity/connections/<conn_id>', methods=['DELETE'])
def connectivity_delete_connection(conn_id: str):
"""Delete a connection. Blocked with 409 when a peer/egress references it."""
try:
result = connectivity_manager.delete_connection(conn_id)
if result.get('ok'):
return jsonify(result)
error = result.get('error', '')
if 'not found' in error:
return jsonify(result), 404
if 'in use by' in error:
return jsonify(result), 409
return jsonify(result), 400
except Exception as e:
logger.error(f"connectivity_delete_connection({conn_id}): {e}")
return jsonify({'error': str(e)}), 500
@app.route('/api/connectivity/connections/<conn_id>/health', methods=['GET'])
def connectivity_connection_health(conn_id: str):
"""On-demand probe of one connection's health (admin)."""
try:
conn = connectivity_manager.get_connection(conn_id)
if conn is None:
return jsonify({'error': f'connection {conn_id!r} not found'}), 404
health, detail = connectivity_manager.probe_health(conn)
return jsonify({'id': conn_id, 'health': health, 'detail': detail})
except Exception as e:
logger.error(f"connectivity_connection_health({conn_id}): {e}")
return jsonify({'error': str(e)}), 500
@app.route('/api/connectivity/peers/<peer_name>/failopen', methods=['PUT'])
def connectivity_set_peer_failopen(peer_name: str):
"""Set or clear a peer's fail-open override. Body: {"failopen": bool|null}."""
try:
data = request.get_json(silent=True) or {}
failopen = data.get('failopen')
if failopen is not None and not isinstance(failopen, bool):
return jsonify({'ok': False, 'error': 'failopen must be a boolean or null'}), 400
result = connectivity_manager.set_peer_failopen(peer_name, failopen)
if result.get('ok'):
return jsonify(result)
status = 404 if 'not found' in result.get('error', '') else 400
return jsonify(result), status
except Exception as e:
logger.error(f"connectivity_set_peer_failopen({peer_name}): {e}")
return jsonify({'error': str(e)}), 500
@app.route('/api/caddy/cert-status', methods=['GET'])
def caddy_cert_status():
"""Return TLS certificate status (expiry, days remaining, domain, mode).