feat: Phase 4 hardening — retry/backoff, loop detection, sync status UI + tests
Phase 4.1 — Retry/backoff for failed permission pushes: - _compute_next_retry(): capped exponential backoff with jitter (60s–1h) - _record_push_result(): tracks push_attempts and next_retry_at per link - replay_pending_pushes(): skips links still in backoff window, logs deferred count - _load() migration: adds push_attempts/next_retry_at to existing records Phase 4.2 — Loop detection (A→B→A routing cycle): - set_peer_route_via(): returns 409 if target cell already routes peers through us - apply_remote_permissions(): soft warning when accepting exit-relay that would cycle Phase 4.3 — Sync staleness indicator in Cell Network UI: - SyncBadge component: green (synced), amber (pending/failed), gray (never) - Shows relativeTime of last sync + error message + next retry estimate - Injected into CellPanel header alongside tunnel online/handshake status Tests (54 new): - TestCheckInviteConflicts: subnet overlap, domain conflict, exclude_cell (9 tests) - TestPushInviteToRemote: success, 4xx, no endpoint, subprocess errors (7 tests) - TestAcceptInviteNew: new cell, idempotent, healing dns/subnet changes (16 tests) - TestAddConnectionMutualPairing: push-invite call, non-fatal failure (5 tests) - TestPeerSyncAcceptInvite endpoint: happy path, field validation, error propagation (16 tests) - Fixed 2 existing replay tests to clear backoff gate (simulates elapsed window) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -11,8 +11,9 @@ Each connection is stored in data/cell_links.json and manifests as:
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -25,6 +26,15 @@ _DEFAULT_PERMISSIONS = {
|
||||
}
|
||||
|
||||
_PUSH_TIMEOUT = 5 # seconds
|
||||
_BACKOFF_BASE_S = 60
|
||||
_BACKOFF_MAX_S = 3600
|
||||
|
||||
|
||||
def _compute_next_retry(attempts: int) -> str:
|
||||
"""Return an ISO timestamp for the earliest next retry using capped exponential backoff."""
|
||||
delay = min(_BACKOFF_BASE_S * (2 ** (attempts - 1)), _BACKOFF_MAX_S)
|
||||
delay += random.uniform(0, _BACKOFF_BASE_S / 2)
|
||||
return (datetime.utcnow() + timedelta(seconds=delay)).isoformat()
|
||||
|
||||
|
||||
def _default_perms() -> Dict[str, Any]:
|
||||
@@ -91,6 +101,13 @@ class CellLinkManager:
|
||||
if 'remote_exit_relay_active' not in link:
|
||||
link['remote_exit_relay_active'] = False
|
||||
changed = True
|
||||
# Phase 4 migration: retry/backoff state
|
||||
if 'push_attempts' not in link:
|
||||
link['push_attempts'] = 0
|
||||
changed = True
|
||||
if 'next_retry_at' not in link:
|
||||
link['next_retry_at'] = None
|
||||
changed = True
|
||||
if changed:
|
||||
self._save(links)
|
||||
return links
|
||||
@@ -214,10 +231,15 @@ class CellLinkManager:
|
||||
link['last_push_at'] = datetime.utcnow().isoformat()
|
||||
link['last_push_error'] = None
|
||||
link['pending_push'] = False
|
||||
link['push_attempts'] = 0
|
||||
link['next_retry_at'] = None
|
||||
else:
|
||||
link['last_push_status'] = 'failed'
|
||||
link['last_push_error'] = result.get('error')
|
||||
link['pending_push'] = True
|
||||
attempts = link.get('push_attempts', 0) + 1
|
||||
link['push_attempts'] = attempts
|
||||
link['next_retry_at'] = _compute_next_retry(attempts)
|
||||
break
|
||||
self._save(links)
|
||||
|
||||
@@ -270,6 +292,24 @@ class CellLinkManager:
|
||||
link['last_remote_update_at'] = datetime.utcnow().isoformat()
|
||||
self._save(links)
|
||||
|
||||
# Soft loop-detection warning: if the remote is asking us to act as exit
|
||||
# AND we already have a peer routing via that cell, it's a potential cycle.
|
||||
if use_as_exit_relay:
|
||||
try:
|
||||
from peer_registry import PeerRegistry
|
||||
import os as _os
|
||||
pr = PeerRegistry(_os.environ.get('DATA_DIR', '/app/data'))
|
||||
loop_peers = [p['name'] for p in pr.list_peers()
|
||||
if p.get('route_via') == link['cell_name']]
|
||||
if loop_peers:
|
||||
logger.warning(
|
||||
f"apply_remote_permissions: '{link['cell_name']}' asked us to act as "
|
||||
f"its exit relay, but we already route peers {loop_peers} via it — "
|
||||
f"potential routing loop detected"
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
inbound_list = [s for s, v in clean_inbound.items() if v]
|
||||
try:
|
||||
import firewall_manager as _fm
|
||||
@@ -295,9 +335,18 @@ class CellLinkManager:
|
||||
logger.warning(f"replay_pending_pushes: cannot resolve identity ({e})")
|
||||
return summary
|
||||
|
||||
summary['deferred'] = 0
|
||||
now_iso = datetime.utcnow().isoformat()
|
||||
for link in self._load():
|
||||
if not link.get('pending_push'):
|
||||
continue
|
||||
next_retry = link.get('next_retry_at')
|
||||
if next_retry and next_retry > now_iso:
|
||||
summary['deferred'] += 1
|
||||
logger.info(
|
||||
f"replay: skipping '{link['cell_name']}' — backoff until {next_retry}"
|
||||
)
|
||||
continue
|
||||
summary['attempted'] += 1
|
||||
result = self._push_permissions_to_remote(
|
||||
link, identity['cell_name'], identity['public_key']
|
||||
@@ -311,10 +360,11 @@ class CellLinkManager:
|
||||
logger.warning(
|
||||
f"replay: push to '{link['cell_name']}' failed: {result.get('error')}"
|
||||
)
|
||||
if summary['attempted']:
|
||||
if summary['attempted'] or summary.get('deferred'):
|
||||
logger.info(
|
||||
f"replay_pending_pushes: {summary['attempted']} attempted, "
|
||||
f"{summary['ok']} ok, {summary['failed']} failed"
|
||||
f"{summary['ok']} ok, {summary['failed']} failed, "
|
||||
f"{summary.get('deferred', 0)} deferred (backoff)"
|
||||
)
|
||||
return summary
|
||||
|
||||
|
||||
@@ -237,6 +237,13 @@ def set_peer_route_via(peer_name):
|
||||
)
|
||||
if not link:
|
||||
return jsonify({'error': f"Cell {via_cell!r} not connected"}), 404
|
||||
if link.get('remote_exit_relay_active'):
|
||||
return jsonify({
|
||||
'error': (
|
||||
f"Cannot route via '{via_cell}': it is already routing peers "
|
||||
f"through this cell — enabling both directions would create a loop"
|
||||
)
|
||||
}), 409
|
||||
wireguard_manager.update_cell_peer_allowed_ips(
|
||||
link['public_key'], link['vpn_subnet'], add_default_route=True)
|
||||
wireguard_manager.apply_peer_route_via(peer_ip, via_wg_ip=link['dns_ip'])
|
||||
|
||||
Reference in New Issue
Block a user