feat: Phase 4 hardening — retry/backoff, loop detection, sync status UI + tests

Phase 4.1 — Retry/backoff for failed permission pushes:
- _compute_next_retry(): capped exponential backoff with jitter (60s–1h)
- _record_push_result(): tracks push_attempts and next_retry_at per link
- replay_pending_pushes(): skips links still in backoff window, logs deferred count
- _load() migration: adds push_attempts/next_retry_at to existing records

Phase 4.2 — Loop detection (A→B→A routing cycle):
- set_peer_route_via(): returns 409 if target cell already routes peers through us
- apply_remote_permissions(): soft warning when accepting exit-relay that would cycle

Phase 4.3 — Sync staleness indicator in Cell Network UI:
- SyncBadge component: green (synced), amber (pending/failed), gray (never)
- Shows relativeTime of last sync + error message + next retry estimate
- Injected into CellPanel header alongside tunnel online/handshake status

Tests (54 new):
- TestCheckInviteConflicts: subnet overlap, domain conflict, exclude_cell (9 tests)
- TestPushInviteToRemote: success, 4xx, no endpoint, subprocess errors (7 tests)
- TestAcceptInviteNew: new cell, idempotent, healing dns/subnet changes (16 tests)
- TestAddConnectionMutualPairing: push-invite call, non-fatal failure (5 tests)
- TestPeerSyncAcceptInvite endpoint: happy path, field validation, error propagation (16 tests)
- Fixed 2 existing replay tests to clear backoff gate (simulates elapsed window)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-04 04:18:36 -04:00
parent 960a4ecc51
commit dc2606541c
5 changed files with 765 additions and 4 deletions
+53 -3
View File
@@ -11,8 +11,9 @@ Each connection is stored in data/cell_links.json and manifests as:
import json
import logging
import os
import random
import subprocess
from datetime import datetime
from datetime import datetime, timezone, timedelta
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
@@ -25,6 +26,15 @@ _DEFAULT_PERMISSIONS = {
}
_PUSH_TIMEOUT = 5 # seconds
_BACKOFF_BASE_S = 60
_BACKOFF_MAX_S = 3600
def _compute_next_retry(attempts: int) -> str:
"""Return an ISO timestamp for the earliest next retry using capped exponential backoff."""
delay = min(_BACKOFF_BASE_S * (2 ** (attempts - 1)), _BACKOFF_MAX_S)
delay += random.uniform(0, _BACKOFF_BASE_S / 2)
return (datetime.utcnow() + timedelta(seconds=delay)).isoformat()
def _default_perms() -> Dict[str, Any]:
@@ -91,6 +101,13 @@ class CellLinkManager:
if 'remote_exit_relay_active' not in link:
link['remote_exit_relay_active'] = False
changed = True
# Phase 4 migration: retry/backoff state
if 'push_attempts' not in link:
link['push_attempts'] = 0
changed = True
if 'next_retry_at' not in link:
link['next_retry_at'] = None
changed = True
if changed:
self._save(links)
return links
@@ -214,10 +231,15 @@ class CellLinkManager:
link['last_push_at'] = datetime.utcnow().isoformat()
link['last_push_error'] = None
link['pending_push'] = False
link['push_attempts'] = 0
link['next_retry_at'] = None
else:
link['last_push_status'] = 'failed'
link['last_push_error'] = result.get('error')
link['pending_push'] = True
attempts = link.get('push_attempts', 0) + 1
link['push_attempts'] = attempts
link['next_retry_at'] = _compute_next_retry(attempts)
break
self._save(links)
@@ -270,6 +292,24 @@ class CellLinkManager:
link['last_remote_update_at'] = datetime.utcnow().isoformat()
self._save(links)
# Soft loop-detection warning: if the remote is asking us to act as exit
# AND we already have a peer routing via that cell, it's a potential cycle.
if use_as_exit_relay:
try:
from peer_registry import PeerRegistry
import os as _os
pr = PeerRegistry(_os.environ.get('DATA_DIR', '/app/data'))
loop_peers = [p['name'] for p in pr.list_peers()
if p.get('route_via') == link['cell_name']]
if loop_peers:
logger.warning(
f"apply_remote_permissions: '{link['cell_name']}' asked us to act as "
f"its exit relay, but we already route peers {loop_peers} via it — "
f"potential routing loop detected"
)
except Exception:
pass
inbound_list = [s for s, v in clean_inbound.items() if v]
try:
import firewall_manager as _fm
@@ -295,9 +335,18 @@ class CellLinkManager:
logger.warning(f"replay_pending_pushes: cannot resolve identity ({e})")
return summary
summary['deferred'] = 0
now_iso = datetime.utcnow().isoformat()
for link in self._load():
if not link.get('pending_push'):
continue
next_retry = link.get('next_retry_at')
if next_retry and next_retry > now_iso:
summary['deferred'] += 1
logger.info(
f"replay: skipping '{link['cell_name']}' — backoff until {next_retry}"
)
continue
summary['attempted'] += 1
result = self._push_permissions_to_remote(
link, identity['cell_name'], identity['public_key']
@@ -311,10 +360,11 @@ class CellLinkManager:
logger.warning(
f"replay: push to '{link['cell_name']}' failed: {result.get('error')}"
)
if summary['attempted']:
if summary['attempted'] or summary.get('deferred'):
logger.info(
f"replay_pending_pushes: {summary['attempted']} attempted, "
f"{summary['ok']} ok, {summary['failed']} failed"
f"{summary['ok']} ok, {summary['failed']} failed, "
f"{summary.get('deferred', 0)} deferred (backoff)"
)
return summary
+7
View File
@@ -237,6 +237,13 @@ def set_peer_route_via(peer_name):
)
if not link:
return jsonify({'error': f"Cell {via_cell!r} not connected"}), 404
if link.get('remote_exit_relay_active'):
return jsonify({
'error': (
f"Cannot route via '{via_cell}': it is already routing peers "
f"through this cell — enabling both directions would create a loop"
)
}), 409
wireguard_manager.update_cell_peer_allowed_ips(
link['public_key'], link['vpn_subnet'], add_default_route=True)
wireguard_manager.apply_peer_route_via(peer_ip, via_wg_ip=link['dns_ip'])