feat: replace hardcoded service names with ServiceRegistry-driven Caddy and CoreDNS config
Unit Tests / test (push) Failing after 11s

Previously, CaddyManager and NetworkManager contained hardcoded lists of
service names (calendar, files, mail, webdav, etc.), meaning every new
service required a code change to appear in Caddy routes and DNS records.
Now both managers accept a service_registry parameter and derive their
service lists dynamically from the registry at runtime.

- CaddyManager: new _build_registry_service_routes() and
  _http01_service_pairs() methods pull routes from the registry
- NetworkManager: new _get_service_subdomains() method returns registry
  subdomains with a hardcoded fallback when no registry is wired in;
  _build_dns_records, stale-record detection, and service name sets all
  use the registry
- managers.py: service_registry constructed before network_manager so it
  can be injected into both CaddyManager and NetworkManager
- service_registry.py: validation chokepoint in get_caddy_routes() rejects
  invalid subdomain/backend values and reserved service names
- service_store_manager.py: _validate_manifest now validates top-level
  subdomain, backend, extra_subdomains, and extra_backends fields
- tests: 24 new tests covering registry-driven routing and DNS subdomain
  generation (test_caddy_registry_integration.py)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-28 18:27:52 -04:00
parent 63c0dfb9d9
commit 16fb362df7
12 changed files with 1312 additions and 46 deletions
+139 -4
View File
@@ -6,6 +6,8 @@ Centralized configuration management for all services
import os
import json
import re
import subprocess
import yaml
import shutil
import hashlib
@@ -14,6 +16,9 @@ from typing import Dict, List, Optional, Any
from pathlib import Path
import logging
_SAFE_CONTAINER_RE = re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9_.-]{0,63}$')
_SAFE_VOL_NAME_RE = re.compile(r'^[a-zA-Z0-9_.-]{1,64}$')
# The Caddyfile lives on a separate volume mount from the rest of config
LIVE_CADDYFILE = os.environ.get('CADDYFILE_PATH', '/app/config-caddy/Caddyfile')
@@ -228,8 +233,128 @@ class ConfigManager:
"warnings": warnings
}
def backup_config(self) -> str:
"""Create a backup of cell_config.json, secrets, Caddyfile, .env, Corefile, and DNS zones."""
@staticmethod
def _validate_vol_entry(service_id: str, vol: dict) -> bool:
"""Return True if a backup volume entry is safe to use; log and return False otherwise."""
container = vol.get('container', '')
path = vol.get('path', '')
name = vol.get('name', '')
if not _SAFE_CONTAINER_RE.match(container):
logger.warning('Backup: unsafe container name %r for %s — skipping', container, service_id)
return False
if not path.startswith('/') or '..' in path.split('/') or '\x00' in path:
logger.warning('Backup: unsafe volume path %r for %s — skipping', path, service_id)
return False
if not _SAFE_VOL_NAME_RE.match(name):
logger.warning('Backup: unsafe volume name %r for %s — skipping', name, service_id)
return False
return True
def _backup_service_volumes(self, backup_path: Path, service_registry) -> None:
"""Stream service data out of each container via 'docker exec tar'.
Archives are relative (created with -C <path> .) so they can be safely
restored with -C <path> without risk of path traversal outside the volume.
Writes to a .partial temp file then renames atomically on success.
"""
try:
plan = service_registry.get_backup_plan()
except Exception as e:
logger.warning('_backup_service_volumes: could not get backup plan: %s', e)
return
for entry in plan:
service_id = entry['service_id']
volumes = entry.get('volumes') or []
if not volumes:
continue
svc_dir = backup_path / 'service_data' / service_id
svc_dir.mkdir(parents=True, exist_ok=True)
for vol in volumes:
if not self._validate_vol_entry(service_id, vol):
continue
container = vol['container']
path = vol['path']
name = vol['name']
archive_path = svc_dir / f'{name}.tar.gz'
tmp_path = svc_dir / f'{name}.tar.gz.partial'
try:
with open(tmp_path, 'wb') as af:
result = subprocess.run(
# -C path; then '.' archives the whole dir with relative entries.
# '--' prevents path/container from being parsed as options.
['docker', 'exec', '--', container,
'tar', '-C', path, '-czf', '-', '.'],
stdout=af,
stderr=subprocess.PIPE,
timeout=300,
)
if result.returncode != 0:
logger.warning(
'Backup: docker exec tar failed for %s/%s: %s',
service_id, name, result.stderr.decode(errors='replace'),
)
tmp_path.unlink(missing_ok=True)
else:
os.replace(tmp_path, archive_path)
logger.info('Backup: archived %s/%s', service_id, name)
except subprocess.TimeoutExpired:
logger.warning('Backup: timed out streaming %s/%s', service_id, name)
tmp_path.unlink(missing_ok=True)
except Exception as e:
logger.warning('Backup: failed to archive %s/%s: %s', service_id, name, e)
tmp_path.unlink(missing_ok=True)
def _restore_service_volumes(self, backup_path: Path, service_registry) -> None:
"""Pipe archived service data back into containers via 'docker exec -i tar'.
Extracts with -C <path>, matching how archives were created (relative paths).
This bounds extraction to within the declared volume directory.
"""
svc_data_dir = backup_path / 'service_data'
if not svc_data_dir.is_dir():
return
for svc_dir in svc_data_dir.iterdir():
if not svc_dir.is_dir():
continue
service_id = svc_dir.name
svc = service_registry.get(service_id)
if not svc:
logger.warning('Restore: unknown service %s in backup, skipping', service_id)
continue
volumes = (svc.get('backup') or {}).get('volumes') or []
for vol in volumes:
if not self._validate_vol_entry(service_id, vol):
continue
container = vol['container']
path = vol['path']
name = vol['name']
archive_path = svc_dir / f'{name}.tar.gz'
if not archive_path.exists():
continue
try:
with open(archive_path, 'rb') as af:
result = subprocess.run(
['docker', 'exec', '-i', '--', container,
'tar', '-C', path, '-xzf', '-'],
stdin=af,
stderr=subprocess.PIPE,
timeout=300,
)
if result.returncode != 0:
logger.warning(
'Restore: docker exec tar failed for %s/%s: %s',
service_id, name, result.stderr.decode(errors='replace'),
)
else:
logger.info('Restore: restored %s/%s', service_id, name)
except subprocess.TimeoutExpired:
logger.warning('Restore: timed out restoring %s/%s', service_id, name)
except Exception as e:
logger.warning('Restore: failed to restore %s/%s: %s', service_id, name, e)
def backup_config(self, service_registry=None) -> str:
"""Create a backup of cell_config.json, secrets, Caddyfile, .env, Corefile, DNS zones,
and (when service_registry is provided) live service data volumes."""
try:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
backup_id = f"backup_{timestamp}"
@@ -278,12 +403,17 @@ class ConfigManager:
except (PermissionError, OSError) as e:
logger.warning(f"Could not back up {src.name}: {e} (skipping)")
# Live service data volumes (streamed via docker exec)
if service_registry is not None:
self._backup_service_volumes(backup_path, service_registry)
services = ['identity'] + list(self.service_schemas.keys())
manifest = {
"backup_id": backup_id,
"timestamp": datetime.now().isoformat(),
"services": services,
"files": [f.name for f in backup_path.iterdir()],
"includes_service_data": service_registry is not None,
}
with open(backup_path / 'manifest.json', 'w') as f:
json.dump(manifest, f, indent=2)
@@ -294,8 +424,9 @@ class ConfigManager:
except Exception as e:
logger.error(f"Error creating backup: {e}")
raise
def restore_config(self, backup_id: str, services: list = None) -> bool:
def restore_config(self, backup_id: str, services: list = None,
service_registry=None) -> bool:
"""Restore from backup. If services list given, only restore those service configs (selective)."""
try:
backup_path = self.backup_dir / backup_id
@@ -373,6 +504,10 @@ class ConfigManager:
except (PermissionError, OSError) as e:
logger.warning(f"Could not restore {dest.name}: {e} (skipping)")
# Live service data volumes
if service_registry is not None:
self._restore_service_volumes(backup_path, service_registry)
self.configs = self._load_all_configs()
logger.info(f"Restored configuration from backup: {backup_id}")
return True