Files
pic/api/config_manager.py
T
roof 2ab3d2d5ac feat: secure build phase 2 — enforce image verification by default
All store images are now digest-pinned and cosign-signed by the publish
pipeline, so the warn-by-default training-wheels period is over: an
unsigned or undigested image must not install unless the admin
explicitly opts out. The service_composer fallback used when the config
manager is unavailable or corrupt also flips to enforce — config
corruption must fail closed rather than silently weaken verification.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-11 14:12:58 -04:00

1343 lines
60 KiB
Python

#!/usr/bin/env python3
"""
Configuration Manager for Personal Internet Cell
Centralized configuration management for all services
"""
import os
import json
import re
import subprocess
import tarfile
import io
import fnmatch
import yaml
import shutil
import hashlib
import threading
from datetime import datetime
from typing import Dict, List, Optional, Any
from pathlib import Path
import logging
import backup_crypto
_SAFE_CONTAINER_RE = re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9_.-]{0,63}$')
_SAFE_VOL_NAME_RE = re.compile(r'^[a-zA-Z0-9_.-]{1,64}$')
# The Caddyfile lives on a separate volume mount from the rest of config
LIVE_CADDYFILE = os.environ.get('CADDYFILE_PATH', '/app/config-caddy/Caddyfile')
# Trash that must never end up inside a backup. Matched against each file's
# path relative to the data dir (posix-style), and bare filenames.
_BACKUP_EXCLUDE_GLOBS = (
'logs/*', 'logs/**',
'api/config_backups/*', 'api/config_backups/**',
'*.tmp', '*.partial',
'__pycache__/*', '**/__pycache__/**',
)
# Specific files (by path relative to data dir) to never copy.
_BACKUP_EXCLUDE_FILES = (
'api/.test_admin_pass',
'api/.gitkeep',
)
logger = logging.getLogger(__name__)
# Valid Python logging levels for the `logging` config section.
_VALID_LOG_LEVELS = ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL')
# Image signature verification modes (see get/set_image_verification).
_IMAGE_VERIFY_MODES = ('off', 'warn', 'enforce')
# Per-service Python loggers exposed in the verbosity panel.
_LOGGING_PYTHON_SERVICES = (
'network', 'wireguard', 'email', 'calendar',
'files', 'routing', 'vault', 'api',
)
# Container services whose log level we can influence (hot for caddy/coredns,
# pending_restart for env-driven containers).
_LOGGING_CONTAINERS = ('caddy', 'coredns', 'wireguard', 'mailserver', 'api')
def _default_logging_config() -> Dict[str, Any]:
"""Return the default `logging` section for cell_config."""
return {
'python': {
'root': 'INFO',
'services': {svc: 'INFO' for svc in _LOGGING_PYTHON_SERVICES},
},
'containers': {c: 'INFO' for c in _LOGGING_CONTAINERS},
}
class ConfigManager:
"""Centralized configuration management for all services (unified config)"""
def __init__(self, config_file: str = '/app/config/cell_config.json', data_dir: str = '/app/data'):
config_file = Path(config_file)
if config_file.is_dir():
config_file = config_file / 'cell_config.json'
print(f"[DEBUG] ConfigManager.__init__: config_file = {config_file}")
self.config_file = config_file
self.data_dir = Path(data_dir)
self.backup_dir = self.data_dir / 'config_backups'
self.secrets_file = self.config_file.parent / 'secrets.yaml'
try:
self.backup_dir.mkdir(parents=True, exist_ok=True)
except (PermissionError, OSError):
pass
self.service_schemas = self._load_service_schemas()
self.configs = self._load_all_configs()
# Guards concurrent reads/writes of the connectivity v2 section.
self._connectivity_lock = threading.RLock()
# Optional callback invoked to migrate the legacy connectivity section
# to v2 on first access. Wired by ConnectivityManager (which owns the
# resource-allocation logic). Until set, get_connectivity() returns the
# raw (possibly legacy) section without migrating.
self._connectivity_migrator = None
# Ensure _identity key always exists
if '_identity' not in self.configs:
self.configs['_identity'] = {}
# Phase 5: ensure connectivity section exists with empty defaults.
if 'connectivity' not in self.configs:
self.configs['connectivity'] = {'exits': {}, 'peer_exit_map': {}}
self._ensure_logging_config()
if not self.config_file.exists():
self._save_all_configs()
# Silent migration: when DDNS is active but the internal domain is still
# the generic "cell" default, give CoreDNS a unique zone name so multiple
# cells on the same LAN don't collide.
try:
_ident = self.configs.get('_identity', {})
_mode = _ident.get('domain_mode', 'lan')
_domain = _ident.get('domain', '')
_cell_name = _ident.get('cell_name', '')
if (_mode != 'lan' and _cell_name
and (_domain in ('cell', '', None))):
_new_domain = f'{_cell_name}.local'
self.configs['_identity']['domain'] = _new_domain
self._save_all_configs()
except Exception:
pass
def _load_service_schemas(self) -> Dict[str, Dict]:
"""Load configuration schemas for all services"""
return {
'network': {
'required': ['dns_port', 'ntp_servers'],
'optional': ['dns_zones'],
'types': {
'dns_port': int,
'ntp_servers': list
}
},
'wireguard': {
'required': ['port', 'private_key', 'address'],
'optional': ['peers', 'allowed_ips'],
'types': {
'port': int,
'private_key': str,
'address': str
}
},
'email': {
'required': ['domain', 'smtp_port', 'imap_port'],
'optional': ['users', 'ssl_cert', 'ssl_key', 'submission_port', 'webmail_port'],
'types': {
'smtp_port': int,
'submission_port': int,
'imap_port': int,
'webmail_port': int,
'domain': str
}
},
'calendar': {
'required': ['port', 'data_dir'],
'optional': ['users', 'calendars'],
'types': {
'port': int,
'data_dir': str
}
},
'files': {
'required': ['port', 'data_dir'],
'optional': ['users', 'quota', 'manager_port'],
'types': {
'port': int,
'manager_port': int,
'data_dir': str,
'quota': int
}
},
'routing': {
'required': ['nat_enabled', 'firewall_enabled'],
'optional': ['nat_rules', 'firewall_rules', 'peer_routes'],
'types': {
'nat_enabled': bool,
'firewall_enabled': bool
}
},
'vault': {
'required': ['ca_configured', 'fernet_configured'],
'optional': ['certificates', 'trusted_keys'],
'types': {
'ca_configured': bool,
'fernet_configured': bool
}
},
'connectivity': {
'required': [],
'optional': ['exits', 'peer_exit_map'],
'types': {
'exits': dict,
'peer_exit_map': dict,
}
}
}
def _load_all_configs(self) -> Dict[str, Dict]:
"""Load all existing service configurations"""
if self.config_file.exists():
try:
with open(self.config_file, 'r') as f:
return json.load(f)
except Exception as e:
logger.error(f"Error loading unified config: {e}")
return {}
return {}
def _save_all_configs(self):
"""Save all service configurations to the unified config file (atomic write)."""
try:
self.config_file.parent.mkdir(parents=True, exist_ok=True)
tmp = self.config_file.with_suffix('.tmp')
with open(tmp, 'w') as f:
json.dump(self.configs, f, indent=2)
f.flush()
os.fsync(f.fileno())
os.replace(tmp, self.config_file)
except (PermissionError, OSError) as e:
logger.error('_save_all_configs: write failed — config NOT persisted to disk: %s', e)
def get_service_config(self, service: str) -> Dict[str, Any]:
"""Get configuration for a specific service"""
if service not in self.service_schemas:
raise ValueError(f"Unknown service: {service}")
return self.configs.get(service, {})
def update_service_config(self, service: str, config: Dict[str, Any]) -> bool:
"""Update configuration for a specific service"""
if service not in self.service_schemas:
raise ValueError(f"Unknown service: {service}")
try:
# Validate types only (required fields are checked by validate_config, not here)
schema = self.service_schemas[service]
for field, expected_type in schema['types'].items():
if field in config and not isinstance(config[field], expected_type):
logger.error(f"Invalid type for {field}: expected {expected_type.__name__}")
return False
# Backup current config
self._backup_service_config(service)
# Update configuration
self.configs[service] = config
self._save_all_configs()
logger.info(f"Updated configuration for {service}")
return True
except Exception as e:
logger.error(f"Error updating config for {service}: {e}")
return False
def validate_config(self, service: str, config: Dict[str, Any]) -> Dict[str, Any]:
"""Validate configuration for a service"""
if service not in self.service_schemas:
return {
"valid": False,
"errors": [f"Unknown service: {service}"],
"warnings": []
}
schema = self.service_schemas[service]
errors = []
warnings = []
# Check required fields (missing = error, wrong type = error)
for field in schema['required']:
if field not in config:
errors.append(f"Missing required field: {field}")
elif field in schema['types']:
expected_type = schema['types'][field]
if not isinstance(config[field], expected_type):
errors.append(f"Field {field} must be of type {expected_type.__name__}")
# Check optional fields
for field in schema['optional']:
if field in config and field in schema['types']:
expected_type = schema['types'][field]
if not isinstance(config[field], expected_type):
warnings.append(f"Field {field} should be of type {expected_type.__name__}")
return {
"valid": len(errors) == 0,
"errors": errors,
"warnings": warnings
}
@staticmethod
def _validate_vol_entry(service_id: str, vol: dict) -> bool:
"""Return True if a backup volume entry is safe to use; log and return False otherwise."""
container = vol.get('container', '')
path = vol.get('path', '')
name = vol.get('name', '')
if not _SAFE_CONTAINER_RE.match(container):
logger.warning('Backup: unsafe container name %r for %s — skipping', container, service_id)
return False
if not path.startswith('/') or '..' in path.split('/') or '\x00' in path:
logger.warning('Backup: unsafe volume path %r for %s — skipping', path, service_id)
return False
if not _SAFE_VOL_NAME_RE.match(name):
logger.warning('Backup: unsafe volume name %r for %s — skipping', name, service_id)
return False
return True
@staticmethod
def _is_excluded(rel_path: str) -> bool:
"""Return True if a data-relative path should be excluded from backups."""
rel_path = rel_path.replace(os.sep, '/')
name = rel_path.rsplit('/', 1)[-1]
if rel_path in _BACKUP_EXCLUDE_FILES:
return True
for pat in _BACKUP_EXCLUDE_GLOBS:
if fnmatch.fnmatch(rel_path, pat) or fnmatch.fnmatch(name, pat):
return True
# '**' segments: also match any path that has the prefix dir
if pat.endswith('/**') and rel_path.startswith(pat[:-3] + '/'):
return True
return False
def _copy_data_path(self, rel_src: str, backup_path: Path) -> None:
"""Copy a file or directory tree from data_dir/<rel_src> into the backup
under data/<rel_src>, honouring the exclude list. Skips silently if the
source does not exist or cannot be read."""
src = self.data_dir / rel_src
if not src.exists():
return
try:
if src.is_file():
if self._is_excluded(rel_src):
return
dest = backup_path / 'data' / rel_src
dest.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(src, dest)
return
for root, dirs, files in os.walk(src):
root_p = Path(root)
rel_root = (Path(rel_src) / root_p.relative_to(src)).as_posix()
dirs[:] = [d for d in dirs
if not self._is_excluded(f'{rel_root}/{d}'.lstrip('./'))]
for fname in files:
rel_file = f'{rel_root}/{fname}'.lstrip('./')
rel_file = rel_file.replace('//', '/')
if self._is_excluded(rel_file):
continue
dest = backup_path / 'data' / rel_file
dest.parent.mkdir(parents=True, exist_ok=True)
try:
shutil.copy2(root_p / fname, dest)
except (PermissionError, OSError) as e:
logger.warning('Backup: could not copy %s: %s (skipping)', rel_file, e)
except (PermissionError, OSError) as e:
logger.warning('Backup: could not copy %s: %s (skipping)', rel_src, e)
def _backup_service_volumes(self, backup_path: Path, service_registry) -> None:
"""Stream service data out of each container via 'docker exec tar'.
Archives are relative (created with -C <path> .) so they can be safely
restored with -C <path> without risk of path traversal outside the volume.
Writes to a .partial temp file then renames atomically on success.
"""
try:
plan = service_registry.get_backup_plan()
except Exception as e:
logger.warning('_backup_service_volumes: could not get backup plan: %s', e)
return
for entry in plan:
service_id = entry['service_id']
volumes = entry.get('volumes') or []
if not volumes:
continue
svc_dir = backup_path / 'service_data' / service_id
svc_dir.mkdir(parents=True, exist_ok=True)
for vol in volumes:
if not self._validate_vol_entry(service_id, vol):
continue
container = vol['container']
path = vol['path']
name = vol['name']
archive_path = svc_dir / f'{name}.tar.gz'
tmp_path = svc_dir / f'{name}.tar.gz.partial'
try:
with open(tmp_path, 'wb') as af:
result = subprocess.run(
# -C path; then '.' archives the whole dir with relative entries.
# '--' prevents path/container from being parsed as options.
['docker', 'exec', '--', container,
'tar', '-C', path, '-czf', '-', '.'],
stdout=af,
stderr=subprocess.PIPE,
timeout=300,
)
if result.returncode != 0:
logger.warning(
'Backup: docker exec tar failed for %s/%s: %s',
service_id, name, result.stderr.decode(errors='replace'),
)
tmp_path.unlink(missing_ok=True)
else:
os.replace(tmp_path, archive_path)
logger.info('Backup: archived %s/%s', service_id, name)
except subprocess.TimeoutExpired:
logger.warning('Backup: timed out streaming %s/%s', service_id, name)
tmp_path.unlink(missing_ok=True)
except Exception as e:
logger.warning('Backup: failed to archive %s/%s: %s', service_id, name, e)
tmp_path.unlink(missing_ok=True)
def _restore_service_volumes(self, backup_path: Path, service_registry) -> None:
"""Pipe archived service data back into containers via 'docker exec -i tar'.
Extracts with -C <path>, matching how archives were created (relative paths).
This bounds extraction to within the declared volume directory.
"""
svc_data_dir = backup_path / 'service_data'
if not svc_data_dir.is_dir():
return
for svc_dir in svc_data_dir.iterdir():
if not svc_dir.is_dir():
continue
service_id = svc_dir.name
svc = service_registry.get(service_id)
if not svc:
logger.warning('Restore: unknown service %s in backup, skipping', service_id)
continue
volumes = (svc.get('backup') or {}).get('volumes') or []
for vol in volumes:
if not self._validate_vol_entry(service_id, vol):
continue
container = vol['container']
path = vol['path']
name = vol['name']
archive_path = svc_dir / f'{name}.tar.gz'
if not archive_path.exists():
continue
try:
with open(archive_path, 'rb') as af:
result = subprocess.run(
['docker', 'exec', '-i', '--', container,
'tar', '-C', path, '-xzf', '-'],
stdin=af,
stderr=subprocess.PIPE,
timeout=300,
)
if result.returncode != 0:
logger.warning(
'Restore: docker exec tar failed for %s/%s: %s',
service_id, name, result.stderr.decode(errors='replace'),
)
else:
logger.info('Restore: restored %s/%s', service_id, name)
except subprocess.TimeoutExpired:
logger.warning('Restore: timed out restoring %s/%s', service_id, name)
except Exception as e:
logger.warning('Restore: failed to restore %s/%s: %s', service_id, name, e)
def backup_config(self, service_registry=None, passphrase: Optional[str] = None) -> str:
"""Create a backup of cell_config.json, all critical secrets/keys, runtime
config and (when service_registry is provided) live service data volumes.
When *passphrase* is supplied the staged backup directory is packed into an
encrypted archive (<backup_id>.tar.gz.age) and the plaintext staging dir is
removed. The archive contains key material; it is written mode 0600.
"""
try:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
backup_id = f"backup_{timestamp}"
backup_path = self.backup_dir / backup_id
backup_path.mkdir(parents=True, exist_ok=True)
# Primary config and secrets
if self.config_file.exists():
shutil.copy2(self.config_file, backup_path / 'cell_config.json')
if self.secrets_file.exists():
shutil.copy2(self.secrets_file, backup_path / 'secrets.yaml')
# Runtime-generated files that must match cell_config.json after restore
config_dir = Path(os.environ.get('CONFIG_DIR', '/app/config'))
env_file = Path(os.environ.get('ENV_FILE', '/app/.env'))
extra = [
(Path(LIVE_CADDYFILE), 'Caddyfile'),
(config_dir / 'dns' / 'Corefile', 'Corefile'),
(env_file, '.env'),
]
for src, dest_name in extra:
if src.exists():
shutil.copy2(src, backup_path / dest_name)
# DNS zone files
dns_data = self.data_dir / 'dns'
if dns_data.is_dir():
zones_dir = backup_path / 'dns_zones'
zones_dir.mkdir(exist_ok=True)
for zone_file in dns_data.glob('*.zone'):
shutil.copy2(zone_file, zones_dir / zone_file.name)
# Service-specific user account files (authoritative source of truth —
# cell_config.json only carries a best-effort sync of these).
svc_user_files = [
(self.data_dir / 'email' / 'users.json', 'email_users.json'),
(self.data_dir / 'calendar' / 'users.json', 'calendar_users.json'),
(self.data_dir / 'calendar' / 'calendars.json', 'calendar_calendars.json'),
]
for src, dest_name in svc_user_files:
if src.exists():
try:
shutil.copy2(src, backup_path / dest_name)
except (PermissionError, OSError) as e:
logger.warning(f"Could not back up {src.name}: {e} (skipping)")
# CRITICAL secrets, keys and state under data/. Losing any of these on a
# restore would lock out the admin, re-provision all WireGuard peers, or
# render vault-encrypted secrets unrecoverable. Each path is copied under
# data/<rel> in the archive and skipped gracefully if absent.
critical_data_paths = [
# API auth + identity
'api/auth_users.json',
'api/.flask_secret_key',
'api/peers.json',
'api/peer_service_credentials.json',
'api/cell_links.json',
'api/ddns_token',
# Append-only audit trail (who changed what) + rotated segments
'api/audit',
# WireGuard key material (server + peers) and live confs
'wireguard/keys',
'wireguard/wg_confs',
'api/wireguard/keys',
# Vault: internal CA, certs, fernet.key, trust, encrypted secrets.
# Without keys/fernet.key all vault secrets are unrecoverable.
'vault',
# Connectivity instance configs (host bind-mounts, not docker volumes):
# wg_ext0.conf, redsocks.conf, sshuttle keys/known_hosts, etc.
'api/services',
'services',
# Caddy issued certs / ACME state (avoid re-issuance + rate-limits)
'caddy',
]
for rel in critical_data_paths:
self._copy_data_path(rel, backup_path)
# Live service data volumes (streamed via docker exec)
if service_registry is not None:
self._backup_service_volumes(backup_path, service_registry)
services = ['identity'] + list(self.service_schemas.keys())
encrypted = bool(passphrase)
manifest = {
"backup_id": backup_id,
"timestamp": datetime.now().isoformat(),
"services": services,
"files": sorted(p.relative_to(backup_path).as_posix()
for p in backup_path.rglob('*') if p.is_file()),
"includes_service_data": service_registry is not None,
"encrypted": encrypted,
"contains_secrets": True,
}
with open(backup_path / 'manifest.json', 'w') as f:
json.dump(manifest, f, indent=2)
if encrypted:
archive_id = self._pack_and_encrypt(backup_path, backup_id, passphrase)
logger.info(f"Created encrypted configuration backup: {archive_id}")
return archive_id
# Plaintext backup: lock the staging dir down — it holds key material.
try:
os.chmod(backup_path, 0o700)
except OSError:
pass
logger.info(f"Created configuration backup: {backup_id}")
return backup_id
except Exception as e:
logger.error(f"Error creating backup: {e}")
raise
def _pack_and_encrypt(self, backup_path: Path, backup_id: str,
passphrase: str) -> str:
"""Tar+gzip the staged backup dir, encrypt with the passphrase, write
<backup_id>.tar.gz.age (mode 0600), and remove the plaintext staging dir."""
buf = io.BytesIO()
with tarfile.open(fileobj=buf, mode='w:gz') as tar:
tar.add(backup_path, arcname=backup_id)
blob = backup_crypto.encrypt_bytes(buf.getvalue(), passphrase)
archive_name = f'{backup_id}.tar.gz.age'
archive_path = self.backup_dir / archive_name
fd = os.open(str(archive_path), os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600)
with os.fdopen(fd, 'wb') as f:
f.write(blob)
os.chmod(str(archive_path), 0o600)
shutil.rmtree(backup_path, ignore_errors=True)
return archive_name
def _resolve_backup_dir(self, backup_id: str, passphrase: Optional[str]):
"""Return (backup_path, cleanup_dir) for a backup id.
For a plaintext backup, backup_path is the on-disk directory and
cleanup_dir is None. For an encrypted archive (<id>.tar.gz.age, detected
either by the id ending in .age or by an archive file existing), the
archive is decrypted and extracted to a temp dir which the caller must
remove via cleanup_dir. Raises PermissionError on a bad/missing
passphrase so the route can return 400.
"""
import tempfile
archive_path = None
if backup_id.endswith('.age'):
archive_path = self.backup_dir / backup_id
else:
candidate = self.backup_dir / f'{backup_id}.tar.gz.age'
if candidate.exists() and not (self.backup_dir / backup_id).is_dir():
archive_path = candidate
if archive_path is None:
return self.backup_dir / backup_id, None
if not archive_path.exists():
raise ValueError(f"Backup {backup_id} not found")
blob = archive_path.read_bytes()
try:
plaintext = backup_crypto.decrypt_bytes(blob, passphrase or '')
except backup_crypto.BackupDecryptError as e:
raise PermissionError(str(e)) from e
tmpdir = Path(tempfile.mkdtemp(prefix='pic_restore_'))
with tarfile.open(fileobj=io.BytesIO(plaintext), mode='r:gz') as tar:
tar.extractall(tmpdir)
inner = [p for p in tmpdir.iterdir() if p.is_dir()]
backup_path = inner[0] if len(inner) == 1 else tmpdir
return backup_path, tmpdir
def restore_config(self, backup_id: str, services: list = None,
service_registry=None, passphrase: Optional[str] = None) -> bool:
"""Restore from backup. If services list given, only restore those service configs (selective).
Encrypted archives (<id>.tar.gz.age) are auto-detected and require the
passphrase; a wrong/missing passphrase raises PermissionError (route → 400).
"""
cleanup_dir = None
try:
backup_path, cleanup_dir = self._resolve_backup_dir(backup_id, passphrase)
if not backup_path.exists():
raise ValueError(f"Backup {backup_id} not found")
manifest_file = backup_path / 'manifest.json'
if not manifest_file.exists():
raise ValueError(f"Backup manifest not found")
if services is not None:
# Selective restore: only update specified services in running config
backup_cfg_path = backup_path / 'cell_config.json'
if backup_cfg_path.exists():
with open(backup_cfg_path) as f:
backup_cfg = json.load(f)
for svc in services:
if svc == 'identity':
if '_identity' in backup_cfg:
self.configs['_identity'] = backup_cfg['_identity']
elif svc in backup_cfg:
self.configs[svc] = backup_cfg[svc]
self._save_all_configs()
logger.info(f"Selectively restored {services} from backup: {backup_id}")
return True
# ── Full restore ─────────────────────────────────────────────────
# Ordering matters: vault (incl. fernet.key) is restored FIRST because
# everything else's secrets are encrypted with it; then identity/.env;
# then WireGuard key material; then cell links; then generated config;
# then per-service connectivity configs; then auth/ddns.
config_dir = Path(os.environ.get('CONFIG_DIR', '/app/config'))
env_file = Path(os.environ.get('ENV_FILE', '/app/.env'))
# (1) Vault FIRST — internal CA, certs, fernet.key, trust, secrets.
self._restore_data_path(backup_path, 'vault')
# (2) Identity / primary config + secrets + .env
config_backup = backup_path / 'cell_config.json'
if config_backup.exists():
shutil.copy2(config_backup, self.config_file)
secrets_backup = backup_path / 'secrets.yaml'
if secrets_backup.exists():
shutil.copy2(secrets_backup, self.secrets_file)
if (backup_path / '.env').exists():
try:
env_file.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(backup_path / '.env', env_file)
except (PermissionError, OSError) as e:
logger.warning(f"Could not restore .env: {e} (skipping)")
# (3) WireGuard key material + live confs, then peers.json
for rel in ('wireguard/keys', 'wireguard/wg_confs', 'api/wireguard/keys'):
self._restore_data_path(backup_path, rel)
for rel in ('api/peers.json', 'api/peer_service_credentials.json'):
self._restore_data_path(backup_path, rel)
# (4) Cell-to-cell links / permissions + audit trail
self._restore_data_path(backup_path, 'api/cell_links.json')
self._restore_data_path(backup_path, 'api/audit')
# (5) Caddy issued certs/ACME, DNS Corefile + zones (generated files are
# reapplied below, but restoring them gives a correct starting point).
self._restore_data_path(backup_path, 'caddy')
if (backup_path / 'Caddyfile').exists():
try:
Path(LIVE_CADDYFILE).parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(backup_path / 'Caddyfile', Path(LIVE_CADDYFILE))
except (PermissionError, OSError) as e:
logger.warning(f"Could not restore Caddyfile: {e} (skipping)")
if (backup_path / 'Corefile').exists():
try:
dest = config_dir / 'dns' / 'Corefile'
dest.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(backup_path / 'Corefile', dest)
except (PermissionError, OSError) as e:
logger.warning(f"Could not restore Corefile: {e} (skipping)")
zones_backup = backup_path / 'dns_zones'
if zones_backup.is_dir():
dns_data = self.data_dir / 'dns'
try:
dns_data.mkdir(parents=True, exist_ok=True)
for zone_file in zones_backup.glob('*.zone'):
try:
shutil.copy2(zone_file, dns_data / zone_file.name)
except (PermissionError, OSError) as zone_err:
logger.warning(f"Could not restore zone {zone_file.name}: {zone_err} (skipping)")
except (PermissionError, OSError) as dir_err:
logger.warning(f"Could not create dns data dir {dns_data}: {dir_err} (skipping)")
# (6) Per-service connectivity configs (host bind-mounts)
for rel in ('api/services', 'services'):
self._restore_data_path(backup_path, rel)
# (7) Auth users, flask secret, ddns token (after vault, before recompose)
for rel in ('api/auth_users.json', 'api/.flask_secret_key', 'api/ddns_token'):
self._restore_data_path(backup_path, rel)
# Service-specific user account files
svc_restore_map = [
(backup_path / 'email_users.json', self.data_dir / 'email' / 'users.json'),
(backup_path / 'calendar_users.json', self.data_dir / 'calendar' / 'users.json'),
(backup_path / 'calendar_calendars.json', self.data_dir / 'calendar' / 'calendars.json'),
]
for src, dest in svc_restore_map:
if src.exists():
try:
dest.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(src, dest)
except (PermissionError, OSError) as e:
logger.warning(f"Could not restore {dest.name}: {e} (skipping)")
# Reload config now that cell_config.json is restored.
self.configs = self._load_all_configs()
# (8) Live service data volumes (after containers exist — best-effort)
if service_registry is not None:
self._restore_service_volumes(backup_path, service_registry)
# (9) Reapply runtime state: regenerate generated config from the
# restored source-of-truth and re-apply routing/links.
self._reapply_runtime_state()
logger.info(f"Restored configuration from backup: {backup_id}")
return True
except PermissionError:
raise
except Exception as e:
logger.error(f"Error restoring backup {backup_id}: {e}")
return False
finally:
if cleanup_dir is not None:
shutil.rmtree(cleanup_dir, ignore_errors=True)
def _restore_data_path(self, backup_path: Path, rel: str) -> None:
"""Restore data/<rel> from the backup into self.data_dir/<rel>.
Handles both files and directory trees. Skips silently if absent."""
src = backup_path / 'data' / rel
if not src.exists():
return
dest = self.data_dir / rel
try:
if src.is_dir():
dest.mkdir(parents=True, exist_ok=True)
for root, _dirs, files in os.walk(src):
root_p = Path(root)
rel_root = root_p.relative_to(src)
for fname in files:
out = dest / rel_root / fname
out.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(root_p / fname, out)
else:
dest.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(src, dest)
except (PermissionError, OSError) as e:
logger.warning(f"Could not restore {rel}: {e} (skipping)")
def _reapply_runtime_state(self) -> None:
"""Regenerate generated config (Caddyfile, Corefile) from the restored
source-of-truth and re-apply routing / cell links. Uses the live
managers; every step is best-effort so a missing manager during a
partial/offline restore never aborts the whole operation.
NOTE: this does NOT stop/start containers. A full restore should be
followed by `make restart` so containers pick up restored key material
and regenerated config. See restore_config docstring / README.
"""
try:
from managers import (caddy_manager, firewall_manager,
connectivity_manager, cell_link_manager,
service_composer, peer_registry)
except Exception as e:
logger.warning(f"Reapply: managers unavailable ({e}); skipping reapply")
return
try:
caddy_manager.regenerate_with_installed([])
except Exception as e:
logger.warning(f"Reapply: regenerate Caddyfile failed: {e}")
try:
peers = peer_registry.list_peers() if peer_registry else []
cell_links = cell_link_manager.list_connections() if cell_link_manager else None
firewall_manager.generate_corefile(
peers, domain=self.get_internal_domain(), cell_links=cell_links)
except Exception as e:
logger.warning(f"Reapply: regenerate Corefile failed: {e}")
try:
if service_composer is not None:
service_composer.reapply_active_services()
except Exception as e:
logger.warning(f"Reapply: reapply_active_services failed: {e}")
try:
if connectivity_manager is not None:
connectivity_manager.apply_routes()
except Exception as e:
logger.warning(f"Reapply: apply_routes failed: {e}")
try:
if cell_link_manager is not None:
cell_link_manager.replay_pending_pushes()
except Exception as e:
logger.warning(f"Reapply: replay_pending_pushes failed: {e}")
def list_backups(self) -> List[Dict[str, Any]]:
"""List all available backups (plaintext dirs and encrypted archives)."""
backups = []
for entry in self.backup_dir.iterdir():
if entry.is_dir():
manifest_file = entry / 'manifest.json'
if manifest_file.exists():
try:
with open(manifest_file, 'r') as f:
manifest = json.load(f)
backups.append(manifest)
except Exception as e:
logger.error(f"Error reading backup manifest {entry.name}: {e}")
elif entry.is_file() and entry.name.endswith('.tar.gz.age'):
# Encrypted archive: manifest is inside and undecryptable without a
# passphrase, so synthesise a listing entry from the filename.
backup_id = entry.name[:-len('.tar.gz')] if entry.name.endswith('.tar.gz.age') else entry.name
# backup_<ts>.tar.gz.age → backup_<ts>
stem = entry.name[:-len('.tar.gz.age')]
ts = stem.replace('backup_', '').replace('_', 'T', 1)
backups.append({
'backup_id': entry.name,
'timestamp': ts,
'encrypted': True,
'contains_secrets': True,
})
return sorted(backups, key=lambda x: x.get('timestamp', ''), reverse=True)
def delete_backup(self, backup_id: str) -> bool:
"""Delete a backup (plaintext directory or encrypted archive)."""
try:
backup_path = self.backup_dir / backup_id
if backup_path.is_dir():
shutil.rmtree(backup_path)
elif backup_path.is_file():
backup_path.unlink()
else:
raise ValueError(f"Backup {backup_id} not found")
logger.info(f"Deleted backup: {backup_id}")
return True
except Exception as e:
logger.error(f"Error deleting backup {backup_id}: {e}")
return False
def get_config_hash(self, service: str) -> str:
"""Get hash of service configuration for change detection"""
config = self.get_service_config(service)
config_str = json.dumps(config, sort_keys=True)
return hashlib.sha256(config_str.encode()).hexdigest()
def has_config_changed(self, service: str, previous_hash: str) -> bool:
"""Check if configuration has changed"""
current_hash = self.get_config_hash(service)
return current_hash != previous_hash
def export_config(self, format: str = 'json', services: list = None) -> str:
"""Export service configurations (excludes internal state like pending_restart)."""
try:
export_data = {}
# Include identity under a clean key
if '_identity' in self.configs:
export_data['identity'] = dict(self.configs['_identity'])
# Include service configs, skip internal _ keys
for key, val in self.configs.items():
if key.startswith('_'):
continue
if services is not None and key not in services:
continue
export_data[key] = val
if format == 'json':
return json.dumps(export_data, indent=2)
elif format == 'yaml':
return yaml.dump(export_data, default_flow_style=False)
else:
raise ValueError(f"Unsupported format: {format}")
except Exception as e:
logger.error(f"Error exporting config: {e}")
raise
def import_config(self, config_data: str, format: str = 'json', services: list = None) -> bool:
"""Import configurations from string. Merges into existing config."""
try:
if format == 'json':
configs = json.loads(config_data)
elif format == 'yaml':
configs = yaml.safe_load(config_data)
else:
raise ValueError(f"Unsupported format: {format}")
# Handle identity (exported as 'identity', stored as '_identity')
if 'identity' in configs and (services is None or 'identity' in services):
ident = configs['identity']
cur = dict(self.configs.get('_identity', {}))
for k in ('cell_name', 'domain', 'ip_range', 'wireguard_port'):
if k in ident:
cur[k] = ident[k]
self.configs['_identity'] = cur
# Merge service configs (don't replace wholesale — keep existing fields not in import)
for key, val in configs.items():
if key == 'identity':
continue
if key not in self.service_schemas:
continue
if services is not None and key not in services:
continue
cur_svc = dict(self.configs.get(key, {}))
cur_svc.update(val)
self.configs[key] = cur_svc
self._save_all_configs()
logger.info("Imported configurations successfully")
return True
except Exception as e:
logger.error(f"Error importing config: {e}")
return False
def _backup_service_config(self, service: str):
"""Create backup of specific service config before update"""
# No-op for unified config, but keep for compatibility
pass
def get_identity(self) -> Dict[str, Any]:
"""Return the current identity configuration."""
return self.configs.get('_identity', {})
def get_effective_domain(self) -> str:
"""Return the FQDN that public-facing services should use.
In lan mode: _identity.domain. Otherwise: _identity.domain_name
(falls back to domain if domain_name not yet registered)."""
ident = self.get_identity()
mode = ident.get('domain_mode', 'lan')
if mode == 'lan':
return ident.get('domain') or os.environ.get('CELL_DOMAIN', 'cell')
return (ident.get('domain_name')
or ident.get('domain')
or os.environ.get('CELL_DOMAIN', 'cell'))
def get_internal_domain(self) -> str:
"""Return the CoreDNS zone name (always _identity.domain)."""
ident = self.get_identity()
return ident.get('domain') or os.environ.get('CELL_DOMAIN', 'cell')
def set_identity_field(self, key: str, value: Any):
"""Set a single field in the identity configuration and persist."""
if '_identity' not in self.configs:
self.configs['_identity'] = {}
self.configs['_identity'][key] = value
self._save_all_configs()
def get_installed_services(self) -> dict:
return self.configs.get('_identity', {}).get('installed_services', {})
def set_installed_service(self, service_id: str, record: dict):
ident = self.configs.setdefault('_identity', {})
ident.setdefault('installed_services', {})[service_id] = record
self._save_all_configs()
def remove_installed_service(self, service_id: str):
ident = self.configs.setdefault('_identity', {})
ident.setdefault('installed_services', {}).pop(service_id, None)
ident.setdefault('service_ips', {}).pop(service_id, None)
self._save_all_configs()
# ── Image signature verification configuration ────────────────────────
#
# Controls how a cell treats store-service container images at install:
# off — skip cosign verification and the digest-pin requirement
# warn — log a warning on a missing digest / failed signature, proceed
# enforce — refuse to start a service whose image is undigested,
# unsigned, or whose signature does not verify
#
# All store images are now signed + digest-pinned via the publish pipeline,
# so the default is "enforce". The section is backed up and restored with
# the rest of cell_config.json automatically.
def get_image_verification(self) -> Dict[str, Any]:
"""Return the image verification config, e.g. {'mode': 'enforce'}."""
cfg = self.configs.get('image_verification')
if not isinstance(cfg, dict) or cfg.get('mode') not in _IMAGE_VERIFY_MODES:
cfg = {'mode': 'enforce'}
self.configs['image_verification'] = cfg
return dict(cfg)
def get_image_verification_mode(self) -> str:
"""Return just the verification mode string (off|warn|enforce)."""
return self.get_image_verification()['mode']
def set_image_verification_mode(self, mode: str) -> None:
"""Persist the verification mode. Raises ValueError on an invalid mode."""
mode = (mode or '').lower()
if mode not in _IMAGE_VERIFY_MODES:
raise ValueError(
f"Invalid image verification mode: {mode!r} "
f"(expected one of {sorted(_IMAGE_VERIFY_MODES)})"
)
self.configs['image_verification'] = {'mode': mode}
self._save_all_configs()
# ── Logging verbosity configuration ───────────────────────────────────
def _ensure_logging_config(self) -> None:
"""Ensure a well-formed `logging` section exists, migrating the legacy
config/api/log_levels.json side-file on first load.
The legacy file held a flat {service: LEVEL} map for the picell.* python
loggers. It is read once and merged in; the section then becomes the
single source of truth (the side-file is ignored thereafter).
"""
cfg = self.configs.get('logging')
if not isinstance(cfg, dict):
cfg = _default_logging_config()
self.configs['logging'] = cfg
python = cfg.setdefault('python', {})
if not python.get('root') or python['root'] not in _VALID_LOG_LEVELS:
python['root'] = 'INFO'
services = python.setdefault('services', {})
for svc in _LOGGING_PYTHON_SERVICES:
if services.get(svc) not in _VALID_LOG_LEVELS:
services.setdefault(svc, 'INFO')
containers = cfg.setdefault('containers', {})
for c in _LOGGING_CONTAINERS:
if containers.get(c) not in _VALID_LOG_LEVELS:
containers.setdefault(c, 'INFO')
# One-time migration from the legacy side-file.
if not cfg.get('_migrated_log_levels'):
legacy = self.config_file.parent / 'api' / 'log_levels.json'
legacy_flat = self.config_file.parent / 'log_levels.json'
for path in (legacy, legacy_flat):
try:
if path.exists():
with open(path) as lf:
for svc, lvl in (json.load(lf) or {}).items():
if (isinstance(lvl, str)
and lvl.upper() in _VALID_LOG_LEVELS
and svc in services):
services[svc] = lvl.upper()
except Exception as e:
logger.warning('log_levels.json migration skipped (%s): %s', path, e)
cfg['_migrated_log_levels'] = True
def get_logging_config(self) -> Dict[str, Any]:
"""Return the full logging config (python + containers sections)."""
self._ensure_logging_config()
cfg = self.configs['logging']
return {
'python': {
'root': cfg['python']['root'],
'services': dict(cfg['python']['services']),
},
'containers': dict(cfg['containers']),
}
def set_python_log_level(self, service: str, level: str) -> None:
"""Persist a python service (or 'root') log level. Raises ValueError on
an invalid level."""
level = (level or '').upper()
if level not in _VALID_LOG_LEVELS:
raise ValueError(f"Invalid log level: {level!r}")
self._ensure_logging_config()
python = self.configs['logging']['python']
if service == 'root':
python['root'] = level
else:
python.setdefault('services', {})[service] = level
self._save_all_configs()
def set_container_log_level(self, container: str, level: str) -> None:
"""Persist a container log level. Raises ValueError on an invalid level."""
level = (level or '').upper()
if level not in _VALID_LOG_LEVELS:
raise ValueError(f"Invalid log level: {level!r}")
self._ensure_logging_config()
self.configs['logging']['containers'][container] = level
self._save_all_configs()
# Phase 5 — Extended connectivity configuration helpers
def get_connectivity_config(self) -> Dict[str, Any]:
"""Return the full connectivity config (exits + peer_exit_map)."""
cfg = self.configs.get('connectivity')
if not isinstance(cfg, dict):
cfg = {'exits': {}, 'peer_exit_map': {}}
self.configs['connectivity'] = cfg
cfg.setdefault('exits', {})
cfg.setdefault('peer_exit_map', {})
return dict(cfg)
def set_ddns_config(self, ddns_cfg: Dict[str, Any]) -> None:
"""Replace the top-level ddns section and persist.
Never writes a 'token' key into cell_config.json — tokens live in data/.
"""
ddns_cfg = {k: v for k, v in ddns_cfg.items() if k != 'token'}
self.configs['ddns'] = ddns_cfg
self._save_all_configs()
@property
def _ddns_token_path(self) -> Path:
return self.data_dir / 'api' / 'ddns_token'
def get_ddns_token(self) -> str:
"""Return the DDNS bearer token from data/api/ddns_token.
Migrates automatically from the old cell_config.json location on first
call so existing installs keep working without manual intervention.
"""
path = self._ddns_token_path
if path.exists():
try:
tok = path.read_text().strip()
if tok:
return tok
except (PermissionError, OSError):
pass
# Migrate legacy token from cell_config.json
old_token = self.configs.get('ddns', {}).get('token', '')
if old_token:
self.set_ddns_token(old_token)
return old_token
def set_ddns_token(self, token: str) -> None:
"""Write the DDNS bearer token to data/api/ddns_token (not cell_config.json)."""
path = self._ddns_token_path
try:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(token)
except (PermissionError, OSError) as exc:
logger.error('set_ddns_token: failed to write token file: %s', exc)
return
# Remove from cell_config.json if a legacy copy is there
if self.configs.get('ddns', {}).get('token'):
ddns_cfg = {k: v for k, v in self.configs.get('ddns', {}).items() if k != 'token'}
self.configs['ddns'] = ddns_cfg
self._save_all_configs()
def set_connectivity_field(self, field: str, value: Any) -> bool:
"""Set a single field within the connectivity config and persist."""
cfg = self.configs.setdefault('connectivity', {'exits': {}, 'peer_exit_map': {}})
cfg[field] = value
try:
self._save_all_configs()
return True
except Exception as e:
logger.error(f"set_connectivity_field({field}): {e}")
return False
# ── Connectivity v2 — named connection instances ──────────────────────
#
# The legacy schema stored at most one exit per type under
# `connectivity.exits` plus a `peer_exit_map`. v2 replaces this with a list
# of named connection instances under `connectivity.connections`, each with
# its own allocated routing resources (mark/table/iface/redirect_port) and
# vault secret references. The legacy keys are kept readable so the one-time
# migration can consume them; the new code path uses `connections`.
def register_connectivity_migrator(self, migrator) -> None:
"""Register the v1→v2 migration callback (owned by ConnectivityManager).
`migrator(legacy_section) -> list[connection_record]` builds the v2
connection records (allocating resources, repointing secrets) from the
legacy section. Called at most once, lazily, on first get_connectivity().
"""
self._connectivity_migrator = migrator
def get_connectivity(self) -> Dict[str, Any]:
"""Return the connectivity v2 dict, running v1→v2 migration if needed.
Idempotent: once `version` is 2 the stored section is returned as-is.
When `version` < 2 and a migrator is registered, the legacy exits are
converted to connection instances exactly once and the result persisted.
"""
with self._connectivity_lock:
cfg = self.configs.get('connectivity')
if not isinstance(cfg, dict):
cfg = {}
if cfg.get('version') == 2 and isinstance(cfg.get('connections'), list):
return self._copy_connectivity(cfg)
connections: List[Dict[str, Any]] = []
if self._connectivity_migrator is not None:
try:
built = self._connectivity_migrator(dict(cfg))
if isinstance(built, list):
connections = built
except Exception as e:
logger.error(f"connectivity v1→v2 migration failed: {e}")
raise
new_cfg = dict(cfg)
new_cfg['version'] = 2
new_cfg['connections'] = connections
self.configs['connectivity'] = new_cfg
self._save_all_configs()
return self._copy_connectivity(new_cfg)
@staticmethod
def _copy_connectivity(cfg: Dict[str, Any]) -> Dict[str, Any]:
"""Deep-ish copy of the connectivity section so callers can't mutate state."""
out = dict(cfg)
out['connections'] = [dict(c) for c in cfg.get('connections', [])]
return out
def list_connections(self) -> List[Dict[str, Any]]:
"""Return a copy of all v2 connection records."""
with self._connectivity_lock:
return self.get_connectivity().get('connections', [])
def get_connection(self, conn_id: str) -> Optional[Dict[str, Any]]:
"""Return a copy of one connection record by id, or None."""
with self._connectivity_lock:
for conn in self.get_connectivity().get('connections', []):
if conn.get('id') == conn_id:
return dict(conn)
return None
def add_connection(self, record: Dict[str, Any]) -> bool:
"""Append a connection record and persist atomically."""
with self._connectivity_lock:
cfg = self.get_connectivity()
conns = cfg.get('connections', [])
conns.append(dict(record))
self.configs['connectivity'] = {
**self.configs.get('connectivity', {}),
'version': 2,
'connections': conns,
}
self._save_all_configs()
return True
def update_connection(self, conn_id: str, fields: Dict[str, Any]) -> bool:
"""Merge `fields` into the connection record with id `conn_id`."""
with self._connectivity_lock:
cfg = self.get_connectivity()
conns = cfg.get('connections', [])
found = False
for conn in conns:
if conn.get('id') == conn_id:
conn.update(fields)
found = True
break
if not found:
return False
self.configs['connectivity'] = {
**self.configs.get('connectivity', {}),
'version': 2,
'connections': conns,
}
self._save_all_configs()
return True
def delete_connection(self, conn_id: str) -> bool:
"""Remove the connection record with id `conn_id`."""
with self._connectivity_lock:
cfg = self.get_connectivity()
conns = cfg.get('connections', [])
remaining = [c for c in conns if c.get('id') != conn_id]
if len(remaining) == len(conns):
return False
self.configs['connectivity'] = {
**self.configs.get('connectivity', {}),
'version': 2,
'connections': remaining,
}
self._save_all_configs()
return True
def set_connection_status(self, conn_id: str, status: Dict[str, Any]) -> bool:
"""Replace the `status` sub-dict of one connection record."""
return self.update_connection(conn_id, {'status': dict(status)})
def get_all_configs(self) -> Dict[str, Dict]:
"""Get all service configurations"""
return self.configs.copy()
def get_config_summary(self) -> Dict[str, Any]:
"""Get summary of all configurations"""
summary = {
"total_services": len(self.service_schemas),
"configured_services": [],
"unconfigured_services": [],
"backup_count": len(self.list_backups()),
"last_backup": None
}
backups = self.list_backups()
if backups:
summary["last_backup"] = backups[0]["timestamp"]
for service in self.service_schemas.keys():
config = self.get_service_config(service)
if config and not config.get("error"):
summary["configured_services"].append(service)
else:
summary["unconfigured_services"].append(service)
return summary