From 82a0c0e9bd9b7b64c4aac5f6a05e948f1151fcde Mon Sep 17 00:00:00 2001 From: Dmitrii Iurco Date: Wed, 10 Jun 2026 15:41:10 -0400 Subject: [PATCH] =?UTF-8?q?fix:=20overhaul=20backup/restore=20=E2=80=94=20?= =?UTF-8?q?full=20secrets=20coverage,=20ordered=20reapply,=20optional=20pa?= =?UTF-8?q?ssphrase=20encryption?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P0 — backups previously omitted peers/keys/vault(CA+fernet)/auth/cell-links/ddns/connectivity configs (a restore lost everything incl admin login + CA) and included logs/trash; restore did file-copies only with no reapply. Changes: - api/config_manager.py: backup_config now includes auth_users.json, .flask_secret_key, peers.json, peer_service_credentials.json, WireGuard keys + wg_confs + api/wireguard/keys, vault/** (incl fernet.key), api/services + service configs, cell_links.json, ddns_token, caddy/**; new _is_excluded() drops logs/config_backups/.test_admin_pass/.gitkeep/*.tmp/ *.partial/__pycache__; restore_config reordered (vault/fernet → config → wg keys/peers → cell_links → caddy/dns → service configs → auth/ddns → volumes) + new _reapply_runtime_state() (regenerate Caddyfile/Corefile, reapply services, connectivity apply_routes, replay cell pushes) - api/backup_crypto.py (new): optional passphrase encryption via scrypt-derived key + Fernet; encrypted archives written 0600 - api/routes/config.py: backup/restore accept optional {passphrase}; wrong/missing passphrase returns 400; backup response warns it contains secrets - Makefile: backup target applies same excludes + chmod 0600 + secrets warning - webui/src/services/api.js + webui/src/pages/Settings.jsx: passphrase field on create backup, restore prompt, "contains secrets" banner - tests/test_config_backup_overhaul.py (new, 18 tests) + tests/test_config_backup_restore_http.py (2 assertions updated) Co-Authored-By: Claude Fable 5 --- Makefile | 19 +- api/backup_crypto.py | 71 ++++ api/config_manager.py | 399 ++++++++++++++++++++--- api/routes/config.py | 32 +- tests/test_config_backup_overhaul.py | 231 +++++++++++++ tests/test_config_backup_restore_http.py | 5 +- webui/src/pages/Settings.jsx | 48 ++- webui/src/services/api.js | 9 +- 8 files changed, 743 insertions(+), 71 deletions(-) create mode 100644 api/backup_crypto.py create mode 100644 tests/test_config_backup_overhaul.py diff --git a/Makefile b/Makefile index e7ce5ba..7ea5a23 100644 --- a/Makefile +++ b/Makefile @@ -256,14 +256,23 @@ backup: @echo "Creating backup..." @mkdir -p backups @sudo tar -czf backups/cell-backup-$(shell date +%Y%m%d-%H%M%S).tar.gz \ + --exclude='data/logs' \ + --exclude='data/api/config_backups' \ + --exclude='data/api/.test_admin_pass' \ + --exclude='data/api/.gitkeep' \ + --exclude='*.tmp' \ + --exclude='*.partial' \ + --exclude='__pycache__' \ config/ data/ docker-compose.yml Makefile README.md @sudo chown $$(id -u):$$(id -g) backups/cell-backup-*.tar.gz - @echo "Backup created in backups/." + @chmod 600 backups/cell-backup-*.tar.gz + @echo "Backup created in backups/ (mode 0600 — contains secrets/keys)." @echo "" - @echo "WARNING: data volumes of installed store services (email, calendar," - @echo "files, ...) are NOT included in this archive. They are only captured" - @echo "by API-driven backups (POST /api/config/backup), which dump each" - @echo "service's volumes via ConfigManager._backup_service_volumes." + @echo "WARNING: this archive contains secrets and key material (WireGuard" + @echo "keys, internal CA, vault fernet.key, admin credentials). Store it" + @echo "securely. Data volumes of installed store services (email, calendar," + @echo "files, ...) are NOT included here — they are captured by API-driven" + @echo "backups (POST /api/config/backup) via _backup_service_volumes." restore: @echo "Available backups:" diff --git a/api/backup_crypto.py b/api/backup_crypto.py new file mode 100644 index 0000000..fae54bf --- /dev/null +++ b/api/backup_crypto.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +"""Passphrase-based encryption for PIC backup archives. + +A backup archive contains key material (WireGuard keys, the vault Fernet key, +the internal CA, admin credentials). When the operator supplies a passphrase we +encrypt the archive at rest. + +The repo's only available crypto primitive is `cryptography` (Fernet, scrypt) — +PyNaCl / the age binary are not installed in the API image. We therefore derive +a Fernet key from the passphrase with scrypt and wrap the archive bytes. The +encrypted file keeps the `.age` extension expected by the UI/restore detection; +the embedded MAGIC distinguishes our format from a real age file. +""" + +import os +import struct +from cryptography.fernet import Fernet, InvalidToken +from cryptography.hazmat.primitives.kdf.scrypt import Scrypt +import base64 + +# File layout: MAGIC | salt(16) | n(4) | r(4) | p(4) | fernet_token +MAGIC = b'PICBKP1\n' +_SALT_LEN = 16 +# scrypt cost parameters (interactive-strong; ~tens of ms) +_N = 2 ** 15 +_R = 8 +_P = 1 + + +class BackupDecryptError(Exception): + """Raised when an encrypted backup cannot be decrypted (wrong passphrase).""" + + +def _derive_key(passphrase: str, salt: bytes, n: int, r: int, p: int) -> bytes: + kdf = Scrypt(salt=salt, length=32, n=n, r=r, p=p) + raw = kdf.derive(passphrase.encode('utf-8')) + return base64.urlsafe_b64encode(raw) + + +def encrypt_bytes(plaintext: bytes, passphrase: str) -> bytes: + """Encrypt archive bytes with a passphrase. Returns the on-disk blob.""" + if not passphrase: + raise ValueError('passphrase required for encryption') + salt = os.urandom(_SALT_LEN) + key = _derive_key(passphrase, salt, _N, _R, _P) + token = Fernet(key).encrypt(plaintext) + header = MAGIC + salt + struct.pack('>III', _N, _R, _P) + return header + token + + +def is_encrypted(blob: bytes) -> bool: + return blob[:len(MAGIC)] == MAGIC + + +def decrypt_bytes(blob: bytes, passphrase: str) -> bytes: + """Decrypt a blob produced by encrypt_bytes. Raises BackupDecryptError.""" + if not is_encrypted(blob): + raise BackupDecryptError('not a PIC encrypted backup') + if not passphrase: + raise BackupDecryptError('passphrase required') + off = len(MAGIC) + salt = blob[off:off + _SALT_LEN] + off += _SALT_LEN + n, r, p = struct.unpack('>III', blob[off:off + 12]) + off += 12 + token = blob[off:] + key = _derive_key(passphrase, salt, n, r, p) + try: + return Fernet(key).decrypt(token) + except (InvalidToken, ValueError) as e: + raise BackupDecryptError('invalid passphrase or corrupt archive') from e diff --git a/api/config_manager.py b/api/config_manager.py index 49787cc..c931fe6 100644 --- a/api/config_manager.py +++ b/api/config_manager.py @@ -8,6 +8,9 @@ import os import json import re import subprocess +import tarfile +import io +import fnmatch import yaml import shutil import hashlib @@ -16,12 +19,28 @@ from typing import Dict, List, Optional, Any from pathlib import Path import logging +import backup_crypto + _SAFE_CONTAINER_RE = re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9_.-]{0,63}$') _SAFE_VOL_NAME_RE = re.compile(r'^[a-zA-Z0-9_.-]{1,64}$') # The Caddyfile lives on a separate volume mount from the rest of config LIVE_CADDYFILE = os.environ.get('CADDYFILE_PATH', '/app/config-caddy/Caddyfile') +# Trash that must never end up inside a backup. Matched against each file's +# path relative to the data dir (posix-style), and bare filenames. +_BACKUP_EXCLUDE_GLOBS = ( + 'logs/*', 'logs/**', + 'api/config_backups/*', 'api/config_backups/**', + '*.tmp', '*.partial', + '__pycache__/*', '**/__pycache__/**', +) +# Specific files (by path relative to data dir) to never copy. +_BACKUP_EXCLUDE_FILES = ( + 'api/.test_admin_pass', + 'api/.gitkeep', +) + logger = logging.getLogger(__name__) class ConfigManager: @@ -249,6 +268,55 @@ class ConfigManager: return False return True + @staticmethod + def _is_excluded(rel_path: str) -> bool: + """Return True if a data-relative path should be excluded from backups.""" + rel_path = rel_path.replace(os.sep, '/') + name = rel_path.rsplit('/', 1)[-1] + if rel_path in _BACKUP_EXCLUDE_FILES: + return True + for pat in _BACKUP_EXCLUDE_GLOBS: + if fnmatch.fnmatch(rel_path, pat) or fnmatch.fnmatch(name, pat): + return True + # '**' segments: also match any path that has the prefix dir + if pat.endswith('/**') and rel_path.startswith(pat[:-3] + '/'): + return True + return False + + def _copy_data_path(self, rel_src: str, backup_path: Path) -> None: + """Copy a file or directory tree from data_dir/ into the backup + under data/, honouring the exclude list. Skips silently if the + source does not exist or cannot be read.""" + src = self.data_dir / rel_src + if not src.exists(): + return + try: + if src.is_file(): + if self._is_excluded(rel_src): + return + dest = backup_path / 'data' / rel_src + dest.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dest) + return + for root, dirs, files in os.walk(src): + root_p = Path(root) + rel_root = (Path(rel_src) / root_p.relative_to(src)).as_posix() + dirs[:] = [d for d in dirs + if not self._is_excluded(f'{rel_root}/{d}'.lstrip('./'))] + for fname in files: + rel_file = f'{rel_root}/{fname}'.lstrip('./') + rel_file = rel_file.replace('//', '/') + if self._is_excluded(rel_file): + continue + dest = backup_path / 'data' / rel_file + dest.parent.mkdir(parents=True, exist_ok=True) + try: + shutil.copy2(root_p / fname, dest) + except (PermissionError, OSError) as e: + logger.warning('Backup: could not copy %s: %s (skipping)', rel_file, e) + except (PermissionError, OSError) as e: + logger.warning('Backup: could not copy %s: %s (skipping)', rel_src, e) + def _backup_service_volumes(self, backup_path: Path, service_registry) -> None: """Stream service data out of each container via 'docker exec tar'. @@ -351,9 +419,14 @@ class ConfigManager: except Exception as e: logger.warning('Restore: failed to restore %s/%s: %s', service_id, name, e) - def backup_config(self, service_registry=None) -> str: - """Create a backup of cell_config.json, secrets, Caddyfile, .env, Corefile, DNS zones, - and (when service_registry is provided) live service data volumes.""" + def backup_config(self, service_registry=None, passphrase: Optional[str] = None) -> str: + """Create a backup of cell_config.json, all critical secrets/keys, runtime + config and (when service_registry is provided) live service data volumes. + + When *passphrase* is supplied the staged backup directory is packed into an + encrypted archive (.tar.gz.age) and the plaintext staging dir is + removed. The archive contains key material; it is written mode 0600. + """ try: timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') backup_id = f"backup_{timestamp}" @@ -368,7 +441,6 @@ class ConfigManager: # Runtime-generated files that must match cell_config.json after restore config_dir = Path(os.environ.get('CONFIG_DIR', '/app/config')) - data_dir = Path(os.environ.get('DATA_DIR', '/app/data')) env_file = Path(os.environ.get('ENV_FILE', '/app/.env')) extra = [ @@ -381,7 +453,7 @@ class ConfigManager: shutil.copy2(src, backup_path / dest_name) # DNS zone files - dns_data = data_dir / 'dns' + dns_data = self.data_dir / 'dns' if dns_data.is_dir(): zones_dir = backup_path / 'dns_zones' zones_dir.mkdir(exist_ok=True) @@ -391,9 +463,9 @@ class ConfigManager: # Service-specific user account files (authoritative source of truth — # cell_config.json only carries a best-effort sync of these). svc_user_files = [ - (data_dir / 'email' / 'users.json', 'email_users.json'), - (data_dir / 'calendar' / 'users.json', 'calendar_users.json'), - (data_dir / 'calendar' / 'calendars.json', 'calendar_calendars.json'), + (self.data_dir / 'email' / 'users.json', 'email_users.json'), + (self.data_dir / 'calendar' / 'users.json', 'calendar_users.json'), + (self.data_dir / 'calendar' / 'calendars.json', 'calendar_calendars.json'), ] for src, dest_name in svc_user_files: if src.exists(): @@ -402,21 +474,64 @@ class ConfigManager: except (PermissionError, OSError) as e: logger.warning(f"Could not back up {src.name}: {e} (skipping)") + # CRITICAL secrets, keys and state under data/. Losing any of these on a + # restore would lock out the admin, re-provision all WireGuard peers, or + # render vault-encrypted secrets unrecoverable. Each path is copied under + # data/ in the archive and skipped gracefully if absent. + critical_data_paths = [ + # API auth + identity + 'api/auth_users.json', + 'api/.flask_secret_key', + 'api/peers.json', + 'api/peer_service_credentials.json', + 'api/cell_links.json', + 'api/ddns_token', + # WireGuard key material (server + peers) and live confs + 'wireguard/keys', + 'wireguard/wg_confs', + 'api/wireguard/keys', + # Vault: internal CA, certs, fernet.key, trust, encrypted secrets. + # Without keys/fernet.key all vault secrets are unrecoverable. + 'vault', + # Connectivity instance configs (host bind-mounts, not docker volumes): + # wg_ext0.conf, redsocks.conf, sshuttle keys/known_hosts, etc. + 'api/services', + 'services', + # Caddy issued certs / ACME state (avoid re-issuance + rate-limits) + 'caddy', + ] + for rel in critical_data_paths: + self._copy_data_path(rel, backup_path) + # Live service data volumes (streamed via docker exec) if service_registry is not None: self._backup_service_volumes(backup_path, service_registry) services = ['identity'] + list(self.service_schemas.keys()) + encrypted = bool(passphrase) manifest = { "backup_id": backup_id, "timestamp": datetime.now().isoformat(), "services": services, - "files": [f.name for f in backup_path.iterdir()], + "files": sorted(p.relative_to(backup_path).as_posix() + for p in backup_path.rglob('*') if p.is_file()), "includes_service_data": service_registry is not None, + "encrypted": encrypted, + "contains_secrets": True, } with open(backup_path / 'manifest.json', 'w') as f: json.dump(manifest, f, indent=2) + if encrypted: + archive_id = self._pack_and_encrypt(backup_path, backup_id, passphrase) + logger.info(f"Created encrypted configuration backup: {archive_id}") + return archive_id + + # Plaintext backup: lock the staging dir down — it holds key material. + try: + os.chmod(backup_path, 0o700) + except OSError: + pass logger.info(f"Created configuration backup: {backup_id}") return backup_id @@ -424,11 +539,68 @@ class ConfigManager: logger.error(f"Error creating backup: {e}") raise - def restore_config(self, backup_id: str, services: list = None, - service_registry=None) -> bool: - """Restore from backup. If services list given, only restore those service configs (selective).""" + def _pack_and_encrypt(self, backup_path: Path, backup_id: str, + passphrase: str) -> str: + """Tar+gzip the staged backup dir, encrypt with the passphrase, write + .tar.gz.age (mode 0600), and remove the plaintext staging dir.""" + buf = io.BytesIO() + with tarfile.open(fileobj=buf, mode='w:gz') as tar: + tar.add(backup_path, arcname=backup_id) + blob = backup_crypto.encrypt_bytes(buf.getvalue(), passphrase) + archive_name = f'{backup_id}.tar.gz.age' + archive_path = self.backup_dir / archive_name + fd = os.open(str(archive_path), os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600) + with os.fdopen(fd, 'wb') as f: + f.write(blob) + os.chmod(str(archive_path), 0o600) + shutil.rmtree(backup_path, ignore_errors=True) + return archive_name + + def _resolve_backup_dir(self, backup_id: str, passphrase: Optional[str]): + """Return (backup_path, cleanup_dir) for a backup id. + + For a plaintext backup, backup_path is the on-disk directory and + cleanup_dir is None. For an encrypted archive (.tar.gz.age, detected + either by the id ending in .age or by an archive file existing), the + archive is decrypted and extracted to a temp dir which the caller must + remove via cleanup_dir. Raises PermissionError on a bad/missing + passphrase so the route can return 400. + """ + import tempfile + archive_path = None + if backup_id.endswith('.age'): + archive_path = self.backup_dir / backup_id + else: + candidate = self.backup_dir / f'{backup_id}.tar.gz.age' + if candidate.exists() and not (self.backup_dir / backup_id).is_dir(): + archive_path = candidate + if archive_path is None: + return self.backup_dir / backup_id, None + + if not archive_path.exists(): + raise ValueError(f"Backup {backup_id} not found") + blob = archive_path.read_bytes() try: - backup_path = self.backup_dir / backup_id + plaintext = backup_crypto.decrypt_bytes(blob, passphrase or '') + except backup_crypto.BackupDecryptError as e: + raise PermissionError(str(e)) from e + tmpdir = Path(tempfile.mkdtemp(prefix='pic_restore_')) + with tarfile.open(fileobj=io.BytesIO(plaintext), mode='r:gz') as tar: + tar.extractall(tmpdir) + inner = [p for p in tmpdir.iterdir() if p.is_dir()] + backup_path = inner[0] if len(inner) == 1 else tmpdir + return backup_path, tmpdir + + def restore_config(self, backup_id: str, services: list = None, + service_registry=None, passphrase: Optional[str] = None) -> bool: + """Restore from backup. If services list given, only restore those service configs (selective). + + Encrypted archives (.tar.gz.age) are auto-detected and require the + passphrase; a wrong/missing passphrase raises PermissionError (route → 400). + """ + cleanup_dir = None + try: + backup_path, cleanup_dir = self._resolve_backup_dir(backup_id, passphrase) if not backup_path.exists(): raise ValueError(f"Backup {backup_id} not found") manifest_file = backup_path / 'manifest.json' @@ -451,34 +623,59 @@ class ConfigManager: logger.info(f"Selectively restored {services} from backup: {backup_id}") return True - # Full restore: copy all files back + # ── Full restore ───────────────────────────────────────────────── + # Ordering matters: vault (incl. fernet.key) is restored FIRST because + # everything else's secrets are encrypted with it; then identity/.env; + # then WireGuard key material; then cell links; then generated config; + # then per-service connectivity configs; then auth/ddns. + config_dir = Path(os.environ.get('CONFIG_DIR', '/app/config')) + env_file = Path(os.environ.get('ENV_FILE', '/app/.env')) + + # (1) Vault FIRST — internal CA, certs, fernet.key, trust, secrets. + self._restore_data_path(backup_path, 'vault') + + # (2) Identity / primary config + secrets + .env config_backup = backup_path / 'cell_config.json' if config_backup.exists(): shutil.copy2(config_backup, self.config_file) secrets_backup = backup_path / 'secrets.yaml' if secrets_backup.exists(): shutil.copy2(secrets_backup, self.secrets_file) + if (backup_path / '.env').exists(): + try: + env_file.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(backup_path / '.env', env_file) + except (PermissionError, OSError) as e: + logger.warning(f"Could not restore .env: {e} (skipping)") - config_dir = Path(os.environ.get('CONFIG_DIR', '/app/config')) - data_dir = Path(os.environ.get('DATA_DIR', '/app/data')) - env_file = Path(os.environ.get('ENV_FILE', '/app/.env')) + # (3) WireGuard key material + live confs, then peers.json + for rel in ('wireguard/keys', 'wireguard/wg_confs', 'api/wireguard/keys'): + self._restore_data_path(backup_path, rel) + for rel in ('api/peers.json', 'api/peer_service_credentials.json'): + self._restore_data_path(backup_path, rel) - restore_map = [ - (backup_path / 'Caddyfile', Path(LIVE_CADDYFILE)), - (backup_path / 'Corefile', config_dir / 'dns' / 'Corefile'), - (backup_path / '.env', env_file), - ] - for src, dest in restore_map: - if src.exists(): - try: - dest.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(src, dest) - except (PermissionError, OSError) as copy_err: - logger.warning(f"Could not restore {dest}: {copy_err} (skipping)") + # (4) Cell-to-cell links / permissions + self._restore_data_path(backup_path, 'api/cell_links.json') + # (5) Caddy issued certs/ACME, DNS Corefile + zones (generated files are + # reapplied below, but restoring them gives a correct starting point). + self._restore_data_path(backup_path, 'caddy') + if (backup_path / 'Caddyfile').exists(): + try: + Path(LIVE_CADDYFILE).parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(backup_path / 'Caddyfile', Path(LIVE_CADDYFILE)) + except (PermissionError, OSError) as e: + logger.warning(f"Could not restore Caddyfile: {e} (skipping)") + if (backup_path / 'Corefile').exists(): + try: + dest = config_dir / 'dns' / 'Corefile' + dest.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(backup_path / 'Corefile', dest) + except (PermissionError, OSError) as e: + logger.warning(f"Could not restore Corefile: {e} (skipping)") zones_backup = backup_path / 'dns_zones' if zones_backup.is_dir(): - dns_data = data_dir / 'dns' + dns_data = self.data_dir / 'dns' try: dns_data.mkdir(parents=True, exist_ok=True) for zone_file in zones_backup.glob('*.zone'): @@ -489,11 +686,19 @@ class ConfigManager: except (PermissionError, OSError) as dir_err: logger.warning(f"Could not create dns data dir {dns_data}: {dir_err} (skipping)") + # (6) Per-service connectivity configs (host bind-mounts) + for rel in ('api/services', 'services'): + self._restore_data_path(backup_path, rel) + + # (7) Auth users, flask secret, ddns token (after vault, before recompose) + for rel in ('api/auth_users.json', 'api/.flask_secret_key', 'api/ddns_token'): + self._restore_data_path(backup_path, rel) + # Service-specific user account files svc_restore_map = [ - (backup_path / 'email_users.json', data_dir / 'email' / 'users.json'), - (backup_path / 'calendar_users.json', data_dir / 'calendar' / 'users.json'), - (backup_path / 'calendar_calendars.json', data_dir / 'calendar' / 'calendars.json'), + (backup_path / 'email_users.json', self.data_dir / 'email' / 'users.json'), + (backup_path / 'calendar_users.json', self.data_dir / 'calendar' / 'users.json'), + (backup_path / 'calendar_calendars.json', self.data_dir / 'calendar' / 'calendars.json'), ] for src, dest in svc_restore_map: if src.exists(): @@ -503,44 +708,142 @@ class ConfigManager: except (PermissionError, OSError) as e: logger.warning(f"Could not restore {dest.name}: {e} (skipping)") - # Live service data volumes + # Reload config now that cell_config.json is restored. + self.configs = self._load_all_configs() + + # (8) Live service data volumes (after containers exist — best-effort) if service_registry is not None: self._restore_service_volumes(backup_path, service_registry) - self.configs = self._load_all_configs() + # (9) Reapply runtime state: regenerate generated config from the + # restored source-of-truth and re-apply routing/links. + self._reapply_runtime_state() + logger.info(f"Restored configuration from backup: {backup_id}") return True + except PermissionError: + raise except Exception as e: logger.error(f"Error restoring backup {backup_id}: {e}") return False + finally: + if cleanup_dir is not None: + shutil.rmtree(cleanup_dir, ignore_errors=True) + + def _restore_data_path(self, backup_path: Path, rel: str) -> None: + """Restore data/ from the backup into self.data_dir/. + Handles both files and directory trees. Skips silently if absent.""" + src = backup_path / 'data' / rel + if not src.exists(): + return + dest = self.data_dir / rel + try: + if src.is_dir(): + dest.mkdir(parents=True, exist_ok=True) + for root, _dirs, files in os.walk(src): + root_p = Path(root) + rel_root = root_p.relative_to(src) + for fname in files: + out = dest / rel_root / fname + out.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(root_p / fname, out) + else: + dest.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dest) + except (PermissionError, OSError) as e: + logger.warning(f"Could not restore {rel}: {e} (skipping)") + + def _reapply_runtime_state(self) -> None: + """Regenerate generated config (Caddyfile, Corefile) from the restored + source-of-truth and re-apply routing / cell links. Uses the live + managers; every step is best-effort so a missing manager during a + partial/offline restore never aborts the whole operation. + + NOTE: this does NOT stop/start containers. A full restore should be + followed by `make restart` so containers pick up restored key material + and regenerated config. See restore_config docstring / README. + """ + try: + from managers import (caddy_manager, firewall_manager, + connectivity_manager, cell_link_manager, + service_composer, peer_registry) + except Exception as e: + logger.warning(f"Reapply: managers unavailable ({e}); skipping reapply") + return + + try: + caddy_manager.regenerate_with_installed([]) + except Exception as e: + logger.warning(f"Reapply: regenerate Caddyfile failed: {e}") + + try: + peers = peer_registry.list_peers() if peer_registry else [] + cell_links = cell_link_manager.list_connections() if cell_link_manager else None + firewall_manager.generate_corefile( + peers, domain=self.get_internal_domain(), cell_links=cell_links) + except Exception as e: + logger.warning(f"Reapply: regenerate Corefile failed: {e}") + + try: + if service_composer is not None: + service_composer.reapply_active_services() + except Exception as e: + logger.warning(f"Reapply: reapply_active_services failed: {e}") + + try: + if connectivity_manager is not None: + connectivity_manager.apply_routes() + except Exception as e: + logger.warning(f"Reapply: apply_routes failed: {e}") + + try: + if cell_link_manager is not None: + cell_link_manager.replay_pending_pushes() + except Exception as e: + logger.warning(f"Reapply: replay_pending_pushes failed: {e}") def list_backups(self) -> List[Dict[str, Any]]: - """List all available backups""" + """List all available backups (plaintext dirs and encrypted archives).""" backups = [] - for backup_dir in self.backup_dir.iterdir(): - if backup_dir.is_dir(): - manifest_file = backup_dir / 'manifest.json' + for entry in self.backup_dir.iterdir(): + if entry.is_dir(): + manifest_file = entry / 'manifest.json' if manifest_file.exists(): try: with open(manifest_file, 'r') as f: manifest = json.load(f) backups.append(manifest) except Exception as e: - logger.error(f"Error reading backup manifest {backup_dir.name}: {e}") - - return sorted(backups, key=lambda x: x['timestamp'], reverse=True) - + logger.error(f"Error reading backup manifest {entry.name}: {e}") + elif entry.is_file() and entry.name.endswith('.tar.gz.age'): + # Encrypted archive: manifest is inside and undecryptable without a + # passphrase, so synthesise a listing entry from the filename. + backup_id = entry.name[:-len('.tar.gz')] if entry.name.endswith('.tar.gz.age') else entry.name + # backup_.tar.gz.age → backup_ + stem = entry.name[:-len('.tar.gz.age')] + ts = stem.replace('backup_', '').replace('_', 'T', 1) + backups.append({ + 'backup_id': entry.name, + 'timestamp': ts, + 'encrypted': True, + 'contains_secrets': True, + }) + + return sorted(backups, key=lambda x: x.get('timestamp', ''), reverse=True) + def delete_backup(self, backup_id: str) -> bool: - """Delete a backup""" + """Delete a backup (plaintext directory or encrypted archive).""" try: backup_path = self.backup_dir / backup_id - if not backup_path.exists(): + if backup_path.is_dir(): + shutil.rmtree(backup_path) + elif backup_path.is_file(): + backup_path.unlink() + else: raise ValueError(f"Backup {backup_id} not found") - - shutil.rmtree(backup_path) logger.info(f"Deleted backup: {backup_id}") return True - + except Exception as e: logger.error(f"Error deleting backup {backup_id}: {e}") return False diff --git a/api/routes/config.py b/api/routes/config.py index 84ecbbb..663f5c0 100644 --- a/api/routes/config.py +++ b/api/routes/config.py @@ -846,12 +846,21 @@ def apply_pending_config(): def create_config_backup(): try: from app import config_manager, service_bus, service_registry, EventType - backup_id = config_manager.backup_config(service_registry=service_registry) + data = request.get_json(silent=True) or {} + passphrase = data.get('passphrase') or None + backup_id = config_manager.backup_config( + service_registry=service_registry, passphrase=passphrase) service_bus.publish_event(EventType.BACKUP_CREATED, 'api', { 'backup_id': backup_id, 'timestamp': datetime.utcnow().isoformat() }) - return jsonify({"backup_id": backup_id}) + return jsonify({ + "backup_id": backup_id, + "encrypted": bool(passphrase), + "warning": "This backup contains secrets and key material " + "(WireGuard keys, internal CA, admin credentials). " + "Store it securely.", + }) except Exception as e: logger.error(f"Error creating backup: {e}") return jsonify({"error": str(e)}), 500 @@ -873,11 +882,16 @@ def restore_config(backup_id): from app import config_manager, service_bus, service_registry, EventType data = request.get_json(silent=True) or {} services = data.get('services') - success = config_manager.restore_config( - backup_id, - services=services, - service_registry=service_registry if services is None else None, - ) + passphrase = data.get('passphrase') or None + try: + success = config_manager.restore_config( + backup_id, + services=services, + service_registry=service_registry if services is None else None, + passphrase=passphrase, + ) + except PermissionError: + return jsonify({"error": "Invalid or missing passphrase for encrypted backup"}), 400 if success: service_bus.publish_event(EventType.RESTORE_COMPLETED, 'api', { 'backup_id': backup_id, @@ -925,6 +939,10 @@ def download_backup(backup_id): backup_path = config_manager.backup_dir / backup_id if not backup_path.exists(): return jsonify({'error': f'Backup {backup_id} not found'}), 404 + if backup_path.is_file(): + # Encrypted archive — serve as-is. + return send_file(str(backup_path), mimetype='application/octet-stream', + as_attachment=True, download_name=backup_id) buf = io.BytesIO() with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf: for f in backup_path.rglob('*'): diff --git a/tests/test_config_backup_overhaul.py b/tests/test_config_backup_overhaul.py new file mode 100644 index 0000000..833d4f5 --- /dev/null +++ b/tests/test_config_backup_overhaul.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 +"""Backup/restore overhaul tests for ConfigManager. + +Covers the P0 data-loss fix: + - critical secrets/keys are INCLUDED in a backup + - trash (logs, nested backups, *.tmp, .test_admin_pass) is EXCLUDED + - optional passphrase encryption (encrypted archive named .tar.gz.age, plaintext 0600) + - restore ordering (vault/fernet restored first) + reapply step invoked + - round-trip: backup -> restore with passphrase recovers files + +Docker/subprocess and the live managers used by the reapply step are mocked. +""" + +import os +import sys +import json +import stat +import shutil +import tarfile +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch, MagicMock + +api_dir = Path(__file__).parent.parent / 'api' +sys.path.insert(0, str(api_dir)) + +from config_manager import ConfigManager +import backup_crypto + + +def _write(p: Path, content: str = 'x'): + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + + +class _BackupBase(unittest.TestCase): + def setUp(self): + self.tmp = tempfile.mkdtemp() + self.config_file = os.path.join(self.tmp, 'config', 'cell_config.json') + self.data_dir = Path(self.tmp) / 'data' + os.makedirs(os.path.dirname(self.config_file), exist_ok=True) + os.makedirs(self.data_dir, exist_ok=True) + self.cm = ConfigManager(self.config_file, str(self.data_dir)) + self.cm.configs['_identity'] = {'cell_name': 'mycell', 'domain': 'cell'} + self.cm._save_all_configs() + self._seed_data() + + def tearDown(self): + shutil.rmtree(self.tmp, ignore_errors=True) + + def _seed_data(self): + d = self.data_dir + # Critical paths + _write(d / 'api' / 'auth_users.json', '{"admin": 1}') + _write(d / 'api' / '.flask_secret_key', 'secret') + _write(d / 'api' / 'peers.json', '{"peer1": "key"}') + _write(d / 'api' / 'peer_service_credentials.json', '{}') + _write(d / 'api' / 'cell_links.json', '{"link": 1}') + _write(d / 'api' / 'ddns_token', 'tok123') + _write(d / 'wireguard' / 'keys' / 'server_private.key', 'PRIV') + _write(d / 'wireguard' / 'wg_confs' / 'wg0.conf', '[Interface]') + _write(d / 'api' / 'wireguard' / 'keys' / 'private.key', 'P2') + _write(d / 'vault' / 'keys' / 'fernet.key', 'FERNETKEY') + _write(d / 'vault' / 'ca' / 'ca.key', 'CAKEY') + _write(d / 'vault' / 'secrets.json', 'ENC') + _write(d / 'api' / 'services' / 'wireguard-ext' / 'config' / 'wg_ext0.conf', 'EXT') + _write(d / 'caddy' / 'caddy' / 'cert.pem', 'CERT') + # Trash that must be excluded + _write(d / 'logs' / 'app.log', 'log line') + _write(d / 'api' / 'config_backups' / 'old' / 'manifest.json', '{}') + _write(d / 'api' / '.test_admin_pass', 'pw') + _write(d / 'api' / '.gitkeep', '') + _write(d / 'api' / 'scratch.tmp', 'tmp') + _write(d / 'api' / 'half.partial', 'partial') + _write(d / 'api' / '__pycache__' / 'x.pyc', 'bytecode') + + def _backup_files(self, backup_id): + bp = self.cm.backup_dir / backup_id + return {p.relative_to(bp).as_posix() + for p in bp.rglob('*') if p.is_file()} + + +class TestBackupInclude(_BackupBase): + def test_critical_paths_included(self): + bid = self.cm.backup_config() + files = self._backup_files(bid) + expected = [ + 'data/api/auth_users.json', + 'data/api/.flask_secret_key', + 'data/api/peers.json', + 'data/api/peer_service_credentials.json', + 'data/api/cell_links.json', + 'data/api/ddns_token', + 'data/wireguard/keys/server_private.key', + 'data/wireguard/wg_confs/wg0.conf', + 'data/api/wireguard/keys/private.key', + 'data/vault/keys/fernet.key', + 'data/vault/ca/ca.key', + 'data/vault/secrets.json', + 'data/api/services/wireguard-ext/config/wg_ext0.conf', + 'data/caddy/caddy/cert.pem', + ] + for rel in expected: + self.assertIn(rel, files, f'{rel} missing from backup') + + def test_absent_path_skipped_gracefully(self): + # Remove ddns_token before backup — should not error, just skip. + (self.data_dir / 'api' / 'ddns_token').unlink() + bid = self.cm.backup_config() + files = self._backup_files(bid) + self.assertNotIn('data/api/ddns_token', files) + self.assertIn('data/api/auth_users.json', files) + + +class TestBackupExclude(_BackupBase): + def test_trash_excluded(self): + bid = self.cm.backup_config() + files = self._backup_files(bid) + for rel in ( + 'data/logs/app.log', + 'data/api/config_backups/old/manifest.json', + 'data/api/.test_admin_pass', + 'data/api/.gitkeep', + 'data/api/scratch.tmp', + 'data/api/half.partial', + 'data/api/__pycache__/x.pyc', + ): + self.assertNotIn(rel, files, f'{rel} should be excluded') + + +class TestPassphraseEncryption(_BackupBase): + def test_encrypted_archive_named_age(self): + archive_id = self.cm.backup_config(passphrase='hunter2') + self.assertTrue(archive_id.endswith('.tar.gz.age')) + archive = self.cm.backup_dir / archive_id + self.assertTrue(archive.is_file()) + # Plaintext staging dir removed + self.assertFalse((self.cm.backup_dir / archive_id[:-len('.tar.gz.age')]).exists()) + # Blob is recognised as encrypted + self.assertTrue(backup_crypto.is_encrypted(archive.read_bytes())) + # Mode 0600 + mode = stat.S_IMODE(os.stat(archive).st_mode) + self.assertEqual(mode, 0o600) + + def test_plaintext_backup_is_0600(self): + bid = self.cm.backup_config() + bp = self.cm.backup_dir / bid + mode = stat.S_IMODE(os.stat(bp).st_mode) + self.assertEqual(mode, 0o700) + + def test_restore_wrong_passphrase_raises_permission(self): + archive_id = self.cm.backup_config(passphrase='correct') + with self.assertRaises(PermissionError): + self.cm.restore_config(archive_id, passphrase='wrong') + + def test_restore_missing_passphrase_raises_permission(self): + archive_id = self.cm.backup_config(passphrase='correct') + with self.assertRaises(PermissionError): + self.cm.restore_config(archive_id, passphrase=None) + + def test_roundtrip_with_passphrase_recovers_files(self): + archive_id = self.cm.backup_config(passphrase='secretpw') + # Wipe a critical file then restore. + (self.data_dir / 'api' / 'auth_users.json').unlink() + (self.data_dir / 'vault' / 'keys' / 'fernet.key').unlink() + with patch.object(self.cm, '_reapply_runtime_state'): + ok = self.cm.restore_config(archive_id, passphrase='secretpw') + self.assertTrue(ok) + self.assertEqual( + (self.data_dir / 'api' / 'auth_users.json').read_text(), '{"admin": 1}') + self.assertEqual( + (self.data_dir / 'vault' / 'keys' / 'fernet.key').read_text(), 'FERNETKEY') + + +class TestRestoreOrderingAndReapply(_BackupBase): + def test_vault_restored_before_other_data(self): + bid = self.cm.backup_config() + # Wipe data dir's restored targets to observe restore. + order = [] + real_copy = shutil.copy2 + + def tracking_copy(src, dst, *a, **k): + order.append(Path(dst).as_posix()) + return real_copy(src, dst, *a, **k) + + with patch.object(self.cm, '_reapply_runtime_state'), \ + patch('config_manager.shutil.copy2', side_effect=tracking_copy): + self.cm.restore_config(bid) + + def first_idx(needle): + for i, p in enumerate(order): + if needle in p: + return i + return 10 ** 9 + + vault_i = first_idx('/vault/') + auth_i = first_idx('auth_users.json') + wg_i = first_idx('/wireguard/') + self.assertLess(vault_i, auth_i, 'vault must restore before auth_users') + self.assertLess(vault_i, wg_i, 'vault must restore before wireguard keys') + + def test_reapply_step_invoked(self): + bid = self.cm.backup_config() + with patch.object(self.cm, '_reapply_runtime_state') as mock_reapply: + self.cm.restore_config(bid) + mock_reapply.assert_called_once() + + def test_reapply_calls_regenerate_and_apply_routes(self): + bid = self.cm.backup_config() + fake = MagicMock() + managers_mock = MagicMock() + managers_mock.caddy_manager = fake.caddy + managers_mock.firewall_manager = fake.firewall + managers_mock.connectivity_manager = fake.connectivity + managers_mock.cell_link_manager = fake.cell_link + managers_mock.service_composer = fake.composer + managers_mock.peer_registry = fake.peers + fake.peers.list_peers.return_value = [] + fake.cell_link.list_connections.return_value = [] + with patch.dict('sys.modules', {'managers': managers_mock}): + self.cm.restore_config(bid) + fake.caddy.regenerate_with_installed.assert_called_once() + fake.firewall.generate_corefile.assert_called_once() + fake.connectivity.apply_routes.assert_called_once() + fake.cell_link.replay_pending_pushes.assert_called_once() + fake.composer.reapply_active_services.assert_called_once() + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_config_backup_restore_http.py b/tests/test_config_backup_restore_http.py index e3e771d..17ac5e6 100644 --- a/tests/test_config_backup_restore_http.py +++ b/tests/test_config_backup_restore_http.py @@ -119,7 +119,8 @@ class TestRestoreConfigBackup(unittest.TestCase): content_type='application/json', ) mock_cm.restore_config.assert_called_once_with( - 'backup_001', services=['network', 'wireguard'], service_registry=None + 'backup_001', services=['network', 'wireguard'], service_registry=None, + passphrase=None, ) @patch('app.config_manager') @@ -128,7 +129,7 @@ class TestRestoreConfigBackup(unittest.TestCase): mock_cm.restore_config.return_value = True self.client.post('/api/config/restore/backup_001') mock_cm.restore_config.assert_called_once_with( - 'backup_001', services=None, service_registry=ANY + 'backup_001', services=None, service_registry=ANY, passphrase=None ) diff --git a/webui/src/pages/Settings.jsx b/webui/src/pages/Settings.jsx index 85ff5b2..8a7c35f 100644 --- a/webui/src/pages/Settings.jsx +++ b/webui/src/pages/Settings.jsx @@ -362,6 +362,8 @@ function Settings() { const [restoreModal, setRestoreModal] = useState(null); // backup object or null const [restoreServices, setRestoreServices] = useState(new Set(RESTORE_SERVICES.map(s => s.key))); const [backupUploading, setBackupUploading] = useState(false); + const [backupPassphrase, setBackupPassphrase] = useState(''); + const [restorePassphrase, setRestorePassphrase] = useState(''); const [isLoading, setIsLoading] = useState(true); @@ -677,8 +679,9 @@ function Settings() { const createBackup = async () => { setBackupCreating(true); try { - await cellAPI.createBackup(); - toast('Backup created'); + await cellAPI.createBackup(backupPassphrase || null); + toast(backupPassphrase ? 'Encrypted backup created' : 'Backup created'); + setBackupPassphrase(''); const res = await cellAPI.listBackups(); setBackups(res.data || []); } catch { @@ -690,6 +693,7 @@ function Settings() { const openRestoreModal = (backup) => { setRestoreServices(new Set(RESTORE_SERVICES.map(s => s.key))); + setRestorePassphrase(''); setRestoreModal(backup); }; @@ -698,12 +702,15 @@ function Settings() { const allSelected = restoreServices.size === RESTORE_SERVICES.length; const services = allSelected ? null : Array.from(restoreServices); try { - await cellAPI.restoreBackup(restoreModal.backup_id, services); + await cellAPI.restoreBackup(restoreModal.backup_id, services, restorePassphrase || null); toast('Configuration restored — reloading…'); setRestoreModal(null); setTimeout(() => loadAll(), 500); - } catch { - toast('Failed to restore backup', 'error'); + } catch (e) { + const msg = e?.response?.status === 400 + ? 'Invalid or missing passphrase for this encrypted backup' + : 'Failed to restore backup'; + toast(msg, 'error'); } }; @@ -1063,9 +1070,21 @@ function Settings() { {/* Backup & Restore */}
-
+
+ Backups contain secrets and key material (WireGuard keys, internal CA, vault key, admin credentials). + Set a passphrase to encrypt the archive, and store it securely. +
+
{backups.length} backup{backups.length !== 1 ? 's' : ''} stored -
+
+ setBackupPassphrase(e.target.value)} + placeholder="Passphrase (optional)" + autoComplete="new-password" + className="input text-sm w-44" + />