""" ServiceComposer — docker-compose generation and container lifecycle for PIC services. Responsibilities: - Render compose-template.yml → per-service docker-compose.yml with PIC_* substitution - Manage store-service container lifecycle (up / down / restart / status / reconfigure) - Manage builtin-service restarts and status via the main compose stack - Generate and persist PIC_SECRET_* variables in a dedicated secrets file Template variable reference (for compose-template.yml authors): ${PIC_CFG_} — value from manifest config_schema, uppercased ${PIC_SECRET_} — auto-generated random secret, persisted across reconfigures ${PIC_DOMAIN} — effective domain (e.g. cell.pic.ngo) ${PIC_CELL_NAME} — cell name (e.g. mycell) ${PIC_SERVICE_ID} — service identifier (e.g. nextcloud) """ import json import logging import os import re import secrets as _secrets_lib import shutil import subprocess import threading from pathlib import Path from typing import Dict, List, Optional from manifest_validator import validate_rendered_compose logger = logging.getLogger('picell') _SECRET_RE = re.compile(r'\$\{(PIC_SECRET_\w+)\}') _SAFE_ID_RE = re.compile(r'^[a-z0-9][a-z0-9_-]{0,63}$') class ServiceComposer: def __init__(self, config_manager, data_dir: str): self.cm = config_manager self.data_dir = data_dir self._services_dir = os.path.join(data_dir, 'services') self._secrets_path = os.path.join(data_dir, 'service_secrets.json') self._lock = threading.Lock() # ── Path helpers ────────────────────────────────────────────────────── @staticmethod def _validate_service_id(service_id: str) -> None: """Raise ValueError if service_id could be used for path traversal.""" if not _SAFE_ID_RE.match(service_id): raise ValueError( f'Invalid service_id {service_id!r}: ' 'must match ^[a-z0-9][a-z0-9_-]{{0,63}}$' ) def _svc_dir(self, service_id: str) -> str: self._validate_service_id(service_id) candidate = os.path.join(self._services_dir, service_id) # Paranoia: ensure the resolved path stays inside _services_dir real_base = os.path.realpath(self._services_dir) real_cand = os.path.realpath(candidate) if not real_cand.startswith(real_base + os.sep) and real_cand != real_base: raise ValueError(f'service_id {service_id!r} escapes services directory') return candidate def _compose_path(self, service_id: str) -> str: return os.path.join(self._svc_dir(service_id), 'docker-compose.yml') def has_compose_file(self, service_id: str) -> bool: try: return os.path.exists(self._compose_path(service_id)) except ValueError: return False # ── Secrets management ──────────────────────────────────────────────── def _load_secrets(self) -> Dict: if not os.path.exists(self._secrets_path): return {} try: with open(self._secrets_path) as f: return json.load(f) except (OSError, json.JSONDecodeError) as e: logger.warning('ServiceComposer: failed to load secrets: %s', e) return {} def _save_secrets(self, secrets: Dict) -> None: tmp = self._secrets_path + '.tmp' # 0o600: readable only by the process owner — secrets must not be world-readable with open(tmp, 'w', opener=lambda path, flags: os.open(path, flags, 0o600)) as f: json.dump(secrets, f, indent=2) f.flush() os.fsync(f.fileno()) os.replace(tmp, self._secrets_path) def _get_or_create_secret(self, service_id: str, var_name: str) -> str: with self._lock: secrets = self._load_secrets() svc_secrets = secrets.setdefault(service_id, {}) if var_name not in svc_secrets: svc_secrets[var_name] = _secrets_lib.token_urlsafe(24) self._save_secrets(secrets) return svc_secrets[var_name] def _clear_secrets(self, service_id: str) -> None: with self._lock: secrets = self._load_secrets() if service_id in secrets: del secrets[service_id] self._save_secrets(secrets) # ── Template rendering ──────────────────────────────────────────────── def render_template(self, service_id: str, manifest: Dict, template_content: str) -> str: """ Substitute all PIC_* variables in a compose-template.yml string. Returns the rendered compose YAML. """ schema = manifest.get('config_schema') or {} saved = self.cm.configs.get(service_id, {}) config: Dict = {k: v['default'] for k, v in schema.items() if 'default' in v} config.update({k: saved[k] for k in schema if k in saved}) identity = self.cm.get_identity() domain = self.cm.get_effective_domain() or identity.get('domain', 'cell.local') cell_name = identity.get('cell_name', 'mycell') result = template_content for key, value in config.items(): # Strip newlines/tabs to prevent YAML injection (a config string containing # \n could inject new YAML keys into the compose file) safe_val = str(value).replace('\n', '').replace('\r', '').replace('\t', ' ') result = result.replace(f'${{PIC_CFG_{key.upper()}}}', safe_val) result = result.replace('${PIC_DOMAIN}', domain) result = result.replace('${PIC_CELL_NAME}', cell_name) result = result.replace('${PIC_SERVICE_ID}', service_id) result = result.replace('${PIC_DATA_DIR}', str(Path(self.data_dir).resolve())) # PIC_SECRET_* — generate on first use, reuse on reconfigure for match in _SECRET_RE.finditer(template_content): var_name = match.group(1) secret = self._get_or_create_secret(service_id, var_name) result = result.replace(f'${{{var_name}}}', secret) return result def write_compose(self, service_id: str, manifest: Dict, template_content: str) -> str: """Render and atomically write the per-service compose file. Returns rendered content.""" os.makedirs(self._svc_dir(service_id), exist_ok=True) content = self.render_template(service_id, manifest, template_content) # Validate before any file I/O so a bad template never touches disk. # Pass the resolved data_dir so that bind mounts created by ${PIC_DATA_DIR} # substitution are allowed; all other absolute paths are still rejected. # Connectivity services (wireguard-ext, openvpn-client, tor) set # requires_host_network: true in their manifest to opt into network_mode: host. allow_host_network = bool(manifest.get('requires_host_network')) ok, errs = validate_rendered_compose( content, allowed_data_dir=str(Path(self.data_dir).resolve()), allow_host_network=allow_host_network, ) if not ok: raise ValueError( f'Compose template failed security validation: {"; ".join(errs)}' ) path = self._compose_path(service_id) tmp = path + '.tmp' with open(tmp, 'w') as f: f.write(content) f.flush() os.fsync(f.fileno()) os.replace(tmp, path) logger.info('ServiceComposer: wrote compose file for %s', service_id) return content # ── Subprocess helper ───────────────────────────────────────────────── def _run(self, cmd: List[str], timeout: int = 120) -> Dict: try: r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) if r.returncode != 0 and r.stderr: logger.warning('ServiceComposer command failed: %s', r.stderr.strip()) return { 'ok': r.returncode == 0, 'stdout': r.stdout.strip(), 'stderr': r.stderr.strip(), } except subprocess.TimeoutExpired: return {'ok': False, 'error': 'docker compose command timed out'} except Exception as e: logger.error('ServiceComposer._run error: %s', e) return {'ok': False, 'error': str(e)} @staticmethod def _parse_ps_json(output: str) -> List[Dict]: """Parse `docker compose ps --format json` output (one JSON object per line).""" containers = [] for line in output.splitlines(): line = line.strip() if not line: continue try: containers.append(json.loads(line)) except json.JSONDecodeError: pass return containers # ── Store-service lifecycle (per-service compose file) ──────────────── def _store_cmd(self, service_id: str, *args, timeout: int = 120) -> Dict: compose_file = self._compose_path(service_id) if not os.path.exists(compose_file): return {'ok': False, 'error': f'No compose file found for service {service_id!r}'} cmd = [ 'docker', 'compose', '-f', compose_file, '--project-name', f'pic-{service_id}', *args, ] return self._run(cmd, timeout) def up(self, service_id: str) -> Dict: # 600s: image pulls on slow connections can take several minutes return self._store_cmd(service_id, 'up', '-d', '--remove-orphans', timeout=600) def down(self, service_id: str, remove_volumes: bool = False) -> Dict: args = ['down'] if remove_volumes: args.append('--volumes') return self._store_cmd(service_id, *args) def restart(self, service_id: str) -> Dict: return self._store_cmd(service_id, 'restart') def status(self, service_id: str) -> Dict: result = self._store_cmd(service_id, 'ps', '--format', 'json') result['containers'] = self._parse_ps_json(result.get('stdout', '')) return result def reconfigure(self, service_id: str, manifest: Dict, template_content: str) -> Dict: """Re-render the compose file then re-apply with `up -d` (rolling update).""" self.write_compose(service_id, manifest, template_content) return self.up(service_id) def install(self, service_id: str, manifest: Dict, template_content: str) -> Dict: """Write compose file, pull image, then start containers. pull is run first so the up step doesn't time out on slow connections. A single retry handles transient registry hiccups on first install. """ self.write_compose(service_id, manifest, template_content) pull = self._store_cmd(service_id, 'pull', timeout=600) if not pull.get('ok'): logger.warning('service_composer: image pull for %s failed, proceeding anyway: %s', service_id, pull.get('stderr', '')[:200]) result = self.up(service_id) if not result.get('ok'): logger.info('service_composer: retrying up for %s after initial failure', service_id) result = self.up(service_id) return result def remove(self, service_id: str, purge_data: bool = False) -> Dict: """Stop containers, optionally delete compose file, secrets, and service data dir.""" result = self.down(service_id, remove_volumes=purge_data) if purge_data: self._clear_secrets(service_id) svc_dir = self._svc_dir(service_id) # already validates service_id + realpath if os.path.isdir(svc_dir): # Final realpath check: reject symlinks that escape the services dir real_svc = os.path.realpath(svc_dir) real_base = os.path.realpath(self._services_dir) if not real_svc.startswith(real_base + os.sep): logger.error('ServiceComposer: refusing rmtree outside services dir: %s', svc_dir) else: try: shutil.rmtree(svc_dir) except OSError as e: logger.warning('ServiceComposer: could not remove %s: %s', svc_dir, e) elif os.path.exists(self._compose_path(service_id)): # Remove compose file even without purge so stale file doesn't confuse future installs try: os.remove(self._compose_path(service_id)) except OSError: pass return result # ── Dependency resolution ───────────────────────────────────────────── def _resolve_requires(self, manifest: Dict, installed_services: Dict) -> Optional[str]: """Return an error string if any required services are missing, else None.""" requires = manifest.get('requires') or [] missing = [r for r in requires if r not in installed_services] if missing: return f"Required services not installed: {', '.join(sorted(missing))}" return None def _resolve_dependents(self, service_id: str, installed_services: Dict) -> List[str]: """Return list of installed service IDs that declare service_id in their requires.""" dependents = [] for svc_id, record in installed_services.items(): if svc_id == service_id: continue m = (record.get('manifest') or {}) if service_id in (m.get('requires') or []): dependents.append(svc_id) return dependents def reapply_active_services(self) -> None: """Call up() for every installed service that has a compose file. Called at startup.""" installed = self.cm.get_installed_services() for svc_id in installed: if not self.has_compose_file(svc_id): logger.warning('reapply_active_services: no compose file for %s, skipping', svc_id) continue result = self.up(svc_id) if not result.get('ok'): logger.warning('reapply_active_services: up failed for %s: %s', svc_id, result.get('error') or result.get('stderr', '')) # ── Builtin-service lifecycle (main compose stack) ───────────────────── @staticmethod def _main_compose() -> str: return os.environ.get('COMPOSE_FILE', '/app/docker-compose.yml') def restart_builtin(self, container_names: List[str]) -> Dict: """Restart one or more containers that live in the main docker-compose stack.""" if not container_names: return {'ok': False, 'error': 'No container names provided'} cmd = ['docker', 'compose', '-f', self._main_compose(), 'restart', *container_names] return self._run(cmd) def status_builtin(self, container_names: List[str]) -> Dict: """Return status of containers from the main compose stack.""" if not container_names: return {'ok': False, 'error': 'No container names provided'} cmd = ['docker', 'compose', '-f', self._main_compose(), 'ps', '--format', 'json', *container_names] result = self._run(cmd) result['containers'] = self._parse_ps_json(result.get('stdout', '')) return result # ── Unified lifecycle (dispatches based on service kind) ─────────────── def restart_service(self, service_id: str, manifest: Dict) -> Dict: """ Restart any service — builtin or store — using the right compose stack. Builtin: uses manifest.containers + main docker-compose.yml. Store: uses per-service compose file. """ if manifest.get('kind') == 'builtin': containers = manifest.get('containers') or [] return self.restart_builtin(containers) return self.restart(service_id) def status_service(self, service_id: str, manifest: Dict) -> Dict: """ Return container status for any service. Builtin: queries manifest.containers from main compose stack. Store: queries per-service compose project. """ if manifest.get('kind') == 'builtin': containers = manifest.get('containers') or [] return self.status_builtin(containers) return self.status(service_id)