Files
pic/api/service_composer.py
T
roof 2ab3d2d5ac feat: secure build phase 2 — enforce image verification by default
All store images are now digest-pinned and cosign-signed by the publish
pipeline, so the warn-by-default training-wheels period is over: an
unsigned or undigested image must not install unless the admin
explicitly opts out. The service_composer fallback used when the config
manager is unavailable or corrupt also flips to enforce — config
corruption must fail closed rather than silently weaken verification.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-11 14:12:58 -04:00

620 lines
28 KiB
Python

"""
ServiceComposer — docker-compose generation and container lifecycle for PIC services.
Responsibilities:
- Render compose-template.yml → per-service docker-compose.yml with PIC_* substitution
- Manage store-service container lifecycle (up / down / restart / status / reconfigure)
- Manage builtin-service restarts and status via the main compose stack
- Generate and persist PIC_SECRET_* variables in a dedicated secrets file
Template variable reference (for compose-template.yml authors):
${PIC_CFG_<KEY>} — value from manifest config_schema, uppercased
${PIC_SECRET_<NAME>} — auto-generated random secret, persisted across reconfigures
${PIC_DOMAIN} — effective domain (e.g. cell.pic.ngo)
${PIC_CELL_NAME} — cell name (e.g. mycell)
${PIC_SERVICE_ID} — service identifier (e.g. nextcloud)
"""
import json
import logging
import os
import re
import secrets as _secrets_lib
import shutil
import subprocess
import threading
from pathlib import Path
from typing import Dict, List, Optional
from manifest_validator import validate_rendered_compose
logger = logging.getLogger('picell')
_SECRET_RE = re.compile(r'\$\{(PIC_SECRET_\w+)\}')
_SAFE_ID_RE = re.compile(r'^[a-z0-9][a-z0-9_-]{0,63}$')
_DIGEST_RE = re.compile(r'@sha256:[0-9a-f]{64}$')
# Bundled cosign public key — shipped in the repo (config/cosign/cosign.pub) so
# every cell can verify store-service image signatures offline. install.sh keeps
# it at /opt/pic/config/cosign/cosign.pub; in the cell-api container it is
# COPYed to /app/config/cosign/cosign.pub.
_COSIGN_PUBKEY_PATH = os.environ.get(
'PIC_COSIGN_PUBKEY', '/app/config/cosign/cosign.pub'
)
_COSIGN_BIN = os.environ.get('PIC_COSIGN_BIN', 'cosign')
class ServiceComposer:
def __init__(self, config_manager, data_dir: str):
self.cm = config_manager
self.data_dir = data_dir
self._services_dir = os.path.join(data_dir, 'services')
self._secrets_path = os.path.join(data_dir, 'service_secrets.json')
self._lock = threading.Lock()
# ── Path helpers ──────────────────────────────────────────────────────
@staticmethod
def _validate_service_id(service_id: str) -> None:
"""Raise ValueError if service_id could be used for path traversal."""
if not _SAFE_ID_RE.match(service_id):
raise ValueError(
f'Invalid service_id {service_id!r}: '
'must match ^[a-z0-9][a-z0-9_-]{{0,63}}$'
)
def _svc_dir(self, service_id: str) -> str:
self._validate_service_id(service_id)
candidate = os.path.join(self._services_dir, service_id)
# Paranoia: ensure the resolved path stays inside _services_dir
real_base = os.path.realpath(self._services_dir)
real_cand = os.path.realpath(candidate)
if not real_cand.startswith(real_base + os.sep) and real_cand != real_base:
raise ValueError(f'service_id {service_id!r} escapes services directory')
return candidate
def _compose_path(self, service_id: str) -> str:
return os.path.join(self._svc_dir(service_id), 'docker-compose.yml')
def has_compose_file(self, service_id: str) -> bool:
try:
return os.path.exists(self._compose_path(service_id))
except ValueError:
return False
# ── Secrets management ────────────────────────────────────────────────
def _load_secrets(self) -> Dict:
if not os.path.exists(self._secrets_path):
return {}
try:
with open(self._secrets_path) as f:
return json.load(f)
except (OSError, json.JSONDecodeError) as e:
logger.warning('ServiceComposer: failed to load secrets: %s', e)
return {}
def _save_secrets(self, secrets: Dict) -> None:
tmp = self._secrets_path + '.tmp'
# 0o600: readable only by the process owner — secrets must not be world-readable
with open(tmp, 'w',
opener=lambda path, flags: os.open(path, flags, 0o600)) as f:
json.dump(secrets, f, indent=2)
f.flush()
os.fsync(f.fileno())
os.replace(tmp, self._secrets_path)
def _get_or_create_secret(self, service_id: str, var_name: str) -> str:
with self._lock:
secrets = self._load_secrets()
svc_secrets = secrets.setdefault(service_id, {})
if var_name not in svc_secrets:
svc_secrets[var_name] = _secrets_lib.token_urlsafe(24)
self._save_secrets(secrets)
return svc_secrets[var_name]
def _clear_secrets(self, service_id: str) -> None:
with self._lock:
secrets = self._load_secrets()
if service_id in secrets:
del secrets[service_id]
self._save_secrets(secrets)
# ── Template rendering ────────────────────────────────────────────────
def render_template(self, service_id: str, manifest: Dict,
template_content: str,
instance_vars: Optional[Dict[str, str]] = None) -> str:
"""
Substitute all PIC_* variables in a compose-template.yml string.
Returns the rendered compose YAML.
instance_vars optionally supplies per-connection-instance values for
${INSTANCE_ID} and ${REDIRECT_PORT} so an instanceable connectivity
service can be rendered once per connection without collisions. They
are ignored for non-instanceable services (the placeholders simply
never appear in the template).
"""
schema = manifest.get('config_schema') or {}
saved = self.cm.configs.get(service_id, {})
config: Dict = {k: v['default'] for k, v in schema.items() if 'default' in v}
config.update({k: saved[k] for k in schema if k in saved})
identity = self.cm.get_identity()
domain = self.cm.get_effective_domain() or identity.get('domain', 'cell.local')
cell_name = identity.get('cell_name', 'mycell')
result = template_content
for key, value in config.items():
# Strip newlines/tabs to prevent YAML injection (a config string containing
# \n could inject new YAML keys into the compose file)
safe_val = str(value).replace('\n', '').replace('\r', '').replace('\t', ' ')
result = result.replace(f'${{PIC_CFG_{key.upper()}}}', safe_val)
result = result.replace('${PIC_DOMAIN}', domain)
result = result.replace('${PIC_CELL_NAME}', cell_name)
result = result.replace('${PIC_SERVICE_ID}', service_id)
result = result.replace('${PIC_DATA_DIR}', str(Path(self.data_dir).resolve()))
if instance_vars:
for var in ('INSTANCE_ID', 'REDIRECT_PORT'):
if var in instance_vars and instance_vars[var] is not None:
safe = str(instance_vars[var]).replace('\n', '').replace(
'\r', '').replace('\t', ' ')
result = result.replace(f'${{{var}}}', safe)
# PIC_SECRET_* — generate on first use, reuse on reconfigure
for match in _SECRET_RE.finditer(template_content):
var_name = match.group(1)
secret = self._get_or_create_secret(service_id, var_name)
result = result.replace(f'${{{var_name}}}', secret)
return result
def write_compose(self, service_id: str, manifest: Dict,
template_content: str) -> str:
"""Render and atomically write the per-service compose file. Returns rendered content."""
os.makedirs(self._svc_dir(service_id), exist_ok=True)
content = self.render_template(service_id, manifest, template_content)
# Validate before any file I/O so a bad template never touches disk.
# Pass the resolved data_dir so that bind mounts created by ${PIC_DATA_DIR}
# substitution are allowed; all other absolute paths are still rejected.
# Connectivity services (wireguard-ext, openvpn-client, tor) set
# requires_host_network: true in their manifest to opt into network_mode: host.
allow_host_network = bool(manifest.get('requires_host_network'))
ok, errs = validate_rendered_compose(
content,
allowed_data_dir=str(Path(self.data_dir).resolve()),
allow_host_network=allow_host_network,
)
if not ok:
raise ValueError(
f'Compose template failed security validation: {"; ".join(errs)}'
)
path = self._compose_path(service_id)
tmp = path + '.tmp'
with open(tmp, 'w') as f:
f.write(content)
f.flush()
os.fsync(f.fileno())
os.replace(tmp, path)
logger.info('ServiceComposer: wrote compose file for %s', service_id)
return content
# ── Subprocess helper ─────────────────────────────────────────────────
def _run(self, cmd: List[str], timeout: int = 120) -> Dict:
try:
r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
if r.returncode != 0 and r.stderr:
logger.warning('ServiceComposer command failed: %s', r.stderr.strip())
return {
'ok': r.returncode == 0,
'stdout': r.stdout.strip(),
'stderr': r.stderr.strip(),
}
except subprocess.TimeoutExpired:
return {'ok': False, 'error': 'docker compose command timed out'}
except Exception as e:
logger.error('ServiceComposer._run error: %s', e)
return {'ok': False, 'error': str(e)}
@staticmethod
def _parse_ps_json(output: str) -> List[Dict]:
"""Parse `docker compose ps --format json` output (one JSON object per line)."""
containers = []
for line in output.splitlines():
line = line.strip()
if not line:
continue
try:
containers.append(json.loads(line))
except json.JSONDecodeError:
pass
return containers
# ── Store-service lifecycle (per-service compose file) ────────────────
def _store_cmd(self, service_id: str, *args, timeout: int = 120) -> Dict:
compose_file = self._compose_path(service_id)
if not os.path.exists(compose_file):
return {'ok': False, 'error': f'No compose file found for service {service_id!r}'}
cmd = [
'docker', 'compose',
'-f', compose_file,
'--project-name', f'pic-{service_id}',
*args,
]
return self._run(cmd, timeout)
def up(self, service_id: str) -> Dict:
# 600s: image pulls on slow connections can take several minutes
return self._store_cmd(service_id, 'up', '-d', '--remove-orphans', timeout=600)
def down(self, service_id: str, remove_volumes: bool = False) -> Dict:
args = ['down']
if remove_volumes:
args.append('--volumes')
return self._store_cmd(service_id, *args)
def restart(self, service_id: str) -> Dict:
return self._store_cmd(service_id, 'restart')
def status(self, service_id: str) -> Dict:
result = self._store_cmd(service_id, 'ps', '--format', 'json')
result['containers'] = self._parse_ps_json(result.get('stdout', ''))
return result
def reconfigure(self, service_id: str, manifest: Dict,
template_content: str) -> Dict:
"""Re-render the compose file then re-apply with `up -d` (rolling update)."""
self.write_compose(service_id, manifest, template_content)
return self.up(service_id)
# ── Image signature verification ──────────────────────────────────────
def _verification_mode(self) -> str:
"""Resolve the configured image verification mode (off|warn|enforce)."""
getter = getattr(self.cm, 'get_image_verification_mode', None)
if callable(getter):
try:
return getter()
except Exception as e: # config corruption must not crash install
logger.warning('service_composer: could not read verification mode: %s', e)
return 'enforce'
def _cosign_verify(self, image_ref: str) -> Dict:
"""Run `cosign verify` against the bundled public key for one image ref.
Factored out so tests can mock it / mock the subprocess call. Returns a
_run-style dict ({'ok': bool, 'stdout', 'stderr'/'error'}).
"""
cmd = [
_COSIGN_BIN, 'verify',
'--key', _COSIGN_PUBKEY_PATH,
'--insecure-ignore-tlog=true',
image_ref,
]
return self._run(cmd, timeout=120)
def verify_image(self, service_id: str, manifest: Dict) -> Dict:
"""Verify a store image's signature subject to the configured mode.
Returns {'ok': True, 'skipped'|'verified'|'warned': ...} when the install
may proceed, or {'ok': False, 'error': ...} when it must abort (enforce
mode with a missing digest or a failed/absent signature).
"""
mode = self._verification_mode()
if mode == 'off':
return {'ok': True, 'skipped': True}
image_ref = (manifest or {}).get('image', '')
if not image_ref:
# No image to verify (e.g. builtin-style manifest); nothing to do.
return {'ok': True, 'skipped': True}
# Store images must be digest-pinned to be verifiable by digest.
if not _DIGEST_RE.search(image_ref):
msg = (f'image {image_ref!r} for {service_id} is not digest-pinned '
'(@sha256:) — cannot verify signature')
if mode == 'enforce':
logger.error('service_composer: %s; aborting install (enforce)', msg)
return {'ok': False, 'error': msg}
logger.warning('service_composer: %s; proceeding (warn)', msg)
return {'ok': True, 'warned': True}
result = self._cosign_verify(image_ref)
if result.get('ok'):
logger.info('service_composer: cosign verified %s', image_ref)
return {'ok': True, 'verified': True}
detail = result.get('stderr') or result.get('error') or 'signature verification failed'
msg = f'cosign verification failed for {image_ref}: {str(detail)[:200]}'
if mode == 'enforce':
logger.error('service_composer: %s; aborting install (enforce)', msg)
return {'ok': False, 'error': msg}
logger.warning('service_composer: %s; proceeding (warn)', msg)
return {'ok': True, 'warned': True}
def install(self, service_id: str, manifest: Dict,
template_content: str) -> Dict:
"""Write compose file, verify + pull image, then start containers.
Image signature verification runs before pull/up. Under enforce mode a
missing digest, missing signature, or failed verification aborts the
install (containers are never started); under warn mode the problem is
logged and the install proceeds; under off mode verification is skipped.
pull is run first so the up step doesn't time out on slow connections.
A single retry handles transient registry hiccups on first install.
"""
self.write_compose(service_id, manifest, template_content)
verify = self.verify_image(service_id, manifest)
if not verify.get('ok'):
return {'ok': False, 'error': verify.get('error', 'image verification failed')}
mode = self._verification_mode()
pull = self._store_cmd(service_id, 'pull', timeout=600)
if not pull.get('ok'):
pull_err = pull.get('stderr') or pull.get('error') or 'unknown error'
if mode == 'enforce':
logger.error('service_composer: image pull for %s failed under enforce, '
'aborting: %s', service_id, str(pull_err)[:200])
return {'ok': False,
'error': f'image pull failed (enforce): {str(pull_err)[:200]}'}
logger.warning('service_composer: image pull for %s failed, proceeding anyway: %s',
service_id, str(pull_err)[:200])
result = self.up(service_id)
if not result.get('ok'):
logger.info('service_composer: retrying up for %s after initial failure', service_id)
result = self.up(service_id)
return result
def remove(self, service_id: str, purge_data: bool = False) -> Dict:
"""Stop containers, optionally delete compose file, secrets, and service data dir."""
result = self.down(service_id, remove_volumes=purge_data)
if purge_data:
self._clear_secrets(service_id)
svc_dir = self._svc_dir(service_id) # already validates service_id + realpath
if os.path.isdir(svc_dir):
# Final realpath check: reject symlinks that escape the services dir
real_svc = os.path.realpath(svc_dir)
real_base = os.path.realpath(self._services_dir)
if not real_svc.startswith(real_base + os.sep):
logger.error('ServiceComposer: refusing rmtree outside services dir: %s', svc_dir)
else:
try:
shutil.rmtree(svc_dir)
except OSError as e:
logger.warning('ServiceComposer: could not remove %s: %s', svc_dir, e)
elif os.path.exists(self._compose_path(service_id)):
# Remove compose file even without purge so stale file doesn't confuse future installs
try:
os.remove(self._compose_path(service_id))
except OSError:
pass
return result
# ── Connection-instance lifecycle (one container per connection) ──────
#
# An instanceable connectivity service (wireguard-ext / openvpn-client /
# sshuttle / proxy) backs MANY connections — one container per connection.
# The store service supplies the image + raw compose-template; each
# connection renders that template with its own ${INSTANCE_ID} (short id),
# ${REDIRECT_PORT} and a per-instance config dir, so two connections of the
# same type never collide on container name, config mount, or listen port.
#
# Layout (all under data/services/<service_id>/<instance_id>/):
# docker-compose.yml rendered per-instance compose
# config/ per-instance bind-mounted config dir
# Tor is single-instance and keeps using the plain store-service path.
@staticmethod
def instance_id_for(conn_id: str) -> str:
"""Derive a short, docker-safe INSTANCE_ID from a connection id."""
return conn_id.split('_')[-1][:12]
def _instance_dir(self, service_id: str, instance_id: str) -> str:
self._validate_service_id(service_id)
if not _SAFE_ID_RE.match(instance_id):
raise ValueError(f'invalid instance_id {instance_id!r}')
candidate = os.path.join(self._svc_dir(service_id), instance_id)
real_base = os.path.realpath(self._svc_dir(service_id))
real_cand = os.path.realpath(candidate)
if not real_cand.startswith(real_base + os.sep) and real_cand != real_base:
raise ValueError(f'instance_id {instance_id!r} escapes service directory')
return candidate
def _instance_compose_path(self, service_id: str, instance_id: str) -> str:
return os.path.join(self._instance_dir(service_id, instance_id),
'docker-compose.yml')
def instance_config_dir(self, service_id: str, instance_id: str) -> str:
"""Per-instance config dir that the compose template bind-mounts."""
return os.path.join(self._instance_dir(service_id, instance_id), 'config')
def has_instance_compose(self, service_id: str, instance_id: str) -> bool:
try:
return os.path.exists(self._instance_compose_path(service_id, instance_id))
except ValueError:
return False
def write_instance_compose(self, service_id: str, instance_id: str,
manifest: Dict, template_content: str,
redirect_port: Optional[int] = None) -> str:
"""Render + atomically write a per-instance compose file. Returns content."""
inst_dir = self._instance_dir(service_id, instance_id)
os.makedirs(os.path.join(inst_dir, 'config'), exist_ok=True)
instance_vars = {'INSTANCE_ID': instance_id}
if redirect_port is not None:
instance_vars['REDIRECT_PORT'] = str(redirect_port)
content = self.render_template(
service_id, manifest, template_content, instance_vars=instance_vars)
allow_host_network = bool(manifest.get('requires_host_network'))
ok, errs = validate_rendered_compose(
content,
allowed_data_dir=str(Path(self.data_dir).resolve()),
allow_host_network=allow_host_network,
)
if not ok:
raise ValueError(
f'Instance compose failed security validation: {"; ".join(errs)}')
path = self._instance_compose_path(service_id, instance_id)
tmp = path + '.tmp'
with open(tmp, 'w') as f:
f.write(content)
f.flush()
os.fsync(f.fileno())
os.replace(tmp, path)
logger.info('ServiceComposer: wrote instance compose %s/%s',
service_id, instance_id)
return content
def _instance_cmd(self, service_id: str, instance_id: str, *args,
timeout: int = 120) -> Dict:
compose_file = self._instance_compose_path(service_id, instance_id)
if not os.path.exists(compose_file):
return {'ok': False,
'error': f'No compose file for instance {service_id}/{instance_id}'}
cmd = [
'docker', 'compose',
'-f', compose_file,
'--project-name', f'pic-conn-{instance_id}',
*args,
]
return self._run(cmd, timeout)
def up_instance(self, service_id: str, instance_id: str, manifest: Dict,
template_content: str,
redirect_port: Optional[int] = None) -> Dict:
"""Render + bring up the container for one connection instance."""
try:
self.write_instance_compose(service_id, instance_id, manifest,
template_content, redirect_port)
except ValueError as e:
return {'ok': False, 'error': str(e)}
return self._instance_cmd(service_id, instance_id, 'up', '-d',
'--remove-orphans', timeout=600)
def down_instance(self, service_id: str, instance_id: str,
purge_data: bool = False) -> Dict:
"""Stop the connection instance's container and remove its compose/dir."""
result = {'ok': True}
if self.has_instance_compose(service_id, instance_id):
args = ['down']
if purge_data:
args.append('--volumes')
result = self._instance_cmd(service_id, instance_id, *args)
try:
inst_dir = self._instance_dir(service_id, instance_id)
except ValueError as e:
logger.warning('down_instance: %s', e)
return result
if os.path.isdir(inst_dir):
real_inst = os.path.realpath(inst_dir)
real_base = os.path.realpath(self._svc_dir(service_id))
if not real_inst.startswith(real_base + os.sep):
logger.error('ServiceComposer: refusing rmtree outside service dir: %s',
inst_dir)
else:
try:
shutil.rmtree(inst_dir)
except OSError as e:
logger.warning('ServiceComposer: could not remove %s: %s',
inst_dir, e)
return result
def status_instance(self, service_id: str, instance_id: str) -> Dict:
result = self._instance_cmd(service_id, instance_id, 'ps', '--format', 'json')
result['containers'] = self._parse_ps_json(result.get('stdout', ''))
return result
# ── Dependency resolution ─────────────────────────────────────────────
def _resolve_requires(self, manifest: Dict, installed_services: Dict) -> Optional[str]:
"""Return an error string if any required services are missing, else None."""
requires = manifest.get('requires') or []
missing = [r for r in requires if r not in installed_services]
if missing:
return f"Required services not installed: {', '.join(sorted(missing))}"
return None
def _resolve_dependents(self, service_id: str, installed_services: Dict) -> List[str]:
"""Return list of installed service IDs that declare service_id in their requires."""
dependents = []
for svc_id, record in installed_services.items():
if svc_id == service_id:
continue
m = (record.get('manifest') or {})
if service_id in (m.get('requires') or []):
dependents.append(svc_id)
return dependents
def reapply_active_services(self) -> None:
"""Call up() for every installed service that has a compose file. Called at startup."""
installed = self.cm.get_installed_services()
for svc_id in installed:
if not self.has_compose_file(svc_id):
logger.warning('reapply_active_services: no compose file for %s, skipping', svc_id)
continue
result = self.up(svc_id)
if not result.get('ok'):
logger.warning('reapply_active_services: up failed for %s: %s',
svc_id, result.get('error') or result.get('stderr', ''))
# ── Builtin-service lifecycle (main compose stack) ─────────────────────
@staticmethod
def _main_compose() -> str:
return os.environ.get('COMPOSE_FILE', '/app/docker-compose.yml')
def restart_builtin(self, container_names: List[str]) -> Dict:
"""Restart one or more containers that live in the main docker-compose stack."""
if not container_names:
return {'ok': False, 'error': 'No container names provided'}
cmd = ['docker', 'compose', '-f', self._main_compose(),
'restart', *container_names]
return self._run(cmd)
def status_builtin(self, container_names: List[str]) -> Dict:
"""Return status of containers from the main compose stack."""
if not container_names:
return {'ok': False, 'error': 'No container names provided'}
cmd = ['docker', 'compose', '-f', self._main_compose(),
'ps', '--format', 'json', *container_names]
result = self._run(cmd)
result['containers'] = self._parse_ps_json(result.get('stdout', ''))
return result
# ── Unified lifecycle (dispatches based on service kind) ───────────────
def restart_service(self, service_id: str, manifest: Dict) -> Dict:
"""
Restart any service — builtin or store — using the right compose stack.
Builtin: uses manifest.containers + main docker-compose.yml.
Store: uses per-service compose file.
"""
if manifest.get('kind') == 'builtin':
containers = manifest.get('containers') or []
return self.restart_builtin(containers)
return self.restart(service_id)
def status_service(self, service_id: str, manifest: Dict) -> Dict:
"""
Return container status for any service.
Builtin: queries manifest.containers from main compose stack.
Store: queries per-service compose project.
"""
if manifest.get('kind') == 'builtin':
containers = manifest.get('containers') or []
return self.status_builtin(containers)
return self.status(service_id)