feat: Phase 3 — ServiceComposer deps + store install via per-service compose
Unit Tests / test (push) Successful in 11m21s

ServiceStoreManager.install() now delegates container lifecycle to
ServiceComposer (per-service docker-compose.yml) instead of appending to a
shared compose override. This eliminates IP pool allocation, compose override
rendering, and the single-stack docker exec approach.

Changes:
- service_composer.py: add _resolve_requires(), _resolve_dependents(),
  reapply_active_services() — dependency graph and startup reapply
- service_store_manager.py: rewrite install() and remove() to use
  ServiceComposer; add _fetch_template(); delete _allocate_service_ip(),
  _render_compose_override(), _write_compose_override(); remove() now guards
  against removing services that others depend on
- managers.py: pass service_composer= to ServiceStoreManager
- Tests: 13 new composer dep tests; TestInstall/TestRemove rewritten for
  the new composer-driven path; test_optional_services_feature.py updated

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-29 09:33:02 -04:00
parent 0bfe95320b
commit 87c321c1c9
6 changed files with 442 additions and 767 deletions
+74 -260
View File
@@ -14,17 +14,14 @@ import logging
import os
import re
import threading
import subprocess
from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple
import json
import requests
import yaml
from base_service_manager import BaseServiceManager
from ip_utils import CONTAINER_OFFSETS
from manifest_validator import validate_manifest, validate_provision_hook
logger = logging.getLogger(__name__)
@@ -33,15 +30,15 @@ logger = logging.getLogger(__name__)
# Constants
# ---------------------------------------------------------------------------
SERVICE_POOL_START = 20
SERVICE_POOL_END = 254
INDEX_URL_DEFAULT = (
'https://git.pic.ngo/roof/pic-services/raw/branch/main/index.json'
)
MANIFEST_URL_TPL = (
'https://git.pic.ngo/roof/pic-services/raw/branch/main/services/{id}/manifest.json'
)
TEMPLATE_URL_TPL = (
'https://git.pic.ngo/roof/pic-services/raw/branch/main/services/{id}/compose-template.yml'
)
IMAGE_ALLOWLIST_RE = re.compile(
r'^git\.pic\.ngo/roof/[a-z0-9._/-]+(:[a-zA-Z0-9._-]+)?(@sha256:[a-f0-9]{64})?$'
@@ -77,11 +74,13 @@ class ServiceStoreManager(BaseServiceManager):
"""Manages service store: install, remove, and list available/installed services."""
def __init__(self, config_manager, caddy_manager, container_manager,
data_dir: str = '', config_dir: str = ''):
data_dir: str = '', config_dir: str = '',
service_composer=None):
super().__init__('service_store', data_dir, config_dir)
self.config_manager = config_manager
self.caddy_manager = caddy_manager
self.container_manager = container_manager
self.service_composer = service_composer
self.compose_override = os.environ.get(
'COMPOSE_SERVICES_PATH', '/app/docker-compose.services.yml'
)
@@ -239,125 +238,6 @@ class ServiceStoreManager(BaseServiceManager):
return (len(errors) == 0, errors)
# ── IP allocation ─────────────────────────────────────────────────────
def _allocate_service_ip(self, service_id: str) -> str:
"""Allocate the next free IP from the service pool."""
identity = self.config_manager.get_identity()
ip_range = identity.get('ip_range', '172.20.0.0/16')
import ipaddress
network = ipaddress.IPv4Network(ip_range, strict=False)
base = int(network.network_address)
# IPs already assigned to named containers
reserved_offsets = set(CONTAINER_OFFSETS.values())
# IPs already assigned to installed services
service_ips: Dict[str, str] = identity.get('service_ips', {})
taken_ips = set(service_ips.values())
for offset in range(SERVICE_POOL_START, SERVICE_POOL_END + 1):
if offset in reserved_offsets:
continue
candidate = str(ipaddress.IPv4Address(base + offset))
if candidate not in taken_ips:
return candidate
raise RuntimeError('Service IP pool exhausted (offsets 20-254 all taken)')
# ── Compose override ──────────────────────────────────────────────────
def _render_compose_override(self, installed_records: dict) -> str:
"""Generate docker-compose YAML override for all installed services."""
services: Dict[str, Any] = {}
for svc_id, record in installed_records.items():
manifest = record.get('manifest', {})
container_name = record.get('container_name', svc_id)
image = manifest.get('image', record.get('image', ''))
service_ip = record.get('service_ip', '')
# Volumes
volumes = []
for vol in manifest.get('volumes', []):
vol_name = vol.get('name', '')
mount = vol.get('mount', '')
if vol_name and mount:
volumes.append(f'{vol_name}:{mount}')
# Environment
environment: Dict[str, str] = {}
for env_entry in manifest.get('env', []):
k = env_entry.get('key', '')
v = str(env_entry.get('value', ''))
if k:
environment[k] = v
svc_def: Dict[str, Any] = {
'image': image,
'container_name': container_name,
'restart': 'unless-stopped',
'logging': {
'driver': 'json-file',
'options': {
'max-size': '10m',
'max-file': '5',
},
},
'networks': {
'cell-network': {
'ipv4_address': service_ip,
}
},
}
if volumes:
svc_def['volumes'] = volumes
if environment:
svc_def['environment'] = environment
services[container_name] = svc_def
# Collect named volumes
named_volumes: Dict[str, Any] = {}
for svc_id, record in installed_records.items():
manifest = record.get('manifest', {})
for vol in manifest.get('volumes', []):
vol_name = vol.get('name', '')
if vol_name:
named_volumes[vol_name] = None # Docker default driver
doc: Dict[str, Any] = {
'version': '3.8',
'services': services,
'networks': {
'cell-network': {
'external': True,
}
},
}
if named_volumes:
doc['volumes'] = named_volumes
return yaml.dump(doc, default_flow_style=False, allow_unicode=True)
def _write_compose_override(self, content: str) -> None:
"""Atomic write of the compose override file."""
tmp_path = self.compose_override + '.tmp'
try:
os.makedirs(os.path.dirname(os.path.abspath(self.compose_override)),
exist_ok=True)
except (PermissionError, OSError):
pass
with open(tmp_path, 'w') as f:
f.write(content)
f.flush()
try:
os.fsync(f.fileno())
except OSError:
pass
os.replace(tmp_path, self.compose_override)
# ── Index / manifest fetching ─────────────────────────────────────────
def fetch_index(self) -> list:
@@ -394,14 +274,22 @@ class ServiceStoreManager(BaseServiceManager):
)
return json.loads(content)
def _fetch_template(self, service_id: str, manifest: dict) -> str:
"""Fetch the compose template for a service."""
_SIZE_LIMIT = 256 * 1024
url = TEMPLATE_URL_TPL.format(id=service_id)
resp = requests.get(url, timeout=10, stream=True)
resp.raise_for_status()
content = resp.raw.read(_SIZE_LIMIT + 1, decode_content=True)
if len(content) > _SIZE_LIMIT:
raise ValueError(f'Compose template for {service_id} exceeds 256 KB limit')
return content.decode('utf-8')
# ── Core operations ───────────────────────────────────────────────────
def install(self, service_id: str) -> dict:
"""Install a service from the store."""
from firewall_manager import apply_service_rules
with self._lock:
# Already installed?
installed = self.config_manager.get_installed_services()
if service_id in installed:
return {'ok': True, 'already_installed': True}
@@ -416,154 +304,80 @@ class ServiceStoreManager(BaseServiceManager):
if not ok:
return {'ok': False, 'errors': errs}
# Allocate IP
try:
ip = self._allocate_service_ip(service_id)
except RuntimeError as e:
return {'ok': False, 'error': str(e)}
ok2, errs2 = validate_manifest(manifest)
if not ok2:
return {'ok': False, 'errors': errs2}
# Build install record
# Dependency check
if self.service_composer is not None:
err = self.service_composer._resolve_requires(manifest, installed)
if err:
return {'ok': False, 'error': err}
# Fetch compose template
try:
template_content = self._fetch_template(service_id, manifest)
except Exception as e:
return {'ok': False, 'error': f'Failed to fetch compose template: {e}'}
# Write compose file and start containers (validation inside write_compose)
if self.service_composer is not None:
try:
result = self.service_composer.install(service_id, manifest, template_content)
except ValueError as e:
return {'ok': False, 'error': str(e)}
except Exception as e:
return {'ok': False, 'error': f'Failed to start service: {e}'}
if not result.get('ok'):
return {'ok': False, 'error': result.get('error') or result.get('stderr', 'docker up failed')}
# Persist minimal install record
record = {
'id': service_id,
'name': manifest.get('name', service_id),
'container_name': manifest['container_name'],
'image': manifest.get('image', ''),
'service_ip': ip,
'caddy_route': manifest.get('caddy_route'),
'iptables_rules': manifest.get('iptables_rules', []),
'manifest': manifest,
'installed_at': datetime.utcnow().isoformat(),
}
# Persist to config
self.config_manager.set_installed_service(service_id, record)
identity = self.config_manager.get_identity()
service_ips = dict(identity.get('service_ips', {}))
service_ips[service_id] = ip
self.config_manager.set_identity_field('service_ips', service_ips)
# Write compose override
all_installed = self.config_manager.get_installed_services()
# Regenerate Caddy (registry now drives routes, no caddy_routes list needed)
try:
content = self._render_compose_override(all_installed)
self._write_compose_override(content)
self.caddy_manager.regenerate_with_installed([])
except Exception as e:
logger.error(f'Failed to write compose override: {e}')
logger.warning('install: caddy regenerate failed for %s (non-fatal): %s', service_id, e)
# Apply iptables rules (best-effort)
try:
apply_service_rules(service_id, ip, manifest.get('iptables_rules', []))
except Exception as e:
logger.warning(f'apply_service_rules for {service_id} failed (non-fatal): {e}')
# Regenerate Caddyfile
try:
caddy_routes = [
r.get('caddy_route')
for r in all_installed.values()
if r.get('caddy_route')
]
self.caddy_manager.regenerate_with_installed(caddy_routes)
except Exception as e:
logger.warning(f'caddy regenerate for {service_id} failed (non-fatal): {e}')
# Start the container via docker compose
base_compose = os.environ.get('COMPOSE_FILE', '/app/docker-compose.yml')
try:
result = subprocess.run(
['docker', 'compose',
'-f', base_compose,
'-f', self.compose_override,
'up', '-d', manifest['container_name']],
capture_output=True, text=True, timeout=120,
)
if result.returncode != 0:
logger.warning(
f'docker compose up for {service_id} failed: {result.stderr.strip()}'
)
except Exception as e:
logger.warning(f'docker compose up for {service_id} failed (non-fatal): {e}')
return {
'ok': True,
'service_ip': ip,
'container_name': manifest['container_name'],
}
return {'ok': True}
def remove(self, service_id: str, purge_data: bool = False) -> dict:
"""Remove an installed service."""
from firewall_manager import clear_service_rules
with self._lock:
installed = self.config_manager.get_installed_services()
record = installed.get(service_id)
if not record:
if service_id not in installed:
return {'ok': False, 'error': f'Service {service_id} is not installed'}
container_name = record.get('container_name', service_id)
manifest = record.get('manifest', {})
base_compose = os.environ.get('COMPOSE_FILE', '/app/docker-compose.yml')
# Prevent removing a service that others depend on
if self.service_composer is not None:
dependents = self.service_composer._resolve_dependents(service_id, installed)
if dependents:
return {
'ok': False,
'error': f'Cannot remove {service_id}: required by {", ".join(sorted(dependents))}',
}
# Stop and remove container
try:
subprocess.run(
['docker', 'compose',
'-f', base_compose,
'-f', self.compose_override,
'stop', container_name],
capture_output=True, text=True, timeout=60,
)
except Exception as e:
logger.warning(f'docker compose stop for {service_id} failed (non-fatal): {e}')
# Stop and remove containers (best-effort)
if self.service_composer is not None:
try:
self.service_composer.remove(service_id, purge_data=purge_data)
except Exception as e:
logger.warning('remove: composer.remove failed for %s (non-fatal): %s', service_id, e)
try:
subprocess.run(
['docker', 'rm', '-f', container_name],
capture_output=True, text=True, timeout=30,
)
except Exception as e:
logger.warning(f'docker rm for {service_id} failed (non-fatal): {e}')
# Clear iptables rules
try:
clear_service_rules(service_id)
except Exception as e:
logger.warning(f'clear_service_rules for {service_id} failed (non-fatal): {e}')
# Remove from config, regenerate compose + caddy
# Remove from config
self.config_manager.remove_installed_service(service_id)
remaining = self.config_manager.get_installed_services()
# Regenerate Caddy
try:
content = self._render_compose_override(remaining)
self._write_compose_override(content)
self.caddy_manager.regenerate_with_installed([])
except Exception as e:
logger.error(f'Failed to write compose override after remove: {e}')
try:
caddy_routes = [
r.get('caddy_route')
for r in remaining.values()
if r.get('caddy_route')
]
self.caddy_manager.regenerate_with_installed(caddy_routes)
except Exception as e:
logger.warning(f'caddy regenerate after remove failed (non-fatal): {e}')
# Purge named volumes if requested
if purge_data:
for vol in manifest.get('volumes', []):
vol_name = vol.get('name', '')
if vol_name:
try:
subprocess.run(
['docker', 'volume', 'rm', vol_name],
capture_output=True, text=True, timeout=30,
)
except Exception as e:
logger.warning(
f'docker volume rm {vol_name} failed (non-fatal): {e}'
)
logger.warning('remove: caddy regenerate failed for %s (non-fatal): %s', service_id, e)
return {'ok': True}
@@ -581,13 +395,6 @@ class ServiceStoreManager(BaseServiceManager):
if not installed:
return
# Regenerate compose override in case it was deleted
try:
content = self._render_compose_override(installed)
self._write_compose_override(content)
except Exception as e:
logger.warning(f'reapply_on_startup: compose override write failed: {e}')
# Re-apply iptables rules
for svc_id, record in installed.items():
ip = record.get('service_ip', '')
@@ -607,3 +414,10 @@ class ServiceStoreManager(BaseServiceManager):
self.caddy_manager.regenerate_with_installed(caddy_routes)
except Exception as e:
logger.warning(f'reapply_on_startup: caddy regenerate failed: {e}')
# Bring up per-service compose stacks
if self.service_composer is not None:
try:
self.service_composer.reapply_active_services()
except Exception as e:
logger.warning('reapply_on_startup: reapply_active_services failed: %s', e)