feat: secure build phase 1 — cosign cell-side image verification (warn default) + Dockerfile validation
Unit Tests / test (push) Successful in 13m28s

- config/cosign/cosign.pub: public verification key committed to repo (safe);
  cosign private key lives in /home/roof/.pic-secrets/ and is NEVER committed
- api/config_manager.py: image_verification config block (modes: off|warn|enforce,
  default: warn) so existing deployments are unaffected until images are signed
- api/service_composer.py: cosign verify before pull/up; enforce aborts the
  operation, warn logs and proceeds, off skips entirely; also fixes the prior
  unsafe proceed-on-pull-failure path
- api/service_store_manager.py: store-image digest requirement (warn default,
  reject under enforce)
- api/Dockerfile: cosign binary copied from the official cosign image
- docker-compose.yml: config/cosign/ bind-mounted into cell-api container
- install.sh: ensure/verify bundled cosign pubkey on new cell installs
- api/manifest_validator.py: validate_build_context() — Dockerfile lint
- tests: full coverage for config modes, composer verify paths, store digest
  guard, and validate_build_context

Verification defaults to warn so nothing breaks in production until images are
signed (phase 2). Private key stored outside git at /home/roof/.pic-secrets/.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-06-11 03:53:47 -04:00
parent 8d904b1b8f
commit 238db60702
12 changed files with 622 additions and 2 deletions
+94 -2
View File
@@ -32,6 +32,16 @@ logger = logging.getLogger('picell')
_SECRET_RE = re.compile(r'\$\{(PIC_SECRET_\w+)\}')
_SAFE_ID_RE = re.compile(r'^[a-z0-9][a-z0-9_-]{0,63}$')
_DIGEST_RE = re.compile(r'@sha256:[0-9a-f]{64}$')
# Bundled cosign public key — shipped in the repo (config/cosign/cosign.pub) so
# every cell can verify store-service image signatures offline. install.sh keeps
# it at /opt/pic/config/cosign/cosign.pub; in the cell-api container it is
# COPYed to /app/config/cosign/cosign.pub.
_COSIGN_PUBKEY_PATH = os.environ.get(
'PIC_COSIGN_PUBKEY', '/app/config/cosign/cosign.pub'
)
_COSIGN_BIN = os.environ.get('PIC_COSIGN_BIN', 'cosign')
class ServiceComposer:
@@ -265,18 +275,100 @@ class ServiceComposer:
self.write_compose(service_id, manifest, template_content)
return self.up(service_id)
# ── Image signature verification ──────────────────────────────────────
def _verification_mode(self) -> str:
"""Resolve the configured image verification mode (off|warn|enforce)."""
getter = getattr(self.cm, 'get_image_verification_mode', None)
if callable(getter):
try:
return getter()
except Exception as e: # config corruption must not crash install
logger.warning('service_composer: could not read verification mode: %s', e)
return 'warn'
def _cosign_verify(self, image_ref: str) -> Dict:
"""Run `cosign verify` against the bundled public key for one image ref.
Factored out so tests can mock it / mock the subprocess call. Returns a
_run-style dict ({'ok': bool, 'stdout', 'stderr'/'error'}).
"""
cmd = [
_COSIGN_BIN, 'verify',
'--key', _COSIGN_PUBKEY_PATH,
'--insecure-ignore-tlog=true',
image_ref,
]
return self._run(cmd, timeout=120)
def verify_image(self, service_id: str, manifest: Dict) -> Dict:
"""Verify a store image's signature subject to the configured mode.
Returns {'ok': True, 'skipped'|'verified'|'warned': ...} when the install
may proceed, or {'ok': False, 'error': ...} when it must abort (enforce
mode with a missing digest or a failed/absent signature).
"""
mode = self._verification_mode()
if mode == 'off':
return {'ok': True, 'skipped': True}
image_ref = (manifest or {}).get('image', '')
if not image_ref:
# No image to verify (e.g. builtin-style manifest); nothing to do.
return {'ok': True, 'skipped': True}
# Store images must be digest-pinned to be verifiable by digest.
if not _DIGEST_RE.search(image_ref):
msg = (f'image {image_ref!r} for {service_id} is not digest-pinned '
'(@sha256:) — cannot verify signature')
if mode == 'enforce':
logger.error('service_composer: %s; aborting install (enforce)', msg)
return {'ok': False, 'error': msg}
logger.warning('service_composer: %s; proceeding (warn)', msg)
return {'ok': True, 'warned': True}
result = self._cosign_verify(image_ref)
if result.get('ok'):
logger.info('service_composer: cosign verified %s', image_ref)
return {'ok': True, 'verified': True}
detail = result.get('stderr') or result.get('error') or 'signature verification failed'
msg = f'cosign verification failed for {image_ref}: {str(detail)[:200]}'
if mode == 'enforce':
logger.error('service_composer: %s; aborting install (enforce)', msg)
return {'ok': False, 'error': msg}
logger.warning('service_composer: %s; proceeding (warn)', msg)
return {'ok': True, 'warned': True}
def install(self, service_id: str, manifest: Dict,
template_content: str) -> Dict:
"""Write compose file, pull image, then start containers.
"""Write compose file, verify + pull image, then start containers.
Image signature verification runs before pull/up. Under enforce mode a
missing digest, missing signature, or failed verification aborts the
install (containers are never started); under warn mode the problem is
logged and the install proceeds; under off mode verification is skipped.
pull is run first so the up step doesn't time out on slow connections.
A single retry handles transient registry hiccups on first install.
"""
self.write_compose(service_id, manifest, template_content)
verify = self.verify_image(service_id, manifest)
if not verify.get('ok'):
return {'ok': False, 'error': verify.get('error', 'image verification failed')}
mode = self._verification_mode()
pull = self._store_cmd(service_id, 'pull', timeout=600)
if not pull.get('ok'):
pull_err = pull.get('stderr') or pull.get('error') or 'unknown error'
if mode == 'enforce':
logger.error('service_composer: image pull for %s failed under enforce, '
'aborting: %s', service_id, str(pull_err)[:200])
return {'ok': False,
'error': f'image pull failed (enforce): {str(pull_err)[:200]}'}
logger.warning('service_composer: image pull for %s failed, proceeding anyway: %s',
service_id, pull.get('stderr', '')[:200])
service_id, str(pull_err)[:200])
result = self.up(service_id)
if not result.get('ok'):
logger.info('service_composer: retrying up for %s after initial failure', service_id)