fix: unblock instanceable connectivity store-service install + clean up on delete

Live verification on pic1 of the connectivity v2 multi-instance feature
surfaced four integration bugs that prevented installing any published
connectivity store service (proxy/wireguard-ext/openvpn-client/sshuttle)
and left stale host routing state behind. All four are fixed here:

1. manifest_validator rejected the CI-published `name:tag@sha256:<digest>`
   image form (it required digest-only), while service_store_manager already
   accepted it — so every published store image failed validation. Allow an
   optional tag before the digest, matching service_store_manager.

2. The cell-api image shipped the docker CLI but not the Compose v2 plugin,
   so every `docker compose` ServiceComposer runs (pull/up/down for store
   services) failed with "'compose' is not a docker command". Copy the
   compose plugin binary from the docker-cli stage.

3. service_store_manager.install ran the base compose up for instanceable
   services, whose template still contains ${INSTANCE_ID}/${REDIRECT_PORT}
   (there is no base container — one runs per connection instance). It now
   verifies the image signature but defers the container to connection
   creation for instanceable manifests.

4. delete_connection freed the record/secrets/container but never removed the
   connection's individually-managed `ip rule fwmark->table` or its FORWARD
   kill-switch (apply_routes only flushes the PIC_CONNECTIVITY chains and
   re-adds rules for surviving connections), leaking stale host routing state.
   It now tears both down; added _remove_killswitch.

Verified end-to-end on pic1: two proxy instances allocate distinct
marks/tables/ports (skipping in-use resources), render distinct per-instance
containers, two peers route through distinct instances (per-peer MARK +
REDIRECT), delete is blocked while referenced (409) and cleans its ip rule.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 08:45:32 -04:00
parent 4b3d695805
commit 6bc1d625bf
7 changed files with 160 additions and 10 deletions
+4
View File
@@ -8,7 +8,11 @@ WORKDIR /app/api
# The API runs as root by design: it drives iptables, the docker socket, and # The API runs as root by design: it drives iptables, the docker socket, and
# docker-execs into sibling containers. Non-root is not feasible here. # docker-execs into sibling containers. Non-root is not feasible here.
# The Compose v2 plugin is a separate binary under cli-plugins/ — ServiceComposer
# shells out to `docker compose` for every store-service lifecycle op, so it must
# be copied alongside the docker CLI, not just the docker binary.
COPY --from=dockercli /usr/local/bin/docker /usr/local/bin/docker COPY --from=dockercli /usr/local/bin/docker /usr/local/bin/docker
COPY --from=dockercli /usr/local/libexec/docker/cli-plugins/docker-compose /usr/local/libexec/docker/cli-plugins/docker-compose
# cosign verifies store-service image signatures against the bundled public key # cosign verifies store-service image signatures against the bundled public key
# (config/cosign/cosign.pub) before ServiceComposer starts a container. # (config/cosign/cosign.pub) before ServiceComposer starts a container.
+37
View File
@@ -1419,6 +1419,25 @@ class ConnectivityManager(BaseServiceManager):
logger.warning(f"delete_connection: container teardown failed " logger.warning(f"delete_connection: container teardown failed "
f"(non-fatal): {e}") f"(non-fatal): {e}")
# Free this connection's host policy-routing rule and kill-switch.
# apply_routes only re-adds rules for *existing* connections and only
# flushes the PIC_CONNECTIVITY chains — it never removes the deleted
# connection's individually-managed `ip rule fwmark→table` or its
# FORWARD kill-switch, so they must be torn down here or they leak.
mark, table = record.get('mark'), record.get('table')
if (record.get('type') != self.CELL_RELAY_TYPE
and isinstance(mark, int) and isinstance(table, int)):
try:
self._remove_ip_rule(mark, table)
except Exception as e:
logger.warning(f"delete_connection: ip rule cleanup failed "
f"(non-fatal): {e}")
try:
self._remove_killswitch(mark, record.get('iface'))
except Exception as e:
logger.warning(f"delete_connection: killswitch cleanup failed "
f"(non-fatal): {e}")
for secret_ref in record.get('secret_refs', []): for secret_ref in record.get('secret_refs', []):
if self.vault_manager is not None: if self.vault_manager is not None:
try: try:
@@ -2138,6 +2157,24 @@ class ConnectivityManager(BaseServiceManager):
'-m', 'mark', '--mark', hex(mark), '-m', 'mark', '--mark', hex(mark),
'!', '-o', iface, '-j', 'DROP']) '!', '-o', iface, '-j', 'DROP'])
def _remove_killswitch(self, mark: int, iface: Optional[str]) -> None:
"""Remove a connection's kill-switch FORWARD DROP (idempotent).
Unlike the per-peer MARK/REDIRECT rules (which live in the flushed
PIC_CONNECTIVITY chains), the kill-switch is appended directly to
FORWARD, so it is not cleared by apply_routes' chain flush — a deleted
connection would otherwise leave a stale DROP that blocks a later
connection reusing the same mark. Drain duplicates with a bounded loop.
"""
if not iface:
return
for _ in range(8):
r = self._wg_iptables(['-D', 'FORWARD',
'-m', 'mark', '--mark', hex(mark),
'!', '-o', iface, '-j', 'DROP'])
if r.returncode != 0:
break
def _exit_status(self, exit_type: str) -> Dict[str, Any]: def _exit_status(self, exit_type: str) -> Dict[str, Any]:
"""Return per-exit status (config presence + interface up/down). """Return per-exit status (config presence + interface up/down).
+1 -1
View File
@@ -45,7 +45,7 @@ _HOOK_BINARY_RE = re.compile(r'^[a-z][a-z0-9_-]{0,31}$')
_CAP_NAME_RE = re.compile(r'^[A-Z_]+$') _CAP_NAME_RE = re.compile(r'^[A-Z_]+$')
_ID_RE = re.compile(r'^[a-z][a-z0-9_-]{0,30}$') _ID_RE = re.compile(r'^[a-z][a-z0-9_-]{0,30}$')
_IMAGE_DIGEST_RE = re.compile( _IMAGE_DIGEST_RE = re.compile(
r'^git\.pic\.ngo/roof/[a-zA-Z0-9._/-]+@sha256:[0-9a-f]{64}$' r'^git\.pic\.ngo/roof/[a-zA-Z0-9._/-]+(:[a-zA-Z0-9._-]+)?@sha256:[0-9a-f]{64}$'
) )
# ── Build-context (Dockerfile) lint ─────────────────────────────────────── # ── Build-context (Dockerfile) lint ───────────────────────────────────────
+25 -9
View File
@@ -333,16 +333,32 @@ class ServiceStoreManager(BaseServiceManager):
except Exception as e: except Exception as e:
return {'ok': False, 'error': f'Failed to fetch compose template: {e}'} return {'ok': False, 'error': f'Failed to fetch compose template: {e}'}
# Write compose file and start containers (validation inside write_compose) # Write compose file and start containers (validation inside write_compose).
# Instanceable connectivity services back one container PER connection
# instance, rendered later by ConnectivityManager with a concrete
# ${INSTANCE_ID}/${REDIRECT_PORT}. Their base template still contains
# those placeholders, so there is no base container to bring up at
# install time — rendering/pulling/up-ing it here fails on the unset
# variables. Verify the image signature now (the enforce gate still
# applies), but defer the container to connection creation.
if self.service_composer is not None: if self.service_composer is not None:
try: if manifest.get('instanceable'):
result = self.service_composer.install(service_id, manifest, template_content) try:
except ValueError as e: verify = self.service_composer.verify_image(service_id, manifest)
return {'ok': False, 'error': str(e)} except Exception as e:
except Exception as e: return {'ok': False, 'error': f'image verification failed: {e}'}
return {'ok': False, 'error': f'Failed to start service: {e}'} if not verify.get('ok'):
if not result.get('ok'): return {'ok': False,
return {'ok': False, 'error': result.get('error') or result.get('stderr', 'docker up failed')} 'error': verify.get('error', 'image verification failed')}
else:
try:
result = self.service_composer.install(service_id, manifest, template_content)
except ValueError as e:
return {'ok': False, 'error': str(e)}
except Exception as e:
return {'ok': False, 'error': f'Failed to start service: {e}'}
if not result.get('ok'):
return {'ok': False, 'error': result.get('error') or result.get('stderr', 'docker up failed')}
# Persist minimal install record. For instanceable connectivity # Persist minimal install record. For instanceable connectivity
# services the raw compose template is stored so ConnectivityManager # services the raw compose template is stored so ConnectivityManager
+43
View File
@@ -110,6 +110,13 @@ class _Base(unittest.TestCase):
for _svc in ('wireguard-ext', 'openvpn-client', 'tor', 'sshuttle', 'proxy'): for _svc in ('wireguard-ext', 'openvpn-client', 'tor', 'sshuttle', 'proxy'):
self.cm.set_installed_service(_svc, {'id': _svc, 'manifest': {}}) self.cm.set_installed_service(_svc, {'id': _svc, 'manifest': {}})
self.cm._save_all_configs() self.cm._save_all_configs()
# No test in this module should shell into the WireGuard container. Stub
# the exec helpers so host-rule cleanup paths (delete_connection) never
# touch real docker/iptables; returncode 1 makes the drain loops stop.
self.mgr._wg_ip = MagicMock(
return_value=MagicMock(returncode=1, stdout='', stderr=''))
self.mgr._wg_iptables = MagicMock(
return_value=MagicMock(returncode=1, stdout='', stderr=''))
def tearDown(self): def tearDown(self):
shutil.rmtree(self.tmp, ignore_errors=True) shutil.rmtree(self.tmp, ignore_errors=True)
@@ -296,6 +303,42 @@ class TestDeleteConnection(_Base):
out = self.mgr.delete_connection('conn_nope') out = self.mgr.delete_connection('conn_nope')
self.assertFalse(out['ok']) self.assertFalse(out['ok'])
def test_delete_removes_host_ip_rule(self):
"""Deleting a connection must remove its fwmark->table ip rule.
apply_routes only re-adds rules for surviving connections and only
flushes the PIC_CONNECTIVITY chains, so the deleted connection's
individually-managed `ip rule` would otherwise leak in cell-wireguard.
"""
res = self.mgr.create_connection('proxy', 'gone', _proxy_cfg())
conn = res['connection']
mark, table = conn['mark'], conn['table']
self.mgr._wg_ip.reset_mock()
out = self.mgr.delete_connection(conn['id'])
self.assertTrue(out['ok'], out)
ip_calls = [c.args[0] for c in self.mgr._wg_ip.call_args_list]
self.assertIn(
['rule', 'del', 'fwmark', hex(mark), 'lookup', str(table)],
ip_calls,
)
def test_delete_removes_iface_killswitch(self):
"""An iface-type connection's FORWARD kill-switch is removed on delete."""
res = self.mgr.create_connection(
'wireguard_ext', 'wgks', {},
secrets={'conf': '[Interface]\nPrivateKey = x\n'})
conn = res['connection']
mark, iface = conn['mark'], conn['iface']
self.mgr._wg_iptables.reset_mock()
out = self.mgr.delete_connection(conn['id'])
self.assertTrue(out['ok'], out)
ipt_calls = [c.args[0] for c in self.mgr._wg_iptables.call_args_list]
self.assertTrue(
any(c[:2] == ['-D', 'FORWARD'] and '--mark' in c and hex(mark) in c
and iface in c for c in ipt_calls),
ipt_calls,
)
class TestUpdateConnection(_Base): class TestUpdateConnection(_Base):
+13
View File
@@ -175,6 +175,19 @@ class TestValidateManifest(unittest.TestCase):
self.assertTrue(ok) self.assertTrue(ok)
self.assertEqual(errs, []) self.assertEqual(errs, [])
def test_image_tag_and_digest_passes(self):
# The publish pipeline writes back name:tag@sha256:<digest> (a valid OCI
# reference). The validator must accept the tag alongside the digest —
# service_store_manager already does, and rejecting it here blocks every
# published store image from installing.
digest = 'a' * 64
ok, errs = validate_manifest(
_minimal_manifest(
image=f'git.pic.ngo/roof/svc-proxy:latest@sha256:{digest}')
)
self.assertTrue(ok, errs)
self.assertEqual(errs, [])
def test_image_wrong_registry_rejected(self): def test_image_wrong_registry_rejected(self):
digest = 'a' * 64 digest = 'a' * 64
ok, errs = validate_manifest( ok, errs = validate_manifest(
+37
View File
@@ -726,6 +726,43 @@ class TestInstall(unittest.TestCase):
self.assertIn('digest', result['error'].lower()) self.assertIn('digest', result['error'].lower())
composer.install.assert_not_called() composer.install.assert_not_called()
def test_install_instanceable_verifies_image_but_does_not_up_container(self):
"""Instanceable services defer the container to connection creation.
Their base compose template still contains ${INSTANCE_ID}/${REDIRECT_PORT},
so the base container must NOT be rendered/pulled/up'd at install time —
only the image signature is verified, and the record (with the raw
template) is stored for ConnectivityManager to render per instance.
"""
manifest = _valid_manifest(
id='proxy', container_name='cell-proxy-${INSTANCE_ID}',
instanceable=True,
)
ssm, cm, _, composer = _make_ssm(manifest=manifest)
cm.get_image_verification_mode.return_value = 'enforce'
composer.verify_image.return_value = {'ok': True}
result = ssm.install('proxy')
self.assertTrue(result['ok'], result)
composer.verify_image.assert_called_once()
composer.install.assert_not_called()
# The raw template is persisted so per-instance rendering needs no refetch.
record = cm.set_installed_service.call_args[0][1]
self.assertIn('compose_template', record)
def test_install_instanceable_aborts_when_image_verification_fails(self):
"""An instanceable service whose image fails verification must not install."""
manifest = _valid_manifest(
id='proxy', container_name='cell-proxy-${INSTANCE_ID}',
instanceable=True,
)
ssm, cm, _, composer = _make_ssm(manifest=manifest)
cm.get_image_verification_mode.return_value = 'enforce'
composer.verify_image.return_value = {'ok': False, 'error': 'signature verification failed'}
result = ssm.install('proxy')
self.assertFalse(result['ok'])
composer.install.assert_not_called()
cm.set_installed_service.assert_not_called()
def test_install_without_composer_stores_record(self): def test_install_without_composer_stores_record(self):
"""When service_composer=None, skip compose but still store the install record.""" """When service_composer=None, skip compose but still store the install record."""
manifest = _valid_manifest(id='myapp', container_name='cell-myapp') manifest = _valid_manifest(id='myapp', container_name='cell-myapp')