fix: unblock instanceable connectivity store-service install + clean up on delete
Live verification on pic1 of the connectivity v2 multi-instance feature
surfaced four integration bugs that prevented installing any published
connectivity store service (proxy/wireguard-ext/openvpn-client/sshuttle)
and left stale host routing state behind. All four are fixed here:
1. manifest_validator rejected the CI-published `name:tag@sha256:<digest>`
image form (it required digest-only), while service_store_manager already
accepted it — so every published store image failed validation. Allow an
optional tag before the digest, matching service_store_manager.
2. The cell-api image shipped the docker CLI but not the Compose v2 plugin,
so every `docker compose` ServiceComposer runs (pull/up/down for store
services) failed with "'compose' is not a docker command". Copy the
compose plugin binary from the docker-cli stage.
3. service_store_manager.install ran the base compose up for instanceable
services, whose template still contains ${INSTANCE_ID}/${REDIRECT_PORT}
(there is no base container — one runs per connection instance). It now
verifies the image signature but defers the container to connection
creation for instanceable manifests.
4. delete_connection freed the record/secrets/container but never removed the
connection's individually-managed `ip rule fwmark->table` or its FORWARD
kill-switch (apply_routes only flushes the PIC_CONNECTIVITY chains and
re-adds rules for surviving connections), leaking stale host routing state.
It now tears both down; added _remove_killswitch.
Verified end-to-end on pic1: two proxy instances allocate distinct
marks/tables/ports (skipping in-use resources), render distinct per-instance
containers, two peers route through distinct instances (per-peer MARK +
REDIRECT), delete is blocked while referenced (409) and cleans its ip rule.
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -8,7 +8,11 @@ WORKDIR /app/api
|
||||
|
||||
# The API runs as root by design: it drives iptables, the docker socket, and
|
||||
# docker-execs into sibling containers. Non-root is not feasible here.
|
||||
# The Compose v2 plugin is a separate binary under cli-plugins/ — ServiceComposer
|
||||
# shells out to `docker compose` for every store-service lifecycle op, so it must
|
||||
# be copied alongside the docker CLI, not just the docker binary.
|
||||
COPY --from=dockercli /usr/local/bin/docker /usr/local/bin/docker
|
||||
COPY --from=dockercli /usr/local/libexec/docker/cli-plugins/docker-compose /usr/local/libexec/docker/cli-plugins/docker-compose
|
||||
|
||||
# cosign verifies store-service image signatures against the bundled public key
|
||||
# (config/cosign/cosign.pub) before ServiceComposer starts a container.
|
||||
|
||||
@@ -1419,6 +1419,25 @@ class ConnectivityManager(BaseServiceManager):
|
||||
logger.warning(f"delete_connection: container teardown failed "
|
||||
f"(non-fatal): {e}")
|
||||
|
||||
# Free this connection's host policy-routing rule and kill-switch.
|
||||
# apply_routes only re-adds rules for *existing* connections and only
|
||||
# flushes the PIC_CONNECTIVITY chains — it never removes the deleted
|
||||
# connection's individually-managed `ip rule fwmark→table` or its
|
||||
# FORWARD kill-switch, so they must be torn down here or they leak.
|
||||
mark, table = record.get('mark'), record.get('table')
|
||||
if (record.get('type') != self.CELL_RELAY_TYPE
|
||||
and isinstance(mark, int) and isinstance(table, int)):
|
||||
try:
|
||||
self._remove_ip_rule(mark, table)
|
||||
except Exception as e:
|
||||
logger.warning(f"delete_connection: ip rule cleanup failed "
|
||||
f"(non-fatal): {e}")
|
||||
try:
|
||||
self._remove_killswitch(mark, record.get('iface'))
|
||||
except Exception as e:
|
||||
logger.warning(f"delete_connection: killswitch cleanup failed "
|
||||
f"(non-fatal): {e}")
|
||||
|
||||
for secret_ref in record.get('secret_refs', []):
|
||||
if self.vault_manager is not None:
|
||||
try:
|
||||
@@ -2138,6 +2157,24 @@ class ConnectivityManager(BaseServiceManager):
|
||||
'-m', 'mark', '--mark', hex(mark),
|
||||
'!', '-o', iface, '-j', 'DROP'])
|
||||
|
||||
def _remove_killswitch(self, mark: int, iface: Optional[str]) -> None:
|
||||
"""Remove a connection's kill-switch FORWARD DROP (idempotent).
|
||||
|
||||
Unlike the per-peer MARK/REDIRECT rules (which live in the flushed
|
||||
PIC_CONNECTIVITY chains), the kill-switch is appended directly to
|
||||
FORWARD, so it is not cleared by apply_routes' chain flush — a deleted
|
||||
connection would otherwise leave a stale DROP that blocks a later
|
||||
connection reusing the same mark. Drain duplicates with a bounded loop.
|
||||
"""
|
||||
if not iface:
|
||||
return
|
||||
for _ in range(8):
|
||||
r = self._wg_iptables(['-D', 'FORWARD',
|
||||
'-m', 'mark', '--mark', hex(mark),
|
||||
'!', '-o', iface, '-j', 'DROP'])
|
||||
if r.returncode != 0:
|
||||
break
|
||||
|
||||
def _exit_status(self, exit_type: str) -> Dict[str, Any]:
|
||||
"""Return per-exit status (config presence + interface up/down).
|
||||
|
||||
|
||||
@@ -45,7 +45,7 @@ _HOOK_BINARY_RE = re.compile(r'^[a-z][a-z0-9_-]{0,31}$')
|
||||
_CAP_NAME_RE = re.compile(r'^[A-Z_]+$')
|
||||
_ID_RE = re.compile(r'^[a-z][a-z0-9_-]{0,30}$')
|
||||
_IMAGE_DIGEST_RE = re.compile(
|
||||
r'^git\.pic\.ngo/roof/[a-zA-Z0-9._/-]+@sha256:[0-9a-f]{64}$'
|
||||
r'^git\.pic\.ngo/roof/[a-zA-Z0-9._/-]+(:[a-zA-Z0-9._-]+)?@sha256:[0-9a-f]{64}$'
|
||||
)
|
||||
|
||||
# ── Build-context (Dockerfile) lint ───────────────────────────────────────
|
||||
|
||||
@@ -333,16 +333,32 @@ class ServiceStoreManager(BaseServiceManager):
|
||||
except Exception as e:
|
||||
return {'ok': False, 'error': f'Failed to fetch compose template: {e}'}
|
||||
|
||||
# Write compose file and start containers (validation inside write_compose)
|
||||
# Write compose file and start containers (validation inside write_compose).
|
||||
# Instanceable connectivity services back one container PER connection
|
||||
# instance, rendered later by ConnectivityManager with a concrete
|
||||
# ${INSTANCE_ID}/${REDIRECT_PORT}. Their base template still contains
|
||||
# those placeholders, so there is no base container to bring up at
|
||||
# install time — rendering/pulling/up-ing it here fails on the unset
|
||||
# variables. Verify the image signature now (the enforce gate still
|
||||
# applies), but defer the container to connection creation.
|
||||
if self.service_composer is not None:
|
||||
try:
|
||||
result = self.service_composer.install(service_id, manifest, template_content)
|
||||
except ValueError as e:
|
||||
return {'ok': False, 'error': str(e)}
|
||||
except Exception as e:
|
||||
return {'ok': False, 'error': f'Failed to start service: {e}'}
|
||||
if not result.get('ok'):
|
||||
return {'ok': False, 'error': result.get('error') or result.get('stderr', 'docker up failed')}
|
||||
if manifest.get('instanceable'):
|
||||
try:
|
||||
verify = self.service_composer.verify_image(service_id, manifest)
|
||||
except Exception as e:
|
||||
return {'ok': False, 'error': f'image verification failed: {e}'}
|
||||
if not verify.get('ok'):
|
||||
return {'ok': False,
|
||||
'error': verify.get('error', 'image verification failed')}
|
||||
else:
|
||||
try:
|
||||
result = self.service_composer.install(service_id, manifest, template_content)
|
||||
except ValueError as e:
|
||||
return {'ok': False, 'error': str(e)}
|
||||
except Exception as e:
|
||||
return {'ok': False, 'error': f'Failed to start service: {e}'}
|
||||
if not result.get('ok'):
|
||||
return {'ok': False, 'error': result.get('error') or result.get('stderr', 'docker up failed')}
|
||||
|
||||
# Persist minimal install record. For instanceable connectivity
|
||||
# services the raw compose template is stored so ConnectivityManager
|
||||
|
||||
Reference in New Issue
Block a user