diff --git a/api/Dockerfile b/api/Dockerfile index f1b5cfa..249d5ae 100644 --- a/api/Dockerfile +++ b/api/Dockerfile @@ -8,7 +8,11 @@ WORKDIR /app/api # The API runs as root by design: it drives iptables, the docker socket, and # docker-execs into sibling containers. Non-root is not feasible here. +# The Compose v2 plugin is a separate binary under cli-plugins/ — ServiceComposer +# shells out to `docker compose` for every store-service lifecycle op, so it must +# be copied alongside the docker CLI, not just the docker binary. COPY --from=dockercli /usr/local/bin/docker /usr/local/bin/docker +COPY --from=dockercli /usr/local/libexec/docker/cli-plugins/docker-compose /usr/local/libexec/docker/cli-plugins/docker-compose # cosign verifies store-service image signatures against the bundled public key # (config/cosign/cosign.pub) before ServiceComposer starts a container. diff --git a/api/connectivity_manager.py b/api/connectivity_manager.py index df03103..45260e1 100644 --- a/api/connectivity_manager.py +++ b/api/connectivity_manager.py @@ -1419,6 +1419,25 @@ class ConnectivityManager(BaseServiceManager): logger.warning(f"delete_connection: container teardown failed " f"(non-fatal): {e}") + # Free this connection's host policy-routing rule and kill-switch. + # apply_routes only re-adds rules for *existing* connections and only + # flushes the PIC_CONNECTIVITY chains — it never removes the deleted + # connection's individually-managed `ip rule fwmark→table` or its + # FORWARD kill-switch, so they must be torn down here or they leak. + mark, table = record.get('mark'), record.get('table') + if (record.get('type') != self.CELL_RELAY_TYPE + and isinstance(mark, int) and isinstance(table, int)): + try: + self._remove_ip_rule(mark, table) + except Exception as e: + logger.warning(f"delete_connection: ip rule cleanup failed " + f"(non-fatal): {e}") + try: + self._remove_killswitch(mark, record.get('iface')) + except Exception as e: + logger.warning(f"delete_connection: killswitch cleanup failed " + f"(non-fatal): {e}") + for secret_ref in record.get('secret_refs', []): if self.vault_manager is not None: try: @@ -2138,6 +2157,24 @@ class ConnectivityManager(BaseServiceManager): '-m', 'mark', '--mark', hex(mark), '!', '-o', iface, '-j', 'DROP']) + def _remove_killswitch(self, mark: int, iface: Optional[str]) -> None: + """Remove a connection's kill-switch FORWARD DROP (idempotent). + + Unlike the per-peer MARK/REDIRECT rules (which live in the flushed + PIC_CONNECTIVITY chains), the kill-switch is appended directly to + FORWARD, so it is not cleared by apply_routes' chain flush — a deleted + connection would otherwise leave a stale DROP that blocks a later + connection reusing the same mark. Drain duplicates with a bounded loop. + """ + if not iface: + return + for _ in range(8): + r = self._wg_iptables(['-D', 'FORWARD', + '-m', 'mark', '--mark', hex(mark), + '!', '-o', iface, '-j', 'DROP']) + if r.returncode != 0: + break + def _exit_status(self, exit_type: str) -> Dict[str, Any]: """Return per-exit status (config presence + interface up/down). diff --git a/api/manifest_validator.py b/api/manifest_validator.py index 5f56cc0..7f43e05 100644 --- a/api/manifest_validator.py +++ b/api/manifest_validator.py @@ -45,7 +45,7 @@ _HOOK_BINARY_RE = re.compile(r'^[a-z][a-z0-9_-]{0,31}$') _CAP_NAME_RE = re.compile(r'^[A-Z_]+$') _ID_RE = re.compile(r'^[a-z][a-z0-9_-]{0,30}$') _IMAGE_DIGEST_RE = re.compile( - r'^git\.pic\.ngo/roof/[a-zA-Z0-9._/-]+@sha256:[0-9a-f]{64}$' + r'^git\.pic\.ngo/roof/[a-zA-Z0-9._/-]+(:[a-zA-Z0-9._-]+)?@sha256:[0-9a-f]{64}$' ) # ── Build-context (Dockerfile) lint ─────────────────────────────────────── diff --git a/api/service_store_manager.py b/api/service_store_manager.py index df24a97..529ad0e 100644 --- a/api/service_store_manager.py +++ b/api/service_store_manager.py @@ -333,16 +333,32 @@ class ServiceStoreManager(BaseServiceManager): except Exception as e: return {'ok': False, 'error': f'Failed to fetch compose template: {e}'} - # Write compose file and start containers (validation inside write_compose) + # Write compose file and start containers (validation inside write_compose). + # Instanceable connectivity services back one container PER connection + # instance, rendered later by ConnectivityManager with a concrete + # ${INSTANCE_ID}/${REDIRECT_PORT}. Their base template still contains + # those placeholders, so there is no base container to bring up at + # install time — rendering/pulling/up-ing it here fails on the unset + # variables. Verify the image signature now (the enforce gate still + # applies), but defer the container to connection creation. if self.service_composer is not None: - try: - result = self.service_composer.install(service_id, manifest, template_content) - except ValueError as e: - return {'ok': False, 'error': str(e)} - except Exception as e: - return {'ok': False, 'error': f'Failed to start service: {e}'} - if not result.get('ok'): - return {'ok': False, 'error': result.get('error') or result.get('stderr', 'docker up failed')} + if manifest.get('instanceable'): + try: + verify = self.service_composer.verify_image(service_id, manifest) + except Exception as e: + return {'ok': False, 'error': f'image verification failed: {e}'} + if not verify.get('ok'): + return {'ok': False, + 'error': verify.get('error', 'image verification failed')} + else: + try: + result = self.service_composer.install(service_id, manifest, template_content) + except ValueError as e: + return {'ok': False, 'error': str(e)} + except Exception as e: + return {'ok': False, 'error': f'Failed to start service: {e}'} + if not result.get('ok'): + return {'ok': False, 'error': result.get('error') or result.get('stderr', 'docker up failed')} # Persist minimal install record. For instanceable connectivity # services the raw compose template is stored so ConnectivityManager diff --git a/tests/test_connectivity_connections.py b/tests/test_connectivity_connections.py index dc12fc0..250fbe0 100644 --- a/tests/test_connectivity_connections.py +++ b/tests/test_connectivity_connections.py @@ -110,6 +110,13 @@ class _Base(unittest.TestCase): for _svc in ('wireguard-ext', 'openvpn-client', 'tor', 'sshuttle', 'proxy'): self.cm.set_installed_service(_svc, {'id': _svc, 'manifest': {}}) self.cm._save_all_configs() + # No test in this module should shell into the WireGuard container. Stub + # the exec helpers so host-rule cleanup paths (delete_connection) never + # touch real docker/iptables; returncode 1 makes the drain loops stop. + self.mgr._wg_ip = MagicMock( + return_value=MagicMock(returncode=1, stdout='', stderr='')) + self.mgr._wg_iptables = MagicMock( + return_value=MagicMock(returncode=1, stdout='', stderr='')) def tearDown(self): shutil.rmtree(self.tmp, ignore_errors=True) @@ -296,6 +303,42 @@ class TestDeleteConnection(_Base): out = self.mgr.delete_connection('conn_nope') self.assertFalse(out['ok']) + def test_delete_removes_host_ip_rule(self): + """Deleting a connection must remove its fwmark->table ip rule. + + apply_routes only re-adds rules for surviving connections and only + flushes the PIC_CONNECTIVITY chains, so the deleted connection's + individually-managed `ip rule` would otherwise leak in cell-wireguard. + """ + res = self.mgr.create_connection('proxy', 'gone', _proxy_cfg()) + conn = res['connection'] + mark, table = conn['mark'], conn['table'] + self.mgr._wg_ip.reset_mock() + out = self.mgr.delete_connection(conn['id']) + self.assertTrue(out['ok'], out) + ip_calls = [c.args[0] for c in self.mgr._wg_ip.call_args_list] + self.assertIn( + ['rule', 'del', 'fwmark', hex(mark), 'lookup', str(table)], + ip_calls, + ) + + def test_delete_removes_iface_killswitch(self): + """An iface-type connection's FORWARD kill-switch is removed on delete.""" + res = self.mgr.create_connection( + 'wireguard_ext', 'wgks', {}, + secrets={'conf': '[Interface]\nPrivateKey = x\n'}) + conn = res['connection'] + mark, iface = conn['mark'], conn['iface'] + self.mgr._wg_iptables.reset_mock() + out = self.mgr.delete_connection(conn['id']) + self.assertTrue(out['ok'], out) + ipt_calls = [c.args[0] for c in self.mgr._wg_iptables.call_args_list] + self.assertTrue( + any(c[:2] == ['-D', 'FORWARD'] and '--mark' in c and hex(mark) in c + and iface in c for c in ipt_calls), + ipt_calls, + ) + class TestUpdateConnection(_Base): diff --git a/tests/test_manifest_validator.py b/tests/test_manifest_validator.py index a3ec927..0de8e95 100644 --- a/tests/test_manifest_validator.py +++ b/tests/test_manifest_validator.py @@ -175,6 +175,19 @@ class TestValidateManifest(unittest.TestCase): self.assertTrue(ok) self.assertEqual(errs, []) + def test_image_tag_and_digest_passes(self): + # The publish pipeline writes back name:tag@sha256: (a valid OCI + # reference). The validator must accept the tag alongside the digest — + # service_store_manager already does, and rejecting it here blocks every + # published store image from installing. + digest = 'a' * 64 + ok, errs = validate_manifest( + _minimal_manifest( + image=f'git.pic.ngo/roof/svc-proxy:latest@sha256:{digest}') + ) + self.assertTrue(ok, errs) + self.assertEqual(errs, []) + def test_image_wrong_registry_rejected(self): digest = 'a' * 64 ok, errs = validate_manifest( diff --git a/tests/test_service_store_manager.py b/tests/test_service_store_manager.py index c3c1df4..55bd0d4 100644 --- a/tests/test_service_store_manager.py +++ b/tests/test_service_store_manager.py @@ -726,6 +726,43 @@ class TestInstall(unittest.TestCase): self.assertIn('digest', result['error'].lower()) composer.install.assert_not_called() + def test_install_instanceable_verifies_image_but_does_not_up_container(self): + """Instanceable services defer the container to connection creation. + + Their base compose template still contains ${INSTANCE_ID}/${REDIRECT_PORT}, + so the base container must NOT be rendered/pulled/up'd at install time — + only the image signature is verified, and the record (with the raw + template) is stored for ConnectivityManager to render per instance. + """ + manifest = _valid_manifest( + id='proxy', container_name='cell-proxy-${INSTANCE_ID}', + instanceable=True, + ) + ssm, cm, _, composer = _make_ssm(manifest=manifest) + cm.get_image_verification_mode.return_value = 'enforce' + composer.verify_image.return_value = {'ok': True} + result = ssm.install('proxy') + self.assertTrue(result['ok'], result) + composer.verify_image.assert_called_once() + composer.install.assert_not_called() + # The raw template is persisted so per-instance rendering needs no refetch. + record = cm.set_installed_service.call_args[0][1] + self.assertIn('compose_template', record) + + def test_install_instanceable_aborts_when_image_verification_fails(self): + """An instanceable service whose image fails verification must not install.""" + manifest = _valid_manifest( + id='proxy', container_name='cell-proxy-${INSTANCE_ID}', + instanceable=True, + ) + ssm, cm, _, composer = _make_ssm(manifest=manifest) + cm.get_image_verification_mode.return_value = 'enforce' + composer.verify_image.return_value = {'ok': False, 'error': 'signature verification failed'} + result = ssm.install('proxy') + self.assertFalse(result['ok']) + composer.install.assert_not_called() + cm.set_installed_service.assert_not_called() + def test_install_without_composer_stores_record(self): """When service_composer=None, skip compose but still store the install record.""" manifest = _valid_manifest(id='myapp', container_name='cell-myapp')