fix: architecture audit — security, atomicity, broken endpoints, test coverage

Sprint 1 — Security & correctness:
- Restore all 10 commented-out is_local_request() checks (vault, containers, images, volumes)
- Fix XFF spoofing: only trust the LAST X-Forwarded-For entry (Caddy's append), not all
- Require prefix length in wireguard.address (was accepting bare IPs like 10.0.0.1)
- Validate service_access list in add_peer (valid: calendar/files/mail/webdav)
- Fix dhcp/reservations POST/DELETE: unpack mac/ip/hostname from body (was passing dict as positional arg)
- Fix network/test POST: remove spurious data arg (test_connectivity takes no args)
- Fix remove_peer: clear iptables rules and regenerate DNS ACLs on deletion (was leaving stale rules)
- Fix CoreDNS reload: SIGHUP → SIGUSR1 (SIGHUP kills the process; SIGUSR1 triggers reload plugin)
- Remove local.{domain} block from Corefile template (local.zone doesn't exist, caused log spam)
- Fix routing_manager._remove_nat_rule: targeted -D instead of flushing entire POSTROUTING chain

Sprint 2 — State consistency:
- Atomic config writes in config_manager, ip_utils, firewall_manager, network_manager
  (write to .tmp → fsync → os.replace, prevents truncated files on kill)
- backup_config: now also backs up Caddyfile, Corefile, .env, DNS zone files
- restore_config: restores all of the above so config stays consistent after restore

Sprint 3 — Dead code / documentation:
- Remove CellManager instantiation from app startup (was never called, double-instantiated all managers)
- Document routing_manager scope (targets host, not cell-wireguard; methods not called by any active route)

Sprint 4 — Test infrastructure:
- Add tests/conftest.py with shared tmp_dir, tmp_config_dir, tmp_data_dir, flask_client fixtures
- Add tests/test_config_validation.py: 400 paths for ip_range, port, wireguard.address validation
- Add tests/test_ip_utils_caddyfile.py: 14 tests for write_caddyfile (was completely untested)
- Expand test_app_misc.py: 7 new is_local_request tests covering XFF spoofing and cell-network IPs
- Add --cov-fail-under=70 to make test-coverage
- Add pre-commit hook that runs pytest before every commit

414 tests pass (was 372).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-24 03:27:52 -04:00
parent 55bec04603
commit d5018c2b34
13 changed files with 801 additions and 633 deletions
+53 -35
View File
@@ -179,7 +179,6 @@ email_manager = EmailManager(data_dir=_DATA_DIR, config_dir=_CONFIG_DIR)
calendar_manager = CalendarManager(data_dir=_DATA_DIR, config_dir=_CONFIG_DIR)
file_manager = FileManager(data_dir=_DATA_DIR, config_dir=_CONFIG_DIR)
routing_manager = RoutingManager(data_dir=_DATA_DIR, config_dir=_CONFIG_DIR)
cell_manager = CellManager(data_dir=_DATA_DIR, config_dir=_CONFIG_DIR)
app.vault_manager = VaultManager(data_dir=_DATA_DIR, config_dir=_CONFIG_DIR)
container_manager = ContainerManager(data_dir=_DATA_DIR, config_dir=_CONFIG_DIR)
cell_link_manager = CellLinkManager(
@@ -345,10 +344,12 @@ def is_local_request():
if _allowed(remote_addr):
return True
# Only trust the LAST X-Forwarded-For entry — that is what Caddy appended.
# Iterating all entries allows clients to spoof local origin by prepending 127.0.0.1.
if forwarded_for:
for addr in forwarded_for.split(','):
if _allowed(addr.strip()):
return True
last_hop = forwarded_for.split(',')[-1].strip()
if _allowed(last_hop):
return True
return False
@app.route('/health', methods=['GET'])
@@ -481,6 +482,8 @@ def update_config():
_addr = data['wireguard'].get('address')
if _addr:
import ipaddress as _ipa2
if '/' not in str(_addr):
return jsonify({'error': 'wireguard.address must include a prefix length (e.g. 10.0.0.1/24)'}), 400
try:
_ipa2.ip_interface(_addr)
except ValueError as _e:
@@ -1166,10 +1169,13 @@ def get_dhcp_leases():
def add_dhcp_reservation():
try:
data = request.get_json(silent=True)
if data is None:
if not data:
return jsonify({"error": "No data provided"}), 400
result = network_manager.add_dhcp_reservation(data)
return jsonify(result)
for field in ('mac', 'ip'):
if field not in data:
return jsonify({"error": f"Missing required field: {field}"}), 400
result = network_manager.add_dhcp_reservation(data['mac'], data['ip'], data.get('hostname', ''))
return jsonify({"success": result})
except Exception as e:
logger.error(f"Error adding DHCP reservation: {e}")
return jsonify({"error": str(e)}), 500
@@ -1179,8 +1185,10 @@ def remove_dhcp_reservation():
"""Remove DHCP reservation."""
try:
data = request.get_json(silent=True)
result = network_manager.remove_dhcp_reservation(data)
return jsonify(result)
if not data or 'mac' not in data:
return jsonify({"error": "Missing required field: mac"}), 400
result = network_manager.remove_dhcp_reservation(data['mac'])
return jsonify({"success": result})
except Exception as e:
logger.error(f"Error removing DHCP reservation: {e}")
return jsonify({"error": str(e)}), 500
@@ -1218,10 +1226,7 @@ def get_dns_status():
@app.route('/api/network/test', methods=['POST'])
def test_network():
try:
data = request.get_json(silent=True)
if data is None:
return jsonify({"error": "No data provided"}), 400
result = network_manager.test_connectivity(data)
result = network_manager.test_connectivity()
return jsonify(result)
except Exception as e:
logger.error(f"Error testing network: {e}")
@@ -1572,6 +1577,12 @@ def add_peer():
assigned_ip = data.get('ip') or _next_peer_ip()
# Validate service_access if provided
_valid_services = {'calendar', 'files', 'mail', 'webdav'}
service_access = data.get('service_access', list(_valid_services))
if not isinstance(service_access, list) or not all(s in _valid_services for s in service_access):
return jsonify({"error": f"service_access must be a list of: {sorted(_valid_services)}"}), 400
# Add peer to registry with all provided fields
peer_info = {
'peer': data['name'],
@@ -1584,7 +1595,7 @@ def add_peer():
'persistent_keepalive': data.get('persistent_keepalive'),
'description': data.get('description'),
'internet_access': data.get('internet_access', True),
'service_access': data.get('service_access', ['calendar', 'files', 'mail', 'webdav']),
'service_access': service_access,
'peer_access': data.get('peer_access', True),
'config_needs_reinstall': False,
}
@@ -1651,10 +1662,17 @@ def clear_peer_reinstall(peer_name):
@app.route('/api/peers/<peer_name>', methods=['DELETE'])
def remove_peer(peer_name):
"""Remove a peer."""
"""Remove a peer and clean up its firewall rules and DNS ACLs."""
try:
peer = peer_registry.get_peer(peer_name)
if not peer:
return jsonify({"message": f"Peer {peer_name} not found or already removed"})
peer_ip = peer.get('ip')
success = peer_registry.remove_peer(peer_name)
if success:
if peer_ip:
firewall_manager.clear_peer_rules(peer_ip)
firewall_manager.apply_all_dns_rules(peer_registry.list_peers(), COREFILE_PATH, _configured_domain())
return jsonify({"message": f"Peer {peer_name} removed successfully"})
else:
return jsonify({"message": f"Peer {peer_name} not found or already removed"})
@@ -2558,8 +2576,8 @@ def restart_container(name):
@app.route('/api/containers/<name>/logs', methods=['GET'])
def get_container_logs(name):
# Temporarily disable access control for debugging
# if not is_local_request():
# return jsonify({'error': 'Access denied'}), 403
if not is_local_request():
return jsonify({'error': 'Access denied'}), 403
tail = request.args.get('tail', default=100, type=int)
try:
logs = container_manager.get_container_logs(name, tail=tail)
@@ -2571,8 +2589,8 @@ def get_container_logs(name):
@app.route('/api/containers/<name>/stats', methods=['GET'])
def get_container_stats(name):
# Temporarily disable access control for debugging
# if not is_local_request():
# return jsonify({'error': 'Access denied'}), 403
if not is_local_request():
return jsonify({'error': 'Access denied'}), 403
try:
stats = container_manager.get_container_stats(name)
return jsonify(stats)
@@ -2583,16 +2601,16 @@ def get_container_stats(name):
@app.route('/api/vault/secrets', methods=['GET'])
def list_secrets():
# Temporarily disable access control for debugging
# if not is_local_request():
# return jsonify({'error': 'Access denied'}), 403
if not is_local_request():
return jsonify({'error': 'Access denied'}), 403
secrets = app.vault_manager.list_secrets()
return jsonify({'secrets': secrets})
@app.route('/api/vault/secrets', methods=['POST'])
def store_secret():
# Temporarily disable access control for debugging
# if not is_local_request():
# return jsonify({'error': 'Access denied'}), 403
if not is_local_request():
return jsonify({'error': 'Access denied'}), 403
data = request.get_json(silent=True)
if not data or 'name' not in data or 'value' not in data:
return jsonify({'error': 'Missing name or value'}), 400
@@ -2602,8 +2620,8 @@ def store_secret():
@app.route('/api/vault/secrets/<name>', methods=['GET'])
def get_secret(name):
# Temporarily disable access control for debugging
# if not is_local_request():
# return jsonify({'error': 'Access denied'}), 403
if not is_local_request():
return jsonify({'error': 'Access denied'}), 403
value = app.vault_manager.get_secret(name)
if value is None:
return jsonify({'error': 'Not found'}), 404
@@ -2612,8 +2630,8 @@ def get_secret(name):
@app.route('/api/vault/secrets/<name>', methods=['DELETE'])
def delete_secret(name):
# Temporarily disable access control for debugging
# if not is_local_request():
# return jsonify({'error': 'Access denied'}), 403
if not is_local_request():
return jsonify({'error': 'Access denied'}), 403
result = app.vault_manager.delete_secret(name)
return jsonify({'deleted': result})
@@ -2621,8 +2639,8 @@ def delete_secret(name):
@app.route('/api/containers', methods=['POST'])
def create_container():
# Temporarily disable access control for debugging
# if not is_local_request():
# return jsonify({'error': 'Access denied'}), 403
if not is_local_request():
return jsonify({'error': 'Access denied'}), 403
data = request.get_json(silent=True)
if not data or 'image' not in data:
return jsonify({'error': 'Missing image parameter'}), 400
@@ -2653,8 +2671,8 @@ def create_container():
@app.route('/api/containers/<name>', methods=['DELETE'])
def remove_container(name):
# Temporarily disable access control for debugging
# if not is_local_request():
# return jsonify({'error': 'Access denied'}), 403
if not is_local_request():
return jsonify({'error': 'Access denied'}), 403
force = request.args.get('force', default=False, type=bool)
success = container_manager.remove_container(name, force=force)
return jsonify({'removed': success})
@@ -2662,8 +2680,8 @@ def remove_container(name):
@app.route('/api/images', methods=['GET'])
def list_images():
# Temporarily disable access control for debugging
# if not is_local_request():
# return jsonify({'error': 'Access denied'}), 403
if not is_local_request():
return jsonify({'error': 'Access denied'}), 403
images = container_manager.list_images()
return jsonify(images)
@@ -2690,8 +2708,8 @@ def remove_image(image):
@app.route('/api/volumes', methods=['GET'])
def list_volumes():
# Temporarily disable access control for debugging
# if not is_local_request():
# return jsonify({'error': 'Access denied'}), 403
if not is_local_request():
return jsonify({'error': 'Access denied'}), 403
volumes = container_manager.list_volumes()
return jsonify(volumes)
+62 -22
View File
@@ -117,11 +117,15 @@ class ConfigManager:
return {}
def _save_all_configs(self):
"""Save all service configurations to the unified config file"""
"""Save all service configurations to the unified config file (atomic write)."""
try:
self.config_file.parent.mkdir(parents=True, exist_ok=True)
with open(self.config_file, 'w') as f:
tmp = self.config_file.with_suffix('.tmp')
with open(tmp, 'w') as f:
json.dump(self.configs, f, indent=2)
f.flush()
os.fsync(f.fileno())
os.replace(tmp, self.config_file)
except (PermissionError, OSError):
pass
@@ -208,62 +212,98 @@ class ConfigManager:
}
def backup_config(self) -> str:
"""Create a backup of all configurations"""
"""Create a backup of cell_config.json, secrets, Caddyfile, .env, Corefile, and DNS zones."""
try:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
backup_id = f"backup_{timestamp}"
backup_path = self.backup_dir / backup_id
# Create backup directory
backup_path.mkdir(parents=True, exist_ok=True)
# Copy all config files
# Primary config and secrets
if self.config_file.exists():
shutil.copy2(self.config_file, backup_path / 'cell_config.json')
# Copy secrets file if it exists
if self.secrets_file.exists():
shutil.copy2(self.secrets_file, backup_path / 'secrets.yaml')
# Create backup manifest
# Runtime-generated files that must match cell_config.json after restore
config_dir = Path(os.environ.get('CONFIG_DIR', '/app/config'))
data_dir = Path(os.environ.get('DATA_DIR', '/app/data'))
env_file = Path(os.environ.get('ENV_FILE', '/app/.env'))
extra = [
(config_dir / 'caddy' / 'Caddyfile', 'Caddyfile'),
(config_dir / 'dns' / 'Corefile', 'Corefile'),
(env_file, '.env'),
]
for src, dest_name in extra:
if src.exists():
shutil.copy2(src, backup_path / dest_name)
# DNS zone files
dns_data = data_dir / 'dns'
if dns_data.is_dir():
zones_dir = backup_path / 'dns_zones'
zones_dir.mkdir(exist_ok=True)
for zone_file in dns_data.glob('*.zone'):
shutil.copy2(zone_file, zones_dir / zone_file.name)
manifest = {
"backup_id": backup_id,
"timestamp": datetime.now().isoformat(),
"services": list(self.service_schemas.keys()),
"files": [f.name for f in backup_path.iterdir()]
"files": [f.name for f in backup_path.iterdir()],
}
with open(backup_path / 'manifest.json', 'w') as f:
json.dump(manifest, f, indent=2)
logger.info(f"Created configuration backup: {backup_id}")
return backup_id
except Exception as e:
logger.error(f"Error creating backup: {e}")
raise
def restore_config(self, backup_id: str) -> bool:
"""Restore configuration from backup"""
"""Restore cell_config.json, secrets, Caddyfile, .env, Corefile, and DNS zones from backup."""
try:
backup_path = self.backup_dir / backup_id
if not backup_path.exists():
raise ValueError(f"Backup {backup_id} not found")
# Read manifest
manifest_file = backup_path / 'manifest.json'
if not manifest_file.exists():
raise ValueError(f"Backup manifest not found")
with open(manifest_file, 'r') as f:
manifest = json.load(f)
# Restore config files
# Restore primary config
config_backup = backup_path / 'cell_config.json'
if config_backup.exists():
shutil.copy2(config_backup, self.config_file)
# Restore secrets file if it exists
secrets_backup = backup_path / 'secrets.yaml'
if secrets_backup.exists():
shutil.copy2(secrets_backup, self.secrets_file)
# Reload configurations — restore only what was in the backup
# Restore runtime-generated files so they stay consistent with cell_config.json
config_dir = Path(os.environ.get('CONFIG_DIR', '/app/config'))
data_dir = Path(os.environ.get('DATA_DIR', '/app/data'))
env_file = Path(os.environ.get('ENV_FILE', '/app/.env'))
restore_map = [
(backup_path / 'Caddyfile', config_dir / 'caddy' / 'Caddyfile'),
(backup_path / 'Corefile', config_dir / 'dns' / 'Corefile'),
(backup_path / '.env', env_file),
]
for src, dest in restore_map:
if src.exists():
dest.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(src, dest)
# Restore DNS zone files
zones_backup = backup_path / 'dns_zones'
if zones_backup.is_dir():
dns_data = data_dir / 'dns'
dns_data.mkdir(parents=True, exist_ok=True)
for zone_file in zones_backup.glob('*.zone'):
shutil.copy2(zone_file, dns_data / zone_file.name)
self.configs = self._load_all_configs()
logger.info(f"Restored configuration from backup: {backup_id}")
return True
+11 -9
View File
@@ -276,14 +276,16 @@ def generate_corefile(peers: List[Dict[str, Any]], corefile_path: str = COREFILE
}}
{primary_zone_block}
local.{domain} {{
file /data/local.zone
log
}}
"""
# local.{domain} block intentionally omitted: /data/local.zone does not exist
# and CoreDNS logs errors on every reload for a missing zone file.
os.makedirs(os.path.dirname(corefile_path), exist_ok=True)
with open(corefile_path, 'w') as f:
tmp_path = corefile_path + '.tmp'
with open(tmp_path, 'w') as f:
f.write(corefile)
f.flush()
os.fsync(f.fileno())
os.replace(tmp_path, corefile_path)
logger.info(f"Wrote Corefile to {corefile_path}")
return True
@@ -293,13 +295,13 @@ local.{domain} {{
def reload_coredns() -> bool:
"""Send SIGHUP to CoreDNS container to reload config."""
"""Signal CoreDNS to reload its config. SIGUSR1 triggers the reload plugin; SIGHUP kills the process."""
try:
result = _run(['docker', 'kill', '--signal=SIGHUP', 'cell-dns'], check=False)
result = _run(['docker', 'kill', '--signal=SIGUSR1', 'cell-dns'], check=False)
if result.returncode == 0:
logger.info("Sent SIGHUP to cell-dns")
logger.info("Sent SIGUSR1 to cell-dns (reload)")
return True
logger.warning(f"SIGHUP to cell-dns failed: {result.stderr.strip()}")
logger.warning(f"SIGUSR1 to cell-dns failed: {result.stderr.strip()}")
return False
except Exception as e:
logger.error(f"reload_coredns: {e}")
+10 -2
View File
@@ -200,8 +200,12 @@ http://api.{domain} {{
}}
"""
os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
with open(path, 'w') as f:
tmp = path + '.tmp'
with open(tmp, 'w') as f:
f.write(content)
f.flush()
os.fsync(f.fileno())
os.replace(tmp, path)
return True
except Exception:
return False
@@ -229,8 +233,12 @@ def write_env_file(ip_range: str, path: str, ports: Optional[Dict[str, int]] = N
for key, var in PORT_ENV_VAR_NAMES.items():
lines.append(f'{var}={merged_ports[key]}\n')
os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
with open(path, 'w') as f:
tmp = path + '.tmp'
with open(tmp, 'w') as f:
f.writelines(lines)
f.flush()
os.fsync(f.fileno())
os.replace(tmp, path)
return True
except Exception:
return False
+7 -3
View File
@@ -33,10 +33,14 @@ class NetworkManager(BaseServiceManager):
# Create zone file content
content = self._generate_zone_content(zone_name, records)
with open(zone_file, 'w') as f:
tmp_file = zone_file + '.tmp'
with open(tmp_file, 'w') as f:
f.write(content)
f.flush()
os.fsync(f.fileno())
os.replace(tmp_file, zone_file)
# Reload DNS service
self._reload_dns_service()
+21 -7
View File
@@ -2,6 +2,16 @@
"""
Routing Manager for Personal Internet Cell
Handles VPN gateway, NAT, iptables, and advanced routing
NOTE: This manager runs iptables/ip-route commands on the HOST (the machine running
docker-compose), not inside cell-wireguard. This is intentional for host-level
routing features (exit-node, bridge, split-route) that are not yet wired to any
UI endpoint. The manager is instantiated but its methods are not called by any
active API route.
CRITICAL: _remove_nat_rule flushes ALL of POSTROUTING (-F), which would wipe the
WireGuard MASQUERADE rule. Do not call it until this is fixed to use targeted
rule deletion (-D) instead of a full flush.
"""
import os
@@ -766,14 +776,18 @@ class RoutingManager(BaseServiceManager):
logger.error(f"Failed to apply NAT rule: {e}")
def _remove_nat_rule(self, rule_id: str):
"""Remove NAT rule from iptables"""
"""Remove NAT rule from iptables by rule_id comment tag."""
try:
# This is a simplified removal - in practice you'd need to track the exact rule
cmd = ['iptables', '-t', 'nat', '-F', 'POSTROUTING']
subprocess.run(cmd, check=True, timeout=10)
logger.info(f"Removed NAT rule: {rule_id}")
# Use -D with the comment tag to remove the specific rule rather than
# flushing the entire POSTROUTING chain (which would wipe WireGuard MASQUERADE).
cmd = ['iptables', '-t', 'nat', '-D', 'POSTROUTING',
'-m', 'comment', '--comment', rule_id, '-j', 'MASQUERADE']
result = subprocess.run(cmd, timeout=10)
if result.returncode != 0:
# Rule may not exist — not an error
logger.debug(f"NAT rule {rule_id} not found (already removed?)")
else:
logger.info(f"Removed NAT rule: {rule_id}")
except Exception as e:
logger.error(f"Failed to remove NAT rule: {e}")