From 1b61e9e290759067fd68acb90a8c6681ddc0f64f Mon Sep 17 00:00:00 2001 From: Dmitrii Iurco Date: Tue, 5 May 2026 18:51:38 -0400 Subject: [PATCH] Fix ICMP latency: re-anchor ESTABLISHED,RELATED to FORWARD position 1 on every health tick Co-Authored-By: Claude Sonnet 4.6 --- api/app.py | 5 ++-- api/firewall_manager.py | 18 +++++++----- tests/test_firewall_manager.py | 52 ++++++++++++++++++++++++++-------- 3 files changed, 54 insertions(+), 21 deletions(-) diff --git a/api/app.py b/api/app.py index 0cc43dc..dca7322 100644 --- a/api/app.py +++ b/api/app.py @@ -526,9 +526,10 @@ def health_monitor_loop(): with app.app_context(): health_result = perform_health_check() health_history.appendleft(health_result) - - # Publish health check event service_bus.publish_event(EventType.HEALTH_CHECK, 'api', health_result) + # Re-anchor stateful rule every cycle: wg0 PostUp uses -I FORWARD which + # pushes ESTABLISHED,RELATED down below per-peer DROPs on restart. + firewall_manager.ensure_forward_stateful() time.sleep(60) # Check every 60 seconds # Start health monitor thread diff --git a/api/firewall_manager.py b/api/firewall_manager.py index 02b2d30..37b4e7b 100644 --- a/api/firewall_manager.py +++ b/api/firewall_manager.py @@ -447,23 +447,27 @@ def apply_all_cell_rules(cell_links: List[Dict[str, Any]]) -> None: def ensure_forward_stateful() -> bool: - """Insert a stateful ESTABLISHED/RELATED ACCEPT at the top of FORWARD. + """Ensure ESTABLISHED/RELATED ACCEPT is at position 1 (top) of FORWARD. Cell rules DROP all traffic from a connected cell's subnet except specific service ports. Without conntrack, ICMP replies and TCP ACKs for connections initiated BY local peers to the connected cell are also dropped, making cross-cell routing (peer → cell → remote cell) broken. - This rule is inserted once and does not carry a peer/cell comment tag, so it - is never removed by clear_peer_rules or clear_cell_rules. + This function always deletes any existing instance and re-inserts at position 1. + That re-anchoring is necessary because wg0 PostUp uses -I FORWARD (insert at top), + which pushes this rule down every time wg0 restarts — causing ICMP to hit the + per-peer DROP rule before reaching the stateful ACCEPT. """ try: - check = ['-C', 'FORWARD', '-m', 'state', '--state', 'ESTABLISHED,RELATED', '-j', 'ACCEPT'] - if _wg_exec(['iptables'] + check).returncode == 0: - return True # already present + # Remove all existing instances so we can re-anchor at position 1. + # PostUp -I FORWARD rules drift this rule down on every wg0 restart. + while _wg_exec(['iptables', '-D', 'FORWARD', '-m', 'state', + '--state', 'ESTABLISHED,RELATED', '-j', 'ACCEPT']).returncode == 0: + pass _wg_exec(['iptables', '-I', 'FORWARD', '1', '-m', 'state', '--state', 'ESTABLISHED,RELATED', '-j', 'ACCEPT']) - logger.info('ensure_forward_stateful: inserted ESTABLISHED,RELATED ACCEPT into FORWARD') + logger.info('ensure_forward_stateful: ESTABLISHED,RELATED anchored at FORWARD position 1') return True except Exception as e: logger.error(f'ensure_forward_stateful: {e}') diff --git a/tests/test_firewall_manager.py b/tests/test_firewall_manager.py index 8da6444..99aff5d 100644 --- a/tests/test_firewall_manager.py +++ b/tests/test_firewall_manager.py @@ -560,7 +560,8 @@ class TestCellRules(unittest.TestCase): with patch.object(firewall_manager, '_wg_exec', side_effect=fake_wg_exec), \ patch.object(firewall_manager, '_get_cell_api_ip', return_value=self._FAKE_API_IP), \ patch.object(firewall_manager, '_get_caddy_container_ip', return_value=self._FAKE_CADDY_IP), \ - patch.object(firewall_manager, '_get_dns_container_ip', return_value=self._FAKE_DNS_IP): + patch.object(firewall_manager, '_get_dns_container_ip', return_value=self._FAKE_DNS_IP), \ + patch.object(firewall_manager, 'ensure_forward_stateful', return_value=True): firewall_manager.apply_cell_rules(cell_name, vpn_subnet, inbound_services) return [c for c in calls_made if 'iptables' in c] @@ -650,7 +651,8 @@ class TestCellRules(unittest.TestCase): with patch.object(firewall_manager, '_wg_exec', side_effect=fake_wg_exec), \ patch.object(firewall_manager, '_get_cell_api_ip', return_value='172.20.0.10'), \ patch.object(firewall_manager, '_get_caddy_container_ip', return_value='172.20.0.2'), \ - patch.object(firewall_manager, '_get_dns_container_ip', return_value='172.20.0.3'): + patch.object(firewall_manager, '_get_dns_container_ip', return_value='172.20.0.3'), \ + patch.object(firewall_manager, 'ensure_forward_stateful', return_value=True): firewall_manager.apply_cell_rules('office', '10.0.1.0/24', []) # The API-sync ACCEPT must be the LAST -I FORWARD insertion so it sits at position 1 @@ -957,40 +959,66 @@ class TestReconcileStale(unittest.TestCase): # --------------------------------------------------------------------------- class TestEnsureForwardStateful(unittest.TestCase): - """ensure_forward_stateful must insert ESTABLISHED,RELATED ACCEPT only once.""" + """ensure_forward_stateful deletes any existing copies then re-inserts at position 1.""" - def _make_exec(self, already_present=False): + def _make_exec(self, existing_copies=0): + """Return (calls_list, fake_wg_exec). + + The fake simulates *existing_copies* existing ESTABLISHED,RELATED rules. + Each -D call with returncode 0 "removes" one copy; once they are all gone + subsequent -D calls return 1 (rule not found). All other calls succeed. + """ calls = [] + state = {'remaining': existing_copies} + def fake_wg_exec(args): calls.append(args) r = MagicMock() - # -C (check) returns 0 if present, 1 if not - if '-C' in args: - r.returncode = 0 if already_present else 1 + r.stdout = '' + if '-D' in args: + if state['remaining'] > 0: + state['remaining'] -= 1 + r.returncode = 0 # deletion succeeded + else: + r.returncode = 1 # nothing left to delete else: r.returncode = 0 - r.stdout = '' return r + return calls, fake_wg_exec def test_inserts_rule_when_not_present(self): - calls, fake = self._make_exec(already_present=False) + """With no pre-existing rule the -D loop exits immediately and -I inserts once.""" + calls, fake = self._make_exec(existing_copies=0) with patch.object(firewall_manager, '_wg_exec', side_effect=fake): result = firewall_manager.ensure_forward_stateful() self.assertTrue(result) + # Exactly one -D attempt (returns 1 straight away, loop body never ran) + delete_calls = [c for c in calls if '-D' in c] + self.assertEqual(len(delete_calls), 1) + # Exactly one -I insert insert_calls = [c for c in calls if '-I' in c] self.assertEqual(len(insert_calls), 1) flat = ' '.join(insert_calls[0]) self.assertIn('ESTABLISHED,RELATED', flat) self.assertIn('ACCEPT', flat) - def test_skips_insert_when_already_present(self): - calls, fake = self._make_exec(already_present=True) + def test_deletes_existing_and_reinserts(self): + """With 2 stale copies the loop deletes both, then inserts once at position 1.""" + calls, fake = self._make_exec(existing_copies=2) with patch.object(firewall_manager, '_wg_exec', side_effect=fake): result = firewall_manager.ensure_forward_stateful() self.assertTrue(result) + # Two successful -D calls to drain existing rules, one more that fails + delete_calls = [c for c in calls if '-D' in c] + self.assertEqual(len(delete_calls), 3) # 2 succeed + 1 fails (loop exit) + # Exactly one -I insert anchored at position 1 insert_calls = [c for c in calls if '-I' in c] - self.assertEqual(len(insert_calls), 0, "Must not insert duplicate rule") + self.assertEqual(len(insert_calls), 1) + flat = ' '.join(insert_calls[0]) + self.assertIn('1', flat) + self.assertIn('ESTABLISHED,RELATED', flat) + self.assertIn('ACCEPT', flat) def test_apply_cell_rules_calls_ensure_forward_stateful(self): """apply_cell_rules must call ensure_forward_stateful so replies are never dropped."""