Fix ICMP latency: re-anchor ESTABLISHED,RELATED to FORWARD position 1 on every health tick

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-05 18:51:38 -04:00
parent 6f84a3ffe1
commit 1b61e9e290
3 changed files with 54 additions and 21 deletions
+3 -2
View File
@@ -526,9 +526,10 @@ def health_monitor_loop():
with app.app_context(): with app.app_context():
health_result = perform_health_check() health_result = perform_health_check()
health_history.appendleft(health_result) health_history.appendleft(health_result)
# Publish health check event
service_bus.publish_event(EventType.HEALTH_CHECK, 'api', health_result) service_bus.publish_event(EventType.HEALTH_CHECK, 'api', health_result)
# Re-anchor stateful rule every cycle: wg0 PostUp uses -I FORWARD which
# pushes ESTABLISHED,RELATED down below per-peer DROPs on restart.
firewall_manager.ensure_forward_stateful()
time.sleep(60) # Check every 60 seconds time.sleep(60) # Check every 60 seconds
# Start health monitor thread # Start health monitor thread
+11 -7
View File
@@ -447,23 +447,27 @@ def apply_all_cell_rules(cell_links: List[Dict[str, Any]]) -> None:
def ensure_forward_stateful() -> bool: def ensure_forward_stateful() -> bool:
"""Insert a stateful ESTABLISHED/RELATED ACCEPT at the top of FORWARD. """Ensure ESTABLISHED/RELATED ACCEPT is at position 1 (top) of FORWARD.
Cell rules DROP all traffic from a connected cell's subnet except specific Cell rules DROP all traffic from a connected cell's subnet except specific
service ports. Without conntrack, ICMP replies and TCP ACKs for connections service ports. Without conntrack, ICMP replies and TCP ACKs for connections
initiated BY local peers to the connected cell are also dropped, making initiated BY local peers to the connected cell are also dropped, making
cross-cell routing (peer → cell → remote cell) broken. cross-cell routing (peer → cell → remote cell) broken.
This rule is inserted once and does not carry a peer/cell comment tag, so it This function always deletes any existing instance and re-inserts at position 1.
is never removed by clear_peer_rules or clear_cell_rules. That re-anchoring is necessary because wg0 PostUp uses -I FORWARD (insert at top),
which pushes this rule down every time wg0 restarts — causing ICMP to hit the
per-peer DROP rule before reaching the stateful ACCEPT.
""" """
try: try:
check = ['-C', 'FORWARD', '-m', 'state', '--state', 'ESTABLISHED,RELATED', '-j', 'ACCEPT'] # Remove all existing instances so we can re-anchor at position 1.
if _wg_exec(['iptables'] + check).returncode == 0: # PostUp -I FORWARD rules drift this rule down on every wg0 restart.
return True # already present while _wg_exec(['iptables', '-D', 'FORWARD', '-m', 'state',
'--state', 'ESTABLISHED,RELATED', '-j', 'ACCEPT']).returncode == 0:
pass
_wg_exec(['iptables', '-I', 'FORWARD', '1', '-m', 'state', _wg_exec(['iptables', '-I', 'FORWARD', '1', '-m', 'state',
'--state', 'ESTABLISHED,RELATED', '-j', 'ACCEPT']) '--state', 'ESTABLISHED,RELATED', '-j', 'ACCEPT'])
logger.info('ensure_forward_stateful: inserted ESTABLISHED,RELATED ACCEPT into FORWARD') logger.info('ensure_forward_stateful: ESTABLISHED,RELATED anchored at FORWARD position 1')
return True return True
except Exception as e: except Exception as e:
logger.error(f'ensure_forward_stateful: {e}') logger.error(f'ensure_forward_stateful: {e}')
+40 -12
View File
@@ -560,7 +560,8 @@ class TestCellRules(unittest.TestCase):
with patch.object(firewall_manager, '_wg_exec', side_effect=fake_wg_exec), \ with patch.object(firewall_manager, '_wg_exec', side_effect=fake_wg_exec), \
patch.object(firewall_manager, '_get_cell_api_ip', return_value=self._FAKE_API_IP), \ patch.object(firewall_manager, '_get_cell_api_ip', return_value=self._FAKE_API_IP), \
patch.object(firewall_manager, '_get_caddy_container_ip', return_value=self._FAKE_CADDY_IP), \ patch.object(firewall_manager, '_get_caddy_container_ip', return_value=self._FAKE_CADDY_IP), \
patch.object(firewall_manager, '_get_dns_container_ip', return_value=self._FAKE_DNS_IP): patch.object(firewall_manager, '_get_dns_container_ip', return_value=self._FAKE_DNS_IP), \
patch.object(firewall_manager, 'ensure_forward_stateful', return_value=True):
firewall_manager.apply_cell_rules(cell_name, vpn_subnet, inbound_services) firewall_manager.apply_cell_rules(cell_name, vpn_subnet, inbound_services)
return [c for c in calls_made if 'iptables' in c] return [c for c in calls_made if 'iptables' in c]
@@ -650,7 +651,8 @@ class TestCellRules(unittest.TestCase):
with patch.object(firewall_manager, '_wg_exec', side_effect=fake_wg_exec), \ with patch.object(firewall_manager, '_wg_exec', side_effect=fake_wg_exec), \
patch.object(firewall_manager, '_get_cell_api_ip', return_value='172.20.0.10'), \ patch.object(firewall_manager, '_get_cell_api_ip', return_value='172.20.0.10'), \
patch.object(firewall_manager, '_get_caddy_container_ip', return_value='172.20.0.2'), \ patch.object(firewall_manager, '_get_caddy_container_ip', return_value='172.20.0.2'), \
patch.object(firewall_manager, '_get_dns_container_ip', return_value='172.20.0.3'): patch.object(firewall_manager, '_get_dns_container_ip', return_value='172.20.0.3'), \
patch.object(firewall_manager, 'ensure_forward_stateful', return_value=True):
firewall_manager.apply_cell_rules('office', '10.0.1.0/24', []) firewall_manager.apply_cell_rules('office', '10.0.1.0/24', [])
# The API-sync ACCEPT must be the LAST -I FORWARD insertion so it sits at position 1 # The API-sync ACCEPT must be the LAST -I FORWARD insertion so it sits at position 1
@@ -957,40 +959,66 @@ class TestReconcileStale(unittest.TestCase):
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
class TestEnsureForwardStateful(unittest.TestCase): class TestEnsureForwardStateful(unittest.TestCase):
"""ensure_forward_stateful must insert ESTABLISHED,RELATED ACCEPT only once.""" """ensure_forward_stateful deletes any existing copies then re-inserts at position 1."""
def _make_exec(self, already_present=False): def _make_exec(self, existing_copies=0):
"""Return (calls_list, fake_wg_exec).
The fake simulates *existing_copies* existing ESTABLISHED,RELATED rules.
Each -D call with returncode 0 "removes" one copy; once they are all gone
subsequent -D calls return 1 (rule not found). All other calls succeed.
"""
calls = [] calls = []
state = {'remaining': existing_copies}
def fake_wg_exec(args): def fake_wg_exec(args):
calls.append(args) calls.append(args)
r = MagicMock() r = MagicMock()
# -C (check) returns 0 if present, 1 if not r.stdout = ''
if '-C' in args: if '-D' in args:
r.returncode = 0 if already_present else 1 if state['remaining'] > 0:
state['remaining'] -= 1
r.returncode = 0 # deletion succeeded
else:
r.returncode = 1 # nothing left to delete
else: else:
r.returncode = 0 r.returncode = 0
r.stdout = ''
return r return r
return calls, fake_wg_exec return calls, fake_wg_exec
def test_inserts_rule_when_not_present(self): def test_inserts_rule_when_not_present(self):
calls, fake = self._make_exec(already_present=False) """With no pre-existing rule the -D loop exits immediately and -I inserts once."""
calls, fake = self._make_exec(existing_copies=0)
with patch.object(firewall_manager, '_wg_exec', side_effect=fake): with patch.object(firewall_manager, '_wg_exec', side_effect=fake):
result = firewall_manager.ensure_forward_stateful() result = firewall_manager.ensure_forward_stateful()
self.assertTrue(result) self.assertTrue(result)
# Exactly one -D attempt (returns 1 straight away, loop body never ran)
delete_calls = [c for c in calls if '-D' in c]
self.assertEqual(len(delete_calls), 1)
# Exactly one -I insert
insert_calls = [c for c in calls if '-I' in c] insert_calls = [c for c in calls if '-I' in c]
self.assertEqual(len(insert_calls), 1) self.assertEqual(len(insert_calls), 1)
flat = ' '.join(insert_calls[0]) flat = ' '.join(insert_calls[0])
self.assertIn('ESTABLISHED,RELATED', flat) self.assertIn('ESTABLISHED,RELATED', flat)
self.assertIn('ACCEPT', flat) self.assertIn('ACCEPT', flat)
def test_skips_insert_when_already_present(self): def test_deletes_existing_and_reinserts(self):
calls, fake = self._make_exec(already_present=True) """With 2 stale copies the loop deletes both, then inserts once at position 1."""
calls, fake = self._make_exec(existing_copies=2)
with patch.object(firewall_manager, '_wg_exec', side_effect=fake): with patch.object(firewall_manager, '_wg_exec', side_effect=fake):
result = firewall_manager.ensure_forward_stateful() result = firewall_manager.ensure_forward_stateful()
self.assertTrue(result) self.assertTrue(result)
# Two successful -D calls to drain existing rules, one more that fails
delete_calls = [c for c in calls if '-D' in c]
self.assertEqual(len(delete_calls), 3) # 2 succeed + 1 fails (loop exit)
# Exactly one -I insert anchored at position 1
insert_calls = [c for c in calls if '-I' in c] insert_calls = [c for c in calls if '-I' in c]
self.assertEqual(len(insert_calls), 0, "Must not insert duplicate rule") self.assertEqual(len(insert_calls), 1)
flat = ' '.join(insert_calls[0])
self.assertIn('1', flat)
self.assertIn('ESTABLISHED,RELATED', flat)
self.assertIn('ACCEPT', flat)
def test_apply_cell_rules_calls_ensure_forward_stateful(self): def test_apply_cell_rules_calls_ensure_forward_stateful(self):
"""apply_cell_rules must call ensure_forward_stateful so replies are never dropped.""" """apply_cell_rules must call ensure_forward_stateful so replies are never dropped."""