Fix ICMP latency: re-anchor ESTABLISHED,RELATED to FORWARD position 1 on every health tick
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+3
-2
@@ -526,9 +526,10 @@ def health_monitor_loop():
|
||||
with app.app_context():
|
||||
health_result = perform_health_check()
|
||||
health_history.appendleft(health_result)
|
||||
|
||||
# Publish health check event
|
||||
service_bus.publish_event(EventType.HEALTH_CHECK, 'api', health_result)
|
||||
# Re-anchor stateful rule every cycle: wg0 PostUp uses -I FORWARD which
|
||||
# pushes ESTABLISHED,RELATED down below per-peer DROPs on restart.
|
||||
firewall_manager.ensure_forward_stateful()
|
||||
time.sleep(60) # Check every 60 seconds
|
||||
|
||||
# Start health monitor thread
|
||||
|
||||
+11
-7
@@ -447,23 +447,27 @@ def apply_all_cell_rules(cell_links: List[Dict[str, Any]]) -> None:
|
||||
|
||||
|
||||
def ensure_forward_stateful() -> bool:
|
||||
"""Insert a stateful ESTABLISHED/RELATED ACCEPT at the top of FORWARD.
|
||||
"""Ensure ESTABLISHED/RELATED ACCEPT is at position 1 (top) of FORWARD.
|
||||
|
||||
Cell rules DROP all traffic from a connected cell's subnet except specific
|
||||
service ports. Without conntrack, ICMP replies and TCP ACKs for connections
|
||||
initiated BY local peers to the connected cell are also dropped, making
|
||||
cross-cell routing (peer → cell → remote cell) broken.
|
||||
|
||||
This rule is inserted once and does not carry a peer/cell comment tag, so it
|
||||
is never removed by clear_peer_rules or clear_cell_rules.
|
||||
This function always deletes any existing instance and re-inserts at position 1.
|
||||
That re-anchoring is necessary because wg0 PostUp uses -I FORWARD (insert at top),
|
||||
which pushes this rule down every time wg0 restarts — causing ICMP to hit the
|
||||
per-peer DROP rule before reaching the stateful ACCEPT.
|
||||
"""
|
||||
try:
|
||||
check = ['-C', 'FORWARD', '-m', 'state', '--state', 'ESTABLISHED,RELATED', '-j', 'ACCEPT']
|
||||
if _wg_exec(['iptables'] + check).returncode == 0:
|
||||
return True # already present
|
||||
# Remove all existing instances so we can re-anchor at position 1.
|
||||
# PostUp -I FORWARD rules drift this rule down on every wg0 restart.
|
||||
while _wg_exec(['iptables', '-D', 'FORWARD', '-m', 'state',
|
||||
'--state', 'ESTABLISHED,RELATED', '-j', 'ACCEPT']).returncode == 0:
|
||||
pass
|
||||
_wg_exec(['iptables', '-I', 'FORWARD', '1', '-m', 'state',
|
||||
'--state', 'ESTABLISHED,RELATED', '-j', 'ACCEPT'])
|
||||
logger.info('ensure_forward_stateful: inserted ESTABLISHED,RELATED ACCEPT into FORWARD')
|
||||
logger.info('ensure_forward_stateful: ESTABLISHED,RELATED anchored at FORWARD position 1')
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f'ensure_forward_stateful: {e}')
|
||||
|
||||
@@ -560,7 +560,8 @@ class TestCellRules(unittest.TestCase):
|
||||
with patch.object(firewall_manager, '_wg_exec', side_effect=fake_wg_exec), \
|
||||
patch.object(firewall_manager, '_get_cell_api_ip', return_value=self._FAKE_API_IP), \
|
||||
patch.object(firewall_manager, '_get_caddy_container_ip', return_value=self._FAKE_CADDY_IP), \
|
||||
patch.object(firewall_manager, '_get_dns_container_ip', return_value=self._FAKE_DNS_IP):
|
||||
patch.object(firewall_manager, '_get_dns_container_ip', return_value=self._FAKE_DNS_IP), \
|
||||
patch.object(firewall_manager, 'ensure_forward_stateful', return_value=True):
|
||||
firewall_manager.apply_cell_rules(cell_name, vpn_subnet, inbound_services)
|
||||
|
||||
return [c for c in calls_made if 'iptables' in c]
|
||||
@@ -650,7 +651,8 @@ class TestCellRules(unittest.TestCase):
|
||||
with patch.object(firewall_manager, '_wg_exec', side_effect=fake_wg_exec), \
|
||||
patch.object(firewall_manager, '_get_cell_api_ip', return_value='172.20.0.10'), \
|
||||
patch.object(firewall_manager, '_get_caddy_container_ip', return_value='172.20.0.2'), \
|
||||
patch.object(firewall_manager, '_get_dns_container_ip', return_value='172.20.0.3'):
|
||||
patch.object(firewall_manager, '_get_dns_container_ip', return_value='172.20.0.3'), \
|
||||
patch.object(firewall_manager, 'ensure_forward_stateful', return_value=True):
|
||||
firewall_manager.apply_cell_rules('office', '10.0.1.0/24', [])
|
||||
|
||||
# The API-sync ACCEPT must be the LAST -I FORWARD insertion so it sits at position 1
|
||||
@@ -957,40 +959,66 @@ class TestReconcileStale(unittest.TestCase):
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestEnsureForwardStateful(unittest.TestCase):
|
||||
"""ensure_forward_stateful must insert ESTABLISHED,RELATED ACCEPT only once."""
|
||||
"""ensure_forward_stateful deletes any existing copies then re-inserts at position 1."""
|
||||
|
||||
def _make_exec(self, already_present=False):
|
||||
def _make_exec(self, existing_copies=0):
|
||||
"""Return (calls_list, fake_wg_exec).
|
||||
|
||||
The fake simulates *existing_copies* existing ESTABLISHED,RELATED rules.
|
||||
Each -D call with returncode 0 "removes" one copy; once they are all gone
|
||||
subsequent -D calls return 1 (rule not found). All other calls succeed.
|
||||
"""
|
||||
calls = []
|
||||
state = {'remaining': existing_copies}
|
||||
|
||||
def fake_wg_exec(args):
|
||||
calls.append(args)
|
||||
r = MagicMock()
|
||||
# -C (check) returns 0 if present, 1 if not
|
||||
if '-C' in args:
|
||||
r.returncode = 0 if already_present else 1
|
||||
r.stdout = ''
|
||||
if '-D' in args:
|
||||
if state['remaining'] > 0:
|
||||
state['remaining'] -= 1
|
||||
r.returncode = 0 # deletion succeeded
|
||||
else:
|
||||
r.returncode = 1 # nothing left to delete
|
||||
else:
|
||||
r.returncode = 0
|
||||
r.stdout = ''
|
||||
return r
|
||||
|
||||
return calls, fake_wg_exec
|
||||
|
||||
def test_inserts_rule_when_not_present(self):
|
||||
calls, fake = self._make_exec(already_present=False)
|
||||
"""With no pre-existing rule the -D loop exits immediately and -I inserts once."""
|
||||
calls, fake = self._make_exec(existing_copies=0)
|
||||
with patch.object(firewall_manager, '_wg_exec', side_effect=fake):
|
||||
result = firewall_manager.ensure_forward_stateful()
|
||||
self.assertTrue(result)
|
||||
# Exactly one -D attempt (returns 1 straight away, loop body never ran)
|
||||
delete_calls = [c for c in calls if '-D' in c]
|
||||
self.assertEqual(len(delete_calls), 1)
|
||||
# Exactly one -I insert
|
||||
insert_calls = [c for c in calls if '-I' in c]
|
||||
self.assertEqual(len(insert_calls), 1)
|
||||
flat = ' '.join(insert_calls[0])
|
||||
self.assertIn('ESTABLISHED,RELATED', flat)
|
||||
self.assertIn('ACCEPT', flat)
|
||||
|
||||
def test_skips_insert_when_already_present(self):
|
||||
calls, fake = self._make_exec(already_present=True)
|
||||
def test_deletes_existing_and_reinserts(self):
|
||||
"""With 2 stale copies the loop deletes both, then inserts once at position 1."""
|
||||
calls, fake = self._make_exec(existing_copies=2)
|
||||
with patch.object(firewall_manager, '_wg_exec', side_effect=fake):
|
||||
result = firewall_manager.ensure_forward_stateful()
|
||||
self.assertTrue(result)
|
||||
# Two successful -D calls to drain existing rules, one more that fails
|
||||
delete_calls = [c for c in calls if '-D' in c]
|
||||
self.assertEqual(len(delete_calls), 3) # 2 succeed + 1 fails (loop exit)
|
||||
# Exactly one -I insert anchored at position 1
|
||||
insert_calls = [c for c in calls if '-I' in c]
|
||||
self.assertEqual(len(insert_calls), 0, "Must not insert duplicate rule")
|
||||
self.assertEqual(len(insert_calls), 1)
|
||||
flat = ' '.join(insert_calls[0])
|
||||
self.assertIn('1', flat)
|
||||
self.assertIn('ESTABLISHED,RELATED', flat)
|
||||
self.assertIn('ACCEPT', flat)
|
||||
|
||||
def test_apply_cell_rules_calls_ensure_forward_stateful(self):
|
||||
"""apply_cell_rules must call ensure_forward_stateful so replies are never dropped."""
|
||||
|
||||
Reference in New Issue
Block a user