fix: spurious health alerts, show rotated logs, clear history button

app.py: - Alert logic now checks status.running (container up/down) instead of healthy (which requires connectivity tests) — services are only alerted when actually down - Add POST /api/health/history/clear endpoint to reset history + alert counters log_manager.py: - get_all_log_file_infos: include rotated backup files (*.log.1, *.log.2 ...) in listing, marked with backup=true so UI can dim them and hide rotate button api.js: add monitoringAPI.clearHealthHistory Logs page: - Health History: add Clear button with confirmation - File panel: show full filename (including .log.1 backups), explain host path and naming, hide rotate button for backup files Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-21 03:05:04 -04:00
parent a5381b2ebc
commit 8e1814c7d2
4 changed files with 61 additions and 40 deletions
@@ -234,36 +234,26 @@ def perform_health_check():
            except Exception as e:
                result[service_name] = {'error': str(e), 'status': 'offline'}
        
-        # Health alerting logic - improved to be more robust
+        # Health alerting logic — alert only when a service container is not running
        global service_alert_counters
        for service_name in service_bus.list_services():
            if service_name in result:
                status = result[service_name]
                healthy = True
-                
-                # Improved health determination logic
+
                if isinstance(status, dict):
-                    # Check for explicit healthy field first
-                    if 'healthy' in status:
-                        healthy = status['healthy']
-                    # Check for running status
+                    # Prefer status.running (container actually up) over healthy (connectivity tests)
+                    inner = status.get('status', {})
+                    if isinstance(inner, dict):
+                        if 'running' in inner:
+                            healthy = inner['running']
+                        elif 'status' in inner:
+                            healthy = str(inner['status']).lower() in ('ok', 'healthy', 'online', 'active')
                    elif 'running' in status:
                        healthy = status['running']
-                    # Check for status field with various healthy values
-                    elif 'status' in status:
-                        status_value = status['status']
-                        if isinstance(status_value, str):
-                            healthy = status_value.lower() in ('ok', 'healthy', 'online', 'active')
-                        else:
-                            healthy = bool(status_value)
-                    # Check for error field
                    elif 'error' in status:
                        healthy = False
-                    # If no health indicators, assume healthy if service exists
-                    else:
-                        healthy = True
                else:
-                    # If status is not a dict, assume it's a boolean
                    healthy = bool(status)
                
                # Only count as unhealthy if we're certain it's down
@@ -1985,6 +1975,14 @@ def get_health_history():
    """Get recent unified health check results."""
    return jsonify(list(health_history))

+@app.route('/api/health/history/clear', methods=['POST'])
+def clear_health_history():
+    """Clear health history and reset alert counters."""
+    global service_alert_counters
+    health_history.clear()
+    service_alert_counters = {}
+    return jsonify({'message': 'Health history cleared'})
+
@app.route('/api/logs', methods=['GET'])
 def get_backend_logs():
    """Get backend log file contents (last N lines)."""