pic/api/audit_manager.py

#!/usr/bin/env python3
"""
Audit Manager for Personal Internet Cell.

Owner-visible, append-only audit trail of WHO (actor + role + ip) did WHAT
(action) to WHICH target, WHEN, with a redacted summary. Storage is a JSONL
file with a per-entry SHA-256 hash chain so tampering is detectable. Request
bodies and secret values are never written; summaries only ever list changed
config KEY NAMES, never their values.
"""

import os
import io
import re
import csv
import json
import hashlib
import logging
import threading
from datetime import datetime
from typing import Dict, List, Optional, Any

from base_service_manager import BaseServiceManager

logger = logging.getLogger(__name__)


def _utcnow_iso() -> str:
    return datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')


# Keys whose values must never be recorded — name-only in summaries.
_SECRET_KEY_RE = re.compile(r'(pass|secret|key|token|private|cred|otp|psk)', re.IGNORECASE)
# Final scrub of anything that looks like base64 key material / encoded blobs.
_BASE64_BLOCK_RE = re.compile(r'[A-Za-z0-9+/]{40,}={0,2}')
# bcrypt and age secret prefixes.
_SECRET_PREFIX_RE = re.compile(
    r'(\$2[aby]\$[^\s]+|AGE-SECRET-KEY-[^\s]+|age1[^\s]+|-----BEGIN[^\n]+)'
)

_VALID_RESULTS = ('success', 'failure')


class AuditManager(BaseServiceManager):
    """Append-only, hash-chained audit trail."""

    MAX_FILE_SIZE = 10 * 1024 * 1024  # 10 MB before rotation
    BACKUP_COUNT = 10                 # audit.log.1 .. audit.log.10

    def __init__(self, data_dir: str = '/app/data', config_dir: str = '/app/config',
                 tamper_chain: bool = True):
        super().__init__('audit', data_dir=data_dir, config_dir=config_dir)
        self.tamper_chain = tamper_chain
        self._lock = threading.RLock()
        self._audit_dir = os.path.join(self.data_dir, 'api', 'audit')
        self._audit_file = os.path.join(self._audit_dir, 'audit.log')
        self._seq = 0
        self._prev_hash = ''
        self.safe_makedirs(self._audit_dir)
        self._load_chain_state()

    # ── chain bootstrap ─────────────────────────────────────────────────────
    def _load_chain_state(self) -> None:
        """Recover seq + prev_hash from the last line of the live file."""
        try:
            if not os.path.exists(self._audit_file):
                return
            last = None
            with open(self._audit_file, 'r', encoding='utf-8', errors='ignore') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        last = line
            if last:
                entry = json.loads(last)
                self._seq = int(entry.get('seq', 0))
                self._prev_hash = entry.get('hash', '') or ''
        except Exception as e:
            logger.warning(f"audit: could not load chain state: {e}")

    # ── redaction ───────────────────────────────────────────────────────────
    @staticmethod
    def _scrub(text: str) -> str:
        """Strip anything resembling a secret value from a summary string."""
        if not text:
            return ''
        text = _SECRET_PREFIX_RE.sub('[REDACTED]', text)
        text = _BASE64_BLOCK_RE.sub('[REDACTED]', text)
        return text

    @classmethod
    def _redact(cls, entry: Dict[str, Any]) -> Dict[str, Any]:
        """Enforce the redaction rules on a built entry before write.

        - summary is scrubbed of base64/secret-prefixed blobs.
        - any string field is scrubbed too (defence in depth).
        Request bodies are never present — the caller passes only a summary.
        """
        for field in ('summary', 'target_id', 'action', 'path'):
            val = entry.get(field)
            if isinstance(val, str):
                entry[field] = cls._scrub(val)
        return entry

    @classmethod
    def summarize_keys(cls, keys: List[str]) -> str:
        """Build a redacted summary listing changed config KEY NAMES only.

        Secret-looking key names are kept (they are names, not values) but the
        whole string is still scrubbed of any accidental value material.
        """
        names = [str(k) for k in keys if k is not None]
        return cls._scrub('changed: ' + ', '.join(names)) if names else 'no changes'

    # ── hashing ─────────────────────────────────────────────────────────────
    @staticmethod
    def _canonical(entry: Dict[str, Any]) -> str:
        return json.dumps(entry, sort_keys=True, separators=(',', ':'), ensure_ascii=False)

    def _hash_entry(self, entry_without_hash: Dict[str, Any]) -> str:
        return hashlib.sha256(self._canonical(entry_without_hash).encode('utf-8')).hexdigest()

    # ── recording ───────────────────────────────────────────────────────────
    def record(self, actor: str, role: str, ip: str, action: str,
               target_type: str = '', target_id: str = '', summary: str = '',
               result: str = 'success', status: int = 200, method: str = '',
               path: str = '', request_id: str = '') -> Optional[Dict[str, Any]]:
        """Append one redacted, hash-chained JSON line. Never raises."""
        try:
            with self._lock:
                self._maybe_rotate()
                self._seq += 1
                if result not in _VALID_RESULTS:
                    result = 'success' if int(status or 200) < 400 else 'failure'
                entry: Dict[str, Any] = {
                    'ts': _utcnow_iso(),
                    'actor': actor or 'anonymous',
                    'role': role or 'system',
                    'ip': ip or '',
                    'action': action or '',
                    'target_type': target_type or '',
                    'target_id': target_id or '',
                    'summary': summary or '',
                    'result': result,
                    'status': int(status or 0),
                    'method': method or '',
                    'path': path or '',
                    'request_id': request_id or '',
                    'seq': self._seq,
                    'prev_hash': self._prev_hash if self.tamper_chain else '',
                }
                entry = self._redact(entry)
                if self.tamper_chain:
                    entry['hash'] = self._hash_entry(entry)
                else:
                    entry['hash'] = ''
                self._append_line(json.dumps(entry, ensure_ascii=False))
                self._prev_hash = entry['hash']
                return entry
        except Exception as e:
            logger.warning(f"audit.record failed: {e}")
            return None

    def _append_line(self, line: str) -> None:
        self.safe_makedirs(self._audit_dir)
        fd = os.open(self._audit_file, os.O_WRONLY | os.O_CREAT | os.O_APPEND, 0o600)
        try:
            os.write(fd, (line + '\n').encode('utf-8'))
        finally:
            os.close(fd)
        try:
            os.chmod(self._audit_file, 0o600)
        except OSError:
            pass

    # ── rotation ────────────────────────────────────────────────────────────
    def _maybe_rotate(self) -> None:
        try:
            if not os.path.exists(self._audit_file):
                return
            if os.path.getsize(self._audit_file) < self.MAX_FILE_SIZE:
                return
        except OSError:
            return
        # audit.log.(N-1) -> audit.log.N, ... audit.log -> audit.log.1
        for i in range(self.BACKUP_COUNT - 1, 0, -1):
            src = f"{self._audit_file}.{i}"
            dst = f"{self._audit_file}.{i + 1}"
            if os.path.exists(src):
                try:
                    os.replace(src, dst)
                except OSError as e:
                    logger.warning(f"audit rotate {src}->{dst}: {e}")
        try:
            os.replace(self._audit_file, f"{self._audit_file}.1")
        except OSError as e:
            logger.warning(f"audit rotate live->.1: {e}")

    def _segment_files(self) -> List[str]:
        """Live file first (newest), then rotated segments .1 .. .N (older)."""
        files = []
        if os.path.exists(self._audit_file):
            files.append(self._audit_file)
        for i in range(1, self.BACKUP_COUNT + 1):
            seg = f"{self._audit_file}.{i}"
            if os.path.exists(seg):
                files.append(seg)
        return files

    # ── reading / filtering ─────────────────────────────────────────────────
    @staticmethod
    def _matches(entry: Dict[str, Any], filters: Dict[str, Any]) -> bool:
        for field in ('actor', 'action', 'target_type', 'target_id', 'result'):
            want = filters.get(field)
            if want and str(entry.get(field, '')) != str(want):
                return False
        since = filters.get('since')
        until = filters.get('until')
        ts = entry.get('ts', '')
        if since and ts < since:
            return False
        if until and ts > until:
            return False
        return True

    def _read_all(self, filters: Dict[str, Any]) -> List[Dict[str, Any]]:
        """Return matching entries, newest-first across all segments."""
        results: List[Dict[str, Any]] = []
        with self._lock:
            for seg in self._segment_files():
                try:
                    with open(seg, 'r', encoding='utf-8', errors='ignore') as f:
                        lines = f.readlines()
                except OSError:
                    continue
                for line in reversed(lines):
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        entry = json.loads(line)
                    except json.JSONDecodeError:
                        continue
                    if self._matches(entry, filters):
                        results.append(entry)
        return results

    def query(self, filters: Optional[Dict[str, Any]] = None,
              limit: int = 100, offset: int = 0) -> Dict[str, Any]:
        filters = filters or {}
        try:
            limit = max(1, min(int(limit), 1000))
        except (TypeError, ValueError):
            limit = 100
        try:
            offset = max(0, int(offset))
        except (TypeError, ValueError):
            offset = 0
        entries = self._read_all(filters)
        total = len(entries)
        page = entries[offset:offset + limit]
        next_offset = offset + limit if offset + limit < total else None
        return {'entries': page, 'total': total, 'next_offset': next_offset}

    def export_csv(self, filters: Optional[Dict[str, Any]] = None) -> str:
        filters = filters or {}
        entries = self._read_all(filters)
        fields = ['ts', 'actor', 'role', 'ip', 'action', 'target_type',
                  'target_id', 'summary', 'result', 'status', 'method', 'path',
                  'request_id', 'seq']
        buf = io.StringIO()
        writer = csv.writer(buf)
        writer.writerow(fields)
        for e in entries:
            writer.writerow([e.get(f, '') for f in fields])
        return buf.getvalue()

    # ── integrity ───────────────────────────────────────────────────────────
    def verify_chain(self) -> Dict[str, Any]:
        """Walk all segments oldest-first; verify each entry's hash + link."""
        if not self.tamper_chain:
            return {'ok': True, 'broken_at_seq': None, 'disabled': True}
        with self._lock:
            segs = list(reversed(self._segment_files()))  # oldest -> newest
            prev_hash = ''
            first = True  # oldest available record: its predecessor may be pruned
            for seg in segs:
                try:
                    with open(seg, 'r', encoding='utf-8', errors='ignore') as f:
                        lines = f.readlines()
                except OSError:
                    continue
                for line in lines:
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        entry = json.loads(line)
                    except json.JSONDecodeError:
                        return {'ok': False, 'broken_at_seq': None}
                    stored_hash = entry.get('hash', '')
                    # Don't fail the prev_hash link on the very first available
                    # record — older segments may have rotated off the end.
                    if not first and entry.get('prev_hash', '') != prev_hash:
                        return {'ok': False, 'broken_at_seq': entry.get('seq')}
                    recomputed = self._hash_entry({k: v for k, v in entry.items() if k != 'hash'})
                    if recomputed != stored_hash:
                        return {'ok': False, 'broken_at_seq': entry.get('seq')}
                    prev_hash = stored_hash
                    first = False
        return {'ok': True, 'broken_at_seq': None}

    # ── BaseServiceManager interface ────────────────────────────────────────
    def get_status(self) -> Dict[str, Any]:
        size = 0
        try:
            if os.path.exists(self._audit_file):
                size = os.path.getsize(self._audit_file)
        except OSError:
            pass
        return {
            'running': True,
            'tamper_chain': self.tamper_chain,
            'seq': self._seq,
            'file': self._audit_file,
            'file_size': size,
        }

    def test_connectivity(self) -> Dict[str, Any]:
        return {'success': True}