""" 컨테이너 이상 감지 + SR 자동 생성 Docker/K8s 컨테이너 헬스 상태를 주기적으로 체크하여 이상 감지 시 SR을 자동으로 생성한다. 엔드포인트: GET /api/container-alerts/check — 컨테이너 상태 즉시 체크 GET /api/container-alerts/list — 최근 알림 목록 POST /api/container-alerts/rules — 알림 규칙 등록 GET /api/container-alerts/rules — 알림 규칙 목록 DELETE /api/container-alerts/rules/{id} — 규칙 삭제 """ from __future__ import annotations import json import logging from datetime import datetime from typing import List, Optional import paramiko from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException from pydantic import BaseModel, Field from sqlalchemy import select, desc from sqlalchemy.ext.asyncio import AsyncSession from core.auth import get_current_user, require_admin_role from database import get_db from models import User, Server, SRRequest, SRStatus, ContainerAlertRule, ContainerAlertLog # 신규 logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/container-alerts", tags=["Container Alerts"]) class AlertRuleCreate(BaseModel): name: str = Field(..., max_length=200) server_id: int container_name: Optional[str] = None # None = 전체 컨테이너 alert_on_stopped: bool = True alert_on_high_cpu: bool = True cpu_threshold: float = Field(90.0, ge=10, le=100) alert_on_high_mem: bool = True mem_threshold: float = Field(90.0, ge=10, le=100) auto_sr: bool = True async def _ssh_run(server: Server, cmd: str) -> str: """SSH 명령 실행 (에이전트리스).""" from core.crypto import decrypt_password try: pw = decrypt_password(server.os_pw_enc) ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) ssh.connect(server.ip_addr, username=server.ssh_user, password=pw, timeout=10) _, stdout, _ = ssh.exec_command(cmd, timeout=20) result = stdout.read().decode('utf-8', 'replace').strip() ssh.close() return result except Exception as e: logger.error(f"SSH 실패 ({server.ip_addr}): {e}") return "" async def _check_containers(server: Server, rule: ContainerAlertRule) -> list[dict]: """서버의 Docker 컨테이너 상태 체크.""" alerts = [] # 컨테이너 목록 및 상태 output = await _ssh_run(server, 'docker ps -a --format \'{"name":"{{.Names}}","status":"{{.Status}}","cpu":"0","mem":"0"}\' 2>/dev/null' ) if not output: return alerts for line in output.strip().split('\n'): try: info = json.loads(line) except Exception: continue cname = info.get("name", "") if rule.container_name and rule.container_name != cname: continue status = info.get("status", "") # 중지된 컨테이너 감지 if rule.alert_on_stopped and ("Exited" in status or "Dead" in status): alerts.append({ "container": cname, "type": "CONTAINER_STOPPED", "severity": "HIGH", "message": f"컨테이너 {cname} 중지됨: {status}", "server": server.ip_addr, }) # docker stats로 CPU/Memory 체크 if rule.alert_on_high_cpu or rule.alert_on_high_mem: stats_out = await _ssh_run(server, f'docker stats --no-stream --format "{{{{.Name}}}} {{{{.CPUPerc}}}} {{{{.MemPerc}}}}" 2>/dev/null' ) for line in (stats_out or "").strip().split('\n'): parts = line.split() if len(parts) < 3: continue cname = parts[0] if rule.container_name and rule.container_name != cname: continue try: cpu = float(parts[1].replace('%', '')) mem = float(parts[2].replace('%', '')) except ValueError: continue if rule.alert_on_high_cpu and cpu >= rule.cpu_threshold: alerts.append({ "container": cname, "type": "HIGH_CPU", "severity": "MEDIUM", "message": f"{cname} CPU {cpu:.1f}% (임계값 {rule.cpu_threshold}%)", "server": server.ip_addr, }) if rule.alert_on_high_mem and mem >= rule.mem_threshold: alerts.append({ "container": cname, "type": "HIGH_MEM", "severity": "MEDIUM", "message": f"{cname} 메모리 {mem:.1f}% (임계값 {rule.mem_threshold}%)", "server": server.ip_addr, }) return alerts @router.post("/rules") async def create_alert_rule( req: AlertRuleCreate, db: AsyncSession = Depends(get_db), user: User = Depends(require_admin_role), ): srv_row = await db.execute(select(Server).where(Server.id == req.server_id)) if not srv_row.scalar_one_or_none(): raise HTTPException(404, "서버를 찾을 수 없습니다") rule = ContainerAlertRule( tenant_id=user.tenant_id, name=req.name, server_id=req.server_id, container_name=req.container_name, alert_on_stopped=req.alert_on_stopped, alert_on_high_cpu=req.alert_on_high_cpu, cpu_threshold=req.cpu_threshold, alert_on_high_mem=req.alert_on_high_mem, mem_threshold=req.mem_threshold, auto_sr=req.auto_sr, is_active=True, created_at=datetime.utcnow(), ) db.add(rule) await db.commit() await db.refresh(rule) return {"ok": True, "id": rule.id} @router.get("/rules") async def list_rules( db: AsyncSession = Depends(get_db), user: User = Depends(get_current_user), ): rows = await db.execute( select(ContainerAlertRule).where( ContainerAlertRule.tenant_id == user.tenant_id, ContainerAlertRule.is_active == True, ) ) rules = rows.scalars().all() return [ {"id": r.id, "name": r.name, "server_id": r.server_id, "container": r.container_name, "auto_sr": r.auto_sr} for r in rules ] @router.delete("/rules/{rule_id}") async def delete_rule( rule_id: int, db: AsyncSession = Depends(get_db), user: User = Depends(require_admin_role), ): row = await db.execute( select(ContainerAlertRule).where( ContainerAlertRule.id == rule_id, ContainerAlertRule.tenant_id == user.tenant_id, ) ) rule = row.scalar_one_or_none() if not rule: raise HTTPException(404) rule.is_active = False await db.commit() return {"ok": True} @router.get("/check") async def check_all_containers( background_tasks: BackgroundTasks, db: AsyncSession = Depends(get_db), user: User = Depends(get_current_user), ): """모든 규칙에 대해 컨테이너 상태 즉시 체크.""" rules_row = await db.execute( select(ContainerAlertRule).where( ContainerAlertRule.tenant_id == user.tenant_id, ContainerAlertRule.is_active == True, ) ) rules = rules_row.scalars().all() all_alerts = [] for rule in rules: srv_row = await db.execute(select(Server).where(Server.id == rule.server_id)) server = srv_row.scalar_one_or_none() if not server: continue alerts = await _check_containers(server, rule) for alert in alerts: log = ContainerAlertLog( rule_id=rule.id, alert_type=alert["type"], container_name=alert["container"], severity=alert["severity"], message=alert["message"], detected_at=datetime.utcnow(), ) db.add(log) # SR 자동 생성 if rule.auto_sr: sr = SRRequest( title=f"[컨테이너 알림] {alert['type']}: {alert['container']}", description=alert["message"], category="MONITORING", priority=alert["severity"], status=SRStatus.OPEN, created_at=datetime.utcnow(), ) db.add(sr) all_alerts.extend(alerts) await db.commit() return {"alerts": all_alerts, "total": len(all_alerts)} @router.get("/list") async def alert_list( limit: int = 50, db: AsyncSession = Depends(get_db), user: User = Depends(get_current_user), ): rows = await db.execute( select(ContainerAlertLog).order_by(desc(ContainerAlertLog.detected_at)).limit(limit) ) logs = rows.scalars().all() return [ {"id": l.id, "type": l.alert_type, "container": l.container_name, "severity": l.severity, "message": l.message, "detected_at": l.detected_at} for l in logs ]