guardia-itsm/routers/container_alerts.py
2026-06-02 06:07:36 +09:00

258 lines
8.7 KiB
Python

"""
컨테이너 이상 감지 + SR 자동 생성
Docker/K8s 컨테이너 헬스 상태를 주기적으로 체크하여
이상 감지 시 SR을 자동으로 생성한다.
엔드포인트:
GET /api/container-alerts/check — 컨테이너 상태 즉시 체크
GET /api/container-alerts/list — 최근 알림 목록
POST /api/container-alerts/rules — 알림 규칙 등록
GET /api/container-alerts/rules — 알림 규칙 목록
DELETE /api/container-alerts/rules/{id} — 규칙 삭제
"""
from __future__ import annotations
import json
import logging
from datetime import datetime
from typing import List, Optional
import paramiko
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
from pydantic import BaseModel, Field
from sqlalchemy import select, desc
from sqlalchemy.ext.asyncio import AsyncSession
from core.auth import get_current_user, require_admin_role
from database import get_db
from models import User, Server, SRRequest, SRStatus, ContainerAlertRule, ContainerAlertLog # 신규
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/container-alerts", tags=["Container Alerts"])
class AlertRuleCreate(BaseModel):
name: str = Field(..., max_length=200)
server_id: int
container_name: Optional[str] = None # None = 전체 컨테이너
alert_on_stopped: bool = True
alert_on_high_cpu: bool = True
cpu_threshold: float = Field(90.0, ge=10, le=100)
alert_on_high_mem: bool = True
mem_threshold: float = Field(90.0, ge=10, le=100)
auto_sr: bool = True
async def _ssh_run(server: Server, cmd: str) -> str:
"""SSH 명령 실행 (에이전트리스)."""
from core.crypto import decrypt_password
try:
pw = decrypt_password(server.os_pw_enc)
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(server.ip_addr, username=server.ssh_user, password=pw, timeout=10)
_, stdout, _ = ssh.exec_command(cmd, timeout=20)
result = stdout.read().decode('utf-8', 'replace').strip()
ssh.close()
return result
except Exception as e:
logger.error(f"SSH 실패 ({server.ip_addr}): {e}")
return ""
async def _check_containers(server: Server, rule: ContainerAlertRule) -> list[dict]:
"""서버의 Docker 컨테이너 상태 체크."""
alerts = []
# 컨테이너 목록 및 상태
output = await _ssh_run(server,
'docker ps -a --format \'{"name":"{{.Names}}","status":"{{.Status}}","cpu":"0","mem":"0"}\' 2>/dev/null'
)
if not output:
return alerts
for line in output.strip().split('\n'):
try:
info = json.loads(line)
except Exception:
continue
cname = info.get("name", "")
if rule.container_name and rule.container_name != cname:
continue
status = info.get("status", "")
# 중지된 컨테이너 감지
if rule.alert_on_stopped and ("Exited" in status or "Dead" in status):
alerts.append({
"container": cname,
"type": "CONTAINER_STOPPED",
"severity": "HIGH",
"message": f"컨테이너 {cname} 중지됨: {status}",
"server": server.ip_addr,
})
# docker stats로 CPU/Memory 체크
if rule.alert_on_high_cpu or rule.alert_on_high_mem:
stats_out = await _ssh_run(server,
f'docker stats --no-stream --format "{{{{.Name}}}} {{{{.CPUPerc}}}} {{{{.MemPerc}}}}" 2>/dev/null'
)
for line in (stats_out or "").strip().split('\n'):
parts = line.split()
if len(parts) < 3:
continue
cname = parts[0]
if rule.container_name and rule.container_name != cname:
continue
try:
cpu = float(parts[1].replace('%', ''))
mem = float(parts[2].replace('%', ''))
except ValueError:
continue
if rule.alert_on_high_cpu and cpu >= rule.cpu_threshold:
alerts.append({
"container": cname, "type": "HIGH_CPU", "severity": "MEDIUM",
"message": f"{cname} CPU {cpu:.1f}% (임계값 {rule.cpu_threshold}%)",
"server": server.ip_addr,
})
if rule.alert_on_high_mem and mem >= rule.mem_threshold:
alerts.append({
"container": cname, "type": "HIGH_MEM", "severity": "MEDIUM",
"message": f"{cname} 메모리 {mem:.1f}% (임계값 {rule.mem_threshold}%)",
"server": server.ip_addr,
})
return alerts
@router.post("/rules")
async def create_alert_rule(
req: AlertRuleCreate,
db: AsyncSession = Depends(get_db),
user: User = Depends(require_admin_role),
):
srv_row = await db.execute(select(Server).where(Server.id == req.server_id))
if not srv_row.scalar_one_or_none():
raise HTTPException(404, "서버를 찾을 수 없습니다")
rule = ContainerAlertRule(
tenant_id=user.tenant_id,
name=req.name, server_id=req.server_id,
container_name=req.container_name,
alert_on_stopped=req.alert_on_stopped,
alert_on_high_cpu=req.alert_on_high_cpu,
cpu_threshold=req.cpu_threshold,
alert_on_high_mem=req.alert_on_high_mem,
mem_threshold=req.mem_threshold,
auto_sr=req.auto_sr, is_active=True,
created_at=datetime.utcnow(),
)
db.add(rule)
await db.commit()
await db.refresh(rule)
return {"ok": True, "id": rule.id}
@router.get("/rules")
async def list_rules(
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
rows = await db.execute(
select(ContainerAlertRule).where(
ContainerAlertRule.tenant_id == user.tenant_id,
ContainerAlertRule.is_active == True,
)
)
rules = rows.scalars().all()
return [
{"id": r.id, "name": r.name, "server_id": r.server_id,
"container": r.container_name, "auto_sr": r.auto_sr}
for r in rules
]
@router.delete("/rules/{rule_id}")
async def delete_rule(
rule_id: int,
db: AsyncSession = Depends(get_db),
user: User = Depends(require_admin_role),
):
row = await db.execute(
select(ContainerAlertRule).where(
ContainerAlertRule.id == rule_id,
ContainerAlertRule.tenant_id == user.tenant_id,
)
)
rule = row.scalar_one_or_none()
if not rule:
raise HTTPException(404)
rule.is_active = False
await db.commit()
return {"ok": True}
@router.get("/check")
async def check_all_containers(
background_tasks: BackgroundTasks,
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
"""모든 규칙에 대해 컨테이너 상태 즉시 체크."""
rules_row = await db.execute(
select(ContainerAlertRule).where(
ContainerAlertRule.tenant_id == user.tenant_id,
ContainerAlertRule.is_active == True,
)
)
rules = rules_row.scalars().all()
all_alerts = []
for rule in rules:
srv_row = await db.execute(select(Server).where(Server.id == rule.server_id))
server = srv_row.scalar_one_or_none()
if not server:
continue
alerts = await _check_containers(server, rule)
for alert in alerts:
log = ContainerAlertLog(
rule_id=rule.id, alert_type=alert["type"],
container_name=alert["container"], severity=alert["severity"],
message=alert["message"], detected_at=datetime.utcnow(),
)
db.add(log)
# SR 자동 생성
if rule.auto_sr:
sr = SRRequest(
title=f"[컨테이너 알림] {alert['type']}: {alert['container']}",
description=alert["message"],
category="MONITORING", priority=alert["severity"],
status=SRStatus.OPEN, created_at=datetime.utcnow(),
)
db.add(sr)
all_alerts.extend(alerts)
await db.commit()
return {"alerts": all_alerts, "total": len(all_alerts)}
@router.get("/list")
async def alert_list(
limit: int = 50,
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
rows = await db.execute(
select(ContainerAlertLog).order_by(desc(ContainerAlertLog.detected_at)).limit(limit)
)
logs = rows.scalars().all()
return [
{"id": l.id, "type": l.alert_type, "container": l.container_name,
"severity": l.severity, "message": l.message, "detected_at": l.detected_at}
for l in logs
]