zioinfo-mail/workspace/guardia-itsm/routers/container_alerts.py
DESKTOP-TKLFCPR\ython fc0ba65e05 feat(expansion): GUARDiA v3 P3 완성 — 13 routers + 14 DB tables
라우터 (667개 엔드포인트, P3 신규 69개):
- multimodal.py:      llava 이미지 분석 + 에러 자동 분류
- learning_loop.py:   Ollama 파인튜닝 + 품질 지표
- ai_insights.py:     주간 인사이트 + 반복 패턴 + 개선 권고
- container_alerts.py: Docker 이상 감지 → SR 자동 생성
- ncloud.py:          NCloud API (서버/LB/스토리지/비용)
- billing.py:         구독 플랜 + 사용량 측정 + 청구서
- servicenow.py:      ServiceNow CMDB/Incident 양방향 연동
- erp_connector.py:   그룹웨어/HR ERP 연동 + 결재 웹훅
- kakao_notify.py:    카카오 알림톡 + 대량 발송
- auto_report.py:     Excel/PDF 보고서 자동 생성·다운로드
- benchmark.py:       기관 간 익명 벤치마킹 (완전 익명화)
- cohort_analysis.py: 도입 코호트 + 리텐션 + 기능 도입률

DB 모델 (14개 신규 테이블):
tb_learning_run, tb_container_alert_{rule,log},
tb_ncloud_config, tb_subscription, tb_invoice,
tb_servicenow_{config,mapping}, tb_erp_config,
tb_kakao_{config,notify_log}, tb_report_{record,schedule},
tb_benchmark_contrib

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-02 06:06:59 +09:00

258 lines
8.7 KiB
Python

"""
컨테이너 이상 감지 + SR 자동 생성
Docker/K8s 컨테이너 헬스 상태를 주기적으로 체크하여
이상 감지 시 SR을 자동으로 생성한다.
엔드포인트:
GET /api/container-alerts/check — 컨테이너 상태 즉시 체크
GET /api/container-alerts/list — 최근 알림 목록
POST /api/container-alerts/rules — 알림 규칙 등록
GET /api/container-alerts/rules — 알림 규칙 목록
DELETE /api/container-alerts/rules/{id} — 규칙 삭제
"""
from __future__ import annotations
import json
import logging
from datetime import datetime
from typing import List, Optional
import paramiko
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
from pydantic import BaseModel, Field
from sqlalchemy import select, desc
from sqlalchemy.ext.asyncio import AsyncSession
from core.auth import get_current_user, require_admin_role
from database import get_db
from models import User, Server, SRRequest, SRStatus, ContainerAlertRule, ContainerAlertLog # 신규
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/container-alerts", tags=["Container Alerts"])
class AlertRuleCreate(BaseModel):
name: str = Field(..., max_length=200)
server_id: int
container_name: Optional[str] = None # None = 전체 컨테이너
alert_on_stopped: bool = True
alert_on_high_cpu: bool = True
cpu_threshold: float = Field(90.0, ge=10, le=100)
alert_on_high_mem: bool = True
mem_threshold: float = Field(90.0, ge=10, le=100)
auto_sr: bool = True
async def _ssh_run(server: Server, cmd: str) -> str:
"""SSH 명령 실행 (에이전트리스)."""
from core.crypto import decrypt_password
try:
pw = decrypt_password(server.os_pw_enc)
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(server.ip_addr, username=server.ssh_user, password=pw, timeout=10)
_, stdout, _ = ssh.exec_command(cmd, timeout=20)
result = stdout.read().decode('utf-8', 'replace').strip()
ssh.close()
return result
except Exception as e:
logger.error(f"SSH 실패 ({server.ip_addr}): {e}")
return ""
async def _check_containers(server: Server, rule: ContainerAlertRule) -> list[dict]:
"""서버의 Docker 컨테이너 상태 체크."""
alerts = []
# 컨테이너 목록 및 상태
output = await _ssh_run(server,
'docker ps -a --format \'{"name":"{{.Names}}","status":"{{.Status}}","cpu":"0","mem":"0"}\' 2>/dev/null'
)
if not output:
return alerts
for line in output.strip().split('\n'):
try:
info = json.loads(line)
except Exception:
continue
cname = info.get("name", "")
if rule.container_name and rule.container_name != cname:
continue
status = info.get("status", "")
# 중지된 컨테이너 감지
if rule.alert_on_stopped and ("Exited" in status or "Dead" in status):
alerts.append({
"container": cname,
"type": "CONTAINER_STOPPED",
"severity": "HIGH",
"message": f"컨테이너 {cname} 중지됨: {status}",
"server": server.ip_addr,
})
# docker stats로 CPU/Memory 체크
if rule.alert_on_high_cpu or rule.alert_on_high_mem:
stats_out = await _ssh_run(server,
f'docker stats --no-stream --format "{{{{.Name}}}} {{{{.CPUPerc}}}} {{{{.MemPerc}}}}" 2>/dev/null'
)
for line in (stats_out or "").strip().split('\n'):
parts = line.split()
if len(parts) < 3:
continue
cname = parts[0]
if rule.container_name and rule.container_name != cname:
continue
try:
cpu = float(parts[1].replace('%', ''))
mem = float(parts[2].replace('%', ''))
except ValueError:
continue
if rule.alert_on_high_cpu and cpu >= rule.cpu_threshold:
alerts.append({
"container": cname, "type": "HIGH_CPU", "severity": "MEDIUM",
"message": f"{cname} CPU {cpu:.1f}% (임계값 {rule.cpu_threshold}%)",
"server": server.ip_addr,
})
if rule.alert_on_high_mem and mem >= rule.mem_threshold:
alerts.append({
"container": cname, "type": "HIGH_MEM", "severity": "MEDIUM",
"message": f"{cname} 메모리 {mem:.1f}% (임계값 {rule.mem_threshold}%)",
"server": server.ip_addr,
})
return alerts
@router.post("/rules")
async def create_alert_rule(
req: AlertRuleCreate,
db: AsyncSession = Depends(get_db),
user: User = Depends(require_admin_role),
):
srv_row = await db.execute(select(Server).where(Server.id == req.server_id))
if not srv_row.scalar_one_or_none():
raise HTTPException(404, "서버를 찾을 수 없습니다")
rule = ContainerAlertRule(
tenant_id=user.tenant_id,
name=req.name, server_id=req.server_id,
container_name=req.container_name,
alert_on_stopped=req.alert_on_stopped,
alert_on_high_cpu=req.alert_on_high_cpu,
cpu_threshold=req.cpu_threshold,
alert_on_high_mem=req.alert_on_high_mem,
mem_threshold=req.mem_threshold,
auto_sr=req.auto_sr, is_active=True,
created_at=datetime.utcnow(),
)
db.add(rule)
await db.commit()
await db.refresh(rule)
return {"ok": True, "id": rule.id}
@router.get("/rules")
async def list_rules(
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
rows = await db.execute(
select(ContainerAlertRule).where(
ContainerAlertRule.tenant_id == user.tenant_id,
ContainerAlertRule.is_active == True,
)
)
rules = rows.scalars().all()
return [
{"id": r.id, "name": r.name, "server_id": r.server_id,
"container": r.container_name, "auto_sr": r.auto_sr}
for r in rules
]
@router.delete("/rules/{rule_id}")
async def delete_rule(
rule_id: int,
db: AsyncSession = Depends(get_db),
user: User = Depends(require_admin_role),
):
row = await db.execute(
select(ContainerAlertRule).where(
ContainerAlertRule.id == rule_id,
ContainerAlertRule.tenant_id == user.tenant_id,
)
)
rule = row.scalar_one_or_none()
if not rule:
raise HTTPException(404)
rule.is_active = False
await db.commit()
return {"ok": True}
@router.get("/check")
async def check_all_containers(
background_tasks: BackgroundTasks,
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
"""모든 규칙에 대해 컨테이너 상태 즉시 체크."""
rules_row = await db.execute(
select(ContainerAlertRule).where(
ContainerAlertRule.tenant_id == user.tenant_id,
ContainerAlertRule.is_active == True,
)
)
rules = rules_row.scalars().all()
all_alerts = []
for rule in rules:
srv_row = await db.execute(select(Server).where(Server.id == rule.server_id))
server = srv_row.scalar_one_or_none()
if not server:
continue
alerts = await _check_containers(server, rule)
for alert in alerts:
log = ContainerAlertLog(
rule_id=rule.id, alert_type=alert["type"],
container_name=alert["container"], severity=alert["severity"],
message=alert["message"], detected_at=datetime.utcnow(),
)
db.add(log)
# SR 자동 생성
if rule.auto_sr:
sr = SRRequest(
title=f"[컨테이너 알림] {alert['type']}: {alert['container']}",
description=alert["message"],
category="MONITORING", priority=alert["severity"],
status=SRStatus.OPEN, created_at=datetime.utcnow(),
)
db.add(sr)
all_alerts.extend(alerts)
await db.commit()
return {"alerts": all_alerts, "total": len(all_alerts)}
@router.get("/list")
async def alert_list(
limit: int = 50,
db: AsyncSession = Depends(get_db),
user: User = Depends(get_current_user),
):
rows = await db.execute(
select(ContainerAlertLog).order_by(desc(ContainerAlertLog.detected_at)).limit(limit)
)
logs = rows.scalars().all()
return [
{"id": l.id, "type": l.alert_type, "container": l.container_name,
"severity": l.severity, "message": l.message, "detected_at": l.detected_at}
for l in logs
]