라우터 (667개 엔드포인트, P3 신규 69개):
- multimodal.py: llava 이미지 분석 + 에러 자동 분류
- learning_loop.py: Ollama 파인튜닝 + 품질 지표
- ai_insights.py: 주간 인사이트 + 반복 패턴 + 개선 권고
- container_alerts.py: Docker 이상 감지 → SR 자동 생성
- ncloud.py: NCloud API (서버/LB/스토리지/비용)
- billing.py: 구독 플랜 + 사용량 측정 + 청구서
- servicenow.py: ServiceNow CMDB/Incident 양방향 연동
- erp_connector.py: 그룹웨어/HR ERP 연동 + 결재 웹훅
- kakao_notify.py: 카카오 알림톡 + 대량 발송
- auto_report.py: Excel/PDF 보고서 자동 생성·다운로드
- benchmark.py: 기관 간 익명 벤치마킹 (완전 익명화)
- cohort_analysis.py: 도입 코호트 + 리텐션 + 기능 도입률
DB 모델 (14개 신규 테이블):
tb_learning_run, tb_container_alert_{rule,log},
tb_ncloud_config, tb_subscription, tb_invoice,
tb_servicenow_{config,mapping}, tb_erp_config,
tb_kakao_{config,notify_log}, tb_report_{record,schedule},
tb_benchmark_contrib
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
258 lines
8.7 KiB
Python
258 lines
8.7 KiB
Python
"""
|
|
컨테이너 이상 감지 + SR 자동 생성
|
|
|
|
Docker/K8s 컨테이너 헬스 상태를 주기적으로 체크하여
|
|
이상 감지 시 SR을 자동으로 생성한다.
|
|
|
|
엔드포인트:
|
|
GET /api/container-alerts/check — 컨테이너 상태 즉시 체크
|
|
GET /api/container-alerts/list — 최근 알림 목록
|
|
POST /api/container-alerts/rules — 알림 규칙 등록
|
|
GET /api/container-alerts/rules — 알림 규칙 목록
|
|
DELETE /api/container-alerts/rules/{id} — 규칙 삭제
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
from datetime import datetime
|
|
from typing import List, Optional
|
|
|
|
import paramiko
|
|
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
|
|
from pydantic import BaseModel, Field
|
|
from sqlalchemy import select, desc
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from core.auth import get_current_user, require_admin_role
|
|
from database import get_db
|
|
from models import User, Server, SRRequest, SRStatus, ContainerAlertRule, ContainerAlertLog # 신규
|
|
|
|
logger = logging.getLogger(__name__)
|
|
router = APIRouter(prefix="/api/container-alerts", tags=["Container Alerts"])
|
|
|
|
|
|
class AlertRuleCreate(BaseModel):
|
|
name: str = Field(..., max_length=200)
|
|
server_id: int
|
|
container_name: Optional[str] = None # None = 전체 컨테이너
|
|
alert_on_stopped: bool = True
|
|
alert_on_high_cpu: bool = True
|
|
cpu_threshold: float = Field(90.0, ge=10, le=100)
|
|
alert_on_high_mem: bool = True
|
|
mem_threshold: float = Field(90.0, ge=10, le=100)
|
|
auto_sr: bool = True
|
|
|
|
|
|
async def _ssh_run(server: Server, cmd: str) -> str:
|
|
"""SSH 명령 실행 (에이전트리스)."""
|
|
from core.crypto import decrypt_password
|
|
try:
|
|
pw = decrypt_password(server.os_pw_enc)
|
|
ssh = paramiko.SSHClient()
|
|
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
ssh.connect(server.ip_addr, username=server.ssh_user, password=pw, timeout=10)
|
|
_, stdout, _ = ssh.exec_command(cmd, timeout=20)
|
|
result = stdout.read().decode('utf-8', 'replace').strip()
|
|
ssh.close()
|
|
return result
|
|
except Exception as e:
|
|
logger.error(f"SSH 실패 ({server.ip_addr}): {e}")
|
|
return ""
|
|
|
|
|
|
async def _check_containers(server: Server, rule: ContainerAlertRule) -> list[dict]:
|
|
"""서버의 Docker 컨테이너 상태 체크."""
|
|
alerts = []
|
|
|
|
# 컨테이너 목록 및 상태
|
|
output = await _ssh_run(server,
|
|
'docker ps -a --format \'{"name":"{{.Names}}","status":"{{.Status}}","cpu":"0","mem":"0"}\' 2>/dev/null'
|
|
)
|
|
if not output:
|
|
return alerts
|
|
|
|
for line in output.strip().split('\n'):
|
|
try:
|
|
info = json.loads(line)
|
|
except Exception:
|
|
continue
|
|
|
|
cname = info.get("name", "")
|
|
if rule.container_name and rule.container_name != cname:
|
|
continue
|
|
|
|
status = info.get("status", "")
|
|
|
|
# 중지된 컨테이너 감지
|
|
if rule.alert_on_stopped and ("Exited" in status or "Dead" in status):
|
|
alerts.append({
|
|
"container": cname,
|
|
"type": "CONTAINER_STOPPED",
|
|
"severity": "HIGH",
|
|
"message": f"컨테이너 {cname} 중지됨: {status}",
|
|
"server": server.ip_addr,
|
|
})
|
|
|
|
# docker stats로 CPU/Memory 체크
|
|
if rule.alert_on_high_cpu or rule.alert_on_high_mem:
|
|
stats_out = await _ssh_run(server,
|
|
f'docker stats --no-stream --format "{{{{.Name}}}} {{{{.CPUPerc}}}} {{{{.MemPerc}}}}" 2>/dev/null'
|
|
)
|
|
for line in (stats_out or "").strip().split('\n'):
|
|
parts = line.split()
|
|
if len(parts) < 3:
|
|
continue
|
|
cname = parts[0]
|
|
if rule.container_name and rule.container_name != cname:
|
|
continue
|
|
try:
|
|
cpu = float(parts[1].replace('%', ''))
|
|
mem = float(parts[2].replace('%', ''))
|
|
except ValueError:
|
|
continue
|
|
|
|
if rule.alert_on_high_cpu and cpu >= rule.cpu_threshold:
|
|
alerts.append({
|
|
"container": cname, "type": "HIGH_CPU", "severity": "MEDIUM",
|
|
"message": f"{cname} CPU {cpu:.1f}% (임계값 {rule.cpu_threshold}%)",
|
|
"server": server.ip_addr,
|
|
})
|
|
if rule.alert_on_high_mem and mem >= rule.mem_threshold:
|
|
alerts.append({
|
|
"container": cname, "type": "HIGH_MEM", "severity": "MEDIUM",
|
|
"message": f"{cname} 메모리 {mem:.1f}% (임계값 {rule.mem_threshold}%)",
|
|
"server": server.ip_addr,
|
|
})
|
|
|
|
return alerts
|
|
|
|
|
|
@router.post("/rules")
|
|
async def create_alert_rule(
|
|
req: AlertRuleCreate,
|
|
db: AsyncSession = Depends(get_db),
|
|
user: User = Depends(require_admin_role),
|
|
):
|
|
srv_row = await db.execute(select(Server).where(Server.id == req.server_id))
|
|
if not srv_row.scalar_one_or_none():
|
|
raise HTTPException(404, "서버를 찾을 수 없습니다")
|
|
|
|
rule = ContainerAlertRule(
|
|
tenant_id=user.tenant_id,
|
|
name=req.name, server_id=req.server_id,
|
|
container_name=req.container_name,
|
|
alert_on_stopped=req.alert_on_stopped,
|
|
alert_on_high_cpu=req.alert_on_high_cpu,
|
|
cpu_threshold=req.cpu_threshold,
|
|
alert_on_high_mem=req.alert_on_high_mem,
|
|
mem_threshold=req.mem_threshold,
|
|
auto_sr=req.auto_sr, is_active=True,
|
|
created_at=datetime.utcnow(),
|
|
)
|
|
db.add(rule)
|
|
await db.commit()
|
|
await db.refresh(rule)
|
|
return {"ok": True, "id": rule.id}
|
|
|
|
|
|
@router.get("/rules")
|
|
async def list_rules(
|
|
db: AsyncSession = Depends(get_db),
|
|
user: User = Depends(get_current_user),
|
|
):
|
|
rows = await db.execute(
|
|
select(ContainerAlertRule).where(
|
|
ContainerAlertRule.tenant_id == user.tenant_id,
|
|
ContainerAlertRule.is_active == True,
|
|
)
|
|
)
|
|
rules = rows.scalars().all()
|
|
return [
|
|
{"id": r.id, "name": r.name, "server_id": r.server_id,
|
|
"container": r.container_name, "auto_sr": r.auto_sr}
|
|
for r in rules
|
|
]
|
|
|
|
|
|
@router.delete("/rules/{rule_id}")
|
|
async def delete_rule(
|
|
rule_id: int,
|
|
db: AsyncSession = Depends(get_db),
|
|
user: User = Depends(require_admin_role),
|
|
):
|
|
row = await db.execute(
|
|
select(ContainerAlertRule).where(
|
|
ContainerAlertRule.id == rule_id,
|
|
ContainerAlertRule.tenant_id == user.tenant_id,
|
|
)
|
|
)
|
|
rule = row.scalar_one_or_none()
|
|
if not rule:
|
|
raise HTTPException(404)
|
|
rule.is_active = False
|
|
await db.commit()
|
|
return {"ok": True}
|
|
|
|
|
|
@router.get("/check")
|
|
async def check_all_containers(
|
|
background_tasks: BackgroundTasks,
|
|
db: AsyncSession = Depends(get_db),
|
|
user: User = Depends(get_current_user),
|
|
):
|
|
"""모든 규칙에 대해 컨테이너 상태 즉시 체크."""
|
|
rules_row = await db.execute(
|
|
select(ContainerAlertRule).where(
|
|
ContainerAlertRule.tenant_id == user.tenant_id,
|
|
ContainerAlertRule.is_active == True,
|
|
)
|
|
)
|
|
rules = rules_row.scalars().all()
|
|
all_alerts = []
|
|
|
|
for rule in rules:
|
|
srv_row = await db.execute(select(Server).where(Server.id == rule.server_id))
|
|
server = srv_row.scalar_one_or_none()
|
|
if not server:
|
|
continue
|
|
|
|
alerts = await _check_containers(server, rule)
|
|
for alert in alerts:
|
|
log = ContainerAlertLog(
|
|
rule_id=rule.id, alert_type=alert["type"],
|
|
container_name=alert["container"], severity=alert["severity"],
|
|
message=alert["message"], detected_at=datetime.utcnow(),
|
|
)
|
|
db.add(log)
|
|
# SR 자동 생성
|
|
if rule.auto_sr:
|
|
sr = SRRequest(
|
|
title=f"[컨테이너 알림] {alert['type']}: {alert['container']}",
|
|
description=alert["message"],
|
|
category="MONITORING", priority=alert["severity"],
|
|
status=SRStatus.OPEN, created_at=datetime.utcnow(),
|
|
)
|
|
db.add(sr)
|
|
all_alerts.extend(alerts)
|
|
|
|
await db.commit()
|
|
return {"alerts": all_alerts, "total": len(all_alerts)}
|
|
|
|
|
|
@router.get("/list")
|
|
async def alert_list(
|
|
limit: int = 50,
|
|
db: AsyncSession = Depends(get_db),
|
|
user: User = Depends(get_current_user),
|
|
):
|
|
rows = await db.execute(
|
|
select(ContainerAlertLog).order_by(desc(ContainerAlertLog.detected_at)).limit(limit)
|
|
)
|
|
logs = rows.scalars().all()
|
|
return [
|
|
{"id": l.id, "type": l.alert_type, "container": l.container_name,
|
|
"severity": l.severity, "message": l.message, "detected_at": l.detected_at}
|
|
for l in logs
|
|
]
|