""" 장애 관리 (Incident Management) API. 엔드포인트: GET /api/incidents — 장애 목록 (등급·상태·기관 필터) POST /api/incidents — 장애 등록 GET /api/incidents/{id} — 장애 상세 PATCH /api/incidents/{id} — 장애 수정 PATCH /api/incidents/{id}/status — 상태 전환 POST /api/incidents/{id}/link-sr — SR 연결 DELETE /api/incidents/{id}/link-sr/{sr_id} — SR 연결 해제 GET /api/incidents/{id}/srs — 연결된 SR 목록 POST /api/incidents/{id}/close — RCA 포함 종료 GET /api/incidents/stats — 등급별·상태별 통계 장애 번호 형식: INC-YYYYMMDD-NNNNNN (UUID 앞 6자리) """ from __future__ import annotations import logging from datetime import datetime from typing import List, Optional from uuid import uuid4 from fastapi import APIRouter, Depends, HTTPException, Query from sqlalchemy import select, or_, desc from sqlalchemy.ext.asyncio import AsyncSession from core.auth import get_current_user from database import get_db from models import ( Institution, SRRequest, Incident, IncidentCreate, IncidentGrade, IncidentOut, IncidentStatus, IncidentUpdate, IncidentSR, User, UserRole, ) logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/incidents", tags=["incidents"]) def _new_incident_id() -> str: return f"INC-{datetime.now().strftime('%Y%m%d')}-{str(uuid4())[:6].upper()}" # ── 장애 목록 ───────────────────────────────────────────────────────────────── @router.get("", response_model=List[IncidentOut]) async def list_incidents( inst_id: Optional[int] = Query(None), grade: Optional[str] = Query(None, description="P1/P2/P3/P4"), status: Optional[str] = Query(None), keyword: Optional[str] = Query(None), skip: int = 0, limit: int = 100, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): q = select(Incident) if current_user.role == UserRole.CUSTOMER and current_user.inst_code: r_i = await db.execute( select(Institution).where(Institution.inst_code == current_user.inst_code) ) own = r_i.scalars().first() q = q.where(Incident.inst_id == own.id) if own else q.where(Incident.id == -1) elif inst_id: q = q.where(Incident.inst_id == inst_id) if grade: q = q.where(Incident.grade == grade) if status: q = q.where(Incident.status == status) if keyword: q = q.where(or_( Incident.title.contains(keyword), Incident.description.contains(keyword), )) q = q.order_by(desc(Incident.occurred_at)).offset(skip).limit(limit) result = await db.execute(q) return result.scalars().all() # ── 장애 등록 ───────────────────────────────────────────────────────────────── @router.post("", response_model=IncidentOut, status_code=201) async def create_incident( payload: IncidentCreate, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): if current_user.role not in (UserRole.ADMIN, UserRole.PM, UserRole.ENGINEER): raise HTTPException(403, "권한이 없습니다.") inc = Incident( **payload.model_dump(), incident_id = _new_incident_id(), reported_by = payload.reported_by or current_user.username, occurred_at = payload.occurred_at or datetime.now(), detected_at = datetime.now(), ) db.add(inc) await db.commit() await db.refresh(inc) logger.info( "장애 등록: %s grade=%s reported_by=%s", inc.incident_id, inc.grade, inc.reported_by, ) # P1/P2는 즉시 알림 if inc.grade in (IncidentGrade.P1, IncidentGrade.P2): await _notify_incident(inc) return inc # ── 장애 상세 ───────────────────────────────────────────────────────────────── @router.get("/stats") async def incident_stats( inst_id: Optional[int] = Query(None), db: AsyncSession = Depends(get_db), _u: User = Depends(get_current_user), ): """장애 통계 — 등급별·상태별 집계.""" q = select(Incident) if inst_id: q = q.where(Incident.inst_id == inst_id) incs = (await db.execute(q)).scalars().all() by_grade: dict[str, int] = {} by_status: dict[str, int] = {} for inc in incs: by_grade[inc.grade] = by_grade.get(inc.grade, 0) + 1 by_status[inc.status] = by_status.get(inc.status, 0) + 1 # 평균 복구 시간 (MTTRS — resolved 기준) resolved = [ i for i in incs if i.resolved_at and i.occurred_at ] mttr_min = 0 if resolved: total_sec = sum( (i.resolved_at - i.occurred_at).total_seconds() for i in resolved ) mttr_min = int(total_sec / len(resolved) / 60) return { "total": len(incs), "by_grade": by_grade, "by_status": by_status, "mttr_min": mttr_min, "open_p1_p2": sum( 1 for i in incs if i.grade in (IncidentGrade.P1, IncidentGrade.P2) and i.status not in (IncidentStatus.RESOLVED, IncidentStatus.CLOSED) ), } @router.get("/{incident_id}", response_model=IncidentOut) async def get_incident( incident_id: str, db: AsyncSession = Depends(get_db), _u: User = Depends(get_current_user), ): r = await db.execute( select(Incident).where(Incident.incident_id == incident_id) ) inc = r.scalars().first() if not inc: raise HTTPException(404, "장애를 찾을 수 없습니다.") return inc # ── 장애 수정 ───────────────────────────────────────────────────────────────── @router.patch("/{incident_id}", response_model=IncidentOut) async def update_incident( incident_id: str, payload: IncidentUpdate, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): if current_user.role not in (UserRole.ADMIN, UserRole.PM, UserRole.ENGINEER): raise HTTPException(403, "권한이 없습니다.") r = await db.execute( select(Incident).where(Incident.incident_id == incident_id) ) inc = r.scalars().first() if not inc: raise HTTPException(404, "장애를 찾을 수 없습니다.") for k, v in payload.model_dump(exclude_unset=True).items(): setattr(inc, k, v) inc.updated_at = datetime.now() await db.commit() await db.refresh(inc) return inc # ── 상태 전환 ───────────────────────────────────────────────────────────────── _VALID_TRANSITIONS: dict[str, list[str]] = { IncidentStatus.OPEN: [IncidentStatus.INVESTIGATING, IncidentStatus.CLOSED], IncidentStatus.INVESTIGATING: [IncidentStatus.MITIGATED, IncidentStatus.RESOLVED], IncidentStatus.MITIGATED: [IncidentStatus.INVESTIGATING, IncidentStatus.RESOLVED], IncidentStatus.RESOLVED: [IncidentStatus.CLOSED], IncidentStatus.CLOSED: [], } @router.patch("/{incident_id}/status", response_model=IncidentOut) async def change_status( incident_id: str, new_status: IncidentStatus = Query(...), note: Optional[str] = Query(None), db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): if current_user.role not in (UserRole.ADMIN, UserRole.PM, UserRole.ENGINEER): raise HTTPException(403, "권한이 없습니다.") r = await db.execute( select(Incident).where(Incident.incident_id == incident_id) ) inc = r.scalars().first() if not inc: raise HTTPException(404, "장애를 찾을 수 없습니다.") allowed = _VALID_TRANSITIONS.get(inc.status, []) if new_status not in allowed: raise HTTPException( 422, f"'{inc.status}' 상태에서 '{new_status}'로 전환할 수 없습니다. " f"허용: {allowed}", ) inc.status = new_status inc.updated_at = datetime.now() now = datetime.now() if new_status == IncidentStatus.MITIGATED: inc.mitigated_at = now elif new_status == IncidentStatus.RESOLVED: inc.resolved_at = now elif new_status == IncidentStatus.CLOSED: inc.closed_at = now await db.commit() await db.refresh(inc) logger.info( "장애 상태 전환: %s → %s by=%s", incident_id, new_status, current_user.username, ) return inc # ── SR 연결 ─────────────────────────────────────────────────────────────────── @router.post("/{incident_id}/link-sr", status_code=201) async def link_sr( incident_id: str, sr_id: str = Query(..., description="연결할 SR 번호"), db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): if current_user.role not in (UserRole.ADMIN, UserRole.PM, UserRole.ENGINEER): raise HTTPException(403, "권한이 없습니다.") # 장애 존재 확인 inc_r = await db.execute( select(Incident).where(Incident.incident_id == incident_id) ) if not inc_r.scalars().first(): raise HTTPException(404, "장애를 찾을 수 없습니다.") # SR 존재 확인 sr_r = await db.execute(select(SRRequest).where(SRRequest.sr_id == sr_id)) if not sr_r.scalars().first(): raise HTTPException(404, "SR을 찾을 수 없습니다.") # 중복 확인 dup = (await db.execute( select(IncidentSR).where( IncidentSR.incident_id == incident_id, IncidentSR.sr_id == sr_id, ) )).scalars().first() if dup: raise HTTPException(409, "이미 연결된 SR입니다.") link = IncidentSR(incident_id=incident_id, sr_id=sr_id) db.add(link) await db.commit() return {"incident_id": incident_id, "sr_id": sr_id, "linked": True} @router.delete("/{incident_id}/link-sr/{sr_id}", status_code=204) async def unlink_sr( incident_id: str, sr_id: str, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): if current_user.role not in (UserRole.ADMIN, UserRole.PM): raise HTTPException(403, "ADMIN 또는 PM 권한이 필요합니다.") r = await db.execute( select(IncidentSR).where( IncidentSR.incident_id == incident_id, IncidentSR.sr_id == sr_id, ) ) link = r.scalars().first() if not link: raise HTTPException(404, "연결을 찾을 수 없습니다.") await db.delete(link) await db.commit() @router.get("/{incident_id}/srs") async def list_linked_srs( incident_id: str, db: AsyncSession = Depends(get_db), _u: User = Depends(get_current_user), ): """장애에 연결된 SR 목록.""" links = (await db.execute( select(IncidentSR).where(IncidentSR.incident_id == incident_id) )).scalars().all() sr_ids = [lk.sr_id for lk in links] if not sr_ids: return [] srs = (await db.execute( select(SRRequest).where(SRRequest.sr_id.in_(sr_ids)) )).scalars().all() return [ { "sr_id": sr.sr_id, "title": sr.title, "status": sr.status, "priority": sr.priority, "sr_type": sr.sr_type, "created_at": sr.created_at.isoformat() if sr.created_at else None, } for sr in srs ] # ── 종료 (RCA 포함) ─────────────────────────────────────────────────────────── @router.post("/{incident_id}/close", response_model=IncidentOut) async def close_incident( incident_id: str, rca: str = Query(..., description="근본 원인 분석 (Root Cause Analysis)"), prevention: Optional[str] = Query(None, description="재발 방지 조치"), kb_doc_id: Optional[str] = Query(None, description="연관 KB 문서 ID"), db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): if current_user.role not in (UserRole.ADMIN, UserRole.PM, UserRole.ENGINEER): raise HTTPException(403, "권한이 없습니다.") r = await db.execute( select(Incident).where(Incident.incident_id == incident_id) ) inc = r.scalars().first() if not inc: raise HTTPException(404, "장애를 찾을 수 없습니다.") if inc.status == IncidentStatus.CLOSED: raise HTTPException(409, "이미 종료된 장애입니다.") inc.status = IncidentStatus.CLOSED inc.rca = rca inc.prevention = prevention inc.kb_doc_id = kb_doc_id inc.closed_at = datetime.now() if not inc.resolved_at: inc.resolved_at = datetime.now() inc.updated_at = datetime.now() await db.commit() await db.refresh(inc) logger.info( "장애 종료: %s grade=%s closed_by=%s", incident_id, inc.grade, current_user.username, ) return inc # ── G-5: 자동 RCA 분석 ──────────────────────────────────────────────────────── @router.post("/{incident_id}/auto-rca") async def auto_rca( incident_id: str, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): """Ollama LLM으로 RCA 초안 자동 생성 후 장애 레코드에 저장.""" inc = (await db.execute( select(Incident).where(Incident.incident_id == incident_id) )).scalars().first() if not inc: raise HTTPException(404, f"장애 {incident_id}를 찾을 수 없습니다.") try: from core.auto_rca import analyze_rca result = await analyze_rca(inc.id, db) except Exception as e: raise HTTPException(500, f"RCA 분석 오류: {str(e)[:200]}") # 생성된 RCA를 장애 레코드에 저장 import json as _json rca_data = result["rca"] inc.rca = rca_data.get("root_cause", "") inc.prevention = _json.dumps(rca_data.get("prevention", []), ensure_ascii=False) inc.updated_at = datetime.now() await db.commit() return result # ── 내부 알림 ───────────────────────────────────────────────────────────────── async def _notify_incident(inc: Incident) -> None: """P1/P2 장애 긴급 알림.""" try: from core.notify import send_messenger import os room = os.getenv("MESSENGER_OPS_ROOM", "ops") grade_emoji = {"P1": "🚨", "P2": "🔴", "P3": "🟠", "P4": "🟡"}.get(inc.grade, "⚠️") msg = ( f"{grade_emoji} [{inc.grade}] 장애 발생\n" f"장애번호: {inc.incident_id}\n" f"제목: {inc.title}\n" f"발생일시: {inc.occurred_at.strftime('%Y-%m-%d %H:%M')}\n" f"담당: {inc.assigned_to or '미지정'}\n" f"즉시 확인 바랍니다." ) await send_messenger(room, {"type": "text", "text": msg}) except Exception as exc: logger.warning("장애 알림 발송 실패: %s", exc)