guardia-itsm/routers/sr_auto_review.py
2026-06-06 08:13:57 +09:00

430 lines
15 KiB
Python

"""
SR 접수 자동 리뷰 엔진 (SR Auto-Review)
SR 생성 즉시 백그라운드로 기동:
1. 관련 서버 조회 (CMDB)
2. tmux 세션 생성 + 서버 상태 스냅샷 (paramiko → tmux)
3. 하네스 선택 (SR 유형 기반)
4. Ollama AI 리뷰 생성
5. TB_SR_AUTO_REVIEW 저장 + SSE 브로드캐스트
"""
import asyncio
import json
import logging
import re
from datetime import datetime, timedelta
from typing import List, Optional
import httpx
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel, ConfigDict
from sqlalchemy import and_, or_, select
from sqlalchemy.ext.asyncio import AsyncSession
from core.auth import get_current_user
from core.events import broadcast
from database import SessionLocal, get_db
from models import SRAutoReview, SRRequest, Server, User
router = APIRouter(prefix="/api/sr-review", tags=["SR Auto Review"])
log = logging.getLogger(__name__)
OLLAMA_URL = "http://localhost:11434/api/generate"
OLLAMA_MODEL = "llama3"
# SR 유형 → 하네스 매핑
_HARNESS_MAP = {
"DEPLOY": "deploy-validation",
"RESTART": "incident-response",
"LOG": "log-analysis",
"INQUIRY": "faq-search",
"OTHER": "general-ops",
}
# 안전한 읽기 전용 스냅샷 명령
_SNAPSHOT_CMDS = {
"uptime": "uptime",
"disk": "df -h",
"memory": "free -h",
"top_procs": "ps aux --sort=-%cpu 2>/dev/null | head -10",
"services": (
"systemctl list-units --type=service --state=running --no-pager 2>/dev/null "
"| head -15 || service --status-all 2>/dev/null | head -15 "
"|| echo 'service list unavailable'"
),
"recent_log": (
"tail -n 40 /var/log/messages 2>/dev/null "
"|| tail -n 40 /var/log/syslog 2>/dev/null "
"|| journalctl -n 40 --no-pager 2>/dev/null "
"|| echo 'log unavailable'"
),
}
# ── SSH 유틸 ──────────────────────────────────────────────────────────────────
def _decrypt_pw(enc: str) -> str:
try:
from core.crypto import decrypt_field
return decrypt_field(enc)
except Exception:
return ""
async def _capture_server_snapshot(server: Server) -> dict:
"""
SSH → tmux 세션 생성 → 스냅샷 수집.
세션은 종료하지 않고 유지 — 담당자가 이후 'tmux attach' 로 접속 가능.
"""
import time
import paramiko
snapshot: dict = {}
session_name = f"sr-{int(time.time())}"
try:
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
kw: dict = {
"hostname": server.ip_addr,
"port": server.port or 22,
"username": server.ssh_user,
"timeout": 15,
}
if server.ssh_method == "KEY" and server.ssh_key_path:
kw["key_filename"] = server.ssh_key_path
else:
pw = _decrypt_pw(server.os_pw_enc or "")
if not pw:
return {"error": "자격증명 복호화 실패"}
kw["password"] = pw
client.connect(**kw)
# tmux 설치 여부
_, stdout, _ = client.exec_command("which tmux 2>/dev/null && echo HAS_TMUX || echo NO_TMUX")
has_tmux = "HAS_TMUX" in stdout.read().decode()
if has_tmux:
client.exec_command(f"tmux new-session -d -s '{session_name}' 2>/dev/null; true")
await asyncio.sleep(0.3)
snapshot["tmux_session"] = session_name
for key, cmd in _SNAPSHOT_CMDS.items():
try:
if has_tmux:
# tmux 세션에 명령 전송 (히스토리에 남음)
safe_cmd = cmd.replace("'", "'\\''")
client.exec_command(f"tmux send-keys -t '{session_name}' '{safe_cmd}' Enter")
await asyncio.sleep(0.8)
_, out, _ = client.exec_command(
f"tmux capture-pane -p -t '{session_name}' | tail -20"
)
else:
_, out, _ = client.exec_command(cmd, timeout=10)
output = out.read().decode(errors="replace").strip()
snapshot[key] = output[:600]
except Exception:
snapshot[key] = "수집 실패"
client.close()
except Exception as e:
snapshot["error"] = f"SSH 실패: {type(e).__name__}: {e}"
log.debug("SR 리뷰 스냅샷 오류: %s", e)
return snapshot
# ── 유사 SR 조회 ──────────────────────────────────────────────────────────────
async def _find_similar_srs(sr: SRRequest, db: AsyncSession) -> list:
cutoff = datetime.now() - timedelta(days=30)
q = (
select(SRRequest)
.where(
and_(
SRRequest.sr_id != sr.sr_id,
SRRequest.status == "COMPLETED",
SRRequest.created_at >= cutoff,
or_(
SRRequest.sr_type == sr.sr_type,
SRRequest.inst_id == sr.inst_id,
),
)
)
.order_by(SRRequest.created_at.desc())
.limit(5)
)
rows = (await db.execute(q)).scalars().all()
return [
{"sr_id": r.sr_id, "title": r.title, "resolution": (r.description or "")[:100]}
for r in rows
]
# ── Ollama AI 리뷰 생성 ───────────────────────────────────────────────────────
async def _generate_ai_review(sr: SRRequest, snapshot: dict, similar: list) -> dict:
similar_text = "\n".join(
f"- [{s['sr_id']}] {s['title']}" for s in similar[:3]
) or "없음"
snap_text = "\n".join(
f"[{k}]\n{v}" for k, v in snapshot.items()
if k not in ("tmux_session", "error")
) or "서버 정보 없음"
prompt = f"""공공기관 IT 인프라 전문 엔지니어로서 SR을 분석하고 JSON만 반환하라.
SR:
- ID: {sr.sr_id}
- 유형: {sr.sr_type}
- 제목: {sr.title}
- 내용: {sr.description or '(없음)'}
- 우선순위: {sr.priority}
- 대상 서버: {sr.target_server or '미지정'}
서버 상태:
{snap_text}
유사 SR:
{similar_text}
JSON 형식으로만 응답 (다른 텍스트 없이):
{{
"summary": "문제 요약 (1-2문장)",
"root_cause": "추정 원인",
"recommended_actions": ["조치1", "조치2", "조치3"],
"estimated_minutes": 30,
"risk_level": "LOW",
"auto_resolvable": false
}}"""
try:
async with httpx.AsyncClient(timeout=90) as client:
resp = await client.post(
OLLAMA_URL,
json={"model": OLLAMA_MODEL, "prompt": prompt, "stream": False},
)
text = resp.json().get("response", "{}")
m = re.search(r"\{[\s\S]*\}", text)
if m:
return json.loads(m.group())
except Exception as e:
log.debug("Ollama SR 리뷰 오류: %s", e)
return {
"summary": "자동 리뷰 생성 실패 — 수동 검토 필요",
"root_cause": "알 수 없음",
"recommended_actions": ["담당자 직접 확인"],
"estimated_minutes": 60,
"risk_level": "MEDIUM",
"auto_resolvable": False,
}
# ── 핵심 리뷰 실행 (background task 진입점) ───────────────────────────────────
async def run_sr_review(sr_id: str) -> None:
"""
tasks.py create_task() 에서 fire-and-forget으로 호출된다.
독립 DB 세션을 사용하므로 메인 트랜잭션과 무관하게 실행된다.
"""
async with SessionLocal() as db:
try:
sr = (await db.execute(
select(SRRequest).where(SRRequest.sr_id == sr_id)
)).scalars().first()
if not sr:
return
# 중복 방지
if (await db.execute(
select(SRAutoReview).where(SRAutoReview.sr_id == sr_id)
)).scalars().first():
return
harness = _HARNESS_MAP.get(sr.sr_type or "OTHER", "general-ops")
# 리뷰 레코드 초기 생성
review = SRAutoReview(
sr_id=sr_id,
harness_name=harness,
status="reviewing",
started_at=datetime.now(),
)
db.add(review)
await db.commit()
except Exception as e:
log.exception("SR 리뷰 초기화 실패 %s: %s", sr_id, e)
return
# ── Step 1: 관련 서버 조회 + tmux 스냅샷
snapshot: dict = {}
async with SessionLocal() as db:
sr = (await db.execute(
select(SRRequest).where(SRRequest.sr_id == sr_id)
)).scalars().first()
if not sr:
return
if sr.target_server:
srv = (await db.execute(
select(Server).where(
Server.server_name == sr.target_server,
Server.is_active == True,
).limit(1)
)).scalars().first()
if srv:
snapshot = await _capture_server_snapshot(srv)
similar = await _find_similar_srs(sr, db)
# ── Step 2: Ollama AI 리뷰 생성
async with SessionLocal() as db:
sr = (await db.execute(
select(SRRequest).where(SRRequest.sr_id == sr_id)
)).scalars().first()
if not sr:
return
ai = await _generate_ai_review(sr, snapshot, similar)
# ── Step 3: 결과 저장
try:
async with SessionLocal() as db:
rev = (await db.execute(
select(SRAutoReview).where(SRAutoReview.sr_id == sr_id)
)).scalars().first()
if rev:
rev.status = "completed"
rev.summary = ai.get("summary", "")
rev.root_cause = ai.get("root_cause", "")
rev.recommended_actions = json.dumps(
ai.get("recommended_actions", []), ensure_ascii=False
)
rev.estimated_minutes = ai.get("estimated_minutes", 60)
rev.risk_level = ai.get("risk_level", "MEDIUM")
rev.similar_count = len(similar)
rev.auto_resolvable = ai.get("auto_resolvable", False)
rev.server_snapshot = json.dumps(snapshot, ensure_ascii=False)
rev.tmux_session = snapshot.get("tmux_session")
rev.completed_at = datetime.now()
await db.commit()
except Exception as e:
log.exception("SR 리뷰 저장 실패 %s: %s", sr_id, e)
async with SessionLocal() as db:
rev = (await db.execute(
select(SRAutoReview).where(SRAutoReview.sr_id == sr_id)
)).scalars().first()
if rev:
rev.status = "failed"
rev.summary = f"리뷰 실패: {type(e).__name__}"
rev.completed_at = datetime.now()
await db.commit()
return
# ── Step 4: SSE 브로드캐스트
await broadcast("sr_review_completed", {
"sr_id": sr_id,
"risk_level": ai.get("risk_level", "MEDIUM"),
"summary": ai.get("summary", ""),
"harness": harness,
"tmux_session": snapshot.get("tmux_session"),
})
# ── REST 엔드포인트 ────────────────────────────────────────────────────────────
class SRReviewOut(BaseModel):
model_config = ConfigDict(from_attributes=True)
id: int
sr_id: str
harness_name: str
status: str
summary: Optional[str] = None
root_cause: Optional[str] = None
recommended_actions: Optional[str] = None
estimated_minutes: Optional[int] = None
risk_level: Optional[str] = None
similar_count: Optional[int] = None
auto_resolvable: Optional[bool] = None
tmux_session: Optional[str] = None
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
@router.get("", response_model=List[SRReviewOut])
async def list_reviews(
status: Optional[str] = None,
risk_level: Optional[str] = None,
skip: int = 0,
limit: int = 50,
db: AsyncSession = Depends(get_db),
_u: User = Depends(get_current_user),
):
q = select(SRAutoReview).order_by(SRAutoReview.started_at.desc())
if status:
q = q.where(SRAutoReview.status == status)
if risk_level:
q = q.where(SRAutoReview.risk_level == risk_level)
q = q.offset(skip).limit(limit)
return (await db.execute(q)).scalars().all()
@router.get("/{sr_id}", response_model=SRReviewOut)
async def get_review(
sr_id: str,
db: AsyncSession = Depends(get_db),
_u: User = Depends(get_current_user),
):
r = (await db.execute(
select(SRAutoReview).where(SRAutoReview.sr_id == sr_id)
)).scalars().first()
if not r:
raise HTTPException(404, detail="리뷰 결과 없음 (리뷰 진행 중이거나 미접수 SR)")
return r
@router.post("/{sr_id}/run", status_code=202)
async def trigger_review(
sr_id: str,
db: AsyncSession = Depends(get_db),
_u: User = Depends(get_current_user),
):
"""수동 재실행 — 기존 결과 삭제 후 재시작."""
existing = (await db.execute(
select(SRAutoReview).where(SRAutoReview.sr_id == sr_id)
)).scalars().first()
if existing:
await db.delete(existing)
await db.commit()
asyncio.create_task(run_sr_review(sr_id))
return {"message": f"SR {sr_id} 리뷰 재실행", "sr_id": sr_id}
@router.get("/{sr_id}/tmux")
async def get_tmux_info(
sr_id: str,
db: AsyncSession = Depends(get_db),
_u: User = Depends(get_current_user),
):
"""리뷰 중 생성된 tmux 세션 정보 + 서버 스냅샷 조회."""
r = (await db.execute(
select(SRAutoReview).where(SRAutoReview.sr_id == sr_id)
)).scalars().first()
if not r:
raise HTTPException(404, detail="리뷰 없음")
return {
"sr_id": sr_id,
"tmux_session": r.tmux_session,
"snapshot": json.loads(r.server_snapshot) if r.server_snapshot else {},
"attach_hint": f"tmux attach -t {r.tmux_session}" if r.tmux_session else None,
"risk_level": r.risk_level,
"status": r.status,
}