430 lines
15 KiB
Python
430 lines
15 KiB
Python
"""
|
|
SR 접수 자동 리뷰 엔진 (SR Auto-Review)
|
|
|
|
SR 생성 즉시 백그라운드로 기동:
|
|
1. 관련 서버 조회 (CMDB)
|
|
2. tmux 세션 생성 + 서버 상태 스냅샷 (paramiko → tmux)
|
|
3. 하네스 선택 (SR 유형 기반)
|
|
4. Ollama AI 리뷰 생성
|
|
5. TB_SR_AUTO_REVIEW 저장 + SSE 브로드캐스트
|
|
"""
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import re
|
|
from datetime import datetime, timedelta
|
|
from typing import List, Optional
|
|
|
|
import httpx
|
|
from fastapi import APIRouter, Depends, HTTPException
|
|
from pydantic import BaseModel, ConfigDict
|
|
from sqlalchemy import and_, or_, select
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from core.auth import get_current_user
|
|
from core.events import broadcast
|
|
from database import SessionLocal, get_db
|
|
from models import SRAutoReview, SRRequest, Server, User
|
|
|
|
router = APIRouter(prefix="/api/sr-review", tags=["SR Auto Review"])
|
|
log = logging.getLogger(__name__)
|
|
|
|
OLLAMA_URL = "http://localhost:11434/api/generate"
|
|
OLLAMA_MODEL = "llama3"
|
|
|
|
# SR 유형 → 하네스 매핑
|
|
_HARNESS_MAP = {
|
|
"DEPLOY": "deploy-validation",
|
|
"RESTART": "incident-response",
|
|
"LOG": "log-analysis",
|
|
"INQUIRY": "faq-search",
|
|
"OTHER": "general-ops",
|
|
}
|
|
|
|
# 안전한 읽기 전용 스냅샷 명령
|
|
_SNAPSHOT_CMDS = {
|
|
"uptime": "uptime",
|
|
"disk": "df -h",
|
|
"memory": "free -h",
|
|
"top_procs": "ps aux --sort=-%cpu 2>/dev/null | head -10",
|
|
"services": (
|
|
"systemctl list-units --type=service --state=running --no-pager 2>/dev/null "
|
|
"| head -15 || service --status-all 2>/dev/null | head -15 "
|
|
"|| echo 'service list unavailable'"
|
|
),
|
|
"recent_log": (
|
|
"tail -n 40 /var/log/messages 2>/dev/null "
|
|
"|| tail -n 40 /var/log/syslog 2>/dev/null "
|
|
"|| journalctl -n 40 --no-pager 2>/dev/null "
|
|
"|| echo 'log unavailable'"
|
|
),
|
|
}
|
|
|
|
|
|
# ── SSH 유틸 ──────────────────────────────────────────────────────────────────
|
|
|
|
def _decrypt_pw(enc: str) -> str:
|
|
try:
|
|
from core.crypto import decrypt_field
|
|
return decrypt_field(enc)
|
|
except Exception:
|
|
return ""
|
|
|
|
|
|
async def _capture_server_snapshot(server: Server) -> dict:
|
|
"""
|
|
SSH → tmux 세션 생성 → 스냅샷 수집.
|
|
세션은 종료하지 않고 유지 — 담당자가 이후 'tmux attach' 로 접속 가능.
|
|
"""
|
|
import time
|
|
import paramiko
|
|
|
|
snapshot: dict = {}
|
|
session_name = f"sr-{int(time.time())}"
|
|
|
|
try:
|
|
client = paramiko.SSHClient()
|
|
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
|
|
kw: dict = {
|
|
"hostname": server.ip_addr,
|
|
"port": server.port or 22,
|
|
"username": server.ssh_user,
|
|
"timeout": 15,
|
|
}
|
|
if server.ssh_method == "KEY" and server.ssh_key_path:
|
|
kw["key_filename"] = server.ssh_key_path
|
|
else:
|
|
pw = _decrypt_pw(server.os_pw_enc or "")
|
|
if not pw:
|
|
return {"error": "자격증명 복호화 실패"}
|
|
kw["password"] = pw
|
|
|
|
client.connect(**kw)
|
|
|
|
# tmux 설치 여부
|
|
_, stdout, _ = client.exec_command("which tmux 2>/dev/null && echo HAS_TMUX || echo NO_TMUX")
|
|
has_tmux = "HAS_TMUX" in stdout.read().decode()
|
|
|
|
if has_tmux:
|
|
client.exec_command(f"tmux new-session -d -s '{session_name}' 2>/dev/null; true")
|
|
await asyncio.sleep(0.3)
|
|
snapshot["tmux_session"] = session_name
|
|
|
|
for key, cmd in _SNAPSHOT_CMDS.items():
|
|
try:
|
|
if has_tmux:
|
|
# tmux 세션에 명령 전송 (히스토리에 남음)
|
|
safe_cmd = cmd.replace("'", "'\\''")
|
|
client.exec_command(f"tmux send-keys -t '{session_name}' '{safe_cmd}' Enter")
|
|
await asyncio.sleep(0.8)
|
|
_, out, _ = client.exec_command(
|
|
f"tmux capture-pane -p -t '{session_name}' | tail -20"
|
|
)
|
|
else:
|
|
_, out, _ = client.exec_command(cmd, timeout=10)
|
|
output = out.read().decode(errors="replace").strip()
|
|
snapshot[key] = output[:600]
|
|
except Exception:
|
|
snapshot[key] = "수집 실패"
|
|
|
|
client.close()
|
|
|
|
except Exception as e:
|
|
snapshot["error"] = f"SSH 실패: {type(e).__name__}: {e}"
|
|
log.debug("SR 리뷰 스냅샷 오류: %s", e)
|
|
|
|
return snapshot
|
|
|
|
|
|
# ── 유사 SR 조회 ──────────────────────────────────────────────────────────────
|
|
|
|
async def _find_similar_srs(sr: SRRequest, db: AsyncSession) -> list:
|
|
cutoff = datetime.now() - timedelta(days=30)
|
|
q = (
|
|
select(SRRequest)
|
|
.where(
|
|
and_(
|
|
SRRequest.sr_id != sr.sr_id,
|
|
SRRequest.status == "COMPLETED",
|
|
SRRequest.created_at >= cutoff,
|
|
or_(
|
|
SRRequest.sr_type == sr.sr_type,
|
|
SRRequest.inst_id == sr.inst_id,
|
|
),
|
|
)
|
|
)
|
|
.order_by(SRRequest.created_at.desc())
|
|
.limit(5)
|
|
)
|
|
rows = (await db.execute(q)).scalars().all()
|
|
return [
|
|
{"sr_id": r.sr_id, "title": r.title, "resolution": (r.description or "")[:100]}
|
|
for r in rows
|
|
]
|
|
|
|
|
|
# ── Ollama AI 리뷰 생성 ───────────────────────────────────────────────────────
|
|
|
|
async def _generate_ai_review(sr: SRRequest, snapshot: dict, similar: list) -> dict:
|
|
similar_text = "\n".join(
|
|
f"- [{s['sr_id']}] {s['title']}" for s in similar[:3]
|
|
) or "없음"
|
|
|
|
snap_text = "\n".join(
|
|
f"[{k}]\n{v}" for k, v in snapshot.items()
|
|
if k not in ("tmux_session", "error")
|
|
) or "서버 정보 없음"
|
|
|
|
prompt = f"""공공기관 IT 인프라 전문 엔지니어로서 SR을 분석하고 JSON만 반환하라.
|
|
|
|
SR:
|
|
- ID: {sr.sr_id}
|
|
- 유형: {sr.sr_type}
|
|
- 제목: {sr.title}
|
|
- 내용: {sr.description or '(없음)'}
|
|
- 우선순위: {sr.priority}
|
|
- 대상 서버: {sr.target_server or '미지정'}
|
|
|
|
서버 상태:
|
|
{snap_text}
|
|
|
|
유사 SR:
|
|
{similar_text}
|
|
|
|
JSON 형식으로만 응답 (다른 텍스트 없이):
|
|
{{
|
|
"summary": "문제 요약 (1-2문장)",
|
|
"root_cause": "추정 원인",
|
|
"recommended_actions": ["조치1", "조치2", "조치3"],
|
|
"estimated_minutes": 30,
|
|
"risk_level": "LOW",
|
|
"auto_resolvable": false
|
|
}}"""
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=90) as client:
|
|
resp = await client.post(
|
|
OLLAMA_URL,
|
|
json={"model": OLLAMA_MODEL, "prompt": prompt, "stream": False},
|
|
)
|
|
text = resp.json().get("response", "{}")
|
|
m = re.search(r"\{[\s\S]*\}", text)
|
|
if m:
|
|
return json.loads(m.group())
|
|
except Exception as e:
|
|
log.debug("Ollama SR 리뷰 오류: %s", e)
|
|
|
|
return {
|
|
"summary": "자동 리뷰 생성 실패 — 수동 검토 필요",
|
|
"root_cause": "알 수 없음",
|
|
"recommended_actions": ["담당자 직접 확인"],
|
|
"estimated_minutes": 60,
|
|
"risk_level": "MEDIUM",
|
|
"auto_resolvable": False,
|
|
}
|
|
|
|
|
|
# ── 핵심 리뷰 실행 (background task 진입점) ───────────────────────────────────
|
|
|
|
async def run_sr_review(sr_id: str) -> None:
|
|
"""
|
|
tasks.py create_task() 에서 fire-and-forget으로 호출된다.
|
|
독립 DB 세션을 사용하므로 메인 트랜잭션과 무관하게 실행된다.
|
|
"""
|
|
async with SessionLocal() as db:
|
|
try:
|
|
sr = (await db.execute(
|
|
select(SRRequest).where(SRRequest.sr_id == sr_id)
|
|
)).scalars().first()
|
|
if not sr:
|
|
return
|
|
|
|
# 중복 방지
|
|
if (await db.execute(
|
|
select(SRAutoReview).where(SRAutoReview.sr_id == sr_id)
|
|
)).scalars().first():
|
|
return
|
|
|
|
harness = _HARNESS_MAP.get(sr.sr_type or "OTHER", "general-ops")
|
|
|
|
# 리뷰 레코드 초기 생성
|
|
review = SRAutoReview(
|
|
sr_id=sr_id,
|
|
harness_name=harness,
|
|
status="reviewing",
|
|
started_at=datetime.now(),
|
|
)
|
|
db.add(review)
|
|
await db.commit()
|
|
|
|
except Exception as e:
|
|
log.exception("SR 리뷰 초기화 실패 %s: %s", sr_id, e)
|
|
return
|
|
|
|
# ── Step 1: 관련 서버 조회 + tmux 스냅샷
|
|
snapshot: dict = {}
|
|
async with SessionLocal() as db:
|
|
sr = (await db.execute(
|
|
select(SRRequest).where(SRRequest.sr_id == sr_id)
|
|
)).scalars().first()
|
|
if not sr:
|
|
return
|
|
|
|
if sr.target_server:
|
|
srv = (await db.execute(
|
|
select(Server).where(
|
|
Server.server_name == sr.target_server,
|
|
Server.is_active == True,
|
|
).limit(1)
|
|
)).scalars().first()
|
|
if srv:
|
|
snapshot = await _capture_server_snapshot(srv)
|
|
|
|
similar = await _find_similar_srs(sr, db)
|
|
|
|
# ── Step 2: Ollama AI 리뷰 생성
|
|
async with SessionLocal() as db:
|
|
sr = (await db.execute(
|
|
select(SRRequest).where(SRRequest.sr_id == sr_id)
|
|
)).scalars().first()
|
|
if not sr:
|
|
return
|
|
ai = await _generate_ai_review(sr, snapshot, similar)
|
|
|
|
# ── Step 3: 결과 저장
|
|
try:
|
|
async with SessionLocal() as db:
|
|
rev = (await db.execute(
|
|
select(SRAutoReview).where(SRAutoReview.sr_id == sr_id)
|
|
)).scalars().first()
|
|
if rev:
|
|
rev.status = "completed"
|
|
rev.summary = ai.get("summary", "")
|
|
rev.root_cause = ai.get("root_cause", "")
|
|
rev.recommended_actions = json.dumps(
|
|
ai.get("recommended_actions", []), ensure_ascii=False
|
|
)
|
|
rev.estimated_minutes = ai.get("estimated_minutes", 60)
|
|
rev.risk_level = ai.get("risk_level", "MEDIUM")
|
|
rev.similar_count = len(similar)
|
|
rev.auto_resolvable = ai.get("auto_resolvable", False)
|
|
rev.server_snapshot = json.dumps(snapshot, ensure_ascii=False)
|
|
rev.tmux_session = snapshot.get("tmux_session")
|
|
rev.completed_at = datetime.now()
|
|
await db.commit()
|
|
except Exception as e:
|
|
log.exception("SR 리뷰 저장 실패 %s: %s", sr_id, e)
|
|
async with SessionLocal() as db:
|
|
rev = (await db.execute(
|
|
select(SRAutoReview).where(SRAutoReview.sr_id == sr_id)
|
|
)).scalars().first()
|
|
if rev:
|
|
rev.status = "failed"
|
|
rev.summary = f"리뷰 실패: {type(e).__name__}"
|
|
rev.completed_at = datetime.now()
|
|
await db.commit()
|
|
return
|
|
|
|
# ── Step 4: SSE 브로드캐스트
|
|
await broadcast("sr_review_completed", {
|
|
"sr_id": sr_id,
|
|
"risk_level": ai.get("risk_level", "MEDIUM"),
|
|
"summary": ai.get("summary", ""),
|
|
"harness": harness,
|
|
"tmux_session": snapshot.get("tmux_session"),
|
|
})
|
|
|
|
|
|
# ── REST 엔드포인트 ────────────────────────────────────────────────────────────
|
|
|
|
class SRReviewOut(BaseModel):
|
|
model_config = ConfigDict(from_attributes=True)
|
|
|
|
id: int
|
|
sr_id: str
|
|
harness_name: str
|
|
status: str
|
|
summary: Optional[str] = None
|
|
root_cause: Optional[str] = None
|
|
recommended_actions: Optional[str] = None
|
|
estimated_minutes: Optional[int] = None
|
|
risk_level: Optional[str] = None
|
|
similar_count: Optional[int] = None
|
|
auto_resolvable: Optional[bool] = None
|
|
tmux_session: Optional[str] = None
|
|
started_at: Optional[datetime] = None
|
|
completed_at: Optional[datetime] = None
|
|
|
|
|
|
@router.get("", response_model=List[SRReviewOut])
|
|
async def list_reviews(
|
|
status: Optional[str] = None,
|
|
risk_level: Optional[str] = None,
|
|
skip: int = 0,
|
|
limit: int = 50,
|
|
db: AsyncSession = Depends(get_db),
|
|
_u: User = Depends(get_current_user),
|
|
):
|
|
q = select(SRAutoReview).order_by(SRAutoReview.started_at.desc())
|
|
if status:
|
|
q = q.where(SRAutoReview.status == status)
|
|
if risk_level:
|
|
q = q.where(SRAutoReview.risk_level == risk_level)
|
|
q = q.offset(skip).limit(limit)
|
|
return (await db.execute(q)).scalars().all()
|
|
|
|
|
|
@router.get("/{sr_id}", response_model=SRReviewOut)
|
|
async def get_review(
|
|
sr_id: str,
|
|
db: AsyncSession = Depends(get_db),
|
|
_u: User = Depends(get_current_user),
|
|
):
|
|
r = (await db.execute(
|
|
select(SRAutoReview).where(SRAutoReview.sr_id == sr_id)
|
|
)).scalars().first()
|
|
if not r:
|
|
raise HTTPException(404, detail="리뷰 결과 없음 (리뷰 진행 중이거나 미접수 SR)")
|
|
return r
|
|
|
|
|
|
@router.post("/{sr_id}/run", status_code=202)
|
|
async def trigger_review(
|
|
sr_id: str,
|
|
db: AsyncSession = Depends(get_db),
|
|
_u: User = Depends(get_current_user),
|
|
):
|
|
"""수동 재실행 — 기존 결과 삭제 후 재시작."""
|
|
existing = (await db.execute(
|
|
select(SRAutoReview).where(SRAutoReview.sr_id == sr_id)
|
|
)).scalars().first()
|
|
if existing:
|
|
await db.delete(existing)
|
|
await db.commit()
|
|
|
|
asyncio.create_task(run_sr_review(sr_id))
|
|
return {"message": f"SR {sr_id} 리뷰 재실행", "sr_id": sr_id}
|
|
|
|
|
|
@router.get("/{sr_id}/tmux")
|
|
async def get_tmux_info(
|
|
sr_id: str,
|
|
db: AsyncSession = Depends(get_db),
|
|
_u: User = Depends(get_current_user),
|
|
):
|
|
"""리뷰 중 생성된 tmux 세션 정보 + 서버 스냅샷 조회."""
|
|
r = (await db.execute(
|
|
select(SRAutoReview).where(SRAutoReview.sr_id == sr_id)
|
|
)).scalars().first()
|
|
if not r:
|
|
raise HTTPException(404, detail="리뷰 없음")
|
|
return {
|
|
"sr_id": sr_id,
|
|
"tmux_session": r.tmux_session,
|
|
"snapshot": json.loads(r.server_snapshot) if r.server_snapshot else {},
|
|
"attach_hint": f"tmux attach -t {r.tmux_session}" if r.tmux_session else None,
|
|
"risk_level": r.risk_level,
|
|
"status": r.status,
|
|
}
|