guardia-itsm/routers/digital_twin.py

525 lines
18 KiB
Python

"""
Digital Twin: 서버 가상 복제본 + 장애/변경 시뮬레이션
엔드포인트:
GET /api/digital-twin/servers — 트윈 서버 목록
POST /api/digital-twin/sync/{server_id} — 실제 서버 -> 트윈 동기화 (SSH)
POST /api/digital-twin/simulate/failure — 장애 시뮬레이션 + 영향도 분석
POST /api/digital-twin/simulate/change — 변경 영향도 분석
GET /api/digital-twin/diff/{server_id} — 실제 vs 트윈 차이점
POST /api/digital-twin/snapshot — 현재 상태 스냅샷 저장
GET /api/digital-twin/snapshots — 스냅샷 이력
보안 원칙:
- ip_addr, ssh_user, os_pw_enc 절대 API 응답 미포함
- 트윈은 읽기 전용 — 실제 서버 변경 불가
- 외부 API 완전 금지 — paramiko + Ollama localhost:11434 only
"""
from __future__ import annotations
import json
import logging
from datetime import datetime
from typing import Any, Dict, List, Optional
from fastapi import APIRouter, Body, Depends, HTTPException, Query
from pydantic import BaseModel
from sqlalchemy import select, desc
from sqlalchemy.ext.asyncio import AsyncSession
from core.auth import get_current_user
from database import get_db
from models import (
DigitalTwinServer, DigitalTwinServerOut,
TwinSimulation, TwinSimulationOut,
TwinSnapshot, TwinSnapshotOut,
Server, User, UserRole,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/digital-twin", tags=["digital-twin"])
# ── SSH 유틸리티 ──────────────────────────────────────────────────────────────
def _get_server_credentials(server: Server) -> dict:
"""서버 자격증명 복호화. ip/user/pw 외부 노출 금지."""
from core.crypto import decrypt_value
ip = server.ip_addr or ""
user = server.ssh_user or "opsagent"
port = server.port or 22
pw = None
if server.os_pw_enc:
try:
pw = decrypt_value(server.os_pw_enc)
except Exception:
pw = None
return {"ip": ip, "user": user, "port": port, "pw": pw,
"key_path": server.ssh_key_path, "method": server.ssh_method or "PASSWORD"}
def _collect_server_state(creds: dict) -> Dict[str, Any]:
"""
SSH로 서버 상태 수집.
실행 명령: top -bn1 / df -h / free -m / ss -tlnp
자격증명은 수집 내부에서만 사용 — 반환값에 미포함.
"""
try:
import paramiko # noqa: PLC0415
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
connect_kwargs: dict = {
"hostname": creds["ip"],
"port": creds["port"],
"username": creds["user"],
"timeout": 15,
}
if creds["method"] in ("KEY", "KEY_WITH_PASS") and creds.get("key_path"):
pk = paramiko.RSAKey.from_private_key_file(
creds["key_path"],
password=creds["pw"] if creds["method"] == "KEY_WITH_PASS" else None,
)
connect_kwargs["pkey"] = pk
else:
connect_kwargs["password"] = creds["pw"]
client.connect(**connect_kwargs)
def _run(cmd: str) -> str:
_, stdout, _ = client.exec_command(cmd, timeout=10)
return stdout.read().decode("utf-8", errors="replace").strip()
cpu_raw = _run("top -bn1 | grep 'Cpu(s)'")
disk_raw = _run("df -h --total 2>/dev/null | tail -1")
mem_raw = _run("free -m | awk '/Mem:/{print $2,$3,$4}'")
ports_raw = _run("ss -tlnp 2>/dev/null | awk 'NR>1{print $4}' | sort -u | head -20")
client.close()
# CPU 사용률 파싱
cpu_usage = 0.0
if cpu_raw:
for part in cpu_raw.split(","):
if "id" in part:
try:
idle = float(part.strip().split()[0].replace(",", "."))
cpu_usage = round(100.0 - idle, 1)
except (ValueError, IndexError):
pass
# 메모리 파싱 (total used free)
mem_info: dict = {}
if mem_raw:
parts = mem_raw.split()
if len(parts) >= 2:
try:
mem_info = {
"total_mb": int(parts[0]),
"used_mb": int(parts[1]),
"free_mb": int(parts[2]) if len(parts) > 2 else 0,
}
if mem_info["total_mb"] > 0:
mem_info["usage_pct"] = round(
mem_info["used_mb"] / mem_info["total_mb"] * 100, 1
)
except (ValueError, IndexError):
pass
# 디스크 파싱 (total used avail use%)
disk_info: dict = {}
if disk_raw:
parts = disk_raw.split()
if len(parts) >= 5:
disk_info = {
"total": parts[1],
"used": parts[2],
"avail": parts[3],
"use_pct": parts[4],
}
listening_ports = [p.strip() for p in ports_raw.splitlines() if p.strip()]
return {
"collected_at": datetime.utcnow().isoformat(),
"cpu_usage_pct": cpu_usage,
"memory": mem_info,
"disk": disk_info,
"listening_ports": listening_ports,
"ssh_reachable": True,
}
except Exception as exc:
logger.warning("SSH 수집 실패 (server=%s): %s", creds.get("ip", "?"), exc)
return {
"collected_at": datetime.utcnow().isoformat(),
"ssh_reachable": False,
"error_summary": "SSH 연결 실패",
}
def _compute_diff(twin_state: dict, real_state: dict) -> dict:
"""twin_state vs real_state 차이점 추출."""
keys = {"cpu_usage_pct", "memory", "disk", "listening_ports", "ssh_reachable"}
diff: dict = {}
for k in keys:
t_val = twin_state.get(k)
r_val = real_state.get(k)
if t_val != r_val:
diff[k] = {"twin": t_val, "real": r_val}
return diff
# ── Ollama 영향도 분석 ────────────────────────────────────────────────────────
def _ollama_analyze(prompt: str) -> str:
"""Ollama localhost:11434 호출 — 외부 API 절대 금지."""
try:
import urllib.request # noqa: PLC0415
payload = json.dumps({
"model": "llama3",
"prompt": prompt,
"stream": False,
"options": {"temperature": 0.2, "num_predict": 300},
}).encode()
req = urllib.request.Request(
"http://localhost:11434/api/generate",
data=payload,
headers={"Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=20) as resp:
data = json.loads(resp.read().decode())
return data.get("response", "").strip()
except Exception as exc:
logger.warning("Ollama 호출 실패: %s", exc)
return ""
# ── 요청/응답 스키마 ──────────────────────────────────────────────────────────
class FailureSimRequest(BaseModel):
server_name: str
failure_type: str # cpu_overload | memory_full | disk_full | service_down | network_partition
description: Optional[str] = None
class ChangeSimRequest(BaseModel):
server_name: str
change_description: str
affected_services: Optional[List[str]] = None
class SnapshotRequest(BaseModel):
label: str
server_ids: Optional[List[int]] = None # None이면 전체
# ── 엔드포인트 ────────────────────────────────────────────────────────────────
@router.get("/servers", response_model=List[DigitalTwinServerOut])
async def list_twin_servers(
keyword: Optional[str] = Query(None),
limit: int = Query(50, ge=1, le=200),
offset: int = Query(0, ge=0),
db: AsyncSession = Depends(get_db),
_u: User = Depends(get_current_user),
):
"""트윈 서버 목록 조회."""
q = select(DigitalTwinServer).order_by(desc(DigitalTwinServer.last_sync_at))
if keyword:
q = q.where(DigitalTwinServer.server_name.ilike(f"%{keyword}%"))
q = q.limit(limit).offset(offset)
rows = (await db.execute(q)).scalars().all()
return rows
@router.post("/sync/{server_id}")
async def sync_server(
server_id: int,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
"""
실제 서버 -> 트윈 동기화.
SSH로 top/df/free/ss 수집하여 real_state 갱신,
twin_state와 diff 계산.
"""
# 서버 조회
server = (await db.execute(
select(Server).where(Server.id == server_id)
)).scalars().first()
if not server:
raise HTTPException(404, "서버를 찾을 수 없습니다.")
# 자격증명 (응답에 절대 미포함)
creds = _get_server_credentials(server)
real_state = _collect_server_state(creds)
# 기존 트윈 조회 또는 신규 생성
twin = (await db.execute(
select(DigitalTwinServer).where(DigitalTwinServer.server_id == server_id)
)).scalars().first()
if twin is None:
twin = DigitalTwinServer(
server_id = server_id,
server_name = server.server_name,
)
db.add(twin)
# 기존 twin_state가 없으면 real_state를 초기값으로 사용
old_twin_state: dict = {}
if twin.twin_state:
try:
old_twin_state = json.loads(twin.twin_state)
except (json.JSONDecodeError, TypeError):
old_twin_state = {}
if not old_twin_state:
old_twin_state = real_state
diff = _compute_diff(old_twin_state, real_state)
twin.real_state = json.dumps(real_state, ensure_ascii=False)
twin.twin_state = json.dumps(old_twin_state, ensure_ascii=False)
twin.diff = json.dumps(diff, ensure_ascii=False)
twin.last_sync_at = datetime.utcnow()
await db.commit()
await db.refresh(twin)
return {
"twin_id": twin.id,
"server_name": twin.server_name,
"ssh_reachable": real_state.get("ssh_reachable", False),
"collected_at": real_state.get("collected_at"),
"diff_fields": list(diff.keys()),
"synced": True,
}
@router.post("/simulate/failure", response_model=TwinSimulationOut, status_code=201)
async def simulate_failure(
body: FailureSimRequest,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
"""
장애 시뮬레이션.
- CMDB 의존성 조회하여 영향 서버 목록 생성
- Ollama로 복구 시간 예측 + 위험도 점수 산출 (0.0~1.0)
"""
# CMDB에서 동일 이름/연관 서버 조회
related_rows = (await db.execute(
select(Server.server_name, Server.server_role)
.where(Server.is_active == True)
)).all()
affected_servers = []
for sname, srole in related_rows:
if sname != body.server_name:
affected_servers.append({"server_name": sname, "role": srole})
# 위험도 기본값 (장애 유형별)
base_risk = {
"cpu_overload": 0.6,
"memory_full": 0.75,
"disk_full": 0.8,
"service_down": 0.85,
"network_partition": 0.9,
}.get(body.failure_type, 0.5)
# 연관 서버가 많을수록 위험도 가중
risk_score = min(1.0, base_risk + len(affected_servers) * 0.02)
# Ollama 복구 예측
prompt = (
f"서버 장애 시뮬레이션 결과를 JSON으로만 답하시오.\n"
f"장애 서버: {body.server_name}\n"
f"장애 유형: {body.failure_type}\n"
f"영향 서버 수: {len(affected_servers)}\n"
f"출력 형식: {{\"estimated_recovery_min\": <숫자>, \"impact_summary\": \"<한 문장>\", "
f"\"recommended_action\": \"<한 문장>\"}}"
)
ai_raw = _ollama_analyze(prompt)
ai_result: dict = {}
try:
ai_result = json.loads(ai_raw) if ai_raw else {}
except (json.JSONDecodeError, ValueError):
ai_result = {"impact_summary": ai_raw} if ai_raw else {}
scenario = {
"failure_type": body.failure_type,
"description": body.description,
"affected_servers": affected_servers[:20], # 최대 20개
}
result = {
"risk_score": risk_score,
"affected_count": len(affected_servers),
"ai_analysis": ai_result,
}
sim = TwinSimulation(
sim_type = "failure",
target = body.server_name,
scenario = json.dumps(scenario, ensure_ascii=False),
result = json.dumps(result, ensure_ascii=False),
risk_score = risk_score,
)
db.add(sim)
await db.commit()
await db.refresh(sim)
return sim
@router.post("/simulate/change", response_model=TwinSimulationOut, status_code=201)
async def simulate_change(
body: ChangeSimRequest,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
"""
변경 영향도 분석.
- 변경 설명과 영향 서비스를 Ollama로 분석
- 위험도 점수 및 롤백 권고 생성
"""
affected_services = body.affected_services or []
# 관련 서버 CMDB 조회
related: list = []
if body.server_name:
rows = (await db.execute(
select(Server.server_name, Server.server_role)
.where(Server.server_name.ilike(f"%{body.server_name}%"), Server.is_active == True)
)).all()
related = [{"name": r[0], "role": r[1]} for r in rows]
# 위험도: 영향 서비스 수 + 관련 서버 수 기반 간이 계산
risk_score = min(1.0, 0.3 + len(affected_services) * 0.1 + len(related) * 0.05)
prompt = (
f"변경 영향도 분석 결과를 JSON으로만 답하시오.\n"
f"변경 대상 서버: {body.server_name}\n"
f"변경 내용: {body.change_description}\n"
f"영향 서비스: {', '.join(affected_services) if affected_services else '미지정'}\n"
f"출력 형식: {{\"risk_level\": \"low|medium|high\", \"rollback_recommended\": true|false, "
f"\"impact_summary\": \"<한 문장>\", \"precautions\": \"<한 문장>\"}}"
)
ai_raw = _ollama_analyze(prompt)
ai_result: dict = {}
try:
ai_result = json.loads(ai_raw) if ai_raw else {}
except (json.JSONDecodeError, ValueError):
ai_result = {"impact_summary": ai_raw} if ai_raw else {}
scenario = {
"change_description": body.change_description,
"affected_services": affected_services,
"related_servers": related[:10],
}
result = {
"risk_score": risk_score,
"ai_analysis": ai_result,
}
sim = TwinSimulation(
sim_type = "change",
target = body.server_name,
scenario = json.dumps(scenario, ensure_ascii=False),
result = json.dumps(result, ensure_ascii=False),
risk_score = risk_score,
)
db.add(sim)
await db.commit()
await db.refresh(sim)
return sim
@router.get("/diff/{server_id}")
async def get_diff(
server_id: int,
db: AsyncSession = Depends(get_db),
_u: User = Depends(get_current_user),
):
"""실제 서버 vs 트윈 차이점 조회."""
twin = (await db.execute(
select(DigitalTwinServer).where(DigitalTwinServer.server_id == server_id)
)).scalars().first()
if twin is None:
raise HTTPException(404, f"server_id={server_id}에 대한 트윈이 없습니다. /sync 먼저 실행하세요.")
diff: dict = {}
if twin.diff:
try:
diff = json.loads(twin.diff)
except (json.JSONDecodeError, TypeError):
diff = {}
return {
"twin_id": twin.id,
"server_name": twin.server_name,
"last_sync_at": twin.last_sync_at.isoformat() if twin.last_sync_at else None,
"diff": diff,
"diff_count": len(diff),
"in_sync": len(diff) == 0,
}
@router.post("/snapshot", response_model=TwinSnapshotOut, status_code=201)
async def create_snapshot(
body: SnapshotRequest,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
"""현재 트윈 상태 전체를 스냅샷으로 저장."""
q = select(DigitalTwinServer)
if body.server_ids:
q = q.where(DigitalTwinServer.server_id.in_(body.server_ids))
twins = (await db.execute(q)).scalars().all()
state_data = {
"snapshot_label": body.label,
"captured_at": datetime.utcnow().isoformat(),
"servers": [
{
"twin_id": t.id,
"server_id": t.server_id,
"server_name": t.server_name,
"twin_state": json.loads(t.twin_state) if t.twin_state else None,
"last_sync_at": t.last_sync_at.isoformat() if t.last_sync_at else None,
}
for t in twins
],
}
snap = TwinSnapshot(
label = body.label,
state = json.dumps(state_data, ensure_ascii=False),
)
db.add(snap)
await db.commit()
await db.refresh(snap)
return snap
@router.get("/snapshots", response_model=List[TwinSnapshotOut])
async def list_snapshots(
limit: int = Query(20, ge=1, le=100),
offset: int = Query(0, ge=0),
db: AsyncSession = Depends(get_db),
_u: User = Depends(get_current_user),
):
"""스냅샷 이력 조회."""
rows = (await db.execute(
select(TwinSnapshot)
.order_by(desc(TwinSnapshot.created_at))
.limit(limit).offset(offset)
)).scalars().all()
return rows