525 lines
18 KiB
Python
525 lines
18 KiB
Python
"""
|
|
Digital Twin: 서버 가상 복제본 + 장애/변경 시뮬레이션
|
|
|
|
엔드포인트:
|
|
GET /api/digital-twin/servers — 트윈 서버 목록
|
|
POST /api/digital-twin/sync/{server_id} — 실제 서버 -> 트윈 동기화 (SSH)
|
|
POST /api/digital-twin/simulate/failure — 장애 시뮬레이션 + 영향도 분석
|
|
POST /api/digital-twin/simulate/change — 변경 영향도 분석
|
|
GET /api/digital-twin/diff/{server_id} — 실제 vs 트윈 차이점
|
|
POST /api/digital-twin/snapshot — 현재 상태 스냅샷 저장
|
|
GET /api/digital-twin/snapshots — 스냅샷 이력
|
|
|
|
보안 원칙:
|
|
- ip_addr, ssh_user, os_pw_enc 절대 API 응답 미포함
|
|
- 트윈은 읽기 전용 — 실제 서버 변경 불가
|
|
- 외부 API 완전 금지 — paramiko + Ollama localhost:11434 only
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
from datetime import datetime
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from fastapi import APIRouter, Body, Depends, HTTPException, Query
|
|
from pydantic import BaseModel
|
|
from sqlalchemy import select, desc
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from core.auth import get_current_user
|
|
from database import get_db
|
|
from models import (
|
|
DigitalTwinServer, DigitalTwinServerOut,
|
|
TwinSimulation, TwinSimulationOut,
|
|
TwinSnapshot, TwinSnapshotOut,
|
|
Server, User, UserRole,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
router = APIRouter(prefix="/api/digital-twin", tags=["digital-twin"])
|
|
|
|
|
|
# ── SSH 유틸리티 ──────────────────────────────────────────────────────────────
|
|
|
|
def _get_server_credentials(server: Server) -> dict:
|
|
"""서버 자격증명 복호화. ip/user/pw 외부 노출 금지."""
|
|
from core.crypto import decrypt_value
|
|
|
|
ip = server.ip_addr or ""
|
|
user = server.ssh_user or "opsagent"
|
|
port = server.port or 22
|
|
pw = None
|
|
if server.os_pw_enc:
|
|
try:
|
|
pw = decrypt_value(server.os_pw_enc)
|
|
except Exception:
|
|
pw = None
|
|
return {"ip": ip, "user": user, "port": port, "pw": pw,
|
|
"key_path": server.ssh_key_path, "method": server.ssh_method or "PASSWORD"}
|
|
|
|
|
|
def _collect_server_state(creds: dict) -> Dict[str, Any]:
|
|
"""
|
|
SSH로 서버 상태 수집.
|
|
실행 명령: top -bn1 / df -h / free -m / ss -tlnp
|
|
자격증명은 수집 내부에서만 사용 — 반환값에 미포함.
|
|
"""
|
|
try:
|
|
import paramiko # noqa: PLC0415
|
|
|
|
client = paramiko.SSHClient()
|
|
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
|
|
connect_kwargs: dict = {
|
|
"hostname": creds["ip"],
|
|
"port": creds["port"],
|
|
"username": creds["user"],
|
|
"timeout": 15,
|
|
}
|
|
if creds["method"] in ("KEY", "KEY_WITH_PASS") and creds.get("key_path"):
|
|
pk = paramiko.RSAKey.from_private_key_file(
|
|
creds["key_path"],
|
|
password=creds["pw"] if creds["method"] == "KEY_WITH_PASS" else None,
|
|
)
|
|
connect_kwargs["pkey"] = pk
|
|
else:
|
|
connect_kwargs["password"] = creds["pw"]
|
|
|
|
client.connect(**connect_kwargs)
|
|
|
|
def _run(cmd: str) -> str:
|
|
_, stdout, _ = client.exec_command(cmd, timeout=10)
|
|
return stdout.read().decode("utf-8", errors="replace").strip()
|
|
|
|
cpu_raw = _run("top -bn1 | grep 'Cpu(s)'")
|
|
disk_raw = _run("df -h --total 2>/dev/null | tail -1")
|
|
mem_raw = _run("free -m | awk '/Mem:/{print $2,$3,$4}'")
|
|
ports_raw = _run("ss -tlnp 2>/dev/null | awk 'NR>1{print $4}' | sort -u | head -20")
|
|
|
|
client.close()
|
|
|
|
# CPU 사용률 파싱
|
|
cpu_usage = 0.0
|
|
if cpu_raw:
|
|
for part in cpu_raw.split(","):
|
|
if "id" in part:
|
|
try:
|
|
idle = float(part.strip().split()[0].replace(",", "."))
|
|
cpu_usage = round(100.0 - idle, 1)
|
|
except (ValueError, IndexError):
|
|
pass
|
|
|
|
# 메모리 파싱 (total used free)
|
|
mem_info: dict = {}
|
|
if mem_raw:
|
|
parts = mem_raw.split()
|
|
if len(parts) >= 2:
|
|
try:
|
|
mem_info = {
|
|
"total_mb": int(parts[0]),
|
|
"used_mb": int(parts[1]),
|
|
"free_mb": int(parts[2]) if len(parts) > 2 else 0,
|
|
}
|
|
if mem_info["total_mb"] > 0:
|
|
mem_info["usage_pct"] = round(
|
|
mem_info["used_mb"] / mem_info["total_mb"] * 100, 1
|
|
)
|
|
except (ValueError, IndexError):
|
|
pass
|
|
|
|
# 디스크 파싱 (total used avail use%)
|
|
disk_info: dict = {}
|
|
if disk_raw:
|
|
parts = disk_raw.split()
|
|
if len(parts) >= 5:
|
|
disk_info = {
|
|
"total": parts[1],
|
|
"used": parts[2],
|
|
"avail": parts[3],
|
|
"use_pct": parts[4],
|
|
}
|
|
|
|
listening_ports = [p.strip() for p in ports_raw.splitlines() if p.strip()]
|
|
|
|
return {
|
|
"collected_at": datetime.utcnow().isoformat(),
|
|
"cpu_usage_pct": cpu_usage,
|
|
"memory": mem_info,
|
|
"disk": disk_info,
|
|
"listening_ports": listening_ports,
|
|
"ssh_reachable": True,
|
|
}
|
|
|
|
except Exception as exc:
|
|
logger.warning("SSH 수집 실패 (server=%s): %s", creds.get("ip", "?"), exc)
|
|
return {
|
|
"collected_at": datetime.utcnow().isoformat(),
|
|
"ssh_reachable": False,
|
|
"error_summary": "SSH 연결 실패",
|
|
}
|
|
|
|
|
|
def _compute_diff(twin_state: dict, real_state: dict) -> dict:
|
|
"""twin_state vs real_state 차이점 추출."""
|
|
keys = {"cpu_usage_pct", "memory", "disk", "listening_ports", "ssh_reachable"}
|
|
diff: dict = {}
|
|
for k in keys:
|
|
t_val = twin_state.get(k)
|
|
r_val = real_state.get(k)
|
|
if t_val != r_val:
|
|
diff[k] = {"twin": t_val, "real": r_val}
|
|
return diff
|
|
|
|
|
|
# ── Ollama 영향도 분석 ────────────────────────────────────────────────────────
|
|
|
|
def _ollama_analyze(prompt: str) -> str:
|
|
"""Ollama localhost:11434 호출 — 외부 API 절대 금지."""
|
|
try:
|
|
import urllib.request # noqa: PLC0415
|
|
|
|
payload = json.dumps({
|
|
"model": "llama3",
|
|
"prompt": prompt,
|
|
"stream": False,
|
|
"options": {"temperature": 0.2, "num_predict": 300},
|
|
}).encode()
|
|
|
|
req = urllib.request.Request(
|
|
"http://localhost:11434/api/generate",
|
|
data=payload,
|
|
headers={"Content-Type": "application/json"},
|
|
method="POST",
|
|
)
|
|
with urllib.request.urlopen(req, timeout=20) as resp:
|
|
data = json.loads(resp.read().decode())
|
|
return data.get("response", "").strip()
|
|
except Exception as exc:
|
|
logger.warning("Ollama 호출 실패: %s", exc)
|
|
return ""
|
|
|
|
|
|
# ── 요청/응답 스키마 ──────────────────────────────────────────────────────────
|
|
|
|
class FailureSimRequest(BaseModel):
|
|
server_name: str
|
|
failure_type: str # cpu_overload | memory_full | disk_full | service_down | network_partition
|
|
description: Optional[str] = None
|
|
|
|
|
|
class ChangeSimRequest(BaseModel):
|
|
server_name: str
|
|
change_description: str
|
|
affected_services: Optional[List[str]] = None
|
|
|
|
|
|
class SnapshotRequest(BaseModel):
|
|
label: str
|
|
server_ids: Optional[List[int]] = None # None이면 전체
|
|
|
|
|
|
# ── 엔드포인트 ────────────────────────────────────────────────────────────────
|
|
|
|
@router.get("/servers", response_model=List[DigitalTwinServerOut])
|
|
async def list_twin_servers(
|
|
keyword: Optional[str] = Query(None),
|
|
limit: int = Query(50, ge=1, le=200),
|
|
offset: int = Query(0, ge=0),
|
|
db: AsyncSession = Depends(get_db),
|
|
_u: User = Depends(get_current_user),
|
|
):
|
|
"""트윈 서버 목록 조회."""
|
|
q = select(DigitalTwinServer).order_by(desc(DigitalTwinServer.last_sync_at))
|
|
if keyword:
|
|
q = q.where(DigitalTwinServer.server_name.ilike(f"%{keyword}%"))
|
|
q = q.limit(limit).offset(offset)
|
|
rows = (await db.execute(q)).scalars().all()
|
|
return rows
|
|
|
|
|
|
@router.post("/sync/{server_id}")
|
|
async def sync_server(
|
|
server_id: int,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
"""
|
|
실제 서버 -> 트윈 동기화.
|
|
SSH로 top/df/free/ss 수집하여 real_state 갱신,
|
|
twin_state와 diff 계산.
|
|
"""
|
|
# 서버 조회
|
|
server = (await db.execute(
|
|
select(Server).where(Server.id == server_id)
|
|
)).scalars().first()
|
|
if not server:
|
|
raise HTTPException(404, "서버를 찾을 수 없습니다.")
|
|
|
|
# 자격증명 (응답에 절대 미포함)
|
|
creds = _get_server_credentials(server)
|
|
real_state = _collect_server_state(creds)
|
|
|
|
# 기존 트윈 조회 또는 신규 생성
|
|
twin = (await db.execute(
|
|
select(DigitalTwinServer).where(DigitalTwinServer.server_id == server_id)
|
|
)).scalars().first()
|
|
|
|
if twin is None:
|
|
twin = DigitalTwinServer(
|
|
server_id = server_id,
|
|
server_name = server.server_name,
|
|
)
|
|
db.add(twin)
|
|
|
|
# 기존 twin_state가 없으면 real_state를 초기값으로 사용
|
|
old_twin_state: dict = {}
|
|
if twin.twin_state:
|
|
try:
|
|
old_twin_state = json.loads(twin.twin_state)
|
|
except (json.JSONDecodeError, TypeError):
|
|
old_twin_state = {}
|
|
|
|
if not old_twin_state:
|
|
old_twin_state = real_state
|
|
|
|
diff = _compute_diff(old_twin_state, real_state)
|
|
|
|
twin.real_state = json.dumps(real_state, ensure_ascii=False)
|
|
twin.twin_state = json.dumps(old_twin_state, ensure_ascii=False)
|
|
twin.diff = json.dumps(diff, ensure_ascii=False)
|
|
twin.last_sync_at = datetime.utcnow()
|
|
|
|
await db.commit()
|
|
await db.refresh(twin)
|
|
|
|
return {
|
|
"twin_id": twin.id,
|
|
"server_name": twin.server_name,
|
|
"ssh_reachable": real_state.get("ssh_reachable", False),
|
|
"collected_at": real_state.get("collected_at"),
|
|
"diff_fields": list(diff.keys()),
|
|
"synced": True,
|
|
}
|
|
|
|
|
|
@router.post("/simulate/failure", response_model=TwinSimulationOut, status_code=201)
|
|
async def simulate_failure(
|
|
body: FailureSimRequest,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
"""
|
|
장애 시뮬레이션.
|
|
- CMDB 의존성 조회하여 영향 서버 목록 생성
|
|
- Ollama로 복구 시간 예측 + 위험도 점수 산출 (0.0~1.0)
|
|
"""
|
|
# CMDB에서 동일 이름/연관 서버 조회
|
|
related_rows = (await db.execute(
|
|
select(Server.server_name, Server.server_role)
|
|
.where(Server.is_active == True)
|
|
)).all()
|
|
|
|
affected_servers = []
|
|
for sname, srole in related_rows:
|
|
if sname != body.server_name:
|
|
affected_servers.append({"server_name": sname, "role": srole})
|
|
|
|
# 위험도 기본값 (장애 유형별)
|
|
base_risk = {
|
|
"cpu_overload": 0.6,
|
|
"memory_full": 0.75,
|
|
"disk_full": 0.8,
|
|
"service_down": 0.85,
|
|
"network_partition": 0.9,
|
|
}.get(body.failure_type, 0.5)
|
|
|
|
# 연관 서버가 많을수록 위험도 가중
|
|
risk_score = min(1.0, base_risk + len(affected_servers) * 0.02)
|
|
|
|
# Ollama 복구 예측
|
|
prompt = (
|
|
f"서버 장애 시뮬레이션 결과를 JSON으로만 답하시오.\n"
|
|
f"장애 서버: {body.server_name}\n"
|
|
f"장애 유형: {body.failure_type}\n"
|
|
f"영향 서버 수: {len(affected_servers)}\n"
|
|
f"출력 형식: {{\"estimated_recovery_min\": <숫자>, \"impact_summary\": \"<한 문장>\", "
|
|
f"\"recommended_action\": \"<한 문장>\"}}"
|
|
)
|
|
ai_raw = _ollama_analyze(prompt)
|
|
ai_result: dict = {}
|
|
try:
|
|
ai_result = json.loads(ai_raw) if ai_raw else {}
|
|
except (json.JSONDecodeError, ValueError):
|
|
ai_result = {"impact_summary": ai_raw} if ai_raw else {}
|
|
|
|
scenario = {
|
|
"failure_type": body.failure_type,
|
|
"description": body.description,
|
|
"affected_servers": affected_servers[:20], # 최대 20개
|
|
}
|
|
result = {
|
|
"risk_score": risk_score,
|
|
"affected_count": len(affected_servers),
|
|
"ai_analysis": ai_result,
|
|
}
|
|
|
|
sim = TwinSimulation(
|
|
sim_type = "failure",
|
|
target = body.server_name,
|
|
scenario = json.dumps(scenario, ensure_ascii=False),
|
|
result = json.dumps(result, ensure_ascii=False),
|
|
risk_score = risk_score,
|
|
)
|
|
db.add(sim)
|
|
await db.commit()
|
|
await db.refresh(sim)
|
|
return sim
|
|
|
|
|
|
@router.post("/simulate/change", response_model=TwinSimulationOut, status_code=201)
|
|
async def simulate_change(
|
|
body: ChangeSimRequest,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
"""
|
|
변경 영향도 분석.
|
|
- 변경 설명과 영향 서비스를 Ollama로 분석
|
|
- 위험도 점수 및 롤백 권고 생성
|
|
"""
|
|
affected_services = body.affected_services or []
|
|
|
|
# 관련 서버 CMDB 조회
|
|
related: list = []
|
|
if body.server_name:
|
|
rows = (await db.execute(
|
|
select(Server.server_name, Server.server_role)
|
|
.where(Server.server_name.ilike(f"%{body.server_name}%"), Server.is_active == True)
|
|
)).all()
|
|
related = [{"name": r[0], "role": r[1]} for r in rows]
|
|
|
|
# 위험도: 영향 서비스 수 + 관련 서버 수 기반 간이 계산
|
|
risk_score = min(1.0, 0.3 + len(affected_services) * 0.1 + len(related) * 0.05)
|
|
|
|
prompt = (
|
|
f"변경 영향도 분석 결과를 JSON으로만 답하시오.\n"
|
|
f"변경 대상 서버: {body.server_name}\n"
|
|
f"변경 내용: {body.change_description}\n"
|
|
f"영향 서비스: {', '.join(affected_services) if affected_services else '미지정'}\n"
|
|
f"출력 형식: {{\"risk_level\": \"low|medium|high\", \"rollback_recommended\": true|false, "
|
|
f"\"impact_summary\": \"<한 문장>\", \"precautions\": \"<한 문장>\"}}"
|
|
)
|
|
ai_raw = _ollama_analyze(prompt)
|
|
ai_result: dict = {}
|
|
try:
|
|
ai_result = json.loads(ai_raw) if ai_raw else {}
|
|
except (json.JSONDecodeError, ValueError):
|
|
ai_result = {"impact_summary": ai_raw} if ai_raw else {}
|
|
|
|
scenario = {
|
|
"change_description": body.change_description,
|
|
"affected_services": affected_services,
|
|
"related_servers": related[:10],
|
|
}
|
|
result = {
|
|
"risk_score": risk_score,
|
|
"ai_analysis": ai_result,
|
|
}
|
|
|
|
sim = TwinSimulation(
|
|
sim_type = "change",
|
|
target = body.server_name,
|
|
scenario = json.dumps(scenario, ensure_ascii=False),
|
|
result = json.dumps(result, ensure_ascii=False),
|
|
risk_score = risk_score,
|
|
)
|
|
db.add(sim)
|
|
await db.commit()
|
|
await db.refresh(sim)
|
|
return sim
|
|
|
|
|
|
@router.get("/diff/{server_id}")
|
|
async def get_diff(
|
|
server_id: int,
|
|
db: AsyncSession = Depends(get_db),
|
|
_u: User = Depends(get_current_user),
|
|
):
|
|
"""실제 서버 vs 트윈 차이점 조회."""
|
|
twin = (await db.execute(
|
|
select(DigitalTwinServer).where(DigitalTwinServer.server_id == server_id)
|
|
)).scalars().first()
|
|
|
|
if twin is None:
|
|
raise HTTPException(404, f"server_id={server_id}에 대한 트윈이 없습니다. /sync 먼저 실행하세요.")
|
|
|
|
diff: dict = {}
|
|
if twin.diff:
|
|
try:
|
|
diff = json.loads(twin.diff)
|
|
except (json.JSONDecodeError, TypeError):
|
|
diff = {}
|
|
|
|
return {
|
|
"twin_id": twin.id,
|
|
"server_name": twin.server_name,
|
|
"last_sync_at": twin.last_sync_at.isoformat() if twin.last_sync_at else None,
|
|
"diff": diff,
|
|
"diff_count": len(diff),
|
|
"in_sync": len(diff) == 0,
|
|
}
|
|
|
|
|
|
@router.post("/snapshot", response_model=TwinSnapshotOut, status_code=201)
|
|
async def create_snapshot(
|
|
body: SnapshotRequest,
|
|
db: AsyncSession = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
"""현재 트윈 상태 전체를 스냅샷으로 저장."""
|
|
q = select(DigitalTwinServer)
|
|
if body.server_ids:
|
|
q = q.where(DigitalTwinServer.server_id.in_(body.server_ids))
|
|
twins = (await db.execute(q)).scalars().all()
|
|
|
|
state_data = {
|
|
"snapshot_label": body.label,
|
|
"captured_at": datetime.utcnow().isoformat(),
|
|
"servers": [
|
|
{
|
|
"twin_id": t.id,
|
|
"server_id": t.server_id,
|
|
"server_name": t.server_name,
|
|
"twin_state": json.loads(t.twin_state) if t.twin_state else None,
|
|
"last_sync_at": t.last_sync_at.isoformat() if t.last_sync_at else None,
|
|
}
|
|
for t in twins
|
|
],
|
|
}
|
|
|
|
snap = TwinSnapshot(
|
|
label = body.label,
|
|
state = json.dumps(state_data, ensure_ascii=False),
|
|
)
|
|
db.add(snap)
|
|
await db.commit()
|
|
await db.refresh(snap)
|
|
return snap
|
|
|
|
|
|
@router.get("/snapshots", response_model=List[TwinSnapshotOut])
|
|
async def list_snapshots(
|
|
limit: int = Query(20, ge=1, le=100),
|
|
offset: int = Query(0, ge=0),
|
|
db: AsyncSession = Depends(get_db),
|
|
_u: User = Depends(get_current_user),
|
|
):
|
|
"""스냅샷 이력 조회."""
|
|
rows = (await db.execute(
|
|
select(TwinSnapshot)
|
|
.order_by(desc(TwinSnapshot.created_at))
|
|
.limit(limit).offset(offset)
|
|
)).scalars().all()
|
|
return rows
|