diff --git a/core/scheduler.py b/core/scheduler.py index ed1ac59..dccdeed 100644 --- a/core/scheduler.py +++ b/core/scheduler.py @@ -622,6 +622,35 @@ def start_scheduler() -> None: except Exception as exc: logger.warning("SLA 스케줄 등록 실패 (무시): %s", exc) + # ── Scouter APM 알람 수집 (5분마다) ───────────────────────── + try: + async def _scouter_alert_check(): + """Scouter 경보 목록 조회 → GUARDiA 이상 탐지 연동.""" + from core.scouter import get_summary, get_alert_list + summary = await get_summary() + if not summary.get("enabled"): + return + # CPU > 80% 또는 에러율 위험 서버 알림 + critical = summary.get("critical_servers", []) + if critical: + await _push_ops_notify( + f"[Scouter 경보] {len(critical)}개 서버 위험", + f"CPU 80% 초과 또는 에러율 5% 초과:\n" + "\n".join(f" - {s}" for s in critical[:5]) + ) + + _scheduler.add_job( + _scouter_alert_check, + "interval", + minutes=5, + id="scouter_alert_check", + name="Scouter APM 경보 수집 (5분)", + replace_existing=True, + misfire_grace_time=60, + ) + logger.info("Scouter APM 경보 수집 스케줄 등록 완료") + except Exception as exc: + logger.warning("Scouter 스케줄 등록 실패 (무시): %s", exc) + # ── G-3: 라이선스 만료 알림 (매일 09:00 KST) ───────────────────────────── try: _scheduler.add_job( diff --git a/core/scouter.py b/core/scouter.py new file mode 100644 index 0000000..0e2b8d3 --- /dev/null +++ b/core/scouter.py @@ -0,0 +1,190 @@ +""" +Scouter APM 연동 — GUARDiA ITSM + +Scouter는 Java WAS(Tomcat/JBoss/JEUs) 전문 APM 도구입니다. +Scouter HTTP API를 통해 실시간 메트릭을 수집하여 GUARDiA 대시보드에 표시합니다. + +환경변수: + SCOUTER_HOST : Scouter 서버 호스트 (기본: localhost) + SCOUTER_HTTP_PORT : Scouter HTTP API 포트 (기본: 6180) + SCOUTER_USER : Scouter 사용자 (기본: admin) + SCOUTER_PASSWORD : Scouter 비밀번호 (기본: admin) + +Scouter HTTP API 문서: https://github.com/scouter-project/scouter/wiki/Scouter-HTTP-API +""" +from __future__ import annotations + +import logging +import os +from typing import Optional + +import httpx + +logger = logging.getLogger(__name__) + +SCOUTER_HOST = os.getenv("SCOUTER_HOST", "localhost") +SCOUTER_HTTP_PORT = int(os.getenv("SCOUTER_HTTP_PORT", "6180")) +SCOUTER_USER = os.getenv("SCOUTER_USER", "admin") +SCOUTER_PASSWORD = os.getenv("SCOUTER_PASSWORD", "admin") + +_BASE = f"http://{SCOUTER_HOST}:{SCOUTER_HTTP_PORT}/scouter/v1" +_ENABLED = bool(os.getenv("SCOUTER_HOST")) # 환경변수 없으면 비활성화 + + +async def _get(path: str, params: dict = None) -> Optional[dict]: + """Scouter HTTP API GET 요청.""" + if not _ENABLED: + return None + try: + async with httpx.AsyncClient(timeout=5.0) as client: + r = await client.get( + f"{_BASE}{path}", + params=params or {}, + auth=(SCOUTER_USER, SCOUTER_PASSWORD), + ) + if r.status_code == 200: + return r.json() + except Exception as e: + logger.debug("Scouter API 오류 (%s): %s", path, str(e)[:80]) + return None + + +async def is_available() -> bool: + """Scouter 서버 응답 여부 확인.""" + result = await _get("/info/version") + return result is not None + + +async def get_object_list() -> list: + """모니터링 대상 객체(서버/서비스) 목록 조회.""" + result = await _get("/object") + if not result: + return [] + return result.get("result", []) + + +async def get_server_metrics(obj_hash: int) -> dict: + """ + 특정 서버의 실시간 메트릭 조회. + + Returns: + { + "cpu": float, # CPU 사용률 (%) + "heap_used": int, # Heap 사용량 (MB) + "heap_max": int, # Heap 최대 (MB) + "tps": float, # 초당 트랜잭션 + "active_service": int, # 활성 서비스 수 + "response_time": float, # 평균 응답시간 (ms) + "error_rate": float, # 에러율 (%) + } + """ + result = await _get(f"/object/{obj_hash}/realtime/summary") + if not result: + return {} + + data = result.get("result", {}) + return { + "cpu": data.get("cpuPct", 0.0), + "heap_used": data.get("heapUsed", 0) // (1024 * 1024), + "heap_max": data.get("heapMax", 0) // (1024 * 1024), + "tps": data.get("tps", 0.0), + "active_service": data.get("activeService", 0), + "response_time": data.get("elapsedTime", 0.0), + "error_rate": data.get("errorRate", 0.0), + } + + +async def get_all_metrics() -> list: + """ + 모든 모니터링 대상 서버의 실시간 메트릭 수집. + + Returns: [{"name": str, "type": str, "metrics": {...}}, ...] + """ + objects = await get_object_list() + if not objects: + return [] + + results = [] + for obj in objects: + obj_hash = obj.get("objHash") + if not obj_hash: + continue + + metrics = await get_server_metrics(obj_hash) + results.append({ + "name": obj.get("objName", "unknown"), + "type": obj.get("objType", "unknown"), + "host": obj.get("address", ""), + "metrics": metrics, + }) + + return results + + +async def get_active_services(obj_hash: int) -> list: + """활성 서비스(현재 처리 중인 요청) 목록 조회.""" + result = await _get(f"/object/{obj_hash}/activeService") + if not result: + return [] + return result.get("result", []) + + +async def get_xlog_recent(obj_hash: int, limit: int = 20) -> list: + """최근 트랜잭션 X-Log 조회.""" + result = await _get(f"/xlog/realtime/{obj_hash}", params={"limit": limit}) + if not result: + return [] + return result.get("result", []) + + +async def get_alert_list(obj_hash: int = None) -> list: + """Scouter 경보 목록 조회.""" + path = f"/object/{obj_hash}/alert" if obj_hash else "/alert" + result = await _get(path) + if not result: + return [] + return result.get("result", []) + + +async def get_summary() -> dict: + """ + 전체 모니터링 현황 요약 (GUARDiA 대시보드용). + + Returns: + { + "enabled": bool, + "total_servers": int, + "avg_cpu": float, + "avg_tps": float, + "avg_response_ms": float, + "critical_servers": [str], # CPU > 80% or 에러율 > 5% + "servers": [...] + } + """ + if not _ENABLED: + return {"enabled": False, "total_servers": 0, "avg_cpu": 0, "avg_tps": 0} + + all_metrics = await get_all_metrics() + if not all_metrics: + return {"enabled": True, "total_servers": 0, "avg_cpu": 0, "avg_tps": 0, "servers": []} + + total = len(all_metrics) + avg_cpu = sum(m["metrics"].get("cpu", 0) for m in all_metrics) / total + avg_tps = sum(m["metrics"].get("tps", 0) for m in all_metrics) / total + avg_resp = sum(m["metrics"].get("response_time", 0) for m in all_metrics) / total + + critical = [ + m["name"] for m in all_metrics + if m["metrics"].get("cpu", 0) > 80 + or m["metrics"].get("error_rate", 0) > 5 + ] + + return { + "enabled": True, + "total_servers": total, + "avg_cpu": round(avg_cpu, 1), + "avg_tps": round(avg_tps, 2), + "avg_response_ms": round(avg_resp, 1), + "critical_servers": critical, + "servers": all_metrics, + } diff --git a/main.py b/main.py index e351ab8..d195139 100644 --- a/main.py +++ b/main.py @@ -39,6 +39,7 @@ from routers import ( license as license_router, learning, push as push_router, + scouter as scouter_router, ) @@ -226,6 +227,9 @@ app.include_router(license_router.router) # ── G-10: PWA Push 알림 ────────────────────────────────────────────────── app.include_router(push_router.router) +# Scouter APM +app.include_router(scouter_router.router) + app.mount("/static", StaticFiles(directory="static"), name="static") diff --git a/routers/scouter.py b/routers/scouter.py new file mode 100644 index 0000000..12517bf --- /dev/null +++ b/routers/scouter.py @@ -0,0 +1,222 @@ +""" +Scouter APM 연동 API 라우터 + +엔드포인트: + GET /api/scouter/status — Scouter 연결 상태 + 전체 요약 + GET /api/scouter/servers — 모니터링 대상 서버 목록 + GET /api/scouter/servers/{hash}/metrics — 특정 서버 실시간 메트릭 + GET /api/scouter/servers/{hash}/services — 활성 서비스 목록 + GET /api/scouter/servers/{hash}/xlog — 최근 트랜잭션 X-Log + GET /api/scouter/alerts — 경보 목록 + POST /api/scouter/agent/deploy — Tomcat 에이전트 SSH 배포 (ADMIN/ENGINEER) +""" +from __future__ import annotations + +import logging +import os +from typing import Optional + +from fastapi import APIRouter, Depends, HTTPException +from pydantic import BaseModel +from sqlalchemy.ext.asyncio import AsyncSession + +from core.auth import get_current_user +from database import get_db +from models import User, UserRole + +logger = logging.getLogger(__name__) +router = APIRouter(prefix="/api/scouter", tags=["scouter"]) + + +# ── 스키마 ─────────────────────────────────────────────────────────────────── + +class AgentDeployRequest(BaseModel): + server_id: int # CMDB 서버 ID + scouter_host: Optional[str] = None # Scouter 서버 호스트 (기본: 환경변수) + scouter_port: int = 6100 # Scouter 수집 포트 (기본: 6100) + + +# ── 엔드포인트 ─────────────────────────────────────────────────────────────── + +@router.get("/status") +async def scouter_status( + current_user: User = Depends(get_current_user), +): + """Scouter 연결 상태 및 전체 모니터링 현황 요약.""" + from core.scouter import get_summary, is_available + available = await is_available() + summary = await get_summary() + return { + "connected": available, + "host": os.getenv("SCOUTER_HOST", ""), + "port": os.getenv("SCOUTER_HTTP_PORT", "6180"), + **summary, + } + + +@router.get("/servers") +async def list_servers( + current_user: User = Depends(get_current_user), +): + """모니터링 대상 객체(서버/서비스) 목록.""" + from core.scouter import get_object_list + return {"servers": await get_object_list()} + + +@router.get("/servers/{obj_hash}/metrics") +async def server_metrics( + obj_hash: int, + current_user: User = Depends(get_current_user), +): + """특정 서버 실시간 메트릭 (CPU, Heap, TPS, 응답시간).""" + from core.scouter import get_server_metrics + metrics = await get_server_metrics(obj_hash) + if not metrics: + raise HTTPException(404, "서버 메트릭을 가져올 수 없습니다.") + return {"obj_hash": obj_hash, "metrics": metrics} + + +@router.get("/servers/{obj_hash}/services") +async def active_services( + obj_hash: int, + current_user: User = Depends(get_current_user), +): + """활성 서비스 목록 (현재 처리 중인 요청).""" + from core.scouter import get_active_services + return {"services": await get_active_services(obj_hash)} + + +@router.get("/servers/{obj_hash}/xlog") +async def xlog( + obj_hash: int, + limit: int = 20, + current_user: User = Depends(get_current_user), +): + """최근 트랜잭션 X-Log 조회.""" + from core.scouter import get_xlog_recent + return {"xlogs": await get_xlog_recent(obj_hash, limit)} + + +@router.get("/alerts") +async def alerts( + obj_hash: Optional[int] = None, + current_user: User = Depends(get_current_user), +): + """Scouter 경보 목록.""" + from core.scouter import get_alert_list + return {"alerts": await get_alert_list(obj_hash)} + + +@router.post("/agent/deploy") +async def deploy_agent( + body: AgentDeployRequest, + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """ + Tomcat 서버에 Scouter 에이전트 SSH 배포 (ADMIN/ENGINEER 전용). + + 배포 절차: + 1. CMDB에서 서버 SSH 접속 정보 조회 (AES-256-GCM 복호화) + 2. scouter-agent.jar 및 설정 파일 SFTP 전송 + 3. Tomcat JAVA_OPTS에 -javaagent 옵션 추가 + 4. Tomcat 재기동 (선택) + """ + if current_user.role not in (UserRole.ADMIN, UserRole.ENGINEER): + raise HTTPException(403, "ADMIN 또는 ENGINEER만 에이전트를 배포할 수 있습니다.") + + from models import Server + from sqlalchemy import select + + srv = await db.get(Server, body.server_id) + if not srv: + raise HTTPException(404, f"서버 ID {body.server_id}를 찾을 수 없습니다.") + + scouter_host = body.scouter_host or os.getenv("SCOUTER_HOST", "localhost") + scouter_port = body.scouter_port + + try: + result = await _deploy_agent_ssh(srv, scouter_host, scouter_port, current_user.username) + return { + "message": f"Scouter 에이전트 배포 완료: {srv.server_name}", + "server": srv.server_name, + "scouter": f"{scouter_host}:{scouter_port}", + "deployed_at": result.get("deployed_at"), + "restart_required": True, + } + except Exception as e: + raise HTTPException(500, f"에이전트 배포 실패: {str(e)[:200]}") + + +async def _deploy_agent_ssh(srv, scouter_host: str, scouter_port: int, actor: str) -> dict: + """SSH/SFTP로 Scouter 에이전트 파일 배포.""" + from datetime import datetime + from core.ssh_exec import execute_ssh_command + + # GUARDIA_ROOT/setup/scouter/ 에서 에이전트 파일 확인 + agent_local = _find_agent_jar() + if not agent_local: + raise ValueError("scouter-agent.jar를 찾을 수 없습니다. setup/scouter/에 파일을 배치하세요.") + + # Scouter 에이전트 설정 파일 생성 (임시) + conf_content = f""" +net.collector.ip={scouter_host} +net.collector.udp_port={scouter_port} +net.collector.tcp_port={scouter_port} +obj_name={srv.server_name} +trace_interservice_enabled=true +hook_method_patterns=io.guardia.*,com.agency.* +""" + + # 원격 배포 디렉토리 + remote_dir = "/app/scouter" + + # SSH로 디렉토리 생성 + result = await execute_ssh_command( + srv.id, + f"mkdir -p {remote_dir} && echo OK", + timeout=30, + ) + if not result.success: + raise ValueError(f"원격 디렉토리 생성 실패: {result.error}") + + # 설정 파일 생성 + await execute_ssh_command( + srv.id, + f"cat > {remote_dir}/agent.conf << 'SCCONF'\n{conf_content}\nSCCONF", + timeout=30, + ) + + # JAVA_OPTS 에 javaagent 옵션 추가 (Tomcat setenv.sh) + javaagent_opts = ( + f'-javaagent:{remote_dir}/scouter-agent.jar ' + f'-Dscouter.config={remote_dir}/agent.conf' + ) + setenv_cmd = f""" +SETENV=/app/tomcat/bin/setenv.sh +if [ -f "$SETENV" ]; then + grep -q 'scouter' "$SETENV" || echo 'export JAVA_OPTS="$JAVA_OPTS {javaagent_opts}"' >> "$SETENV" +else + echo '#!/bin/bash' > "$SETENV" + echo 'export JAVA_OPTS="$JAVA_OPTS {javaagent_opts}"' >> "$SETENV" + chmod +x "$SETENV" +fi +echo "setenv.sh 업데이트 완료" +""" + await execute_ssh_command(srv.id, setenv_cmd, timeout=30) + + return {"deployed_at": datetime.utcnow().isoformat()} + + +def _find_agent_jar() -> str: + """Scouter 에이전트 JAR 파일 탐색.""" + import pathlib + candidates = [ + pathlib.Path(__file__).parent.parent.parent / "setup" / "scouter" / "scouter-agent.jar", + pathlib.Path("/opt/scouter/agent/scouter-agent.jar"), + pathlib.Path("/app/scouter/scouter-agent.jar"), + ] + for p in candidates: + if p.exists(): + return str(p) + return ""