""" Scouter APM 연동 — GUARDiA ITSM Scouter는 Java WAS(Tomcat/JBoss/JEUs) 전문 APM 도구입니다. Scouter HTTP API를 통해 실시간 메트릭을 수집하여 GUARDiA 대시보드에 표시합니다. 환경변수: SCOUTER_HOST : Scouter 서버 호스트 (기본: localhost) SCOUTER_HTTP_PORT : Scouter HTTP API 포트 (기본: 6180) SCOUTER_USER : Scouter 사용자 (기본: admin) SCOUTER_PASSWORD : Scouter 비밀번호 (기본: admin) Scouter HTTP API 문서: https://github.com/scouter-project/scouter/wiki/Scouter-HTTP-API """ from __future__ import annotations import logging import os from typing import Optional import httpx logger = logging.getLogger(__name__) SCOUTER_HOST = os.getenv("SCOUTER_HOST", "localhost") SCOUTER_HTTP_PORT = int(os.getenv("SCOUTER_HTTP_PORT", "6180")) SCOUTER_USER = os.getenv("SCOUTER_USER", "admin") SCOUTER_PASSWORD = os.getenv("SCOUTER_PASSWORD", "admin") _BASE = f"http://{SCOUTER_HOST}:{SCOUTER_HTTP_PORT}/scouter/v1" _ENABLED = bool(os.getenv("SCOUTER_HOST")) # 환경변수 없으면 비활성화 async def _get(path: str, params: dict = None) -> Optional[dict]: """Scouter HTTP API GET 요청.""" if not _ENABLED: return None try: async with httpx.AsyncClient(timeout=5.0) as client: r = await client.get( f"{_BASE}{path}", params=params or {}, auth=(SCOUTER_USER, SCOUTER_PASSWORD), ) if r.status_code == 200: return r.json() except Exception as e: logger.debug("Scouter API 오류 (%s): %s", path, str(e)[:80]) return None async def is_available() -> bool: """Scouter 서버 응답 여부 확인.""" result = await _get("/info/version") return result is not None async def get_object_list() -> list: """모니터링 대상 객체(서버/서비스) 목록 조회.""" result = await _get("/object") if not result: return [] return result.get("result", []) async def get_server_metrics(obj_hash: int) -> dict: """ 특정 서버의 실시간 메트릭 조회. Returns: { "cpu": float, # CPU 사용률 (%) "heap_used": int, # Heap 사용량 (MB) "heap_max": int, # Heap 최대 (MB) "tps": float, # 초당 트랜잭션 "active_service": int, # 활성 서비스 수 "response_time": float, # 평균 응답시간 (ms) "error_rate": float, # 에러율 (%) } """ result = await _get(f"/object/{obj_hash}/realtime/summary") if not result: return {} data = result.get("result", {}) return { "cpu": data.get("cpuPct", 0.0), "heap_used": data.get("heapUsed", 0) // (1024 * 1024), "heap_max": data.get("heapMax", 0) // (1024 * 1024), "tps": data.get("tps", 0.0), "active_service": data.get("activeService", 0), "response_time": data.get("elapsedTime", 0.0), "error_rate": data.get("errorRate", 0.0), } async def get_all_metrics() -> list: """ 모든 모니터링 대상 서버의 실시간 메트릭 수집. Returns: [{"name": str, "type": str, "metrics": {...}}, ...] """ objects = await get_object_list() if not objects: return [] results = [] for obj in objects: obj_hash = obj.get("objHash") if not obj_hash: continue metrics = await get_server_metrics(obj_hash) results.append({ "name": obj.get("objName", "unknown"), "type": obj.get("objType", "unknown"), "host": obj.get("address", ""), "metrics": metrics, }) return results async def get_active_services(obj_hash: int) -> list: """활성 서비스(현재 처리 중인 요청) 목록 조회.""" result = await _get(f"/object/{obj_hash}/activeService") if not result: return [] return result.get("result", []) async def get_xlog_recent(obj_hash: int, limit: int = 20) -> list: """최근 트랜잭션 X-Log 조회.""" result = await _get(f"/xlog/realtime/{obj_hash}", params={"limit": limit}) if not result: return [] return result.get("result", []) async def get_alert_list(obj_hash: int = None) -> list: """Scouter 경보 목록 조회.""" path = f"/object/{obj_hash}/alert" if obj_hash else "/alert" result = await _get(path) if not result: return [] return result.get("result", []) async def get_summary() -> dict: """ 전체 모니터링 현황 요약 (GUARDiA 대시보드용). Returns: { "enabled": bool, "total_servers": int, "avg_cpu": float, "avg_tps": float, "avg_response_ms": float, "critical_servers": [str], # CPU > 80% or 에러율 > 5% "servers": [...] } """ if not _ENABLED: return {"enabled": False, "total_servers": 0, "avg_cpu": 0, "avg_tps": 0} all_metrics = await get_all_metrics() if not all_metrics: return {"enabled": True, "total_servers": 0, "avg_cpu": 0, "avg_tps": 0, "servers": []} total = len(all_metrics) avg_cpu = sum(m["metrics"].get("cpu", 0) for m in all_metrics) / total avg_tps = sum(m["metrics"].get("tps", 0) for m in all_metrics) / total avg_resp = sum(m["metrics"].get("response_time", 0) for m in all_metrics) / total critical = [ m["name"] for m in all_metrics if m["metrics"].get("cpu", 0) > 80 or m["metrics"].get("error_rate", 0) > 5 ] return { "enabled": True, "total_servers": total, "avg_cpu": round(avg_cpu, 1), "avg_tps": round(avg_tps, 2), "avg_response_ms": round(avg_resp, 1), "critical_servers": critical, "servers": all_metrics, }