# [Specification] 에이전트리스 배포 엔진 (Agentless Deployment Engine) --- ## 1. 개요 대상 서버에 어떤 소프트웨어도 설치하지 않고, 표준 **SSH/SFTP 프로토콜**만으로 파일 배포 · 서비스 재기동 · 헬스체크 · 자동 롤백을 수행하는 핵심 실행 엔진. --- ## 2. SSH Executor 모듈 (`src/deploy/ssh_executor.py`) ```python import paramiko import hashlib class SSHExecutor: def __init__(self, host: str, user: str, password: str, port: int = 22): self.client = paramiko.SSHClient() self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) self.client.connect(host, port=port, username=user, password=password, timeout=10) def execute_command(self, command: str, timeout: int = 300) -> dict: """원격 명령 실행 — exit_code 반드시 확인""" stdin, stdout, stderr = self.client.exec_command(command, timeout=timeout) exit_code = stdout.channel.recv_exit_status() return { "exit_code": exit_code, "stdout": stdout.read().decode("utf-8", errors="replace"), "stderr": stderr.read().decode("utf-8", errors="replace") } def upload_file(self, local_path: str, remote_path: str) -> str: """SFTP 파일 업로드 + MD5 무결성 검증""" sftp = self.client.open_sftp() sftp.put(local_path, remote_path) sftp.close() # 원격지 MD5 검증 result = self.execute_command(f"md5sum {remote_path}") return result["stdout"].split()[0] # 원격 MD5 def close(self): self.client.close() ``` --- ## 3. 롤링 배포 엔진 (`src/deploy/rolling_deployer.py`) ```python import time import requests from .ssh_executor import SSHExecutor from ..db.audit import log_execution class RollingDeployer: HEALTH_TIMEOUT = 30 # 헬스체크 최대 대기 (초) def deploy(self, nodes: list, artifact_path: str, sr_id: str) -> bool: """무중단 롤링 배포 — 노드 순차 처리""" for node in nodes: success = self._deploy_single_node(node, artifact_path, sr_id) if not success: return False return True def _deploy_single_node(self, node: dict, artifact_path: str, sr_id: str) -> bool: executor = SSHExecutor(node["ip"], node["user"], node["password"], node["ssh_port"]) try: # 1. Pre-check: 디스크 용량 확인 result = executor.execute_command("df -h /app | awk 'NR==2{print $5}' | sed 's/%//'") if int(result["stdout"].strip()) > 90: raise Exception("디스크 사용량 90% 초과 — 배포 중단") # 2. 백업 (타임스탬프 포함) ts = time.strftime("%Y%m%d_%H%M%S") backup_cmd = f"cp -rp {node['deploy_path']} {node['deploy_path']}.bak_{ts}" executor.execute_command(backup_cmd) # 3. SFTP 전송 + 무결성 검증 local_md5 = self._local_md5(artifact_path) remote_path = f"{node['deploy_path']}/{artifact_path.split('/')[-1]}" remote_md5 = executor.upload_file(artifact_path, remote_path) if local_md5 != remote_md5: raise Exception(f"MD5 불일치: {local_md5} vs {remote_md5}") # 4. WAS 재기동 (동적 자원인 경우) if node.get("requires_restart", True): executor.execute_command("sh /app/scripts/shutdown.sh") executor.execute_command(f"cp {remote_path} {node['deploy_path']}/") result = executor.execute_command("sh /app/scripts/startup.sh") if result["exit_code"] != 0: raise Exception(f"startup.sh 실패: {result['stderr']}") # 5. 헬스체크 if not self._health_check(node["ip"], node.get("health_port", 8080)): raise Exception("헬스체크 실패 — 롤백 시작") log_execution(sr_id, node["ip"], "DEPLOY_SUCCESS", 0, "") return True except Exception as e: self._rollback(executor, node, ts) log_execution(sr_id, node["ip"], "DEPLOY_FAILED", 1, str(e)) return False finally: executor.close() def _health_check(self, ip: str, port: int) -> bool: """서비스 포트 응답 확인 — 최대 30초 대기""" deadline = time.time() + self.HEALTH_TIMEOUT while time.time() < deadline: try: r = requests.get(f"http://{ip}:{port}", timeout=3) if r.status_code == 200: return True except Exception: pass time.sleep(2) return False def _rollback(self, executor: SSHExecutor, node: dict, ts: str): """백업본으로 자동 롤백""" try: executor.execute_command("sh /app/scripts/shutdown.sh") executor.execute_command(f"cp -rp {node['deploy_path']}.bak_{ts} {node['deploy_path']}") executor.execute_command("sh /app/scripts/startup.sh") except Exception as e: print(f"[ROLLBACK ERROR] {e}") @staticmethod def _local_md5(path: str) -> str: with open(path, "rb") as f: return hashlib.md5(f.read()).hexdigest() ``` --- ## 4. Command Sanitizer (보안 필터) ```python BLACKLIST_PATTERNS = [ r"rm\s+-[rRf].*\/", # rm -rf / r"\bmkfs\b", r"\bformat\b", r"\bdrop\s+table\b", r"\btruncate\b", r">\s*/dev/sda", ] def sanitize_command(command: str) -> None: """위험 명령어 사전 차단 — SecurityViolationError 발생""" import re for pattern in BLACKLIST_PATTERNS: if re.search(pattern, command, re.IGNORECASE): raise SecurityViolationError(f"금지 명령어 패턴 탐지: {pattern}") ``` --- ## 5. 티어드 롤링 배포 (1,000+ 사이트 대응) ```python def tiered_rolling_deployment(sites: list, playbook: dict) -> bool: """ Tier 1 (미션 크리티컬): 순차 배포, 엄격한 헬스체크 Tier 2 (표준 사이트): 50개 단위 병렬 Tier 3 (테스트/개발): 전량 병렬 """ tiers = _categorize_by_tier(sites) for tier_name, tier_sites in tiers.items(): batch_size = 1 if tier_name == "TIER1" else 50 results = [] for i in range(0, len(tier_sites), batch_size): batch = tier_sites[i:i+batch_size] # 병렬 처리 (asyncio 또는 ThreadPoolExecutor) batch_results = _deploy_batch(batch, playbook) results.extend(batch_results) # Safety Gate: 실패율 > 10% 이면 중단 fail_rate = sum(1 for r in results if not r) / len(results) if fail_rate > 0.1: print(f"[ABORT] {tier_name} 실패율 {fail_rate:.0%} > 10% — 전체 배포 중단") return False return True ``` --- ## 6. 배포 파이프라인 API 엔드포인트 ```python @app.post("/api/deploy/trigger") async def trigger_deployment(sr_id: str, approver_id: str): """승인 완료 후 배포 엔진 가동""" task = db.get_task(sr_id) nodes = db.get_server_nodes(task["inst_id"], task["system_name"]) artifacts = db.get_staged_artifacts(sr_id) deployer = RollingDeployer() success = deployer.deploy(nodes, artifacts[0], sr_id) if success: db.update_task_status(sr_id, "PENDING_PM_VALIDATION") notify_pm_for_hitl_review(sr_id) else: db.update_task_status(sr_id, "FAILED_ROLLBACK") notify_failure(sr_id) return {"sr_id": sr_id, "success": success} ```