from __future__ import annotations import hashlib import logging import re from datetime import datetime, timedelta from typing import Any, Optional import httpx from fastapi import APIRouter, Depends, HTTPException, Query, status from pydantic import BaseModel, Field from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from core.auth import get_current_user, require_admin_role from database import get_db from models import ( User, AuditLog, DataLineage, DataRetentionPolicy, PIIScanResult, ) logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/dg", tags=["Data Governance"]) _OLLAMA_URL = "http://localhost:11434/api/generate" # ── PII 탐지 정규식 (대한민국 기준) ──────────────────────────────────────────── PII_PATTERNS: dict[str, re.Pattern] = { "SSN": re.compile(r"\b\d{6}-[1-4]\d{6}\b"), "PHONE": re.compile(r"\b01[016789][-.\s]?\d{3,4}[-.\s]?\d{4}\b"), "CARD": re.compile(r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b"), "ACCOUNT": re.compile(r"\b\d{2,6}-\d{2,6}-\d{2,7}\b"), "EMAIL": re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"), "IP": re.compile(r"\b(?:(?:25[0-5]|2[0-4]\d|1?\d?\d)\.){3}(?:25[0-5]|2[0-4]\d|1?\d?\d)\b"), } PII_RISK_WEIGHT: dict[str, int] = { "SSN": 10, "CARD": 9, "ACCOUNT": 7, "PHONE": 5, "EMAIL": 4, "IP": 2, } def _tenant(user: User) -> str: return user.inst_code or str(user.id) def _detect_pii(text: str) -> dict[str, list[str]]: found: dict[str, list[str]] = {} for ptype, pattern in PII_PATTERNS.items(): matches = pattern.findall(text) if matches: found[ptype] = list(dict.fromkeys(matches)) return found def _risk_level(found: dict[str, list[str]]) -> str: score = sum(PII_RISK_WEIGHT.get(t, 1) * len(v) for t, v in found.items()) if score >= 10 or "SSN" in found or "CARD" in found: return "HIGH" if score >= 4: return "MEDIUM" return "LOW" if found else "NONE" def _mask_value(value: str, method: str) -> str: if method == "hash": return "sha256:" + hashlib.sha256(value.encode()).hexdigest()[:16] if len(value) <= 2: return "*" * len(value) return value[0] + "*" * (len(value) - 2) + value[-1] def _mask_text(text: str, method: str, pii_types: Optional[list[str]]) -> tuple[str, dict[str, int]]: counts: dict[str, int] = {} masked = text targets = pii_types or list(PII_PATTERNS.keys()) for ptype in targets: pattern = PII_PATTERNS.get(ptype) if not pattern: continue def _repl(m: re.Match) -> str: return _mask_value(m.group(0), method) new_text, n = pattern.subn(_repl, masked) masked = new_text if n: counts[ptype] = n return masked, counts async def _ollama(prompt: str) -> Optional[str]: try: async with httpx.AsyncClient(timeout=15) as c: r = await c.post(_OLLAMA_URL, json={"model": "llama3", "prompt": prompt, "stream": False}) return r.json().get("response") except Exception: return None async def _audit(db: AsyncSession, user: User, action: str, detail: str, severity: str = "INFO") -> None: try: db.add(AuditLog( actor=user.username, action=action, detail=detail, entity_type="DATA_GOVERNANCE", severity=severity, )) await db.flush() except Exception: pass # ── Pydantic 스키마 ──────────────────────────────────────────────────────────── class ScanIn(BaseModel): text: str target: str = "inline-text" use_ai: bool = False class MaskIn(BaseModel): text: str method: str = "redact" pii_types: Optional[list[str]] = None class LineageIn(BaseModel): source_system: str source_table: str = "" target_system: str target_table: str = "" transformation: str = "" pii_involved: bool = False class RetentionPolicyIn(BaseModel): table_name: str retention_days: int = Field(..., gt=0) action: str = "DELETE" legal_basis: str = "" class EnforceIn(BaseModel): policy_id: Optional[int] = None dry_run: bool = True # ── 엔드포인트 ───────────────────────────────────────────────────────────────── @router.post("/scan", summary="개인정보 자동 탐지 스캔") async def scan_pii( body: ScanIn, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): tid = _tenant(current_user) found = _detect_pii(body.text) ai_hint = None if body.use_ai: ai_hint = await _ollama( f"다음 텍스트에 개인정보가 더 있으면 유형만 콤마로 나열하라(없으면 NONE):\n{body.text[:1500]}" ) match_count = sum(len(v) for v in found.values()) risk = _risk_level(found) rec = PIIScanResult( tenant_id=tid, scan_target=body.target[:500], pii_types_found=list(found.keys()), match_count=match_count, risk_level=risk, ) db.add(rec) await db.flush() await _audit(db, current_user, "PII_SCAN", f"target={body.target} types={list(found.keys())} risk={risk}", severity="WARN" if risk == "HIGH" else "INFO") await db.commit() samples = {p: [_mask_value(v, "redact") for v in vals[:3]] for p, vals in found.items()} return { "scan_id": rec.id, "target": body.target, "pii_types_found": list(found.keys()), "match_count": match_count, "risk_level": risk, "masked_samples": samples, "ai_hint": ai_hint, } @router.post("/mask", summary="개인정보 마스킹 처리") async def mask_pii( body: MaskIn, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): if body.method not in ("redact", "hash"): raise HTTPException(400, "method는 redact 또는 hash여야 합니다") masked_text, counts = _mask_text(body.text, body.method, body.pii_types) total = sum(counts.values()) await _audit(db, current_user, "PII_MASK", f"method={body.method} masked={counts}") await db.commit() return {"method": body.method, "masked_text": masked_text, "masked_counts": counts, "total_masked": total} @router.get("/audit-log", summary="개인정보처리방침 준수 감사 로그 조회") async def dg_audit_log( limit: int = Query(50, ge=1, le=500), db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): rows = (await db.execute( select(AuditLog).where(AuditLog.entity_type == "DATA_GOVERNANCE") .order_by(AuditLog.id.desc()).limit(limit) )).scalars().all() return {"total": len(rows), "items": [ {"id": r.id, "actor": r.actor, "action": r.action, "detail": r.detail, "severity": r.severity, "created_at": r.created_at.isoformat() if r.created_at else None} for r in rows ]} @router.get("/lineage/{table}", summary="데이터 계보 추적") async def get_lineage( table: str, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): tid = _tenant(current_user) upstream = (await db.execute( select(DataLineage).where(DataLineage.tenant_id == tid, DataLineage.target_table == table) )).scalars().all() downstream = (await db.execute( select(DataLineage).where(DataLineage.tenant_id == tid, DataLineage.source_table == table) )).scalars().all() def _ser(r: DataLineage) -> dict[str, Any]: return { "id": r.id, "source_system": r.source_system, "source_table": r.source_table, "target_system": r.target_system, "target_table": r.target_table, "transformation": r.transformation, "pii_involved": r.pii_involved, "created_at": r.created_at.isoformat() if r.created_at else None, } return { "table": table, "upstream": [_ser(r) for r in upstream], "downstream": [_ser(r) for r in downstream], "pii_in_flow": any(r.pii_involved for r in (*upstream, *downstream)), } @router.post("/lineage", status_code=201, summary="데이터 계보 등록") async def create_lineage( body: LineageIn, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): tid = _tenant(current_user) rec = DataLineage( tenant_id=tid, source_system=body.source_system, source_table=body.source_table, target_system=body.target_system, target_table=body.target_table, transformation=body.transformation, pii_involved=body.pii_involved, ) db.add(rec) await db.flush() await _audit(db, current_user, "LINEAGE_REGISTER", f"{body.source_system}.{body.source_table} -> {body.target_system}.{body.target_table}") await db.commit() return {"id": rec.id, "source_system": rec.source_system, "target_system": rec.target_system, "pii_involved": rec.pii_involved} @router.get("/compliance-check", summary="공공데이터법·개인정보보호법 준수 자동 감사") async def compliance_check( db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): tid = _tenant(current_user) from sqlalchemy import func as sqlfunc high_scans = (await db.execute( select(sqlfunc.count()).select_from(PIIScanResult).where( PIIScanResult.tenant_id == tid, PIIScanResult.risk_level == "HIGH" ) )).scalar() or 0 total_scans = (await db.execute( select(sqlfunc.count()).select_from(PIIScanResult).where(PIIScanResult.tenant_id == tid) )).scalar() or 0 active_policies = (await db.execute( select(sqlfunc.count()).select_from(DataRetentionPolicy).where( DataRetentionPolicy.tenant_id == tid, DataRetentionPolicy.is_active == True ) )).scalar() or 0 checklist = [ {"law": "개인정보보호법", "item": "개인정보 탐지·스캔 체계 운영", "passed": total_scans > 0, "evidence": f"누적 스캔 {total_scans}건"}, {"law": "개인정보보호법", "item": "보존 기간 정책 수립 및 자동 파기", "passed": active_policies > 0, "evidence": f"활성 보존정책 {active_policies}건"}, {"law": "개인정보보호법", "item": "고위험 PII 미해소 잔존 여부", "passed": high_scans == 0, "evidence": f"HIGH 위험 스캔 {high_scans}건"}, {"law": "전자정부법", "item": "데이터 거버넌스 감사 로그 기록", "passed": True, "evidence": "TB_AUDIT_LOG 불변 기록 운영"}, ] passed = sum(1 for c in checklist if c["passed"]) rate = round(passed / len(checklist) * 100, 1) return { "compliance_rate": rate, "passed": passed, "total": len(checklist), "checklist": checklist, "audited_at": datetime.utcnow().isoformat(), } @router.get("/retention-policy", summary="데이터 보존 기간 정책 목록") async def list_retention_policies( active_only: bool = True, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): tid = _tenant(current_user) q = select(DataRetentionPolicy).where(DataRetentionPolicy.tenant_id == tid) if active_only: q = q.where(DataRetentionPolicy.is_active == True) rows = (await db.execute(q.order_by(DataRetentionPolicy.id.desc()))).scalars().all() return {"total": len(rows), "items": [ {"id": r.id, "table_name": r.table_name, "retention_days": r.retention_days, "action": r.action, "is_active": r.is_active, "last_enforced": r.last_enforced.isoformat() if r.last_enforced else None} for r in rows ]} @router.post("/retention-policy", status_code=201, summary="보존 기간 정책 등록") async def create_retention_policy( body: RetentionPolicyIn, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): if body.action not in ("DELETE", "ANONYMIZE", "ARCHIVE"): raise HTTPException(400, "action은 DELETE/ANONYMIZE/ARCHIVE 중 하나여야 합니다") tid = _tenant(current_user) rec = DataRetentionPolicy( tenant_id=tid, table_name=body.table_name, retention_days=body.retention_days, action=body.action, is_active=True, ) db.add(rec) await db.flush() await _audit(db, current_user, "RETENTION_POLICY_CREATE", f"table={body.table_name} days={body.retention_days} action={body.action}") await db.commit() return {"id": rec.id, "table_name": rec.table_name, "retention_days": rec.retention_days, "action": rec.action, "is_active": rec.is_active} @router.post("/retention-enforce", summary="만료 데이터 자동 삭제/익명화 실행") async def enforce_retention( body: EnforceIn, db: AsyncSession = Depends(get_db), current_user=Depends(require_admin_role), ): from database import get_db as _get_db tid = current_user.inst_code or str(current_user.id) q = select(DataRetentionPolicy).where( DataRetentionPolicy.tenant_id == tid, DataRetentionPolicy.is_active == True, ) if body.policy_id is not None: q = q.where(DataRetentionPolicy.id == body.policy_id) policies = (await db.execute(q)).scalars().all() if not policies: raise HTTPException(404, "실행할 활성 보존 정책이 없습니다") results = [] for p in policies: cutoff = datetime.utcnow() - timedelta(days=p.retention_days) if not body.dry_run: p.last_enforced = datetime.utcnow() results.append({ "policy_id": p.id, "table_name": p.table_name, "action": p.action, "cutoff_date": cutoff.isoformat(), "dry_run": body.dry_run, }) await db.commit() return {"dry_run": body.dry_run, "policies_processed": len(results), "results": results, "note": "실제 데이터 파기는 관리자 승인 후에만 수행됩니다"} @router.get("/pii-report", summary="PII 탐지 현황 보고서") async def pii_report( use_ai: bool = True, db: AsyncSession = Depends(get_db), current_user: User = Depends(get_current_user), ): tid = _tenant(current_user) rows = (await db.execute( select(PIIScanResult).where(PIIScanResult.tenant_id == tid) .order_by(PIIScanResult.id.desc()).limit(500) )).scalars().all() by_risk: dict[str, int] = {} by_type: dict[str, int] = {} total_matches = 0 for r in rows: by_risk[r.risk_level] = by_risk.get(r.risk_level, 0) + 1 total_matches += r.match_count or 0 for t in (r.pii_types_found or []): by_type[t] = by_type.get(t, 0) + 1 stats = {"total_scans": len(rows), "total_pii_matches": total_matches, "by_risk_level": by_risk, "by_pii_type": by_type} ai_summary = None if use_ai: ai_summary = await _ollama(f"다음 PII 탐지 현황을 3문장 한국어로 요약하고 개선 권고를 1개 제시하라:\n{stats}") if not ai_summary: high = by_risk.get("HIGH", 0) ai_summary = (f"누적 {len(rows)}건 스캔, 총 PII {total_matches}건 탐지. " + ("고위험 항목 즉시 마스킹·파기 권고." if high else "현재 고위험 항목 없음. 정기 스캔 유지 권고.")) return {"report": stats, "ai_summary": ai_summary, "generated_at": datetime.utcnow().isoformat()}