diff --git a/.claude/agents/scraping-bot.md b/.claude/agents/scraping-bot.md new file mode 100644 index 0000000..8fb62a0 --- /dev/null +++ b/.claude/agents/scraping-bot.md @@ -0,0 +1,42 @@ +--- +name: scraping-bot +description: "웹 스크랩핑 봇 에이전트. URL 스크랩 → DB 저장 → 상태 관리(DRAFT/PUBLISHED/DELETED) → 메신저 알림까지 담당. BeautifulSoup 기반 HTML 파싱, CSS 셀렉터 지원, 스케줄 스크랩, 원복 기능." +model: opus +--- + +# Scraping Bot — 웹 스크랩핑 자동화 에이전트 + +## 핵심 역할 + +- URL을 스크랩하여 제목·본문·메타를 추출, DB(tb_scraping_result)에 저장 +- 스크랩 결과 상태 관리: DRAFT → PUBLISHED(메신저 알림) / DELETED → 원복(DRAFT) +- 스케줄 스크랩: APScheduler 크론 연동 +- Manager UI에 결과 제공 (삭제·원복·게시) + +## 작업 원칙 + +1. **원본 보존**: 스크랩 시 source_html 전체 저장 → 원복 보장 +2. **중복 방지**: 동일 URL + 동일 일자 스크랩 중복 저장 차단 +3. **타임아웃**: 단일 URL 스크랩 최대 30초 +4. **Fail-Safe**: 스크랩 실패 시 status=FAILED 기록, 서비스 중단 없음 + +## 입력/출력 + +- **입력**: URL (필수), CSS 셀렉터 (선택), 스케줄 (cron) +- **출력**: ScrapingResult (id, title, content, status, scraped_at) + +## 봇 명령어 (messenger.py 연동) + +| 명령어 | 설명 | +|--------|------| +| `!scrap ` | URL 즉시 스크랩 | +| `!scrap list [n]` | 최근 n개 결과 목록 | +| `!scrap publish ` | 게시 + 메신저 알림 | +| `!scrap del ` | 삭제 (소프트) | +| `!scrap restore ` | 삭제→DRAFT 원복 | +| `!scrap status ` | 결과 상세 조회 | + +## 팀 통신 + +- 수신: guardia-orchestrator, rpa-bot +- 발신: incident-responder (스크랩 반복 실패 시) diff --git a/.claude/skills/scraping-orchestrator/SKILL.md b/.claude/skills/scraping-orchestrator/SKILL.md new file mode 100644 index 0000000..b7fea54 --- /dev/null +++ b/.claude/skills/scraping-orchestrator/SKILL.md @@ -0,0 +1,74 @@ +--- +name: scraping-orchestrator +description: "GUARDiA ITSM 웹 스크랩핑 봇 오케스트레이터. URL 스크랩, DB 저장, 상태관리(DRAFT/PUBLISHED/DELETED), 메신저 알림, Manager UI 연동을 조율한다. 다음 상황에서 반드시 사용: (1) '스크랩', '웹 수집', 'URL 수집', '스크랩핑 봇' 요청; (2) '게시', '원복', '스크랩 삭제' 요청; (3) '!scrap' 봇 명령어 처리; (4) 스크랩 결과 조회, 타겟 등록; (5) 다시 실행, 업데이트, 수정, 보완 요청." +--- + +# GUARDiA 스크랩핑 오케스트레이터 + +## 에이전트 팀 + +| 에이전트 | 역할 | +|---------|------| +| scraping-bot | URL 스크랩 실행, 상태 전환, 메신저 알림 | + +## 상태 흐름 + +``` +URL 등록(ScrapingTarget) + → 즉시 또는 스케줄 스크랩 + → DRAFT (저장됨) + → PUBLISHED (게시 + 메신저 알림) + → DELETED (소프트 삭제) + → DRAFT (원복) +``` + +## Phase 0: 요청 분류 + +- **타겟 등록** → `POST /api/scraping/targets` +- **즉시 스크랩** → `POST /api/scraping/run` +- **결과 조회** → `GET /api/scraping/results` +- **게시** → `POST /api/scraping/results/{id}/publish` +- **삭제** → `DELETE /api/scraping/results/{id}` +- **원복** → `POST /api/scraping/results/{id}/restore` + +## Phase 1: 스크랩 실행 + +``` +POST /api/scraping/run +{ "url": "...", "selector": ".content", "target_id": null } + +응답: { id, title, content, status: "DRAFT", scraped_at } +``` + +## Phase 2: 게시 + +``` +POST /api/scraping/results/{id}/publish +{ "room": "ops", "message": "커스텀 메시지 (선택)" } + +→ status: PUBLISHED +→ POST /api/messenger/webhook (scrap_published 이벤트) +``` + +## Phase 3: 삭제/원복 + +``` +DELETE /api/scraping/results/{id} → status: DELETED +POST /api/scraping/results/{id}/restore → status: DRAFT +``` + +## 봇 명령어 (messenger.py) + +| 명령어 | API 호출 | +|--------|---------| +| `!scrap ` | POST /api/scraping/run | +| `!scrap list [n]` | GET /api/scraping/results?size=n | +| `!scrap publish ` | POST /api/scraping/results/{id}/publish | +| `!scrap del ` | DELETE /api/scraping/results/{id} | +| `!scrap restore ` | POST /api/scraping/results/{id}/restore | +| `!scrap status ` | GET /api/scraping/results/{id} | + +## 테스트 시나리오 + +정상: POST run → DRAFT → publish → PUBLISHED → messenger 수신 +오류: 존재하지 않는 URL → status=FAILED, 서비스 무중단 diff --git a/core/scraping_engine.py b/core/scraping_engine.py new file mode 100644 index 0000000..4b019a5 --- /dev/null +++ b/core/scraping_engine.py @@ -0,0 +1,163 @@ +""" +Scraping Engine — BeautifulSoup 기반 웹 스크랩핑 +""" +from __future__ import annotations + +import re +from datetime import datetime +from typing import Any, Dict, List, Optional +from urllib.parse import urljoin, urlparse + +import httpx + +_BS4_OK = False +try: + from bs4 import BeautifulSoup + _BS4_OK = True +except ImportError: + pass + + +HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/124.0 Safari/537.36" + ), + "Accept-Language": "ko-KR,ko;q=0.9,en;q=0.8", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +} + + +class ScrapingResult: + __slots__ = ("url", "title", "content", "plain_text", "source_html", + "meta", "links", "images", "scraped_at", "error") + + def __init__(self): + self.url = "" + self.title = "" + self.content = "" + self.plain_text = "" + self.source_html = "" + self.meta: Dict[str, str] = {} + self.links: List[str] = [] + self.images: List[str] = [] + self.scraped_at = datetime.now().isoformat() + self.error: Optional[str] = None + + def to_dict(self) -> Dict[str, Any]: + return { + "url": self.url, + "title": self.title, + "content": self.content, + "plain_text": self.plain_text[:5000], # DB 저장용 자름 + "source_html": self.source_html[:500000], + "meta": self.meta, + "links": self.links[:50], + "images": self.images[:20], + "scraped_at": self.scraped_at, + "error": self.error, + } + + +async def scrape( + url: str, + selector: Optional[str] = None, + timeout: int = 30, +) -> ScrapingResult: + """ + URL을 스크랩하여 ScrapingResult 반환. + selector가 있으면 해당 CSS 셀렉터 내용만 추출. + """ + result = ScrapingResult() + result.url = url + + if not _BS4_OK: + result.error = "bs4 미설치. pip install beautifulsoup4 lxml" + return result + + try: + async with httpx.AsyncClient( + timeout=timeout, + headers=HEADERS, + follow_redirects=True, + ) as client: + resp = await client.get(url) + resp.raise_for_status() + html = resp.text + except httpx.TimeoutException: + result.error = f"타임아웃 ({timeout}초)" + return result + except Exception as e: + result.error = str(e)[:200] + return result + + result.source_html = html + + soup = BeautifulSoup(html, "lxml" if _lxml_ok() else "html.parser") + + # 제목 + title_tag = soup.find("title") + og_title = soup.find("meta", property="og:title") + result.title = ( + (og_title.get("content", "") if og_title else "") + or (title_tag.get_text(strip=True) if title_tag else "") + or urlparse(url).netloc + ) + + # 메타 + for m in soup.find_all("meta"): + name = m.get("name") or m.get("property", "") + content = m.get("content", "") + if name and content: + result.meta[name] = content[:300] + + # 본문 (셀렉터 or 자동 추출) + base = urlparse(url).scheme + "://" + urlparse(url).netloc + if selector: + target = soup.select_one(selector) + if target: + result.content = str(target) + result.plain_text = target.get_text(separator="\n", strip=True) + else: + result.error = f"셀렉터 '{selector}' 미매칭" + else: + # 자동 추출 우선순위: article > main > #content > body + for tag in ("article", "main", '[id*="content"]', '[class*="content"]', + '[id*="article"]', "body"): + node = soup.select_one(tag) + if node and len(node.get_text(strip=True)) > 100: + result.content = str(node)[:200000] + result.plain_text = _clean_text(node.get_text(separator="\n", strip=True)) + break + else: + result.plain_text = _clean_text(soup.get_text(separator="\n", strip=True)) + result.content = html[:200000] + + # 링크 + for a in soup.find_all("a", href=True)[:100]: + href = urljoin(base, a["href"]) + if href.startswith("http"): + result.links.append(href) + + # 이미지 + for img in soup.find_all("img", src=True)[:30]: + src = urljoin(base, img["src"]) + if src.startswith("http"): + result.images.append(src) + + return result + + +def _clean_text(text: str) -> str: + text = re.sub(r"\n{3,}", "\n\n", text) + text = re.sub(r"[ \t]{2,}", " ", text) + return text.strip()[:10000] + + +def _lxml_ok() -> bool: + try: + import lxml # noqa + return True + except ImportError: + return False diff --git a/main.py b/main.py index 97cf6bf..b763532 100644 --- a/main.py +++ b/main.py @@ -59,6 +59,7 @@ from routers import ( network_devices, autonomous, rpa, + scraping, ) @@ -304,6 +305,7 @@ app.include_router(dr.router) # DR 자동화 (Failover/RTO-RPO/백 app.include_router(network_devices.router) # 네트워크 장비 관리 (스위치/라우터/방화벽) app.include_router(autonomous.router) # 자율 운영 (자동처리/승인 게이트) app.include_router(rpa.router) # RPA 봇 (Validation 학습 + 자동화 실행) +app.include_router(scraping.router) # 스크랩핑 봇 (URL 수집 + 게시/삭제/원복) # ── 개방망 보안 헤더 미들웨어 ──────────────────────────────────────────────── diff --git a/models.py b/models.py index a8c16fe..33d9c54 100644 --- a/models.py +++ b/models.py @@ -4602,6 +4602,95 @@ class CSAPCheckResult(Base): # ── 개방망 API Key ───────────────────────────────────────────────────────────── +# ── 스크랩핑 ──────────────────────────────────────────────────────────────── + +class ScrapingTarget(Base): + """스크랩 대상 URL 등록 테이블.""" + __tablename__ = "tb_scraping_target" + + id = Column(Integer, primary_key=True, index=True) + name = Column(String(100), nullable=False) + url = Column(String(500), nullable=False) + selector = Column(String(200)) # CSS 셀렉터 (옵션) + schedule = Column(String(50)) # cron (옵션) + is_active = Column(Boolean, default=True) + last_scraped = Column(DateTime, nullable=True) + note = Column(Text) + created_by = Column(String(50)) + created_at = Column(DateTime, default=func.now()) + updated_at = Column(DateTime, default=func.now(), onupdate=func.now()) + + results = relationship("ScrapingResult", back_populates="target", + cascade="all, delete-orphan") + + +class ScrapingResult(Base): + """스크랩 결과 저장 테이블.""" + __tablename__ = "tb_scraping_result" + + id = Column(Integer, primary_key=True, index=True) + target_id = Column(Integer, ForeignKey("tb_scraping_target.id"), nullable=True) + title = Column(String(300)) + content = Column(Text) # 정제된 본문 (HTML) + plain_text = Column(Text) # 텍스트 본문 (검색용) + url = Column(String(500), nullable=False) + source_html = Column(Text) # 원본 HTML (원복용) + status = Column(String(20), default="DRAFT") # DRAFT/PUBLISHED/DELETED/FAILED + scraped_at = Column(DateTime, default=func.now()) + published_at = Column(DateTime, nullable=True) + deleted_at = Column(DateTime, nullable=True) + published_by = Column(String(50), nullable=True) + messenger_room = Column(String(50), default="ops") + meta = Column(JSON, default=dict) + error_msg = Column(Text, nullable=True) + scraped_by = Column(String(50), default="system") + created_at = Column(DateTime, default=func.now()) + + target = relationship("ScrapingTarget", back_populates="results") + + +class ScrapingTargetOut(BaseModel): + model_config = ConfigDict(from_attributes=True) + id: int + name: str + url: str + selector: Optional[str] + schedule: Optional[str] + is_active: bool + last_scraped: Optional[datetime] + note: Optional[str] + created_by: Optional[str] + created_at: datetime + + +class ScrapingTargetCreate(BaseModel): + name: str + url: str + selector: Optional[str] = None + schedule: Optional[str] = None + is_active: bool = True + note: Optional[str] = None + + +class ScrapingResultOut(BaseModel): + model_config = ConfigDict(from_attributes=True) + id: int + target_id: Optional[int] + title: Optional[str] + plain_text: Optional[str] + url: str + status: str + scraped_at: datetime + published_at: Optional[datetime] + deleted_at: Optional[datetime] + published_by: Optional[str] + messenger_room: Optional[str] + meta: Optional[dict] + error_msg: Optional[str] + scraped_by: Optional[str] + created_at: datetime + + class APIKey(Base): """외부 시스템 연동용 API Key (개방망 전용).""" __tablename__ = "tb_api_key" diff --git a/routers/messenger.py b/routers/messenger.py index 7d4fe09..2cbbc8d 100644 --- a/routers/messenger.py +++ b/routers/messenger.py @@ -138,6 +138,12 @@ def _format_event_message(event: MessengerEvent) -> str: f"SR: {event.sr_id or '—'}\n" f"{event.summary or ''}" ) + elif event.event == "scrap_published": + return ( + f"[스크랩 게시] {event.title or '제목 없음'}\n" + f"결과 ID: #{event.sr_id or '—'}\n" + f"{event.result_summary or ''}" + ) else: return f"[{event.event}] SR: {event.sr_id or '—'}" @@ -330,6 +336,55 @@ async def handle_bot_command( reply = await _cmd_oncall() return BotReply(room=cmd.room, text=reply) + # ── !scrap ─── 웹 스크랩핑 봇 ─────────────────────────────────────────── + elif keyword in ("!scrap", "/scrap"): + if len(parts) < 2: + return BotReply(room=cmd.room, text=( + "사용법:\n" + " !scrap → 즉시 스크랩\n" + " !scrap list [n] → 최근 n개 결과\n" + " !scrap publish → 게시 + 메신저 알림\n" + " !scrap del → 삭제\n" + " !scrap restore → 원복\n" + " !scrap status → 상세 조회" + )) + sub = parts[1].lower() + + if sub == "list": + n = int(parts[2]) if len(parts) >= 3 and parts[2].isdigit() else 5 + reply = await _cmd_scrap_list(n) + return BotReply(room=cmd.room, text=reply) + + elif sub == "publish": + if len(parts) < 3 or not parts[2].isdigit(): + return BotReply(room=cmd.room, text="사용법: !scrap publish ") + bg.add_task(_cmd_scrap_publish, cmd.room, cmd.user, int(parts[2])) + return BotReply(room=cmd.room, text=f"[스크랩 게시] #{parts[2]} 게시 처리 중...") + + elif sub in ("del", "delete"): + if len(parts) < 3 or not parts[2].isdigit(): + return BotReply(room=cmd.room, text="사용법: !scrap del ") + reply = await _cmd_scrap_delete(int(parts[2])) + return BotReply(room=cmd.room, text=reply) + + elif sub == "restore": + if len(parts) < 3 or not parts[2].isdigit(): + return BotReply(room=cmd.room, text="사용법: !scrap restore ") + reply = await _cmd_scrap_restore(int(parts[2])) + return BotReply(room=cmd.room, text=reply) + + elif sub == "status": + if len(parts) < 3 or not parts[2].isdigit(): + return BotReply(room=cmd.room, text="사용법: !scrap status ") + reply = await _cmd_scrap_status(int(parts[2])) + return BotReply(room=cmd.room, text=reply) + + else: + # !scrap 형식 + url = parts[1] + bg.add_task(_cmd_scrap_url, cmd.room, cmd.user, url) + return BotReply(room=cmd.room, text=f"[스크랩] {url} 수집 중...") + # ── /incident <제목> [P1|P2|P3|P4] ─── 인시던트 빠른 등록 ─────────────── elif keyword in ("/incident", "!incident", "/inc"): if len(parts) < 2: @@ -1844,4 +1899,175 @@ def _help_text() -> str: ━━━━━━━━━━━━━━━━━━━━━━━━ SM 스크립트 키: system, tomcat, jboss, jeus, weblogic, postgresql, oracle, mysql, tibero, -esb, elasticsearch, solr, pinpoint, scouter""" +esb, elasticsearch, solr, pinpoint, scouter + +[스크랩핑 봇] +!scrap → URL 즉시 스크랩 +!scrap list [n] → 최근 n개 결과 목록 +!scrap publish → 게시 + 메신저 알림 +!scrap del → 삭제 +!scrap restore → 삭제→DRAFT 원복 +!scrap status → 결과 상세 조회 +━━━━━━━━━━━━━━━━━━━━━━━━""" + + +# ── 스크랩 봇 헬퍼 ──────────────────────────────────────────────────────────── + +async def _cmd_scrap_url(room: str, actor: str, url: str) -> None: + """URL 즉시 스크랩 후 결과를 채널로 전송.""" + from core.scraping_engine import scrape as _scrape + try: + eng = await _scrape(url) + async with SessionLocal() as db: + from models import ScrapingResult + rec = ScrapingResult( + title=eng.title or url, + content=eng.content, + plain_text=eng.plain_text, + url=url, + source_html=eng.source_html, + status="FAILED" if eng.error else "DRAFT", + meta=eng.meta, + error_msg=eng.error, + scraped_by=actor, + messenger_room=room, + ) + db.add(rec) + await db.commit() + await db.refresh(rec) + rid = rec.id + title = rec.title + status = rec.status + err = rec.error_msg + + if err: + msg = f"[스크랩 실패] #{rid}\n오류: {err}" + else: + summary = (eng.plain_text or "")[:200] + msg = ( + f"[스크랩 완료] #{rid} — {title}\n" + f"URL: {url}\n" + f"요약: {summary}{'...' if len(eng.plain_text or '') > 200 else ''}\n" + f"상태: {status}\n" + f"게시: !scrap publish {rid}" + ) + except Exception as e: + msg = f"[스크랩 오류] {str(e)[:150]}" + await _send_to_room(room, msg) + + +async def _cmd_scrap_list(n: int) -> str: + """최근 스크랩 결과 n개 목록.""" + try: + from models import ScrapingResult + from sqlalchemy import select, desc + async with SessionLocal() as db: + rows = (await db.execute( + select(ScrapingResult) + .where(ScrapingResult.status != "DELETED") + .order_by(desc(ScrapingResult.scraped_at)) + .limit(min(n, 20)) + )).scalars().all() + if not rows: + return "스크랩 결과가 없습니다." + lines = ["[최근 스크랩 결과]"] + for r in rows: + lines.append( + f"#{r.id} [{r.status}] {r.title or r.url[:50]}\n" + f" {r.scraped_at.strftime('%m/%d %H:%M')}" + ) + return "\n".join(lines) + except Exception as e: + return f"조회 오류: {e}" + + +async def _cmd_scrap_publish(room: str, actor: str, result_id: int) -> None: + """스크랩 결과 게시.""" + try: + from models import ScrapingResult + async with SessionLocal() as db: + r = await db.get(ScrapingResult, result_id) + if not r: + await _send_to_room(room, f"#{result_id} 결과를 찾을 수 없습니다.") + return + if r.status == "PUBLISHED": + await _send_to_room(room, f"#{result_id} 이미 게시된 결과입니다.") + return + if r.status == "FAILED": + await _send_to_room(room, f"#{result_id} 실패한 결과는 게시할 수 없습니다.") + return + r.status = "PUBLISHED" + r.published_at = datetime.utcnow() + r.published_by = actor + r.messenger_room = room + await db.commit() + + summary = (r.plain_text or "")[:300] + msg = ( + f"[스크랩 게시] #{r.id} — {r.title}\n" + f"URL: {r.url}\n" + f"요약: {summary}{'...' if len(r.plain_text or '') > 300 else ''}\n" + f"게시자: {actor}" + ) + await _send_to_room(room, msg) + except Exception as e: + await _send_to_room(room, f"게시 오류: {e}") + + +async def _cmd_scrap_delete(result_id: int) -> str: + try: + from models import ScrapingResult + async with SessionLocal() as db: + r = await db.get(ScrapingResult, result_id) + if not r: + return f"#{result_id} 결과를 찾을 수 없습니다." + if r.status == "DELETED": + return f"#{result_id} 이미 삭제된 결과입니다." + r.status = "DELETED" + r.deleted_at = datetime.utcnow() + await db.commit() + return f"[스크랩 삭제] #{result_id} 삭제 완료. (!scrap restore {result_id} 로 원복)" + except Exception as e: + return f"삭제 오류: {e}" + + +async def _cmd_scrap_restore(result_id: int) -> str: + try: + from models import ScrapingResult + async with SessionLocal() as db: + r = await db.get(ScrapingResult, result_id) + if not r: + return f"#{result_id} 결과를 찾을 수 없습니다." + if r.status != "DELETED": + return f"#{result_id} 삭제된 결과만 원복할 수 있습니다. (현재: {r.status})" + r.status = "DRAFT" + r.deleted_at = None + await db.commit() + return f"[스크랩 원복] #{result_id} DRAFT 상태로 원복 완료." + except Exception as e: + return f"원복 오류: {e}" + + +async def _cmd_scrap_status(result_id: int) -> str: + try: + from models import ScrapingResult + async with SessionLocal() as db: + r = await db.get(ScrapingResult, result_id) + if not r: + return f"#{result_id} 결과를 찾을 수 없습니다." + lines = [ + f"[스크랩 상세] #{r.id}", + f"제목: {r.title or '—'}", + f"URL: {r.url}", + f"상태: {r.status}", + f"수집일시: {r.scraped_at.strftime('%Y-%m-%d %H:%M:%S')}", + ] + if r.published_at: + lines.append(f"게시일시: {r.published_at.strftime('%Y-%m-%d %H:%M:%S')}") + if r.error_msg: + lines.append(f"오류: {r.error_msg[:100]}") + if r.plain_text: + lines.append(f"요약: {r.plain_text[:200]}...") + return "\n".join(lines) + except Exception as e: + return f"조회 오류: {e}" diff --git a/routers/scraping.py b/routers/scraping.py new file mode 100644 index 0000000..1428ef0 --- /dev/null +++ b/routers/scraping.py @@ -0,0 +1,378 @@ +""" +스크랩핑 봇 라우터 +- ScrapingTarget CRUD (스크랩 대상 등록) +- 즉시/스케줄 스크랩 실행 +- 결과 관리: DRAFT → PUBLISHED(메신저 알림) / DELETED / 원복 +""" +from __future__ import annotations + +import asyncio +from datetime import datetime +from typing import Any, Dict, List, Optional + +from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query +from pydantic import BaseModel +from sqlalchemy import select, func, desc +from sqlalchemy.ext.asyncio import AsyncSession + +from core.auth import get_current_user +from core.scraping_engine import scrape as _scrape +from database import get_db, SessionLocal +from models import ( + ScrapingTarget, ScrapingResult, + ScrapingTargetOut, ScrapingTargetCreate, + ScrapingResultOut, User, +) + +router = APIRouter(prefix="/api/scraping", tags=["scraping"]) + + +# ── ScrapingTarget CRUD ─────────────────────────────────────────────────────── + +@router.post("/targets", response_model=ScrapingTargetOut) +async def create_target( + body: ScrapingTargetCreate, + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """스크랩 대상 URL 등록.""" + target = ScrapingTarget( + name=body.name, url=body.url, selector=body.selector, + schedule=body.schedule, is_active=body.is_active, + note=body.note, created_by=current_user.username, + ) + db.add(target) + await db.commit() + await db.refresh(target) + + if body.schedule and body.is_active: + _register_scrape_cron(target.id, target.url, body.schedule, + body.selector, current_user.username) + return target + + +@router.get("/targets", response_model=List[ScrapingTargetOut]) +async def list_targets( + is_active: Optional[bool] = Query(None), + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + q = select(ScrapingTarget).order_by(desc(ScrapingTarget.created_at)) + if is_active is not None: + q = q.where(ScrapingTarget.is_active == is_active) + result = await db.execute(q) + return result.scalars().all() + + +@router.get("/targets/{target_id}", response_model=ScrapingTargetOut) +async def get_target( + target_id: int, + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + t = await db.get(ScrapingTarget, target_id) + if not t: + raise HTTPException(404, "스크랩 타겟을 찾을 수 없습니다.") + return t + + +@router.put("/targets/{target_id}", response_model=ScrapingTargetOut) +async def update_target( + target_id: int, + body: ScrapingTargetCreate, + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + t = await db.get(ScrapingTarget, target_id) + if not t: + raise HTTPException(404, "스크랩 타겟을 찾을 수 없습니다.") + for k, v in body.model_dump().items(): + setattr(t, k, v) + await db.commit() + await db.refresh(t) + return t + + +@router.delete("/targets/{target_id}") +async def delete_target( + target_id: int, + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + t = await db.get(ScrapingTarget, target_id) + if not t: + raise HTTPException(404, "스크랩 타겟을 찾을 수 없습니다.") + await db.delete(t) + await db.commit() + return {"deleted": target_id} + + +# ── 스크랩 실행 ────────────────────────────────────────────────────────────── + +class RunRequest(BaseModel): + url: str + selector: Optional[str] = None + target_id: Optional[int] = None + messenger_room: str = "ops" + + +@router.post("/run", response_model=ScrapingResultOut) +async def run_scrape( + body: RunRequest, + bg: BackgroundTasks, + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """URL 즉시 스크랩 → DRAFT 저장.""" + eng_result = await _scrape(body.url, body.selector) + + rec = ScrapingResult( + target_id=body.target_id, + title=eng_result.title or body.url, + content=eng_result.content, + plain_text=eng_result.plain_text, + url=body.url, + source_html=eng_result.source_html, + status="FAILED" if eng_result.error else "DRAFT", + meta=eng_result.meta, + error_msg=eng_result.error, + messenger_room=body.messenger_room, + scraped_by=current_user.username, + ) + db.add(rec) + + if body.target_id: + t = await db.get(ScrapingTarget, body.target_id) + if t: + t.last_scraped = datetime.now() + + await db.commit() + await db.refresh(rec) + return rec + + +# ── 결과 조회 ───────────────────────────────────────────────────────────────── + +@router.get("/results", response_model=List[ScrapingResultOut]) +async def list_results( + status: Optional[str] = Query(None), + target_id: Optional[int] = Query(None), + page: int = Query(1, ge=1), + size: int = Query(20, ge=1, le=100), + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + q = select(ScrapingResult).order_by(desc(ScrapingResult.scraped_at)) + if status: + q = q.where(ScrapingResult.status == status) + if target_id: + q = q.where(ScrapingResult.target_id == target_id) + total_q = select(func.count()).select_from(q.subquery()) + total = (await db.execute(total_q)).scalar_one() + q = q.offset((page - 1) * size).limit(size) + rows = (await db.execute(q)).scalars().all() + return rows + + +@router.get("/results/{result_id}", response_model=ScrapingResultOut) +async def get_result( + result_id: int, + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + r = await db.get(ScrapingResult, result_id) + if not r: + raise HTTPException(404, "스크랩 결과를 찾을 수 없습니다.") + return r + + +@router.get("/results/{result_id}/html") +async def get_result_html( + result_id: int, + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """원본 HTML 조회 (원복 미리보기용).""" + r = await db.get(ScrapingResult, result_id) + if not r: + raise HTTPException(404) + return {"id": r.id, "url": r.url, "source_html": r.source_html or ""} + + +# ── 상태 전환: 게시 ─────────────────────────────────────────────────────────── + +class PublishRequest(BaseModel): + room: str = "ops" + custom_message: Optional[str] = None + + +@router.post("/results/{result_id}/publish") +async def publish_result( + result_id: int, + body: PublishRequest, + bg: BackgroundTasks, + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """DRAFT → PUBLISHED + 메신저 알림.""" + r = await db.get(ScrapingResult, result_id) + if not r: + raise HTTPException(404, "스크랩 결과를 찾을 수 없습니다.") + if r.status == "PUBLISHED": + raise HTTPException(400, "이미 게시된 결과입니다.") + if r.status == "FAILED": + raise HTTPException(400, "실패한 스크랩은 게시할 수 없습니다.") + + r.status = "PUBLISHED" + r.published_at = datetime.now() + r.published_by = current_user.username + r.messenger_room = body.room + await db.commit() + await db.refresh(r) + + bg.add_task(_notify_publish, r.id, r.title, r.url, + r.plain_text, body.room, body.custom_message, current_user.username) + return {"id": r.id, "status": "PUBLISHED", "published_at": r.published_at.isoformat()} + + +# ── 상태 전환: 삭제 ─────────────────────────────────────────────────────────── + +@router.delete("/results/{result_id}") +async def delete_result( + result_id: int, + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """소프트 삭제: → DELETED (원본 보존).""" + r = await db.get(ScrapingResult, result_id) + if not r: + raise HTTPException(404, "스크랩 결과를 찾을 수 없습니다.") + if r.status == "DELETED": + raise HTTPException(400, "이미 삭제된 결과입니다.") + r.status = "DELETED" + r.deleted_at = datetime.now() + await db.commit() + return {"id": r.id, "status": "DELETED"} + + +# ── 상태 전환: 원복 ─────────────────────────────────────────────────────────── + +@router.post("/results/{result_id}/restore") +async def restore_result( + result_id: int, + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + """DELETED → DRAFT 원복.""" + r = await db.get(ScrapingResult, result_id) + if not r: + raise HTTPException(404, "스크랩 결과를 찾을 수 없습니다.") + if r.status != "DELETED": + raise HTTPException(400, "삭제된 결과만 원복할 수 있습니다.") + r.status = "DRAFT" + r.deleted_at = None + await db.commit() + return {"id": r.id, "status": "DRAFT", "restored_at": datetime.now().isoformat()} + + +# ── 통계 ───────────────────────────────────────────────────────────────────── + +@router.get("/stats") +async def scraping_stats( + db: AsyncSession = Depends(get_db), + current_user: User = Depends(get_current_user), +): + stats = {} + for status in ("DRAFT", "PUBLISHED", "DELETED", "FAILED"): + cnt = (await db.execute( + select(func.count()).where(ScrapingResult.status == status) + )).scalar_one() + stats[status.lower()] = cnt + stats["targets"] = (await db.execute( + select(func.count()).select_from(ScrapingTarget) + )).scalar_one() + return stats + + +# ── 내부 헬퍼 ──────────────────────────────────────────────────────────────── + +async def _notify_publish( + result_id: int, title: str, url: str, + plain_text: str, room: str, custom_msg: Optional[str], publisher: str, +) -> None: + """게시 시 메신저 webhook 전송.""" + import httpx + import os + base = os.getenv("ITSM_BASE_URL", "http://127.0.0.1:9001") + summary = (plain_text or "")[:300].replace("\n", " ") + msg = custom_msg or ( + f"[스크랩 게시] {title}\n" + f"URL: {url}\n" + f"요약: {summary}{'...' if len(plain_text or '') > 300 else ''}\n" + f"게시자: {publisher}\n" + f"결과 ID: #{result_id}" + ) + payload = { + "event": "scrap_published", + "room": room, + "title": title, + "summary": msg, + "result_id": result_id, + } + try: + async with httpx.AsyncClient(timeout=5) as client: + await client.post(f"{base}/api/messenger/webhook", json=payload) + except Exception: + pass + + +def _register_scrape_cron( + target_id: int, url: str, schedule: str, + selector: Optional[str], actor: str, +) -> None: + """APScheduler에 스크랩 크론 등록.""" + try: + from core.scheduler import scheduler + parts = schedule.split() + if len(parts) < 5: + return + minute, hour, day, month, dow = parts[:5] + job_id = f"scrape_target_{target_id}" + scheduler.add_job( + _run_scrape_background, + trigger="cron", + id=job_id, + replace_existing=True, + minute=minute, hour=hour, day=day, month=month, day_of_week=dow, + args=[target_id, url, selector, actor], + ) + except Exception as e: + import logging + logging.getLogger(__name__).warning("scrape cron 등록 실패: %s", e) + + +def _run_scrape_background( + target_id: int, url: str, selector: Optional[str], actor: str, +) -> None: + """크론 실행 시 백그라운드 스크랩.""" + async def _inner(): + from database import SessionLocal + eng = await _scrape(url, selector) + async with SessionLocal() as db: + rec = ScrapingResult( + target_id=target_id, title=eng.title or url, + content=eng.content, plain_text=eng.plain_text, + url=url, source_html=eng.source_html, + status="FAILED" if eng.error else "DRAFT", + meta=eng.meta, error_msg=eng.error, scraped_by=actor, + ) + db.add(rec) + t = await db.get(ScrapingTarget, target_id) + if t: + t.last_scraped = datetime.now() + await db.commit() + + import asyncio + loop = asyncio.new_event_loop() + loop.run_until_complete(_inner()) + loop.close()