guardia-itsm/routers/scraping.py
DESKTOP-TKLFCPRython 9514379a96 feat(scraping): 웹 스크랩핑 봇 전체 구현 + 하네스 구성
[하네스]
- agents/scraping-bot.md: 스크랩 봇 에이전트
- skills/scraping-orchestrator/SKILL.md: E2E 워크플로우

[ITSM Backend]
- models.py: ScrapingTarget + ScrapingResult ORM + Pydantic 스키마
- core/scraping_engine.py: BeautifulSoup 기반 비동기 스크래퍼
- routers/scraping.py: 13개 API (타겟 CRUD + run + 게시/삭제/원복 + 통계)
- routers/messenger.py: !scrap 봇 명령어 6종 + scrap_published 이벤트
- main.py: scraping 라우터 등록

[Manager UI]
- ScrapingManager.tsx: 결과 목록/상세/게시/삭제/원복 + 타겟 관리
- Sidebar.tsx: 🕷️ 스크랩핑 봇 메뉴 추가
- App.tsx: /scraping 라우트 추가

[테스트 결과 - 전체 통과]
- T1 타겟 등록 OK
- T2 즉시 스크랩: zioinfo.co.kr → DRAFT
- T3 결과 목록 조회 OK
- T4 게시: DRAFT → PUBLISHED + 메신저 알림
- T5/T6/T7 두번째 스크랩 → 삭제 → 원복 OK
- T8 통계: draft:1, published:1, deleted:0
- T9 !scrap list 봇 명령어 OK
- T10 !scrap status 봇 명령어 OK

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-31 16:52:49 +09:00

379 lines
13 KiB
Python

"""
스크랩핑 봇 라우터
- ScrapingTarget CRUD (스크랩 대상 등록)
- 즉시/스케줄 스크랩 실행
- 결과 관리: DRAFT → PUBLISHED(메신저 알림) / DELETED / 원복
"""
from __future__ import annotations
import asyncio
from datetime import datetime
from typing import Any, Dict, List, Optional
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query
from pydantic import BaseModel
from sqlalchemy import select, func, desc
from sqlalchemy.ext.asyncio import AsyncSession
from core.auth import get_current_user
from core.scraping_engine import scrape as _scrape
from database import get_db, SessionLocal
from models import (
ScrapingTarget, ScrapingResult,
ScrapingTargetOut, ScrapingTargetCreate,
ScrapingResultOut, User,
)
router = APIRouter(prefix="/api/scraping", tags=["scraping"])
# ── ScrapingTarget CRUD ───────────────────────────────────────────────────────
@router.post("/targets", response_model=ScrapingTargetOut)
async def create_target(
body: ScrapingTargetCreate,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
"""스크랩 대상 URL 등록."""
target = ScrapingTarget(
name=body.name, url=body.url, selector=body.selector,
schedule=body.schedule, is_active=body.is_active,
note=body.note, created_by=current_user.username,
)
db.add(target)
await db.commit()
await db.refresh(target)
if body.schedule and body.is_active:
_register_scrape_cron(target.id, target.url, body.schedule,
body.selector, current_user.username)
return target
@router.get("/targets", response_model=List[ScrapingTargetOut])
async def list_targets(
is_active: Optional[bool] = Query(None),
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
q = select(ScrapingTarget).order_by(desc(ScrapingTarget.created_at))
if is_active is not None:
q = q.where(ScrapingTarget.is_active == is_active)
result = await db.execute(q)
return result.scalars().all()
@router.get("/targets/{target_id}", response_model=ScrapingTargetOut)
async def get_target(
target_id: int,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
t = await db.get(ScrapingTarget, target_id)
if not t:
raise HTTPException(404, "스크랩 타겟을 찾을 수 없습니다.")
return t
@router.put("/targets/{target_id}", response_model=ScrapingTargetOut)
async def update_target(
target_id: int,
body: ScrapingTargetCreate,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
t = await db.get(ScrapingTarget, target_id)
if not t:
raise HTTPException(404, "스크랩 타겟을 찾을 수 없습니다.")
for k, v in body.model_dump().items():
setattr(t, k, v)
await db.commit()
await db.refresh(t)
return t
@router.delete("/targets/{target_id}")
async def delete_target(
target_id: int,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
t = await db.get(ScrapingTarget, target_id)
if not t:
raise HTTPException(404, "스크랩 타겟을 찾을 수 없습니다.")
await db.delete(t)
await db.commit()
return {"deleted": target_id}
# ── 스크랩 실행 ──────────────────────────────────────────────────────────────
class RunRequest(BaseModel):
url: str
selector: Optional[str] = None
target_id: Optional[int] = None
messenger_room: str = "ops"
@router.post("/run", response_model=ScrapingResultOut)
async def run_scrape(
body: RunRequest,
bg: BackgroundTasks,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
"""URL 즉시 스크랩 → DRAFT 저장."""
eng_result = await _scrape(body.url, body.selector)
rec = ScrapingResult(
target_id=body.target_id,
title=eng_result.title or body.url,
content=eng_result.content,
plain_text=eng_result.plain_text,
url=body.url,
source_html=eng_result.source_html,
status="FAILED" if eng_result.error else "DRAFT",
meta=eng_result.meta,
error_msg=eng_result.error,
messenger_room=body.messenger_room,
scraped_by=current_user.username,
)
db.add(rec)
if body.target_id:
t = await db.get(ScrapingTarget, body.target_id)
if t:
t.last_scraped = datetime.now()
await db.commit()
await db.refresh(rec)
return rec
# ── 결과 조회 ─────────────────────────────────────────────────────────────────
@router.get("/results", response_model=List[ScrapingResultOut])
async def list_results(
status: Optional[str] = Query(None),
target_id: Optional[int] = Query(None),
page: int = Query(1, ge=1),
size: int = Query(20, ge=1, le=100),
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
q = select(ScrapingResult).order_by(desc(ScrapingResult.scraped_at))
if status:
q = q.where(ScrapingResult.status == status)
if target_id:
q = q.where(ScrapingResult.target_id == target_id)
total_q = select(func.count()).select_from(q.subquery())
total = (await db.execute(total_q)).scalar_one()
q = q.offset((page - 1) * size).limit(size)
rows = (await db.execute(q)).scalars().all()
return rows
@router.get("/results/{result_id}", response_model=ScrapingResultOut)
async def get_result(
result_id: int,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
r = await db.get(ScrapingResult, result_id)
if not r:
raise HTTPException(404, "스크랩 결과를 찾을 수 없습니다.")
return r
@router.get("/results/{result_id}/html")
async def get_result_html(
result_id: int,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
"""원본 HTML 조회 (원복 미리보기용)."""
r = await db.get(ScrapingResult, result_id)
if not r:
raise HTTPException(404)
return {"id": r.id, "url": r.url, "source_html": r.source_html or ""}
# ── 상태 전환: 게시 ───────────────────────────────────────────────────────────
class PublishRequest(BaseModel):
room: str = "ops"
custom_message: Optional[str] = None
@router.post("/results/{result_id}/publish")
async def publish_result(
result_id: int,
body: PublishRequest,
bg: BackgroundTasks,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
"""DRAFT → PUBLISHED + 메신저 알림."""
r = await db.get(ScrapingResult, result_id)
if not r:
raise HTTPException(404, "스크랩 결과를 찾을 수 없습니다.")
if r.status == "PUBLISHED":
raise HTTPException(400, "이미 게시된 결과입니다.")
if r.status == "FAILED":
raise HTTPException(400, "실패한 스크랩은 게시할 수 없습니다.")
r.status = "PUBLISHED"
r.published_at = datetime.now()
r.published_by = current_user.username
r.messenger_room = body.room
await db.commit()
await db.refresh(r)
bg.add_task(_notify_publish, r.id, r.title, r.url,
r.plain_text, body.room, body.custom_message, current_user.username)
return {"id": r.id, "status": "PUBLISHED", "published_at": r.published_at.isoformat()}
# ── 상태 전환: 삭제 ───────────────────────────────────────────────────────────
@router.delete("/results/{result_id}")
async def delete_result(
result_id: int,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
"""소프트 삭제: → DELETED (원본 보존)."""
r = await db.get(ScrapingResult, result_id)
if not r:
raise HTTPException(404, "스크랩 결과를 찾을 수 없습니다.")
if r.status == "DELETED":
raise HTTPException(400, "이미 삭제된 결과입니다.")
r.status = "DELETED"
r.deleted_at = datetime.now()
await db.commit()
return {"id": r.id, "status": "DELETED"}
# ── 상태 전환: 원복 ───────────────────────────────────────────────────────────
@router.post("/results/{result_id}/restore")
async def restore_result(
result_id: int,
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
"""DELETED → DRAFT 원복."""
r = await db.get(ScrapingResult, result_id)
if not r:
raise HTTPException(404, "스크랩 결과를 찾을 수 없습니다.")
if r.status != "DELETED":
raise HTTPException(400, "삭제된 결과만 원복할 수 있습니다.")
r.status = "DRAFT"
r.deleted_at = None
await db.commit()
return {"id": r.id, "status": "DRAFT", "restored_at": datetime.now().isoformat()}
# ── 통계 ─────────────────────────────────────────────────────────────────────
@router.get("/stats")
async def scraping_stats(
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
stats = {}
for status in ("DRAFT", "PUBLISHED", "DELETED", "FAILED"):
cnt = (await db.execute(
select(func.count()).where(ScrapingResult.status == status)
)).scalar_one()
stats[status.lower()] = cnt
stats["targets"] = (await db.execute(
select(func.count()).select_from(ScrapingTarget)
)).scalar_one()
return stats
# ── 내부 헬퍼 ────────────────────────────────────────────────────────────────
async def _notify_publish(
result_id: int, title: str, url: str,
plain_text: str, room: str, custom_msg: Optional[str], publisher: str,
) -> None:
"""게시 시 메신저 webhook 전송."""
import httpx
import os
base = os.getenv("ITSM_BASE_URL", "http://127.0.0.1:9001")
summary = (plain_text or "")[:300].replace("\n", " ")
msg = custom_msg or (
f"[스크랩 게시] {title}\n"
f"URL: {url}\n"
f"요약: {summary}{'...' if len(plain_text or '') > 300 else ''}\n"
f"게시자: {publisher}\n"
f"결과 ID: #{result_id}"
)
payload = {
"event": "scrap_published",
"room": room,
"title": title,
"summary": msg,
"result_id": result_id,
}
try:
async with httpx.AsyncClient(timeout=5) as client:
await client.post(f"{base}/api/messenger/webhook", json=payload)
except Exception:
pass
def _register_scrape_cron(
target_id: int, url: str, schedule: str,
selector: Optional[str], actor: str,
) -> None:
"""APScheduler에 스크랩 크론 등록."""
try:
from core.scheduler import scheduler
parts = schedule.split()
if len(parts) < 5:
return
minute, hour, day, month, dow = parts[:5]
job_id = f"scrape_target_{target_id}"
scheduler.add_job(
_run_scrape_background,
trigger="cron",
id=job_id,
replace_existing=True,
minute=minute, hour=hour, day=day, month=month, day_of_week=dow,
args=[target_id, url, selector, actor],
)
except Exception as e:
import logging
logging.getLogger(__name__).warning("scrape cron 등록 실패: %s", e)
def _run_scrape_background(
target_id: int, url: str, selector: Optional[str], actor: str,
) -> None:
"""크론 실행 시 백그라운드 스크랩."""
async def _inner():
from database import SessionLocal
eng = await _scrape(url, selector)
async with SessionLocal() as db:
rec = ScrapingResult(
target_id=target_id, title=eng.title or url,
content=eng.content, plain_text=eng.plain_text,
url=url, source_html=eng.source_html,
status="FAILED" if eng.error else "DRAFT",
meta=eng.meta, error_msg=eng.error, scraped_by=actor,
)
db.add(rec)
t = await db.get(ScrapingTarget, target_id)
if t:
t.last_scraped = datetime.now()
await db.commit()
import asyncio
loop = asyncio.new_event_loop()
loop.run_until_complete(_inner())
loop.close()