[하네스]
- agents/scraping-bot.md: 스크랩 봇 에이전트
- skills/scraping-orchestrator/SKILL.md: E2E 워크플로우
[ITSM Backend]
- models.py: ScrapingTarget + ScrapingResult ORM + Pydantic 스키마
- core/scraping_engine.py: BeautifulSoup 기반 비동기 스크래퍼
- routers/scraping.py: 13개 API (타겟 CRUD + run + 게시/삭제/원복 + 통계)
- routers/messenger.py: !scrap 봇 명령어 6종 + scrap_published 이벤트
- main.py: scraping 라우터 등록
[Manager UI]
- ScrapingManager.tsx: 결과 목록/상세/게시/삭제/원복 + 타겟 관리
- Sidebar.tsx: 🕷️ 스크랩핑 봇 메뉴 추가
- App.tsx: /scraping 라우트 추가
[테스트 결과 - 전체 통과]
- T1 타겟 등록 OK
- T2 즉시 스크랩: zioinfo.co.kr → DRAFT
- T3 결과 목록 조회 OK
- T4 게시: DRAFT → PUBLISHED + 메신저 알림
- T5/T6/T7 두번째 스크랩 → 삭제 → 원복 OK
- T8 통계: draft:1, published:1, deleted:0
- T9 !scrap list 봇 명령어 OK
- T10 !scrap status 봇 명령어 OK
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
164 lines
4.7 KiB
Python
164 lines
4.7 KiB
Python
"""
|
|
Scraping Engine — BeautifulSoup 기반 웹 스크랩핑
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from datetime import datetime
|
|
from typing import Any, Dict, List, Optional
|
|
from urllib.parse import urljoin, urlparse
|
|
|
|
import httpx
|
|
|
|
_BS4_OK = False
|
|
try:
|
|
from bs4 import BeautifulSoup
|
|
_BS4_OK = True
|
|
except ImportError:
|
|
pass
|
|
|
|
|
|
HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/124.0 Safari/537.36"
|
|
),
|
|
"Accept-Language": "ko-KR,ko;q=0.9,en;q=0.8",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
}
|
|
|
|
|
|
class ScrapingResult:
|
|
__slots__ = ("url", "title", "content", "plain_text", "source_html",
|
|
"meta", "links", "images", "scraped_at", "error")
|
|
|
|
def __init__(self):
|
|
self.url = ""
|
|
self.title = ""
|
|
self.content = ""
|
|
self.plain_text = ""
|
|
self.source_html = ""
|
|
self.meta: Dict[str, str] = {}
|
|
self.links: List[str] = []
|
|
self.images: List[str] = []
|
|
self.scraped_at = datetime.now().isoformat()
|
|
self.error: Optional[str] = None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return {
|
|
"url": self.url,
|
|
"title": self.title,
|
|
"content": self.content,
|
|
"plain_text": self.plain_text[:5000], # DB 저장용 자름
|
|
"source_html": self.source_html[:500000],
|
|
"meta": self.meta,
|
|
"links": self.links[:50],
|
|
"images": self.images[:20],
|
|
"scraped_at": self.scraped_at,
|
|
"error": self.error,
|
|
}
|
|
|
|
|
|
async def scrape(
|
|
url: str,
|
|
selector: Optional[str] = None,
|
|
timeout: int = 30,
|
|
) -> ScrapingResult:
|
|
"""
|
|
URL을 스크랩하여 ScrapingResult 반환.
|
|
selector가 있으면 해당 CSS 셀렉터 내용만 추출.
|
|
"""
|
|
result = ScrapingResult()
|
|
result.url = url
|
|
|
|
if not _BS4_OK:
|
|
result.error = "bs4 미설치. pip install beautifulsoup4 lxml"
|
|
return result
|
|
|
|
try:
|
|
async with httpx.AsyncClient(
|
|
timeout=timeout,
|
|
headers=HEADERS,
|
|
follow_redirects=True,
|
|
) as client:
|
|
resp = await client.get(url)
|
|
resp.raise_for_status()
|
|
html = resp.text
|
|
except httpx.TimeoutException:
|
|
result.error = f"타임아웃 ({timeout}초)"
|
|
return result
|
|
except Exception as e:
|
|
result.error = str(e)[:200]
|
|
return result
|
|
|
|
result.source_html = html
|
|
|
|
soup = BeautifulSoup(html, "lxml" if _lxml_ok() else "html.parser")
|
|
|
|
# 제목
|
|
title_tag = soup.find("title")
|
|
og_title = soup.find("meta", property="og:title")
|
|
result.title = (
|
|
(og_title.get("content", "") if og_title else "")
|
|
or (title_tag.get_text(strip=True) if title_tag else "")
|
|
or urlparse(url).netloc
|
|
)
|
|
|
|
# 메타
|
|
for m in soup.find_all("meta"):
|
|
name = m.get("name") or m.get("property", "")
|
|
content = m.get("content", "")
|
|
if name and content:
|
|
result.meta[name] = content[:300]
|
|
|
|
# 본문 (셀렉터 or 자동 추출)
|
|
base = urlparse(url).scheme + "://" + urlparse(url).netloc
|
|
if selector:
|
|
target = soup.select_one(selector)
|
|
if target:
|
|
result.content = str(target)
|
|
result.plain_text = target.get_text(separator="\n", strip=True)
|
|
else:
|
|
result.error = f"셀렉터 '{selector}' 미매칭"
|
|
else:
|
|
# 자동 추출 우선순위: article > main > #content > body
|
|
for tag in ("article", "main", '[id*="content"]', '[class*="content"]',
|
|
'[id*="article"]', "body"):
|
|
node = soup.select_one(tag)
|
|
if node and len(node.get_text(strip=True)) > 100:
|
|
result.content = str(node)[:200000]
|
|
result.plain_text = _clean_text(node.get_text(separator="\n", strip=True))
|
|
break
|
|
else:
|
|
result.plain_text = _clean_text(soup.get_text(separator="\n", strip=True))
|
|
result.content = html[:200000]
|
|
|
|
# 링크
|
|
for a in soup.find_all("a", href=True)[:100]:
|
|
href = urljoin(base, a["href"])
|
|
if href.startswith("http"):
|
|
result.links.append(href)
|
|
|
|
# 이미지
|
|
for img in soup.find_all("img", src=True)[:30]:
|
|
src = urljoin(base, img["src"])
|
|
if src.startswith("http"):
|
|
result.images.append(src)
|
|
|
|
return result
|
|
|
|
|
|
def _clean_text(text: str) -> str:
|
|
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
text = re.sub(r"[ \t]{2,}", " ", text)
|
|
return text.strip()[:10000]
|
|
|
|
|
|
def _lxml_ok() -> bool:
|
|
try:
|
|
import lxml # noqa
|
|
return True
|
|
except ImportError:
|
|
return False
|