zioinfo-mail/workspace/guardia-itsm/core/scraping_engine.py
DESKTOP-TKLFCPR\ython cfe2901a55 refactor(structure): consolidate all projects under workspace/
- itsm/    -> workspace/guardia-itsm/
- manager/ -> workspace/guardia-manager/
- app/     -> workspace/guardia-messenger/
- manual/  -> workspace/guardia-docs/

workspace/zioinfo-web/ unchanged.
git mv preserves full commit history.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-31 23:50:56 +09:00

164 lines
4.7 KiB
Python

"""
Scraping Engine — BeautifulSoup 기반 웹 스크랩핑
"""
from __future__ import annotations
import re
from datetime import datetime
from typing import Any, Dict, List, Optional
from urllib.parse import urljoin, urlparse
import httpx
_BS4_OK = False
try:
from bs4 import BeautifulSoup
_BS4_OK = True
except ImportError:
pass
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0 Safari/537.36"
),
"Accept-Language": "ko-KR,ko;q=0.9,en;q=0.8",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}
class ScrapingResult:
__slots__ = ("url", "title", "content", "plain_text", "source_html",
"meta", "links", "images", "scraped_at", "error")
def __init__(self):
self.url = ""
self.title = ""
self.content = ""
self.plain_text = ""
self.source_html = ""
self.meta: Dict[str, str] = {}
self.links: List[str] = []
self.images: List[str] = []
self.scraped_at = datetime.now().isoformat()
self.error: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
return {
"url": self.url,
"title": self.title,
"content": self.content,
"plain_text": self.plain_text[:5000], # DB 저장용 자름
"source_html": self.source_html[:500000],
"meta": self.meta,
"links": self.links[:50],
"images": self.images[:20],
"scraped_at": self.scraped_at,
"error": self.error,
}
async def scrape(
url: str,
selector: Optional[str] = None,
timeout: int = 30,
) -> ScrapingResult:
"""
URL을 스크랩하여 ScrapingResult 반환.
selector가 있으면 해당 CSS 셀렉터 내용만 추출.
"""
result = ScrapingResult()
result.url = url
if not _BS4_OK:
result.error = "bs4 미설치. pip install beautifulsoup4 lxml"
return result
try:
async with httpx.AsyncClient(
timeout=timeout,
headers=HEADERS,
follow_redirects=True,
) as client:
resp = await client.get(url)
resp.raise_for_status()
html = resp.text
except httpx.TimeoutException:
result.error = f"타임아웃 ({timeout}초)"
return result
except Exception as e:
result.error = str(e)[:200]
return result
result.source_html = html
soup = BeautifulSoup(html, "lxml" if _lxml_ok() else "html.parser")
# 제목
title_tag = soup.find("title")
og_title = soup.find("meta", property="og:title")
result.title = (
(og_title.get("content", "") if og_title else "")
or (title_tag.get_text(strip=True) if title_tag else "")
or urlparse(url).netloc
)
# 메타
for m in soup.find_all("meta"):
name = m.get("name") or m.get("property", "")
content = m.get("content", "")
if name and content:
result.meta[name] = content[:300]
# 본문 (셀렉터 or 자동 추출)
base = urlparse(url).scheme + "://" + urlparse(url).netloc
if selector:
target = soup.select_one(selector)
if target:
result.content = str(target)
result.plain_text = target.get_text(separator="\n", strip=True)
else:
result.error = f"셀렉터 '{selector}' 미매칭"
else:
# 자동 추출 우선순위: article > main > #content > body
for tag in ("article", "main", '[id*="content"]', '[class*="content"]',
'[id*="article"]', "body"):
node = soup.select_one(tag)
if node and len(node.get_text(strip=True)) > 100:
result.content = str(node)[:200000]
result.plain_text = _clean_text(node.get_text(separator="\n", strip=True))
break
else:
result.plain_text = _clean_text(soup.get_text(separator="\n", strip=True))
result.content = html[:200000]
# 링크
for a in soup.find_all("a", href=True)[:100]:
href = urljoin(base, a["href"])
if href.startswith("http"):
result.links.append(href)
# 이미지
for img in soup.find_all("img", src=True)[:30]:
src = urljoin(base, img["src"])
if src.startswith("http"):
result.images.append(src)
return result
def _clean_text(text: str) -> str:
text = re.sub(r"\n{3,}", "\n\n", text)
text = re.sub(r"[ \t]{2,}", " ", text)
return text.strip()[:10000]
def _lxml_ok() -> bool:
try:
import lxml # noqa
return True
except ImportError:
return False