Files
knowledge-inbox/core/web.py
joungmin 86a4104ae3 feat: initial knowledge-inbox pipeline implementation
- Oracle ADB queue table (sql/schema.sql)
- Queue CRUD: core/queue_db.py
- YouTube transcript: core/youtube.py
- Web page fetch: core/web.py
- LLM enrichment via OCI GenAI Gemini Flash: core/enricher.py
- Text chunker: core/chunker.py
- Obsidian note writer: core/obsidian.py
- Oracle vector store insertion: core/vector.py
- Polling daemon: daemon/worker.py
- Telegram bot: bot/telegram_bot.py
- Main runner: main.py
2026-02-28 08:16:11 +09:00

60 lines
1.5 KiB
Python

"""URL fetching and HTML-to-text extraction."""
import re
from html.parser import HTMLParser
import httpx
class _TextExtractor(HTMLParser):
_SKIP_TAGS = {"script", "style", "head", "nav", "footer", "noscript"}
def __init__(self) -> None:
super().__init__()
self._buf: list[str] = []
self._skip = 0
def handle_starttag(self, tag: str, attrs: list) -> None:
if tag in self._SKIP_TAGS:
self._skip += 1
def handle_endtag(self, tag: str) -> None:
if tag in self._SKIP_TAGS and self._skip:
self._skip -= 1
def handle_data(self, data: str) -> None:
if not self._skip:
text = data.strip()
if text:
self._buf.append(text)
def get_text(self) -> str:
return " ".join(self._buf)
def _html_to_text(html: str) -> str:
parser = _TextExtractor()
parser.feed(html)
return re.sub(r"\s{3,}", " ", parser.get_text())
_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; knowledge-inbox/1.0)"}
def fetch_page_text(url: str, max_chars: int = 8000) -> str:
"""Fetch a URL and return stripped plain text, truncated to max_chars.
Args:
url: The URL to fetch.
max_chars: Maximum characters to return.
Returns:
Extracted plain text, or empty string on failure.
"""
try:
r = httpx.get(url, timeout=15, follow_redirects=True, headers=_HEADERS)
r.raise_for_status()
return _html_to_text(r.text)[:max_chars]
except Exception:
return ""