feat: initial knowledge-inbox pipeline implementation
- Oracle ADB queue table (sql/schema.sql) - Queue CRUD: core/queue_db.py - YouTube transcript: core/youtube.py - Web page fetch: core/web.py - LLM enrichment via OCI GenAI Gemini Flash: core/enricher.py - Text chunker: core/chunker.py - Obsidian note writer: core/obsidian.py - Oracle vector store insertion: core/vector.py - Polling daemon: daemon/worker.py - Telegram bot: bot/telegram_bot.py - Main runner: main.py
This commit is contained in:
59
core/web.py
Normal file
59
core/web.py
Normal file
@@ -0,0 +1,59 @@
|
||||
"""URL fetching and HTML-to-text extraction."""
|
||||
|
||||
import re
|
||||
from html.parser import HTMLParser
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
class _TextExtractor(HTMLParser):
|
||||
_SKIP_TAGS = {"script", "style", "head", "nav", "footer", "noscript"}
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._buf: list[str] = []
|
||||
self._skip = 0
|
||||
|
||||
def handle_starttag(self, tag: str, attrs: list) -> None:
|
||||
if tag in self._SKIP_TAGS:
|
||||
self._skip += 1
|
||||
|
||||
def handle_endtag(self, tag: str) -> None:
|
||||
if tag in self._SKIP_TAGS and self._skip:
|
||||
self._skip -= 1
|
||||
|
||||
def handle_data(self, data: str) -> None:
|
||||
if not self._skip:
|
||||
text = data.strip()
|
||||
if text:
|
||||
self._buf.append(text)
|
||||
|
||||
def get_text(self) -> str:
|
||||
return " ".join(self._buf)
|
||||
|
||||
|
||||
def _html_to_text(html: str) -> str:
|
||||
parser = _TextExtractor()
|
||||
parser.feed(html)
|
||||
return re.sub(r"\s{3,}", " ", parser.get_text())
|
||||
|
||||
|
||||
_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; knowledge-inbox/1.0)"}
|
||||
|
||||
|
||||
def fetch_page_text(url: str, max_chars: int = 8000) -> str:
|
||||
"""Fetch a URL and return stripped plain text, truncated to max_chars.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch.
|
||||
max_chars: Maximum characters to return.
|
||||
|
||||
Returns:
|
||||
Extracted plain text, or empty string on failure.
|
||||
"""
|
||||
try:
|
||||
r = httpx.get(url, timeout=15, follow_redirects=True, headers=_HEADERS)
|
||||
r.raise_for_status()
|
||||
return _html_to_text(r.text)[:max_chars]
|
||||
except Exception:
|
||||
return ""
|
||||
Reference in New Issue
Block a user