feat: initial knowledge-inbox pipeline implementation

- Oracle ADB queue table (sql/schema.sql) - Queue CRUD: core/queue_db.py - YouTube transcript: core/youtube.py - Web page fetch: core/web.py - LLM enrichment via OCI GenAI Gemini Flash: core/enricher.py - Text chunker: core/chunker.py - Obsidian note writer: core/obsidian.py - Oracle vector store insertion: core/vector.py - Polling daemon: daemon/worker.py - Telegram bot: bot/telegram_bot.py - Main runner: main.py
2026-02-28 08:16:11 +09:00
commit 86a4104ae3
18 changed files with 926 additions and 0 deletions
--- a/core/web.py
+++ b/core/web.py
@@ -0,0 +1,59 @@
+"""URL fetching and HTML-to-text extraction."""
+
+import re
+from html.parser import HTMLParser
+
+import httpx
+
+
+class _TextExtractor(HTMLParser):
+    _SKIP_TAGS = {"script", "style", "head", "nav", "footer", "noscript"}
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._buf: list[str] = []
+        self._skip = 0
+
+    def handle_starttag(self, tag: str, attrs: list) -> None:
+        if tag in self._SKIP_TAGS:
+            self._skip += 1
+
+    def handle_endtag(self, tag: str) -> None:
+        if tag in self._SKIP_TAGS and self._skip:
+            self._skip -= 1
+
+    def handle_data(self, data: str) -> None:
+        if not self._skip:
+            text = data.strip()
+            if text:
+                self._buf.append(text)
+
+    def get_text(self) -> str:
+        return " ".join(self._buf)
+
+
+def _html_to_text(html: str) -> str:
+    parser = _TextExtractor()
+    parser.feed(html)
+    return re.sub(r"\s{3,}", "  ", parser.get_text())
+
+
+_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; knowledge-inbox/1.0)"}
+
+
+def fetch_page_text(url: str, max_chars: int = 8000) -> str:
+    """Fetch a URL and return stripped plain text, truncated to max_chars.
+
+    Args:
+        url: The URL to fetch.
+        max_chars: Maximum characters to return.
+
+    Returns:
+        Extracted plain text, or empty string on failure.
+    """
+    try:
+        r = httpx.get(url, timeout=15, follow_redirects=True, headers=_HEADERS)
+        r.raise_for_status()
+        return _html_to_text(r.text)[:max_chars]
+    except Exception:
+        return ""