feat: initial knowledge-inbox pipeline implementation

- Oracle ADB queue table (sql/schema.sql) - Queue CRUD: core/queue_db.py - YouTube transcript: core/youtube.py - Web page fetch: core/web.py - LLM enrichment via OCI GenAI Gemini Flash: core/enricher.py - Text chunker: core/chunker.py - Obsidian note writer: core/obsidian.py - Oracle vector store insertion: core/vector.py - Polling daemon: daemon/worker.py - Telegram bot: bot/telegram_bot.py - Main runner: main.py
2026-02-28 08:16:11 +09:00
commit 86a4104ae3
18 changed files with 926 additions and 0 deletions
--- a/core/enricher.py
+++ b/core/enricher.py
@@ -0,0 +1,96 @@
+"""LLM-based content enrichment via OCI GenAI Gemini Flash."""
+
+import json
+import os
+import re
+
+import oci
+from oci.generative_ai_inference import GenerativeAiInferenceClient
+from oci.generative_ai_inference.models import (
+    ChatDetails,
+    GenericChatRequest,
+    OnDemandServingMode,
+    TextContent,
+    UserMessage,
+)
+
+_PROMPT = """\
+You are a knowledge extraction assistant. Analyze the content below and return ONLY a valid JSON object with these fields:
+- "title": concise descriptive title for this content (string)
+- "summary": 3-5 sentence summary capturing key insights (string)
+- "tags": list of 3-7 relevant keywords or topics (string[])
+- "author": author or creator name, or null if not found (string | null)
+- "date": publication date in ISO 8601 format (YYYY-MM-DD), or null if not found (string | null)
+- "content_type": one of "youtube", "article", "documentation", "news", "forum", "code", "other" (string)
+
+Content type: {content_type}
+Source URL: {url}
+Content:
+{text}
+
+Return only the JSON object, no markdown, no explanation."""
+
+
+def _get_client() -> GenerativeAiInferenceClient:
+    config = oci.config.from_file()
+    return GenerativeAiInferenceClient(
+        config,
+        service_endpoint=os.environ["OCI_GENAI_ENDPOINT"],
+    )
+
+
+def enrich(content_type: str, title: str, url: str, text: str) -> dict:
+    """Extract structured metadata from content using Gemini Flash.
+
+    Args:
+        content_type: One of 'youtube', 'url', 'text'.
+        title: Initial title hint (may be empty).
+        url: Source URL (empty for plain text).
+        text: The full content text to analyze.
+
+    Returns:
+        Dict with keys: title, summary, tags, author, date, content_type.
+        Falls back to minimal defaults on LLM failure.
+    """
+    prompt = _PROMPT.format(
+        content_type=content_type,
+        url=url or "(none)",
+        text=text[:6000],
+    )
+
+    try:
+        client = _get_client()
+        req = GenericChatRequest(
+            messages=[UserMessage(content=[TextContent(text=prompt)])],
+            max_tokens=1024,
+            temperature=0,
+        )
+        det = ChatDetails(
+            compartment_id=os.environ["OCI_COMPARTMENT_ID"],
+            serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]),
+            chat_request=req,
+        )
+        response = client.chat(det)
+        raw = response.data.chat_response.choices[0].message.content[0].text.strip()
+        raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE)
+        metadata = json.loads(raw)
+    except Exception as exc:
+        metadata = {
+            "title": title or url or text[:80],
+            "summary": text[:300],
+            "tags": [],
+            "author": None,
+            "date": None,
+            "content_type": content_type,
+            "_error": str(exc),
+        }
+
+    # Ensure required keys exist
+    metadata.setdefault("title", title or url or text[:80])
+    metadata.setdefault("summary", "")
+    metadata.setdefault("tags", [])
+    metadata.setdefault("author", None)
+    metadata.setdefault("date", None)
+    metadata.setdefault("content_type", content_type)
+
+    return metadata