feat: initial knowledge-inbox pipeline implementation

- Oracle ADB queue table (sql/schema.sql) - Queue CRUD: core/queue_db.py - YouTube transcript: core/youtube.py - Web page fetch: core/web.py - LLM enrichment via OCI GenAI Gemini Flash: core/enricher.py - Text chunker: core/chunker.py - Obsidian note writer: core/obsidian.py - Oracle vector store insertion: core/vector.py - Polling daemon: daemon/worker.py - Telegram bot: bot/telegram_bot.py - Main runner: main.py
2026-02-28 08:16:11 +09:00
commit 86a4104ae3
18 changed files with 926 additions and 0 deletions
--- a/core/init.py
+++ b/core/init.py
--- a/core/chunker.py
+++ b/core/chunker.py
@@ -0,0 +1,28 @@
+"""Simple sliding-window text chunking."""
+
+
+def chunk_text(text: str, size: int = 2000, overlap: int = 200) -> list[str]:
+    """Split text into overlapping chunks.
+
+    Args:
+        text: The full text to split.
+        size: Maximum characters per chunk.
+        overlap: Characters of overlap between consecutive chunks.
+
+    Returns:
+        List of text chunks. Returns single-item list for short text.
+    """
+    if len(text) <= size:
+        return [text]
+
+    chunks: list[str] = []
+    step = size - overlap
+    start = 0
+    while start < len(text):
+        end = start + size
+        chunks.append(text[start:end])
+        if end >= len(text):
+            break
+        start += step
+
+    return chunks
--- a/core/enricher.py
+++ b/core/enricher.py
@@ -0,0 +1,96 @@
+"""LLM-based content enrichment via OCI GenAI Gemini Flash."""
+
+import json
+import os
+import re
+
+import oci
+from oci.generative_ai_inference import GenerativeAiInferenceClient
+from oci.generative_ai_inference.models import (
+    ChatDetails,
+    GenericChatRequest,
+    OnDemandServingMode,
+    TextContent,
+    UserMessage,
+)
+
+_PROMPT = """\
+You are a knowledge extraction assistant. Analyze the content below and return ONLY a valid JSON object with these fields:
+- "title": concise descriptive title for this content (string)
+- "summary": 3-5 sentence summary capturing key insights (string)
+- "tags": list of 3-7 relevant keywords or topics (string[])
+- "author": author or creator name, or null if not found (string | null)
+- "date": publication date in ISO 8601 format (YYYY-MM-DD), or null if not found (string | null)
+- "content_type": one of "youtube", "article", "documentation", "news", "forum", "code", "other" (string)
+
+Content type: {content_type}
+Source URL: {url}
+Content:
+{text}
+
+Return only the JSON object, no markdown, no explanation."""
+
+
+def _get_client() -> GenerativeAiInferenceClient:
+    config = oci.config.from_file()
+    return GenerativeAiInferenceClient(
+        config,
+        service_endpoint=os.environ["OCI_GENAI_ENDPOINT"],
+    )
+
+
+def enrich(content_type: str, title: str, url: str, text: str) -> dict:
+    """Extract structured metadata from content using Gemini Flash.
+
+    Args:
+        content_type: One of 'youtube', 'url', 'text'.
+        title: Initial title hint (may be empty).
+        url: Source URL (empty for plain text).
+        text: The full content text to analyze.
+
+    Returns:
+        Dict with keys: title, summary, tags, author, date, content_type.
+        Falls back to minimal defaults on LLM failure.
+    """
+    prompt = _PROMPT.format(
+        content_type=content_type,
+        url=url or "(none)",
+        text=text[:6000],
+    )
+
+    try:
+        client = _get_client()
+        req = GenericChatRequest(
+            messages=[UserMessage(content=[TextContent(text=prompt)])],
+            max_tokens=1024,
+            temperature=0,
+        )
+        det = ChatDetails(
+            compartment_id=os.environ["OCI_COMPARTMENT_ID"],
+            serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]),
+            chat_request=req,
+        )
+        response = client.chat(det)
+        raw = response.data.chat_response.choices[0].message.content[0].text.strip()
+        raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE)
+        metadata = json.loads(raw)
+    except Exception as exc:
+        metadata = {
+            "title": title or url or text[:80],
+            "summary": text[:300],
+            "tags": [],
+            "author": None,
+            "date": None,
+            "content_type": content_type,
+            "_error": str(exc),
+        }
+
+    # Ensure required keys exist
+    metadata.setdefault("title", title or url or text[:80])
+    metadata.setdefault("summary", "")
+    metadata.setdefault("tags", [])
+    metadata.setdefault("author", None)
+    metadata.setdefault("date", None)
+    metadata.setdefault("content_type", content_type)
+
+    return metadata
--- a/core/obsidian.py
+++ b/core/obsidian.py
@@ -0,0 +1,86 @@
+"""Save processed knowledge items as Obsidian markdown notes."""
+
+import os
+import re
+from datetime import datetime
+from pathlib import Path
+
+
+def _slugify(text: str, max_len: int = 50) -> str:
+    """Convert text to a filesystem-safe slug."""
+    text = re.sub(r"[^\w\s-]", "", text, flags=re.UNICODE)
+    text = re.sub(r"[\s_]+", "-", text).strip("-")
+    return text[:max_len].lower()
+
+
+def save_note(
+    content_type: str,
+    title: str,
+    summary: str,
+    body: str,
+    tags: list[str],
+    source_url: str = "",
+    author: str = "",
+    date: str = "",
+) -> Path:
+    """Save a processed knowledge item as an Obsidian markdown file.
+
+    Args:
+        content_type: One of 'youtube', 'url', 'text'.
+        title: The note title.
+        summary: LLM-generated summary.
+        body: Full content text.
+        tags: List of topic tags.
+        source_url: Original URL (empty for plain text).
+        author: Author name (may be empty).
+        date: Publication date in ISO 8601 format (may be empty).
+
+    Returns:
+        Path of the created markdown file.
+    """
+    vault = os.environ.get("OBSIDIAN_VAULT", "/Users/joungmin/Documents/Obsidian Vault")
+    today = datetime.now().strftime("%Y-%m-%d")
+    now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    slug = _slugify(title) or "untitled"
+
+    # Determine subfolder by content type
+    subfolder_map = {
+        "youtube": "20 Sources/YouTube",
+        "url": "20 Sources/Web",
+        "text": "20 Sources/Notes",
+    }
+    subfolder = subfolder_map.get(content_type, "20 Sources/Notes")
+    note_dir = Path(vault) / subfolder
+    note_dir.mkdir(parents=True, exist_ok=True)
+
+    filename = f"{today}-{slug}.md"
+    note_path = note_dir / filename
+
+    # Build YAML frontmatter tags
+    tags_yaml = ", ".join(tags) if tags else ""
+
+    content = f"""---
+title: {title}
+source_type: {content_type}
+url: {source_url}
+author: {author}
+date: {date}
+tags: [{tags_yaml}]
+created: {today}
+---
+
+# {title}
+
+## 요약
+{summary}
+
+## 원문
+{body}
+
+---
+*Source: {source_url}*
+*Saved: {now_str}*
+"""
+
+    note_path.write_text(content, encoding="utf-8")
+    return note_path
--- a/core/queue_db.py
+++ b/core/queue_db.py
@@ -0,0 +1,185 @@
+"""Oracle ADB connection pool and CRUD operations for knowledge_queue."""
+
+import json
+import os
+from contextlib import contextmanager
+from typing import Generator
+
+import oracledb
+
+_pool: oracledb.ConnectionPool | None = None
+
+
+def _get_pool() -> oracledb.ConnectionPool:
+    """Return (or lazily create) the module-level connection pool."""
+    global _pool
+    if _pool is None:
+        kwargs: dict = dict(
+            user=os.environ["ORACLE_USER"],
+            password=os.environ["ORACLE_PASSWORD"],
+            dsn=os.environ["ORACLE_DSN"],
+            min=1,
+            max=5,
+            increment=1,
+        )
+        wallet = os.environ.get("ORACLE_WALLET")
+        if wallet:
+            kwargs["config_dir"] = wallet  # tnsnames.ora + cwallet.sso live here
+        _pool = oracledb.create_pool(**kwargs)
+    return _pool
+
+
+@contextmanager
+def _conn() -> Generator[oracledb.Connection, None, None]:
+    """Context manager that acquires and releases a pooled connection."""
+    pool = _get_pool()
+    conn = pool.acquire()
+    try:
+        yield conn
+        conn.commit()
+    except Exception:
+        conn.rollback()
+        raise
+    finally:
+        pool.release(conn)
+
+
+def insert_item(input_type: str, content: str, chat_id: str = "") -> str:
+    """Insert a new queue item and return its generated UUID.
+
+    Args:
+        input_type: One of 'youtube', 'url', 'text'.
+        content: The URL or raw text to process.
+        chat_id: Telegram chat ID for future notification support.
+
+    Returns:
+        The UUID of the newly inserted row.
+    """
+    sql = """
+        INSERT INTO knowledge_queue (input_type, content, telegram_chat_id)
+        VALUES (:input_type, :content, :chat_id)
+        RETURNING id INTO :out_id
+    """
+    with _conn() as conn:
+        cursor = conn.cursor()
+        out_id_var = cursor.var(oracledb.STRING)
+        cursor.execute(
+            sql,
+            {
+                "input_type": input_type,
+                "content": content,
+                "chat_id": chat_id,
+                "out_id": out_id_var,
+            },
+        )
+        return out_id_var.getvalue()[0]
+
+
+def fetch_pending(limit: int = 5) -> list[dict]:
+    """Fetch oldest pending items up to limit.
+
+    Args:
+        limit: Maximum number of rows to return.
+
+    Returns:
+        List of dicts with keys: id, input_type, content, telegram_chat_id.
+    """
+    sql = """
+        SELECT id, input_type, content, telegram_chat_id
+          FROM knowledge_queue
+         WHERE status = 'pending'
+         ORDER BY created_at
+         FETCH FIRST :n ROWS ONLY
+    """
+    with _conn() as conn:
+        cursor = conn.cursor()
+        cursor.execute(sql, {"n": limit})
+        rows = cursor.fetchall()
+
+    return [
+        {
+            "id": row[0],
+            "input_type": row[1],
+            "content": row[2].read() if hasattr(row[2], "read") else row[2],
+            "telegram_chat_id": row[3],
+        }
+        for row in rows
+    ]
+
+
+def set_processing(row_id: str) -> None:
+    """Mark a queue item as processing.
+
+    Args:
+        row_id: The UUID of the row to update.
+    """
+    sql = """
+        UPDATE knowledge_queue
+           SET status = 'processing', updated_at = SYSTIMESTAMP
+         WHERE id = :id
+    """
+    with _conn() as conn:
+        conn.cursor().execute(sql, {"id": row_id})
+
+
+def set_done(row_id: str, title: str, metadata: dict) -> None:
+    """Mark a queue item as done with extracted metadata.
+
+    Args:
+        row_id: The UUID of the row to update.
+        title: LLM-extracted title.
+        metadata: Dict of enrichment results to store as JSON.
+    """
+    sql = """
+        UPDATE knowledge_queue
+           SET status = 'done',
+               title = :title,
+               metadata_json = :meta_json,
+               updated_at = SYSTIMESTAMP
+         WHERE id = :id
+    """
+    with _conn() as conn:
+        conn.cursor().execute(
+            sql,
+            {
+                "id": row_id,
+                "title": title[:500] if title else "",
+                "meta_json": json.dumps(metadata, ensure_ascii=False),
+            },
+        )
+
+
+def set_error(row_id: str, error_msg: str) -> None:
+    """Mark a queue item as error with a message.
+
+    Args:
+        row_id: The UUID of the row to update.
+        error_msg: Description of the error.
+    """
+    sql = """
+        UPDATE knowledge_queue
+           SET status = 'error', error_msg = :error_msg, updated_at = SYSTIMESTAMP
+         WHERE id = :id
+    """
+    with _conn() as conn:
+        conn.cursor().execute(sql, {"id": row_id, "error_msg": error_msg})
+
+
+def get_status_counts() -> dict:
+    """Return count of rows per status.
+
+    Returns:
+        Dict like {'pending': 3, 'processing': 1, 'done': 42, 'error': 0}.
+    """
+    sql = """
+        SELECT status, COUNT(*) FROM knowledge_queue GROUP BY status
+    """
+    with _conn() as conn:
+        cursor = conn.cursor()
+        cursor.execute(sql)
+        rows = cursor.fetchall()
+
+    counts = {"pending": 0, "processing": 0, "done": 0, "error": 0}
+    for status, count in rows:
+        counts[status] = count
+    return counts
--- a/core/vector.py
+++ b/core/vector.py
@@ -0,0 +1,114 @@
+"""Embedding generation and Oracle vector store insertion."""
+
+import array
+import os
+from contextlib import contextmanager
+from typing import Generator
+
+import oci
+import oracledb
+from oci.generative_ai_inference import GenerativeAiInferenceClient
+from oci.generative_ai_inference.models import (
+    EmbedTextDetails,
+    OnDemandServingMode,
+)
+
+# Reuse same pool as queue_db but connect to same ADB instance
+_pool: oracledb.ConnectionPool | None = None
+
+
+def _get_pool() -> oracledb.ConnectionPool:
+    """Return (or lazily create) the module-level connection pool."""
+    global _pool
+    if _pool is None:
+        kwargs: dict = dict(
+            user=os.environ["ORACLE_USER"],
+            password=os.environ["ORACLE_PASSWORD"],
+            dsn=os.environ["ORACLE_DSN"],
+            min=1,
+            max=5,
+            increment=1,
+        )
+        wallet = os.environ.get("ORACLE_WALLET")
+        if wallet:
+            kwargs["config_dir"] = wallet
+        _pool = oracledb.create_pool(**kwargs)
+    return _pool
+
+
+@contextmanager
+def _conn() -> Generator[oracledb.Connection, None, None]:
+    """Context manager that acquires and releases a pooled connection."""
+    pool = _get_pool()
+    conn = pool.acquire()
+    try:
+        yield conn
+        conn.commit()
+    except Exception:
+        conn.rollback()
+        raise
+    finally:
+        pool.release(conn)
+
+
+def _to_vector_param(embedding: list[float]) -> array.array:
+    return array.array("f", embedding)
+
+
+def _embed_texts(texts: list[str]) -> list[list[float]]:
+    """Generate embeddings using Cohere Embed v4 via OCI GenAI."""
+    config = oci.config.from_file()
+    client = GenerativeAiInferenceClient(
+        config,
+        service_endpoint=os.environ["OCI_GENAI_ENDPOINT"],
+    )
+    model_id = os.environ.get("OCI_EMBED_MODEL_ID", "cohere.embed-v4.0")
+    compartment_id = os.environ["OCI_COMPARTMENT_ID"]
+
+    details = EmbedTextDetails(
+        inputs=texts,
+        serving_mode=OnDemandServingMode(model_id=model_id),
+        compartment_id=compartment_id,
+        input_type="SEARCH_DOCUMENT",
+    )
+    response = client.embed_text(details)
+    return response.data.embeddings
+
+
+def save_to_vector(doc_id: str, chunks: list[str]) -> list[str]:
+    """Embed chunks and insert them into the Oracle vector store.
+
+    Args:
+        doc_id: Document identifier (e.g. 'youtube:abc12345').
+        chunks: List of text chunks to embed and store.
+
+    Returns:
+        List of inserted row UUIDs.
+    """
+    if not chunks:
+        return []
+
+    embeddings = _embed_texts(chunks)
+    inserted_ids: list[str] = []
+
+    sql = """
+        INSERT INTO vector_store (doc_id, chunk_text, embedding)
+        VALUES (:doc_id, :chunk_text, :embedding)
+        RETURNING id INTO :out_id
+    """
+    with _conn() as conn:
+        cursor = conn.cursor()
+        for chunk, embedding in zip(chunks, embeddings):
+            out_id_var = cursor.var(oracledb.STRING)
+            cursor.execute(
+                sql,
+                {
+                    "doc_id": doc_id,
+                    "chunk_text": chunk,
+                    "embedding": _to_vector_param(embedding),
+                    "out_id": out_id_var,
+                },
+            )
+            inserted_ids.append(out_id_var.getvalue()[0])
+
+    return inserted_ids
--- a/core/web.py
+++ b/core/web.py
@@ -0,0 +1,59 @@
+"""URL fetching and HTML-to-text extraction."""
+
+import re
+from html.parser import HTMLParser
+
+import httpx
+
+
+class _TextExtractor(HTMLParser):
+    _SKIP_TAGS = {"script", "style", "head", "nav", "footer", "noscript"}
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._buf: list[str] = []
+        self._skip = 0
+
+    def handle_starttag(self, tag: str, attrs: list) -> None:
+        if tag in self._SKIP_TAGS:
+            self._skip += 1
+
+    def handle_endtag(self, tag: str) -> None:
+        if tag in self._SKIP_TAGS and self._skip:
+            self._skip -= 1
+
+    def handle_data(self, data: str) -> None:
+        if not self._skip:
+            text = data.strip()
+            if text:
+                self._buf.append(text)
+
+    def get_text(self) -> str:
+        return " ".join(self._buf)
+
+
+def _html_to_text(html: str) -> str:
+    parser = _TextExtractor()
+    parser.feed(html)
+    return re.sub(r"\s{3,}", "  ", parser.get_text())
+
+
+_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; knowledge-inbox/1.0)"}
+
+
+def fetch_page_text(url: str, max_chars: int = 8000) -> str:
+    """Fetch a URL and return stripped plain text, truncated to max_chars.
+
+    Args:
+        url: The URL to fetch.
+        max_chars: Maximum characters to return.
+
+    Returns:
+        Extracted plain text, or empty string on failure.
+    """
+    try:
+        r = httpx.get(url, timeout=15, follow_redirects=True, headers=_HEADERS)
+        r.raise_for_status()
+        return _html_to_text(r.text)[:max_chars]
+    except Exception:
+        return ""
--- a/core/youtube.py
+++ b/core/youtube.py
@@ -0,0 +1,50 @@
+"""YouTube transcript extraction via youtube-transcript-api."""
+
+import re
+
+from youtube_transcript_api import YouTubeTranscriptApi
+
+
+def _extract_video_id(url: str) -> str:
+    """Extract YouTube video ID from a URL.
+
+    Args:
+        url: YouTube URL (watch?v= or youtu.be/ formats).
+
+    Returns:
+        The video ID string.
+
+    Raises:
+        ValueError: If no video ID can be found in the URL.
+    """
+    match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url)
+    if not match:
+        raise ValueError(f"Cannot extract video ID from URL: {url}")
+    return match.group(1)
+
+
+def get_transcript(url: str) -> dict:
+    """Fetch transcript text for a YouTube video.
+
+    Args:
+        url: YouTube video URL.
+
+    Returns:
+        Dict with keys: video_id, title, text, url.
+        title falls back to video_id if unavailable.
+    """
+    video_id = _extract_video_id(url)
+
+    fetched = YouTubeTranscriptApi.fetch(video_id, languages=["ko", "en"])
+    segments = list(fetched)
+    text = " ".join(seg.text for seg in segments)
+
+    # Try to get title from fetched transcript metadata
+    title = getattr(fetched, "title", None) or video_id
+
+    return {
+        "video_id": video_id,
+        "title": title,
+        "text": text,
+        "url": url,
+    }