feat: initial knowledge-inbox pipeline implementation

- Oracle ADB queue table (sql/schema.sql) - Queue CRUD: core/queue_db.py - YouTube transcript: core/youtube.py - Web page fetch: core/web.py - LLM enrichment via OCI GenAI Gemini Flash: core/enricher.py - Text chunker: core/chunker.py - Obsidian note writer: core/obsidian.py - Oracle vector store insertion: core/vector.py - Polling daemon: daemon/worker.py - Telegram bot: bot/telegram_bot.py - Main runner: main.py
2026-02-28 08:16:11 +09:00
commit 86a4104ae3
18 changed files with 926 additions and 0 deletions
--- a/core/chunker.py
+++ b/core/chunker.py
@@ -0,0 +1,28 @@
+"""Simple sliding-window text chunking."""
+
+
+def chunk_text(text: str, size: int = 2000, overlap: int = 200) -> list[str]:
+    """Split text into overlapping chunks.
+
+    Args:
+        text: The full text to split.
+        size: Maximum characters per chunk.
+        overlap: Characters of overlap between consecutive chunks.
+
+    Returns:
+        List of text chunks. Returns single-item list for short text.
+    """
+    if len(text) <= size:
+        return [text]
+
+    chunks: list[str] = []
+    step = size - overlap
+    start = 0
+    while start < len(text):
+        end = start + size
+        chunks.append(text[start:end])
+        if end >= len(text):
+            break
+        start += step
+
+    return chunks