feat: initial knowledge-inbox pipeline implementation

- Oracle ADB queue table (sql/schema.sql) - Queue CRUD: core/queue_db.py - YouTube transcript: core/youtube.py - Web page fetch: core/web.py - LLM enrichment via OCI GenAI Gemini Flash: core/enricher.py - Text chunker: core/chunker.py - Obsidian note writer: core/obsidian.py - Oracle vector store insertion: core/vector.py - Polling daemon: daemon/worker.py - Telegram bot: bot/telegram_bot.py - Main runner: main.py
2026-02-28 08:16:11 +09:00
commit 86a4104ae3
18 changed files with 926 additions and 0 deletions
--- a/core/youtube.py
+++ b/core/youtube.py
@@ -0,0 +1,50 @@
+"""YouTube transcript extraction via youtube-transcript-api."""
+
+import re
+
+from youtube_transcript_api import YouTubeTranscriptApi
+
+
+def _extract_video_id(url: str) -> str:
+    """Extract YouTube video ID from a URL.
+
+    Args:
+        url: YouTube URL (watch?v= or youtu.be/ formats).
+
+    Returns:
+        The video ID string.
+
+    Raises:
+        ValueError: If no video ID can be found in the URL.
+    """
+    match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url)
+    if not match:
+        raise ValueError(f"Cannot extract video ID from URL: {url}")
+    return match.group(1)
+
+
+def get_transcript(url: str) -> dict:
+    """Fetch transcript text for a YouTube video.
+
+    Args:
+        url: YouTube video URL.
+
+    Returns:
+        Dict with keys: video_id, title, text, url.
+        title falls back to video_id if unavailable.
+    """
+    video_id = _extract_video_id(url)
+
+    fetched = YouTubeTranscriptApi.fetch(video_id, languages=["ko", "en"])
+    segments = list(fetched)
+    text = " ".join(seg.text for seg in segments)
+
+    # Try to get title from fetched transcript metadata
+    title = getattr(fetched, "title", None) or video_id
+
+    return {
+        "video_id": video_id,
+        "title": title,
+        "text": text,
+        "url": url,
+    }