feat: initial knowledge-inbox pipeline implementation

- Oracle ADB queue table (sql/schema.sql)
- Queue CRUD: core/queue_db.py
- YouTube transcript: core/youtube.py
- Web page fetch: core/web.py
- LLM enrichment via OCI GenAI Gemini Flash: core/enricher.py
- Text chunker: core/chunker.py
- Obsidian note writer: core/obsidian.py
- Oracle vector store insertion: core/vector.py
- Polling daemon: daemon/worker.py
- Telegram bot: bot/telegram_bot.py
- Main runner: main.py
This commit is contained in:
joungmin
2026-02-28 08:16:11 +09:00
commit 86a4104ae3
18 changed files with 926 additions and 0 deletions

50
core/youtube.py Normal file
View File

@@ -0,0 +1,50 @@
"""YouTube transcript extraction via youtube-transcript-api."""
import re
from youtube_transcript_api import YouTubeTranscriptApi
def _extract_video_id(url: str) -> str:
"""Extract YouTube video ID from a URL.
Args:
url: YouTube URL (watch?v= or youtu.be/ formats).
Returns:
The video ID string.
Raises:
ValueError: If no video ID can be found in the URL.
"""
match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url)
if not match:
raise ValueError(f"Cannot extract video ID from URL: {url}")
return match.group(1)
def get_transcript(url: str) -> dict:
"""Fetch transcript text for a YouTube video.
Args:
url: YouTube video URL.
Returns:
Dict with keys: video_id, title, text, url.
title falls back to video_id if unavailable.
"""
video_id = _extract_video_id(url)
fetched = YouTubeTranscriptApi.fetch(video_id, languages=["ko", "en"])
segments = list(fetched)
text = " ".join(seg.text for seg in segments)
# Try to get title from fetched transcript metadata
title = getattr(fetched, "title", None) or video_id
return {
"video_id": video_id,
"title": title,
"text": text,
"url": url,
}