feat: initial knowledge-inbox pipeline implementation
- Oracle ADB queue table (sql/schema.sql) - Queue CRUD: core/queue_db.py - YouTube transcript: core/youtube.py - Web page fetch: core/web.py - LLM enrichment via OCI GenAI Gemini Flash: core/enricher.py - Text chunker: core/chunker.py - Obsidian note writer: core/obsidian.py - Oracle vector store insertion: core/vector.py - Polling daemon: daemon/worker.py - Telegram bot: bot/telegram_bot.py - Main runner: main.py
This commit is contained in:
50
core/youtube.py
Normal file
50
core/youtube.py
Normal file
@@ -0,0 +1,50 @@
|
||||
"""YouTube transcript extraction via youtube-transcript-api."""
|
||||
|
||||
import re
|
||||
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
|
||||
|
||||
def _extract_video_id(url: str) -> str:
|
||||
"""Extract YouTube video ID from a URL.
|
||||
|
||||
Args:
|
||||
url: YouTube URL (watch?v= or youtu.be/ formats).
|
||||
|
||||
Returns:
|
||||
The video ID string.
|
||||
|
||||
Raises:
|
||||
ValueError: If no video ID can be found in the URL.
|
||||
"""
|
||||
match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url)
|
||||
if not match:
|
||||
raise ValueError(f"Cannot extract video ID from URL: {url}")
|
||||
return match.group(1)
|
||||
|
||||
|
||||
def get_transcript(url: str) -> dict:
|
||||
"""Fetch transcript text for a YouTube video.
|
||||
|
||||
Args:
|
||||
url: YouTube video URL.
|
||||
|
||||
Returns:
|
||||
Dict with keys: video_id, title, text, url.
|
||||
title falls back to video_id if unavailable.
|
||||
"""
|
||||
video_id = _extract_video_id(url)
|
||||
|
||||
fetched = YouTubeTranscriptApi.fetch(video_id, languages=["ko", "en"])
|
||||
segments = list(fetched)
|
||||
text = " ".join(seg.text for seg in segments)
|
||||
|
||||
# Try to get title from fetched transcript metadata
|
||||
title = getattr(fetched, "title", None) or video_id
|
||||
|
||||
return {
|
||||
"video_id": video_id,
|
||||
"title": title,
|
||||
"text": text,
|
||||
"url": url,
|
||||
}
|
||||
Reference in New Issue
Block a user