- Oracle ADB queue table (sql/schema.sql) - Queue CRUD: core/queue_db.py - YouTube transcript: core/youtube.py - Web page fetch: core/web.py - LLM enrichment via OCI GenAI Gemini Flash: core/enricher.py - Text chunker: core/chunker.py - Obsidian note writer: core/obsidian.py - Oracle vector store insertion: core/vector.py - Polling daemon: daemon/worker.py - Telegram bot: bot/telegram_bot.py - Main runner: main.py
51 lines
1.2 KiB
Python
51 lines
1.2 KiB
Python
"""YouTube transcript extraction via youtube-transcript-api."""
|
|
|
|
import re
|
|
|
|
from youtube_transcript_api import YouTubeTranscriptApi
|
|
|
|
|
|
def _extract_video_id(url: str) -> str:
|
|
"""Extract YouTube video ID from a URL.
|
|
|
|
Args:
|
|
url: YouTube URL (watch?v= or youtu.be/ formats).
|
|
|
|
Returns:
|
|
The video ID string.
|
|
|
|
Raises:
|
|
ValueError: If no video ID can be found in the URL.
|
|
"""
|
|
match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url)
|
|
if not match:
|
|
raise ValueError(f"Cannot extract video ID from URL: {url}")
|
|
return match.group(1)
|
|
|
|
|
|
def get_transcript(url: str) -> dict:
|
|
"""Fetch transcript text for a YouTube video.
|
|
|
|
Args:
|
|
url: YouTube video URL.
|
|
|
|
Returns:
|
|
Dict with keys: video_id, title, text, url.
|
|
title falls back to video_id if unavailable.
|
|
"""
|
|
video_id = _extract_video_id(url)
|
|
|
|
fetched = YouTubeTranscriptApi.fetch(video_id, languages=["ko", "en"])
|
|
segments = list(fetched)
|
|
text = " ".join(seg.text for seg in segments)
|
|
|
|
# Try to get title from fetched transcript metadata
|
|
title = getattr(fetched, "title", None) or video_id
|
|
|
|
return {
|
|
"video_id": video_id,
|
|
"title": title,
|
|
"text": text,
|
|
"url": url,
|
|
}
|