"""YouTube transcript extraction via youtube-transcript-api.""" import re from youtube_transcript_api import YouTubeTranscriptApi def _extract_video_id(url: str) -> str: """Extract YouTube video ID from a URL. Args: url: YouTube URL (watch?v= or youtu.be/ formats). Returns: The video ID string. Raises: ValueError: If no video ID can be found in the URL. """ match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url) if not match: raise ValueError(f"Cannot extract video ID from URL: {url}") return match.group(1) def get_transcript(url: str) -> dict: """Fetch transcript text for a YouTube video. Args: url: YouTube video URL. Returns: Dict with keys: video_id, title, text, url. title falls back to video_id if unavailable. """ video_id = _extract_video_id(url) fetched = YouTubeTranscriptApi.fetch(video_id, languages=["ko", "en"]) segments = list(fetched) text = " ".join(seg.text for seg in segments) # Try to get title from fetched transcript metadata title = getattr(fetched, "title", None) or video_id return { "video_id": video_id, "title": title, "text": text, "url": url, }