knowledge-inbox/core/youtube.py

"""YouTube transcript extraction via youtube-transcript-api."""

import re

import httpx
from youtube_transcript_api import YouTubeTranscriptApi


def _extract_video_id(url: str) -> str:
    match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url)
    if not match:
        raise ValueError(f"Cannot extract video ID from URL: {url}")
    return match.group(1)


def _get_video_title(video_id: str) -> str:
    """Fetch video title via YouTube oEmbed API (no auth required)."""
    try:
        r = httpx.get(
            "https://www.youtube.com/oembed",
            params={"url": f"https://www.youtube.com/watch?v={video_id}", "format": "json"},
            timeout=5,
        )
        r.raise_for_status()
        return r.json().get("title") or video_id
    except Exception:
        return video_id


def _paragraphize(text: str, sentences_per_para: int = 4) -> str:
    """Group flat transcript text into readable paragraphs.

    Splits on Korean/English sentence endings and groups by sentence count.
    """
    # Split on sentence-ending punctuation followed by whitespace
    parts = re.split(r'(?<=[다요죠습니까요\.!?])\s+', text)
    parts = [p.strip() for p in parts if p.strip()]

    paras = []
    for i in range(0, len(parts), sentences_per_para):
        paras.append(" ".join(parts[i:i + sentences_per_para]))

    return "\n\n".join(paras) if paras else text


def get_transcript(url: str) -> dict:
    """Fetch transcript text for a YouTube video.

    Args:
        url: YouTube video URL.

    Returns:
        Dict with keys: video_id, title, text, url.
    """
    video_id = _extract_video_id(url)

    fetched = YouTubeTranscriptApi().fetch(video_id, languages=["ko", "en"])
    raw_text = " ".join(seg.text for seg in fetched)

    title = _get_video_title(video_id)
    text = _paragraphize(raw_text)

    return {
        "video_id": video_id,
        "title": title,
        "text": text,
        "url": url,
    }