"""YouTube channel scanner + transcript extraction. Uses YouTube Data API v3 for channel video listing, youtube-transcript-api for transcript extraction. """ from __future__ import annotations import logging import os import re from datetime import datetime import httpx from youtube_transcript_api import YouTubeTranscriptApi from core.db import conn logger = logging.getLogger(__name__) def _api_key() -> str: return os.environ["YOUTUBE_DATA_API_KEY"] def extract_video_id(url: str) -> str: match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url) if not match: raise ValueError(f"Cannot extract video ID from URL: {url}") return match.group(1) # -- Channel operations ------------------------------------------------------- def add_channel(channel_id: str, channel_name: str) -> str: """Register a YouTube channel. Returns DB row id.""" sql = """ INSERT INTO channels (channel_id, channel_name, channel_url) VALUES (:cid, :cname, :curl) RETURNING id INTO :out_id """ with conn() as c: cur = c.cursor() import oracledb out_id = cur.var(oracledb.STRING) cur.execute(sql, { "cid": channel_id, "cname": channel_name, "curl": f"https://www.youtube.com/channel/{channel_id}", "out_id": out_id, }) return out_id.getvalue()[0] def get_active_channels() -> list[dict]: sql = "SELECT id, channel_id, channel_name FROM channels WHERE is_active = 1" with conn() as c: cur = c.cursor() cur.execute(sql) return [ {"id": r[0], "channel_id": r[1], "channel_name": r[2]} for r in cur.fetchall() ] # -- Video listing via YouTube Data API v3 ------------------------------------ def fetch_channel_videos( channel_id: str, max_results: int = 50, published_after: str | None = None, ) -> list[dict]: """Fetch video list from a YouTube channel via Data API v3. Returns list of dicts: video_id, title, published_at, url. """ params: dict = { "key": _api_key(), "channelId": channel_id, "part": "snippet", "order": "date", "maxResults": min(max_results, 50), "type": "video", } if published_after: params["publishedAfter"] = published_after videos: list[dict] = [] next_page = None while True: if next_page: params["pageToken"] = next_page r = httpx.get( "https://www.googleapis.com/youtube/v3/search", params=params, timeout=15, ) r.raise_for_status() data = r.json() for item in data.get("items", []): snippet = item["snippet"] vid = item["id"]["videoId"] videos.append({ "video_id": vid, "title": snippet["title"], "published_at": snippet["publishedAt"], "url": f"https://www.youtube.com/watch?v={vid}", }) next_page = data.get("nextPageToken") if not next_page or len(videos) >= max_results: break return videos[:max_results] # -- Transcript extraction ---------------------------------------------------- def get_transcript(video_id: str) -> str | None: """Fetch transcript text for a video. Returns None if unavailable.""" try: fetched = YouTubeTranscriptApi().fetch(video_id, languages=["ko", "en"]) return " ".join(seg.text for seg in fetched) except Exception as e: logger.warning("Transcript unavailable for %s: %s", video_id, e) return None # -- DB operations for videos ------------------------------------------------- def save_video(channel_db_id: str, video: dict) -> str | None: """Insert a video row if not exists. Returns row id or None if duplicate.""" sql = """ INSERT INTO videos (channel_id, video_id, title, url, published_at, status) VALUES (:ch_id, :vid, :title, :url, :pub_at, 'pending') RETURNING id INTO :out_id """ with conn() as c: cur = c.cursor() import oracledb out_id = cur.var(oracledb.STRING) try: pub_at = None if video.get("published_at"): pub_at = datetime.fromisoformat( video["published_at"].replace("Z", "+00:00") ) cur.execute(sql, { "ch_id": channel_db_id, "vid": video["video_id"], "title": video["title"], "url": video["url"], "pub_at": pub_at, "out_id": out_id, }) return out_id.getvalue()[0] except Exception as e: if "UQ_VIDEOS_VID" in str(e).upper(): return None # duplicate raise def get_pending_videos(limit: int = 10) -> list[dict]: sql = """ SELECT id, video_id, title, url FROM videos WHERE status = 'pending' ORDER BY created_at FETCH FIRST :n ROWS ONLY """ with conn() as c: cur = c.cursor() cur.execute(sql, {"n": limit}) return [ {"id": r[0], "video_id": r[1], "title": r[2], "url": r[3]} for r in cur.fetchall() ] def update_video_status( video_db_id: str, status: str, transcript: str | None = None, llm_raw: str | None = None, ) -> None: sets = ["status = :st", "processed_at = SYSTIMESTAMP"] params: dict = {"st": status, "vid": video_db_id} if transcript: sets.append("transcript_text = :txt") params["txt"] = transcript if llm_raw: sets.append("llm_raw_response = :llm_resp") params["llm_resp"] = llm_raw sql = f"UPDATE videos SET {', '.join(sets)} WHERE id = :vid" with conn() as c: c.cursor().execute(sql, params) # -- Scan: fetch new videos for all active channels --------------------------- def scan_all_channels(max_per_channel: int = 50) -> int: """Scan all active channels for new videos. Returns count of new videos.""" channels = get_active_channels() total_new = 0 for ch in channels: try: videos = fetch_channel_videos(ch["channel_id"], max_per_channel) for v in videos: row_id = save_video(ch["id"], v) if row_id: total_new += 1 logger.info( "Channel %s: fetched %d videos, %d new", ch["channel_name"], len(videos), total_new, ) except Exception as e: logger.error("Failed to scan channel %s: %s", ch["channel_name"], e) return total_new