Initial commit: Tasteby - YouTube restaurant map service
Backend (FastAPI + Oracle ADB), Frontend (Next.js), daemon worker. Features: channel/video/restaurant management, semantic search, Google OAuth, user reviews. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
221
backend/core/youtube.py
Normal file
221
backend/core/youtube.py
Normal file
@@ -0,0 +1,221 @@
|
||||
"""YouTube channel scanner + transcript extraction.
|
||||
|
||||
Uses YouTube Data API v3 for channel video listing,
|
||||
youtube-transcript-api for transcript extraction.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
import httpx
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
|
||||
from core.db import conn
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _api_key() -> str:
|
||||
return os.environ["YOUTUBE_DATA_API_KEY"]
|
||||
|
||||
|
||||
def extract_video_id(url: str) -> str:
|
||||
match = re.search(r"(?:v=|youtu\.be/)([^&?/\s]+)", url)
|
||||
if not match:
|
||||
raise ValueError(f"Cannot extract video ID from URL: {url}")
|
||||
return match.group(1)
|
||||
|
||||
|
||||
# -- Channel operations -------------------------------------------------------
|
||||
|
||||
def add_channel(channel_id: str, channel_name: str) -> str:
|
||||
"""Register a YouTube channel. Returns DB row id."""
|
||||
sql = """
|
||||
INSERT INTO channels (channel_id, channel_name, channel_url)
|
||||
VALUES (:cid, :cname, :curl)
|
||||
RETURNING id INTO :out_id
|
||||
"""
|
||||
with conn() as c:
|
||||
cur = c.cursor()
|
||||
import oracledb
|
||||
out_id = cur.var(oracledb.STRING)
|
||||
cur.execute(sql, {
|
||||
"cid": channel_id,
|
||||
"cname": channel_name,
|
||||
"curl": f"https://www.youtube.com/channel/{channel_id}",
|
||||
"out_id": out_id,
|
||||
})
|
||||
return out_id.getvalue()[0]
|
||||
|
||||
|
||||
def get_active_channels() -> list[dict]:
|
||||
sql = "SELECT id, channel_id, channel_name FROM channels WHERE is_active = 1"
|
||||
with conn() as c:
|
||||
cur = c.cursor()
|
||||
cur.execute(sql)
|
||||
return [
|
||||
{"id": r[0], "channel_id": r[1], "channel_name": r[2]}
|
||||
for r in cur.fetchall()
|
||||
]
|
||||
|
||||
|
||||
# -- Video listing via YouTube Data API v3 ------------------------------------
|
||||
|
||||
def fetch_channel_videos(
|
||||
channel_id: str,
|
||||
max_results: int = 50,
|
||||
published_after: str | None = None,
|
||||
) -> list[dict]:
|
||||
"""Fetch video list from a YouTube channel via Data API v3.
|
||||
|
||||
Returns list of dicts: video_id, title, published_at, url.
|
||||
"""
|
||||
params: dict = {
|
||||
"key": _api_key(),
|
||||
"channelId": channel_id,
|
||||
"part": "snippet",
|
||||
"order": "date",
|
||||
"maxResults": min(max_results, 50),
|
||||
"type": "video",
|
||||
}
|
||||
if published_after:
|
||||
params["publishedAfter"] = published_after
|
||||
|
||||
videos: list[dict] = []
|
||||
next_page = None
|
||||
|
||||
while True:
|
||||
if next_page:
|
||||
params["pageToken"] = next_page
|
||||
|
||||
r = httpx.get(
|
||||
"https://www.googleapis.com/youtube/v3/search",
|
||||
params=params,
|
||||
timeout=15,
|
||||
)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
|
||||
for item in data.get("items", []):
|
||||
snippet = item["snippet"]
|
||||
vid = item["id"]["videoId"]
|
||||
videos.append({
|
||||
"video_id": vid,
|
||||
"title": snippet["title"],
|
||||
"published_at": snippet["publishedAt"],
|
||||
"url": f"https://www.youtube.com/watch?v={vid}",
|
||||
})
|
||||
|
||||
next_page = data.get("nextPageToken")
|
||||
if not next_page or len(videos) >= max_results:
|
||||
break
|
||||
|
||||
return videos[:max_results]
|
||||
|
||||
|
||||
# -- Transcript extraction ----------------------------------------------------
|
||||
|
||||
def get_transcript(video_id: str) -> str | None:
|
||||
"""Fetch transcript text for a video. Returns None if unavailable."""
|
||||
try:
|
||||
fetched = YouTubeTranscriptApi().fetch(video_id, languages=["ko", "en"])
|
||||
return " ".join(seg.text for seg in fetched)
|
||||
except Exception as e:
|
||||
logger.warning("Transcript unavailable for %s: %s", video_id, e)
|
||||
return None
|
||||
|
||||
|
||||
# -- DB operations for videos -------------------------------------------------
|
||||
|
||||
def save_video(channel_db_id: str, video: dict) -> str | None:
|
||||
"""Insert a video row if not exists. Returns row id or None if duplicate."""
|
||||
sql = """
|
||||
INSERT INTO videos (channel_id, video_id, title, url, published_at, status)
|
||||
VALUES (:ch_id, :vid, :title, :url, :pub_at, 'pending')
|
||||
RETURNING id INTO :out_id
|
||||
"""
|
||||
with conn() as c:
|
||||
cur = c.cursor()
|
||||
import oracledb
|
||||
out_id = cur.var(oracledb.STRING)
|
||||
try:
|
||||
pub_at = None
|
||||
if video.get("published_at"):
|
||||
pub_at = datetime.fromisoformat(
|
||||
video["published_at"].replace("Z", "+00:00")
|
||||
)
|
||||
cur.execute(sql, {
|
||||
"ch_id": channel_db_id,
|
||||
"vid": video["video_id"],
|
||||
"title": video["title"],
|
||||
"url": video["url"],
|
||||
"pub_at": pub_at,
|
||||
"out_id": out_id,
|
||||
})
|
||||
return out_id.getvalue()[0]
|
||||
except Exception as e:
|
||||
if "UQ_VIDEOS_VID" in str(e).upper():
|
||||
return None # duplicate
|
||||
raise
|
||||
|
||||
|
||||
def get_pending_videos(limit: int = 10) -> list[dict]:
|
||||
sql = """
|
||||
SELECT id, video_id, title, url
|
||||
FROM videos
|
||||
WHERE status = 'pending'
|
||||
ORDER BY created_at
|
||||
FETCH FIRST :n ROWS ONLY
|
||||
"""
|
||||
with conn() as c:
|
||||
cur = c.cursor()
|
||||
cur.execute(sql, {"n": limit})
|
||||
return [
|
||||
{"id": r[0], "video_id": r[1], "title": r[2], "url": r[3]}
|
||||
for r in cur.fetchall()
|
||||
]
|
||||
|
||||
|
||||
def update_video_status(
|
||||
video_db_id: str,
|
||||
status: str,
|
||||
transcript: str | None = None,
|
||||
llm_raw: str | None = None,
|
||||
) -> None:
|
||||
sets = ["status = :st", "processed_at = SYSTIMESTAMP"]
|
||||
params: dict = {"st": status, "vid": video_db_id}
|
||||
if transcript:
|
||||
sets.append("transcript_text = :txt")
|
||||
params["txt"] = transcript
|
||||
if llm_raw:
|
||||
sets.append("llm_raw_response = :llm_resp")
|
||||
params["llm_resp"] = llm_raw
|
||||
sql = f"UPDATE videos SET {', '.join(sets)} WHERE id = :vid"
|
||||
with conn() as c:
|
||||
c.cursor().execute(sql, params)
|
||||
|
||||
|
||||
# -- Scan: fetch new videos for all active channels ---------------------------
|
||||
|
||||
def scan_all_channels(max_per_channel: int = 50) -> int:
|
||||
"""Scan all active channels for new videos. Returns count of new videos."""
|
||||
channels = get_active_channels()
|
||||
total_new = 0
|
||||
for ch in channels:
|
||||
try:
|
||||
videos = fetch_channel_videos(ch["channel_id"], max_per_channel)
|
||||
for v in videos:
|
||||
row_id = save_video(ch["id"], v)
|
||||
if row_id:
|
||||
total_new += 1
|
||||
logger.info(
|
||||
"Channel %s: fetched %d videos, %d new",
|
||||
ch["channel_name"], len(videos), total_new,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Failed to scan channel %s: %s", ch["channel_name"], e)
|
||||
return total_new
|
||||
Reference in New Issue
Block a user