Add admin features, responsive UI, user reviews, visit stats, and channel-colored markers
- Admin: video management with Google Maps match status, manual restaurant mapping, restaurant remap on name change - Admin: user management tab with favorites/reviews detail - Admin: channel deletion fix for IDs with slashes - Frontend: responsive mobile layout (map top, list bottom, 2-row header) - Frontend: channel-colored map markers with legend - Frontend: my reviews list, favorites toggle, visit counter overlay - Frontend: force light mode for dark theme devices - Backend: visit tracking (site_visits table), user reviews endpoint - Backend: bulk transcript/extract streaming, geocode key fixes - Nginx config for production deployment Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -56,7 +56,7 @@ def _parse_json(raw: str) -> dict | list:
|
||||
return json.JSONDecoder(strict=False).decode(raw)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
# recover truncated array
|
||||
# recover truncated array — extract complete objects one by one
|
||||
if raw.lstrip().startswith("["):
|
||||
decoder = json.JSONDecoder(strict=False)
|
||||
items: list = []
|
||||
@@ -71,8 +71,19 @@ def _parse_json(raw: str) -> dict | list:
|
||||
items.append(obj)
|
||||
idx = end
|
||||
except json.JSONDecodeError:
|
||||
# Try to recover truncated last object by closing braces
|
||||
remainder = raw[idx:]
|
||||
for fix in ["}", "}]", '"}', '"}' , '"}]', "null}", "null}]"]:
|
||||
try:
|
||||
patched = remainder.rstrip().rstrip(",") + fix
|
||||
obj = json.loads(patched)
|
||||
if isinstance(obj, dict) and obj.get("name"):
|
||||
items.append(obj)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
continue
|
||||
break
|
||||
if items:
|
||||
logger.info("Recovered %d restaurants from truncated JSON", len(items))
|
||||
return items
|
||||
raise ValueError(f"JSON parse failed: {raw[:80]!r}")
|
||||
|
||||
@@ -104,7 +115,7 @@ _EXTRACT_PROMPT = """\
|
||||
JSON 배열:"""
|
||||
|
||||
|
||||
def extract_restaurants(title: str, transcript: str) -> tuple[list[dict], str]:
|
||||
def extract_restaurants(title: str, transcript: str, custom_prompt: str | None = None) -> tuple[list[dict], str]:
|
||||
"""Extract restaurant info from a video transcript using LLM.
|
||||
|
||||
Returns (list of restaurant dicts, raw LLM response text).
|
||||
@@ -113,10 +124,11 @@ def extract_restaurants(title: str, transcript: str) -> tuple[list[dict], str]:
|
||||
if len(transcript) > 8000:
|
||||
transcript = transcript[:7000] + "\n...(중략)...\n" + transcript[-1000:]
|
||||
|
||||
prompt = _EXTRACT_PROMPT.format(title=title, transcript=transcript)
|
||||
template = custom_prompt if custom_prompt else _EXTRACT_PROMPT
|
||||
prompt = template.format(title=title, transcript=transcript)
|
||||
|
||||
try:
|
||||
raw = _llm(prompt, max_tokens=4096)
|
||||
raw = _llm(prompt, max_tokens=8192)
|
||||
result = _parse_json(raw)
|
||||
if isinstance(result, list):
|
||||
return result, raw
|
||||
|
||||
@@ -57,17 +57,53 @@ def _places_text_search(query: str) -> dict | None:
|
||||
if data.get("status") == "OK" and data.get("results"):
|
||||
place = data["results"][0]
|
||||
loc = place["geometry"]["location"]
|
||||
return {
|
||||
result = {
|
||||
"latitude": loc["lat"],
|
||||
"longitude": loc["lng"],
|
||||
"formatted_address": place.get("formatted_address", ""),
|
||||
"google_place_id": place.get("place_id", ""),
|
||||
"business_status": place.get("business_status"),
|
||||
"rating": place.get("rating"),
|
||||
"rating_count": place.get("user_ratings_total"),
|
||||
}
|
||||
# Fetch phone/website from Place Details
|
||||
place_id = place.get("place_id")
|
||||
if place_id:
|
||||
details = _place_details(place_id)
|
||||
if details:
|
||||
result.update(details)
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.warning("Places text search failed for '%s': %s", query, e)
|
||||
return None
|
||||
|
||||
|
||||
def _place_details(place_id: str) -> dict | None:
|
||||
"""Fetch phone and website from Google Place Details API."""
|
||||
try:
|
||||
r = httpx.get(
|
||||
"https://maps.googleapis.com/maps/api/place/details/json",
|
||||
params={
|
||||
"place_id": place_id,
|
||||
"key": _api_key(),
|
||||
"language": "ko",
|
||||
"fields": "formatted_phone_number,website",
|
||||
},
|
||||
timeout=10,
|
||||
)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
if data.get("status") == "OK" and data.get("result"):
|
||||
res = data["result"]
|
||||
return {
|
||||
"phone": res.get("formatted_phone_number"),
|
||||
"website": res.get("website"),
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning("Place details failed for '%s': %s", place_id, e)
|
||||
return None
|
||||
|
||||
|
||||
def _geocode(query: str) -> dict | None:
|
||||
"""Geocode an address string."""
|
||||
try:
|
||||
|
||||
@@ -28,7 +28,7 @@ def process_video(video: dict) -> int:
|
||||
|
||||
try:
|
||||
# 1. Transcript
|
||||
transcript = youtube.get_transcript(video_id)
|
||||
transcript, _src = youtube.get_transcript(video_id)
|
||||
if not transcript:
|
||||
logger.warning("No transcript for %s, marking done", video_id)
|
||||
youtube.update_video_status(video_db_id, "done")
|
||||
@@ -72,6 +72,11 @@ def process_video(video: dict) -> int:
|
||||
cuisine_type=rest_data.get("cuisine_type"),
|
||||
price_range=rest_data.get("price_range"),
|
||||
google_place_id=place_id,
|
||||
phone=geo.get("phone") if geo else None,
|
||||
website=geo.get("website") if geo else None,
|
||||
business_status=geo.get("business_status") if geo else None,
|
||||
rating=geo.get("rating") if geo else None,
|
||||
rating_count=geo.get("rating_count") if geo else None,
|
||||
)
|
||||
|
||||
# Link video <-> restaurant
|
||||
@@ -101,6 +106,76 @@ def process_video(video: dict) -> int:
|
||||
return 0
|
||||
|
||||
|
||||
def process_video_extract(video: dict, transcript: str, custom_prompt: str | None = None) -> int:
|
||||
"""Run LLM extraction + geocode + save on an existing transcript.
|
||||
Returns number of restaurants found."""
|
||||
video_db_id = video["id"]
|
||||
title = video["title"]
|
||||
|
||||
logger.info("Extracting restaurants from video: %s", title)
|
||||
|
||||
try:
|
||||
restaurants, llm_raw = extractor.extract_restaurants(title, transcript, custom_prompt=custom_prompt)
|
||||
if not restaurants:
|
||||
youtube.update_video_status(video_db_id, "done", llm_raw=llm_raw)
|
||||
return 0
|
||||
|
||||
count = 0
|
||||
for rest_data in restaurants:
|
||||
name = rest_data.get("name")
|
||||
if not name:
|
||||
continue
|
||||
|
||||
geo = geocoding.geocode_restaurant(
|
||||
name,
|
||||
address=rest_data.get("address"),
|
||||
region=rest_data.get("region"),
|
||||
)
|
||||
|
||||
lat = geo["latitude"] if geo else None
|
||||
lng = geo["longitude"] if geo else None
|
||||
addr = geo["formatted_address"] if geo else rest_data.get("address")
|
||||
place_id = geo["google_place_id"] if geo else None
|
||||
|
||||
rest_id = restaurant.upsert(
|
||||
name=name,
|
||||
address=addr,
|
||||
region=rest_data.get("region"),
|
||||
latitude=lat,
|
||||
longitude=lng,
|
||||
cuisine_type=rest_data.get("cuisine_type"),
|
||||
price_range=rest_data.get("price_range"),
|
||||
google_place_id=place_id,
|
||||
phone=geo.get("phone") if geo else None,
|
||||
website=geo.get("website") if geo else None,
|
||||
business_status=geo.get("business_status") if geo else None,
|
||||
rating=geo.get("rating") if geo else None,
|
||||
rating_count=geo.get("rating_count") if geo else None,
|
||||
)
|
||||
|
||||
restaurant.link_video_restaurant(
|
||||
video_db_id=video_db_id,
|
||||
restaurant_id=rest_id,
|
||||
foods=rest_data.get("foods_mentioned"),
|
||||
evaluation=rest_data.get("evaluation"),
|
||||
guests=rest_data.get("guests"),
|
||||
)
|
||||
|
||||
chunks = _build_chunks(name, rest_data, title)
|
||||
if chunks:
|
||||
vector.save_restaurant_vectors(rest_id, chunks)
|
||||
|
||||
count += 1
|
||||
logger.info("Saved restaurant: %s (geocoded=%s)", name, bool(geo))
|
||||
|
||||
youtube.update_video_status(video_db_id, "done", llm_raw=llm_raw)
|
||||
return count
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Extract error for %s: %s", video["video_id"], e, exc_info=True)
|
||||
return 0
|
||||
|
||||
|
||||
def _build_chunks(name: str, data: dict, video_title: str) -> list[str]:
|
||||
"""Build text chunks for vector embedding."""
|
||||
parts = [f"식당: {name}"]
|
||||
|
||||
@@ -9,6 +9,16 @@ import oracledb
|
||||
from core.db import conn
|
||||
|
||||
|
||||
def _truncate_bytes(val: str | None, max_bytes: int) -> str | None:
|
||||
"""Truncate a string to fit within max_bytes when encoded as UTF-8."""
|
||||
if not val:
|
||||
return val
|
||||
encoded = val.encode("utf-8")
|
||||
if len(encoded) <= max_bytes:
|
||||
return val
|
||||
return encoded[:max_bytes].decode("utf-8", errors="ignore").rstrip()
|
||||
|
||||
|
||||
def find_by_name(name: str) -> dict | None:
|
||||
"""Find a restaurant by exact name match."""
|
||||
sql = "SELECT id, name, address, region, latitude, longitude FROM restaurants WHERE name = :n"
|
||||
@@ -33,8 +43,19 @@ def upsert(
|
||||
cuisine_type: str | None = None,
|
||||
price_range: str | None = None,
|
||||
google_place_id: str | None = None,
|
||||
phone: str | None = None,
|
||||
website: str | None = None,
|
||||
business_status: str | None = None,
|
||||
rating: float | None = None,
|
||||
rating_count: int | None = None,
|
||||
) -> str:
|
||||
"""Insert or update a restaurant. Returns row id."""
|
||||
# Truncate fields to fit DB column byte limits (VARCHAR2 is byte-based)
|
||||
price_range = _truncate_bytes(price_range, 50)
|
||||
cuisine_type = _truncate_bytes(cuisine_type, 100)
|
||||
region = _truncate_bytes(region, 100)
|
||||
website = _truncate_bytes(website, 500)
|
||||
|
||||
existing = find_by_name(name)
|
||||
if existing:
|
||||
sql = """
|
||||
@@ -46,6 +67,11 @@ def upsert(
|
||||
cuisine_type = COALESCE(:cuisine, cuisine_type),
|
||||
price_range = COALESCE(:price, price_range),
|
||||
google_place_id = COALESCE(:gid, google_place_id),
|
||||
phone = COALESCE(:phone, phone),
|
||||
website = COALESCE(:web, website),
|
||||
business_status = COALESCE(:bstatus, business_status),
|
||||
rating = COALESCE(:rating, rating),
|
||||
rating_count = COALESCE(:rcnt, rating_count),
|
||||
updated_at = SYSTIMESTAMP
|
||||
WHERE id = :id
|
||||
"""
|
||||
@@ -54,14 +80,18 @@ def upsert(
|
||||
"addr": address, "reg": region,
|
||||
"lat": latitude, "lng": longitude,
|
||||
"cuisine": cuisine_type, "price": price_range,
|
||||
"gid": google_place_id, "id": existing["id"],
|
||||
"gid": google_place_id, "phone": phone, "web": website,
|
||||
"bstatus": business_status, "rating": rating, "rcnt": rating_count,
|
||||
"id": existing["id"],
|
||||
})
|
||||
return existing["id"]
|
||||
|
||||
sql = """
|
||||
INSERT INTO restaurants (name, address, region, latitude, longitude,
|
||||
cuisine_type, price_range, google_place_id)
|
||||
VALUES (:name, :addr, :reg, :lat, :lng, :cuisine, :price, :gid)
|
||||
cuisine_type, price_range, google_place_id,
|
||||
phone, website, business_status, rating, rating_count)
|
||||
VALUES (:name, :addr, :reg, :lat, :lng, :cuisine, :price, :gid,
|
||||
:phone, :web, :bstatus, :rating, :rcnt)
|
||||
RETURNING id INTO :out_id
|
||||
"""
|
||||
with conn() as c:
|
||||
@@ -71,7 +101,9 @@ def upsert(
|
||||
"name": name, "addr": address, "reg": region,
|
||||
"lat": latitude, "lng": longitude,
|
||||
"cuisine": cuisine_type, "price": price_range,
|
||||
"gid": google_place_id, "out_id": out_id,
|
||||
"gid": google_place_id, "phone": phone, "web": website,
|
||||
"bstatus": business_status, "rating": rating, "rcnt": rating_count,
|
||||
"out_id": out_id,
|
||||
})
|
||||
return out_id.getvalue()[0]
|
||||
|
||||
@@ -116,38 +148,83 @@ def get_all(
|
||||
offset: int = 0,
|
||||
cuisine: str | None = None,
|
||||
region: str | None = None,
|
||||
channel: str | None = None,
|
||||
) -> list[dict]:
|
||||
"""List restaurants with optional filters."""
|
||||
conditions = ["latitude IS NOT NULL"]
|
||||
conditions = [
|
||||
"r.latitude IS NOT NULL",
|
||||
"EXISTS (SELECT 1 FROM video_restaurants vr0 WHERE vr0.restaurant_id = r.id)",
|
||||
]
|
||||
params: dict = {"lim": limit, "off": offset}
|
||||
|
||||
if cuisine:
|
||||
conditions.append("cuisine_type = :cuisine")
|
||||
conditions.append("r.cuisine_type = :cuisine")
|
||||
params["cuisine"] = cuisine
|
||||
if region:
|
||||
conditions.append("region LIKE :region")
|
||||
conditions.append("r.region LIKE :region")
|
||||
params["region"] = f"%{region}%"
|
||||
|
||||
join_clause = ""
|
||||
if channel:
|
||||
join_clause = """
|
||||
JOIN video_restaurants vr_f ON vr_f.restaurant_id = r.id
|
||||
JOIN videos v_f ON v_f.id = vr_f.video_id
|
||||
JOIN channels c_f ON c_f.id = v_f.channel_id
|
||||
"""
|
||||
conditions.append("c_f.channel_name = :channel")
|
||||
params["channel"] = channel
|
||||
|
||||
where = " AND ".join(conditions)
|
||||
sql = f"""
|
||||
SELECT id, name, address, region, latitude, longitude,
|
||||
cuisine_type, price_range, google_place_id
|
||||
FROM restaurants
|
||||
SELECT DISTINCT r.id, r.name, r.address, r.region, r.latitude, r.longitude,
|
||||
r.cuisine_type, r.price_range, r.google_place_id,
|
||||
r.business_status, r.rating, r.rating_count, r.updated_at
|
||||
FROM restaurants r
|
||||
{join_clause}
|
||||
WHERE {where}
|
||||
ORDER BY updated_at DESC
|
||||
ORDER BY r.updated_at DESC
|
||||
OFFSET :off ROWS FETCH NEXT :lim ROWS ONLY
|
||||
"""
|
||||
with conn() as c:
|
||||
cur = c.cursor()
|
||||
cur.execute(sql, params)
|
||||
cols = [d[0].lower() for d in cur.description]
|
||||
return [dict(zip(cols, row)) for row in cur.fetchall()]
|
||||
restaurants = [dict(zip(cols, row)) for row in cur.fetchall()]
|
||||
for r in restaurants:
|
||||
r.pop("updated_at", None)
|
||||
|
||||
if not restaurants:
|
||||
return restaurants
|
||||
|
||||
# Attach channel names for each restaurant
|
||||
ids = [r["id"] for r in restaurants]
|
||||
placeholders = ", ".join(f":id{i}" for i in range(len(ids)))
|
||||
ch_sql = f"""
|
||||
SELECT DISTINCT vr.restaurant_id, c.channel_name
|
||||
FROM video_restaurants vr
|
||||
JOIN videos v ON v.id = vr.video_id
|
||||
JOIN channels c ON c.id = v.channel_id
|
||||
WHERE vr.restaurant_id IN ({placeholders})
|
||||
"""
|
||||
ch_params = {f"id{i}": rid for i, rid in enumerate(ids)}
|
||||
ch_map: dict[str, list[str]] = {}
|
||||
with conn() as c:
|
||||
cur = c.cursor()
|
||||
cur.execute(ch_sql, ch_params)
|
||||
for row in cur.fetchall():
|
||||
ch_map.setdefault(row[0], []).append(row[1])
|
||||
|
||||
for r in restaurants:
|
||||
r["channels"] = ch_map.get(r["id"], [])
|
||||
|
||||
return restaurants
|
||||
|
||||
|
||||
def get_by_id(restaurant_id: str) -> dict | None:
|
||||
sql = """
|
||||
SELECT r.id, r.name, r.address, r.region, r.latitude, r.longitude,
|
||||
r.cuisine_type, r.price_range, r.phone, r.website, r.google_place_id
|
||||
r.cuisine_type, r.price_range, r.phone, r.website, r.google_place_id,
|
||||
r.business_status, r.rating, r.rating_count
|
||||
FROM restaurants r
|
||||
WHERE r.id = :id
|
||||
"""
|
||||
@@ -165,9 +242,11 @@ def get_video_links(restaurant_id: str) -> list[dict]:
|
||||
"""Get all video appearances for a restaurant."""
|
||||
sql = """
|
||||
SELECT v.video_id, v.title, v.url, v.published_at,
|
||||
vr.foods_mentioned, vr.evaluation, vr.guests
|
||||
vr.foods_mentioned, vr.evaluation, vr.guests,
|
||||
c.channel_name, c.channel_id
|
||||
FROM video_restaurants vr
|
||||
JOIN videos v ON v.id = vr.video_id
|
||||
JOIN channels c ON c.id = v.channel_id
|
||||
WHERE vr.restaurant_id = :rid
|
||||
ORDER BY v.published_at DESC
|
||||
"""
|
||||
@@ -187,6 +266,8 @@ def get_video_links(restaurant_id: str) -> list[dict]:
|
||||
"foods_mentioned": _parse_json_field(foods_raw, []),
|
||||
"evaluation": _parse_json_field(eval_raw, {}),
|
||||
"guests": _parse_json_field(guests_raw, []),
|
||||
"channel_name": r[7],
|
||||
"channel_id": r[8],
|
||||
})
|
||||
return results
|
||||
|
||||
|
||||
@@ -131,13 +131,15 @@ def get_user_reviews(
|
||||
limit: int = 20,
|
||||
offset: int = 0,
|
||||
) -> list[dict]:
|
||||
"""List reviews by a specific user."""
|
||||
"""List reviews by a specific user, including restaurant name."""
|
||||
sql = """
|
||||
SELECT r.id, r.user_id, r.restaurant_id, r.rating, r.review_text,
|
||||
r.visited_at, r.created_at, r.updated_at,
|
||||
u.nickname, u.avatar_url
|
||||
u.nickname, u.avatar_url,
|
||||
rest.name AS restaurant_name
|
||||
FROM user_reviews r
|
||||
JOIN tasteby_users u ON u.id = r.user_id
|
||||
LEFT JOIN restaurants rest ON rest.id = r.restaurant_id
|
||||
WHERE r.user_id = :user_id
|
||||
ORDER BY r.created_at DESC
|
||||
OFFSET :off ROWS FETCH NEXT :lim ROWS ONLY
|
||||
@@ -149,7 +151,12 @@ def get_user_reviews(
|
||||
"off": offset,
|
||||
"lim": limit,
|
||||
})
|
||||
return [_row_to_dict(row) for row in cur.fetchall()]
|
||||
rows = []
|
||||
for row in cur.fetchall():
|
||||
d = _row_to_dict(row)
|
||||
d["restaurant_name"] = row[10]
|
||||
rows.append(d)
|
||||
return rows
|
||||
|
||||
|
||||
def get_restaurant_avg_rating(restaurant_id: str) -> dict:
|
||||
|
||||
@@ -32,11 +32,11 @@ def extract_video_id(url: str) -> str:
|
||||
|
||||
# -- Channel operations -------------------------------------------------------
|
||||
|
||||
def add_channel(channel_id: str, channel_name: str) -> str:
|
||||
def add_channel(channel_id: str, channel_name: str, title_filter: str | None = None) -> str:
|
||||
"""Register a YouTube channel. Returns DB row id."""
|
||||
sql = """
|
||||
INSERT INTO channels (channel_id, channel_name, channel_url)
|
||||
VALUES (:cid, :cname, :curl)
|
||||
INSERT INTO channels (channel_id, channel_name, channel_url, title_filter)
|
||||
VALUES (:cid, :cname, :curl, :tf)
|
||||
RETURNING id INTO :out_id
|
||||
"""
|
||||
with conn() as c:
|
||||
@@ -47,45 +47,77 @@ def add_channel(channel_id: str, channel_name: str) -> str:
|
||||
"cid": channel_id,
|
||||
"cname": channel_name,
|
||||
"curl": f"https://www.youtube.com/channel/{channel_id}",
|
||||
"tf": title_filter,
|
||||
"out_id": out_id,
|
||||
})
|
||||
return out_id.getvalue()[0]
|
||||
|
||||
|
||||
def deactivate_channel(channel_id: str) -> bool:
|
||||
"""Deactivate a channel by channel_id. Returns True if found."""
|
||||
sql = "UPDATE channels SET is_active = 0 WHERE channel_id = :cid AND is_active = 1"
|
||||
with conn() as c:
|
||||
cur = c.cursor()
|
||||
cur.execute(sql, {"cid": channel_id})
|
||||
return cur.rowcount > 0
|
||||
|
||||
|
||||
def deactivate_channel_by_db_id(db_id: str) -> bool:
|
||||
"""Deactivate a channel by DB id. Returns True if found."""
|
||||
sql = "UPDATE channels SET is_active = 0 WHERE id = :did AND is_active = 1"
|
||||
with conn() as c:
|
||||
cur = c.cursor()
|
||||
cur.execute(sql, {"did": db_id})
|
||||
return cur.rowcount > 0
|
||||
|
||||
|
||||
def get_active_channels() -> list[dict]:
|
||||
sql = "SELECT id, channel_id, channel_name FROM channels WHERE is_active = 1"
|
||||
sql = "SELECT id, channel_id, channel_name, title_filter FROM channels WHERE is_active = 1"
|
||||
with conn() as c:
|
||||
cur = c.cursor()
|
||||
cur.execute(sql)
|
||||
return [
|
||||
{"id": r[0], "channel_id": r[1], "channel_name": r[2]}
|
||||
{"id": r[0], "channel_id": r[1], "channel_name": r[2], "title_filter": r[3]}
|
||||
for r in cur.fetchall()
|
||||
]
|
||||
|
||||
|
||||
# -- Video listing via YouTube Data API v3 ------------------------------------
|
||||
|
||||
def fetch_channel_videos(
|
||||
channel_id: str,
|
||||
max_results: int = 50,
|
||||
published_after: str | None = None,
|
||||
) -> list[dict]:
|
||||
"""Fetch video list from a YouTube channel via Data API v3.
|
||||
def get_latest_video_date(channel_db_id: str) -> str | None:
|
||||
"""Get the latest published_at for a channel's videos in ISO 8601 format."""
|
||||
sql = """
|
||||
SELECT MAX(published_at) FROM videos
|
||||
WHERE channel_id = :ch_id AND published_at IS NOT NULL
|
||||
"""
|
||||
with conn() as c:
|
||||
cur = c.cursor()
|
||||
cur.execute(sql, {"ch_id": channel_db_id})
|
||||
row = cur.fetchone()
|
||||
if row and row[0]:
|
||||
return row[0].strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
return None
|
||||
|
||||
Returns list of dicts: video_id, title, published_at, url.
|
||||
|
||||
def fetch_channel_videos_iter(
|
||||
channel_id: str,
|
||||
published_after: str | None = None,
|
||||
):
|
||||
"""Yield pages of videos from a YouTube channel via Data API v3.
|
||||
|
||||
Each yield is a list of dicts for one API page (up to 50).
|
||||
"""
|
||||
params: dict = {
|
||||
"key": _api_key(),
|
||||
"channelId": channel_id,
|
||||
"part": "snippet",
|
||||
"order": "date",
|
||||
"maxResults": min(max_results, 50),
|
||||
"maxResults": 50,
|
||||
"type": "video",
|
||||
}
|
||||
if published_after:
|
||||
params["publishedAfter"] = published_after
|
||||
|
||||
videos: list[dict] = []
|
||||
next_page = None
|
||||
|
||||
while True:
|
||||
@@ -100,33 +132,337 @@ def fetch_channel_videos(
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
|
||||
page_videos = []
|
||||
for item in data.get("items", []):
|
||||
snippet = item["snippet"]
|
||||
vid = item["id"]["videoId"]
|
||||
videos.append({
|
||||
page_videos.append({
|
||||
"video_id": vid,
|
||||
"title": snippet["title"],
|
||||
"published_at": snippet["publishedAt"],
|
||||
"url": f"https://www.youtube.com/watch?v={vid}",
|
||||
})
|
||||
|
||||
if page_videos:
|
||||
yield page_videos
|
||||
|
||||
next_page = data.get("nextPageToken")
|
||||
if not next_page or len(videos) >= max_results:
|
||||
if not next_page:
|
||||
break
|
||||
|
||||
return videos[:max_results]
|
||||
|
||||
def fetch_channel_videos(
|
||||
channel_id: str,
|
||||
max_results: int = 0,
|
||||
published_after: str | None = None,
|
||||
) -> list[dict]:
|
||||
"""Fetch video list from a YouTube channel via Data API v3.
|
||||
|
||||
Args:
|
||||
max_results: 0 means fetch all available videos.
|
||||
"""
|
||||
videos: list[dict] = []
|
||||
for page in fetch_channel_videos_iter(channel_id, published_after=published_after):
|
||||
videos.extend(page)
|
||||
if max_results > 0 and len(videos) >= max_results:
|
||||
break
|
||||
return videos[:max_results] if max_results > 0 else videos
|
||||
|
||||
|
||||
# -- Transcript extraction ----------------------------------------------------
|
||||
|
||||
def get_transcript(video_id: str) -> str | None:
|
||||
"""Fetch transcript text for a video. Returns None if unavailable."""
|
||||
def get_transcript(video_id: str, mode: str = "auto") -> tuple[str | None, str | None]:
|
||||
"""Fetch transcript using Playwright (headless browser).
|
||||
|
||||
Args:
|
||||
mode: "manual" = manual only, "generated" = auto-generated only,
|
||||
"auto" = try API first, fallback to browser transcript panel.
|
||||
|
||||
Returns:
|
||||
(transcript_text, source) where source describes origin, or (None, None).
|
||||
"""
|
||||
# Try youtube-transcript-api first (fast path)
|
||||
text, source = _get_transcript_api(video_id, mode)
|
||||
if text:
|
||||
return text, source
|
||||
|
||||
# Fallback: Playwright browser
|
||||
logger.warning("API failed for %s, trying Playwright browser", video_id)
|
||||
print(f"[TRANSCRIPT] API failed for {video_id}, trying Playwright browser", flush=True)
|
||||
return _get_transcript_browser(video_id)
|
||||
|
||||
|
||||
def _make_ytt() -> YouTubeTranscriptApi:
|
||||
"""Create YouTubeTranscriptApi with cookies if available."""
|
||||
cookie_file = os.path.join(os.path.dirname(__file__), "..", "cookies.txt")
|
||||
if os.path.exists(cookie_file):
|
||||
import http.cookiejar
|
||||
import requests
|
||||
jar = http.cookiejar.MozillaCookieJar(cookie_file)
|
||||
jar.load(ignore_discard=True, ignore_expires=True)
|
||||
session = requests.Session()
|
||||
session.cookies = jar
|
||||
return YouTubeTranscriptApi(http_client=session)
|
||||
return YouTubeTranscriptApi()
|
||||
|
||||
|
||||
def _get_transcript_api(video_id: str, mode: str = "auto") -> tuple[str | None, str | None]:
|
||||
"""Try youtube-transcript-api (fast but may be IP-blocked)."""
|
||||
ytt = _make_ytt()
|
||||
prefer = ["ko", "en"]
|
||||
|
||||
try:
|
||||
fetched = YouTubeTranscriptApi().fetch(video_id, languages=["ko", "en"])
|
||||
return " ".join(seg.text for seg in fetched)
|
||||
transcript_list = ytt.list(video_id)
|
||||
except Exception as e:
|
||||
logger.warning("Transcript unavailable for %s: %s", video_id, e)
|
||||
return None
|
||||
logger.warning("Cannot list transcripts for %s: %s", video_id, e)
|
||||
return None, None
|
||||
|
||||
all_transcripts = list(transcript_list)
|
||||
manual = [t for t in all_transcripts if not t.is_generated]
|
||||
generated = [t for t in all_transcripts if t.is_generated]
|
||||
|
||||
def _pick(candidates):
|
||||
for lang in prefer:
|
||||
for t in candidates:
|
||||
if t.language_code == lang:
|
||||
return t
|
||||
return candidates[0] if candidates else None
|
||||
|
||||
def _fetch(t):
|
||||
try:
|
||||
return " ".join(seg.text for seg in t.fetch()), t.language_code
|
||||
except Exception:
|
||||
return None, None
|
||||
|
||||
if mode == "manual":
|
||||
t = _pick(manual)
|
||||
if t:
|
||||
text, lang = _fetch(t)
|
||||
return (text, f"manual ({lang})") if text else (None, None)
|
||||
return None, None
|
||||
elif mode == "generated":
|
||||
t = _pick(generated)
|
||||
if t:
|
||||
text, lang = _fetch(t)
|
||||
return (text, f"generated ({lang})") if text else (None, None)
|
||||
return None, None
|
||||
else:
|
||||
t = _pick(manual)
|
||||
if t:
|
||||
text, lang = _fetch(t)
|
||||
if text:
|
||||
return text, f"manual ({lang})"
|
||||
t = _pick(generated)
|
||||
if t:
|
||||
text, lang = _fetch(t)
|
||||
if text:
|
||||
return text, f"generated ({lang})"
|
||||
return None, None
|
||||
|
||||
|
||||
def _get_transcript_browser(video_id: str) -> tuple[str | None, str | None]:
|
||||
"""Fetch transcript via Playwright browser (bypasses IP blocks)."""
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
except ImportError:
|
||||
logger.error("playwright not installed")
|
||||
return None, None
|
||||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(
|
||||
headless=False,
|
||||
args=["--disable-blink-features=AutomationControlled"],
|
||||
)
|
||||
ctx = browser.new_context(locale="ko-KR", viewport={"width": 1280, "height": 900})
|
||||
|
||||
# Load YouTube cookies if available
|
||||
cookie_file = os.path.join(os.path.dirname(__file__), "..", "cookies.txt")
|
||||
if os.path.exists(cookie_file):
|
||||
import http.cookiejar
|
||||
jar = http.cookiejar.MozillaCookieJar(cookie_file)
|
||||
jar.load(ignore_discard=True, ignore_expires=True)
|
||||
pw_cookies = []
|
||||
for c in jar:
|
||||
if "youtube" in c.domain or "google" in c.domain:
|
||||
pw_cookies.append({
|
||||
"name": c.name, "value": c.value,
|
||||
"domain": c.domain, "path": c.path,
|
||||
"secure": c.secure, "httpOnly": False,
|
||||
})
|
||||
if pw_cookies:
|
||||
ctx.add_cookies(pw_cookies)
|
||||
print(f"[TRANSCRIPT] Loaded {len(pw_cookies)} cookies", flush=True)
|
||||
|
||||
page = ctx.new_page()
|
||||
page.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => false})")
|
||||
|
||||
print(f"[TRANSCRIPT] Opening YouTube page for {video_id}", flush=True)
|
||||
page.goto(
|
||||
f"https://www.youtube.com/watch?v={video_id}",
|
||||
wait_until="domcontentloaded",
|
||||
timeout=30000,
|
||||
)
|
||||
page.wait_for_timeout(5000)
|
||||
|
||||
# Skip ads if present
|
||||
for ad_wait in range(12): # up to ~60s for ads
|
||||
ad_status = page.evaluate("""() => {
|
||||
const skipBtn = document.querySelector('.ytp-skip-ad-button, .ytp-ad-skip-button, .ytp-ad-skip-button-modern, button.ytp-ad-skip-button-modern');
|
||||
if (skipBtn) { skipBtn.click(); return 'skipped'; }
|
||||
const adOverlay = document.querySelector('.ytp-ad-player-overlay, .ad-showing');
|
||||
if (adOverlay) return 'playing';
|
||||
const adBadge = document.querySelector('.ytp-ad-text');
|
||||
if (adBadge && adBadge.textContent) return 'badge';
|
||||
return 'none';
|
||||
}""")
|
||||
if ad_status == "none":
|
||||
break
|
||||
print(f"[TRANSCRIPT] Ad detected: {ad_status}, waiting...", flush=True)
|
||||
if ad_status == "skipped":
|
||||
page.wait_for_timeout(2000)
|
||||
break
|
||||
page.wait_for_timeout(5000)
|
||||
|
||||
page.wait_for_timeout(2000)
|
||||
print(f"[TRANSCRIPT] Page loaded, looking for transcript button", flush=True)
|
||||
|
||||
# Click "더보기" (more actions) button first to reveal transcript option
|
||||
page.evaluate("""
|
||||
() => {
|
||||
// Try clicking the "...더보기" button in description area
|
||||
const moreBtn = document.querySelector('tp-yt-paper-button#expand');
|
||||
if (moreBtn) moreBtn.click();
|
||||
}
|
||||
""")
|
||||
page.wait_for_timeout(2000)
|
||||
|
||||
# Click "스크립트 표시" button via JS
|
||||
clicked = page.evaluate("""
|
||||
() => {
|
||||
// Method 1: aria-label
|
||||
for (const label of ['스크립트 표시', 'Show transcript']) {
|
||||
const btns = document.querySelectorAll(`button[aria-label="${label}"]`);
|
||||
for (const b of btns) { b.click(); return 'aria-label: ' + label; }
|
||||
}
|
||||
// Method 2: search all buttons by text content
|
||||
const allBtns = document.querySelectorAll('button');
|
||||
for (const b of allBtns) {
|
||||
const text = b.textContent.trim();
|
||||
if (text === '스크립트 표시' || text === 'Show transcript') {
|
||||
b.click();
|
||||
return 'text: ' + text;
|
||||
}
|
||||
}
|
||||
// Method 3: look for transcript button in engagement panel
|
||||
const engBtns = document.querySelectorAll('ytd-button-renderer button, ytd-button-renderer a');
|
||||
for (const b of engBtns) {
|
||||
const text = b.textContent.trim().toLowerCase();
|
||||
if (text.includes('transcript') || text.includes('스크립트')) {
|
||||
b.click();
|
||||
return 'engagement: ' + text;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
""")
|
||||
print(f"[TRANSCRIPT] Clicked transcript button: {clicked}", flush=True)
|
||||
if not clicked:
|
||||
# Dump available buttons for debugging
|
||||
btn_labels = page.evaluate("""
|
||||
() => {
|
||||
const btns = document.querySelectorAll('button[aria-label]');
|
||||
return Array.from(btns).map(b => b.getAttribute('aria-label')).slice(0, 30);
|
||||
}
|
||||
""")
|
||||
print(f"[TRANSCRIPT] Available buttons: {btn_labels}", flush=True)
|
||||
browser.close()
|
||||
return None, None
|
||||
|
||||
# Wait for transcript panel segments to appear (max ~40s)
|
||||
page.wait_for_timeout(3000) # initial wait for panel to render
|
||||
for attempt in range(12):
|
||||
page.wait_for_timeout(3000)
|
||||
count = page.evaluate(
|
||||
"() => document.querySelectorAll('ytd-transcript-segment-renderer').length"
|
||||
)
|
||||
print(f"[TRANSCRIPT] Wait {(attempt+1)*3+3}s: {count} segments", flush=True)
|
||||
if count > 0:
|
||||
break
|
||||
|
||||
# Select Korean if available (language selector in transcript panel)
|
||||
page.evaluate("""
|
||||
() => {
|
||||
// Open language dropdown and pick Korean if available
|
||||
const menu = document.querySelector('ytd-transcript-renderer ytd-menu-renderer yt-dropdown-menu');
|
||||
if (!menu) return;
|
||||
const trigger = menu.querySelector('button, tp-yt-paper-button');
|
||||
if (trigger) trigger.click();
|
||||
}
|
||||
""")
|
||||
page.wait_for_timeout(1000)
|
||||
page.evaluate("""
|
||||
() => {
|
||||
const items = document.querySelectorAll('tp-yt-paper-listbox a, tp-yt-paper-listbox tp-yt-paper-item');
|
||||
for (const item of items) {
|
||||
const text = item.textContent.trim();
|
||||
if (text.includes('한국어') || text.includes('Korean')) {
|
||||
item.click();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
""")
|
||||
page.wait_for_timeout(2000)
|
||||
|
||||
# Scroll transcript panel to load all segments
|
||||
segments = page.evaluate("""
|
||||
async () => {
|
||||
const container = document.querySelector(
|
||||
'ytd-transcript-segment-list-renderer #segments-container, ' +
|
||||
'ytd-transcript-renderer #body'
|
||||
);
|
||||
if (!container) {
|
||||
// Fallback: just grab what's there
|
||||
const segs = document.querySelectorAll('ytd-transcript-segment-renderer');
|
||||
return Array.from(segs).map(s => {
|
||||
const txt = s.querySelector('.segment-text, yt-formatted-string.segment-text');
|
||||
return txt ? txt.textContent.trim() : '';
|
||||
}).filter(t => t);
|
||||
}
|
||||
|
||||
// Scroll to bottom repeatedly to load all virtual segments
|
||||
let prevCount = 0;
|
||||
for (let i = 0; i < 50; i++) {
|
||||
container.scrollTop = container.scrollHeight;
|
||||
await new Promise(r => setTimeout(r, 300));
|
||||
const segs = document.querySelectorAll('ytd-transcript-segment-renderer');
|
||||
if (segs.length === prevCount && i > 3) break;
|
||||
prevCount = segs.length;
|
||||
}
|
||||
|
||||
const segs = document.querySelectorAll('ytd-transcript-segment-renderer');
|
||||
return Array.from(segs).map(s => {
|
||||
const txt = s.querySelector('.segment-text, yt-formatted-string.segment-text');
|
||||
return txt ? txt.textContent.trim() : '';
|
||||
}).filter(t => t);
|
||||
}
|
||||
""")
|
||||
|
||||
browser.close()
|
||||
|
||||
print(f"[TRANSCRIPT] Got {len(segments) if segments else 0} segments for {video_id}", flush=True)
|
||||
if segments:
|
||||
text = " ".join(segments)
|
||||
print(f"[TRANSCRIPT] Success: {len(text)} chars from {len(segments)} segments", flush=True)
|
||||
return text, "browser"
|
||||
return None, None
|
||||
except Exception as e:
|
||||
logger.error("Playwright transcript failed for %s: %s", video_id, e)
|
||||
print(f"[TRANSCRIPT] Playwright FAILED for {video_id}: {e}", flush=True)
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return None, None
|
||||
|
||||
|
||||
# -- DB operations for videos -------------------------------------------------
|
||||
@@ -163,6 +499,48 @@ def save_video(channel_db_id: str, video: dict) -> str | None:
|
||||
raise
|
||||
|
||||
|
||||
def get_existing_video_ids(channel_db_id: str) -> set[str]:
|
||||
"""Get all video_ids already in DB for a channel."""
|
||||
with conn() as c:
|
||||
cur = c.cursor()
|
||||
cur.execute("SELECT video_id FROM videos WHERE channel_id = :cid", {"cid": channel_db_id})
|
||||
return {r[0] for r in cur.fetchall()}
|
||||
|
||||
|
||||
def save_videos_batch(channel_db_id: str, videos: list[dict]) -> int:
|
||||
"""Insert multiple videos in a single DB connection. Returns count of new videos."""
|
||||
if not videos:
|
||||
return 0
|
||||
import oracledb
|
||||
sql = """
|
||||
INSERT INTO videos (channel_id, video_id, title, url, published_at, status)
|
||||
VALUES (:ch_id, :vid, :title, :url, :pub_at, 'pending')
|
||||
"""
|
||||
new_count = 0
|
||||
with conn() as c:
|
||||
cur = c.cursor()
|
||||
for video in videos:
|
||||
try:
|
||||
pub_at = None
|
||||
if video.get("published_at"):
|
||||
pub_at = datetime.fromisoformat(
|
||||
video["published_at"].replace("Z", "+00:00")
|
||||
)
|
||||
cur.execute(sql, {
|
||||
"ch_id": channel_db_id,
|
||||
"vid": video["video_id"],
|
||||
"title": video["title"],
|
||||
"url": video["url"],
|
||||
"pub_at": pub_at,
|
||||
})
|
||||
new_count += 1
|
||||
except Exception as e:
|
||||
if "UQ_VIDEOS_VID" in str(e).upper():
|
||||
continue
|
||||
raise
|
||||
return new_count
|
||||
|
||||
|
||||
def get_pending_videos(limit: int = 10) -> list[dict]:
|
||||
sql = """
|
||||
SELECT id, video_id, title, url
|
||||
@@ -201,20 +579,28 @@ def update_video_status(
|
||||
|
||||
# -- Scan: fetch new videos for all active channels ---------------------------
|
||||
|
||||
def scan_all_channels(max_per_channel: int = 50) -> int:
|
||||
def scan_all_channels() -> int:
|
||||
"""Scan all active channels for new videos. Returns count of new videos."""
|
||||
channels = get_active_channels()
|
||||
total_new = 0
|
||||
for ch in channels:
|
||||
try:
|
||||
videos = fetch_channel_videos(ch["channel_id"], max_per_channel)
|
||||
for v in videos:
|
||||
row_id = save_video(ch["id"], v)
|
||||
if row_id:
|
||||
total_new += 1
|
||||
after = get_latest_video_date(ch["id"])
|
||||
title_filter = ch.get("title_filter")
|
||||
new_count = 0
|
||||
fetched = 0
|
||||
for page in fetch_channel_videos_iter(ch["channel_id"], published_after=after):
|
||||
fetched += len(page)
|
||||
for v in page:
|
||||
if title_filter and title_filter not in v["title"]:
|
||||
continue
|
||||
row_id = save_video(ch["id"], v)
|
||||
if row_id:
|
||||
new_count += 1
|
||||
total_new += new_count
|
||||
logger.info(
|
||||
"Channel %s: fetched %d videos, %d new",
|
||||
ch["channel_name"], len(videos), total_new,
|
||||
"Channel %s: fetched %d videos (after=%s), %d new (filter=%s)",
|
||||
ch["channel_name"], fetched, after or "all", new_count, title_filter or "none",
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Failed to scan channel %s: %s", ch["channel_name"], e)
|
||||
|
||||
Reference in New Issue
Block a user