- Admin: video management with Google Maps match status, manual restaurant mapping, restaurant remap on name change - Admin: user management tab with favorites/reviews detail - Admin: channel deletion fix for IDs with slashes - Frontend: responsive mobile layout (map top, list bottom, 2-row header) - Frontend: channel-colored map markers with legend - Frontend: my reviews list, favorites toggle, visit counter overlay - Frontend: force light mode for dark theme devices - Backend: visit tracking (site_visits table), user reviews endpoint - Backend: bulk transcript/extract streaming, geocode key fixes - Nginx config for production deployment Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
141 lines
4.6 KiB
Python
141 lines
4.6 KiB
Python
"""LLM-based restaurant info extraction from video transcripts.
|
|
|
|
Uses OCI GenAI (Gemini Flash) to extract structured restaurant data.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
|
|
import oci
|
|
from oci.generative_ai_inference import GenerativeAiInferenceClient
|
|
from oci.generative_ai_inference.models import (
|
|
ChatDetails,
|
|
GenericChatRequest,
|
|
OnDemandServingMode,
|
|
TextContent,
|
|
UserMessage,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _get_client() -> GenerativeAiInferenceClient:
|
|
config = oci.config.from_file()
|
|
endpoint = os.environ.get("OCI_CHAT_ENDPOINT") or os.environ["OCI_GENAI_ENDPOINT"]
|
|
return GenerativeAiInferenceClient(config, service_endpoint=endpoint)
|
|
|
|
|
|
def _llm(prompt: str, max_tokens: int = 4096) -> str:
|
|
client = _get_client()
|
|
req = GenericChatRequest(
|
|
messages=[UserMessage(content=[TextContent(text=prompt)])],
|
|
max_tokens=max_tokens,
|
|
temperature=0,
|
|
)
|
|
det = ChatDetails(
|
|
compartment_id=os.environ["OCI_COMPARTMENT_ID"],
|
|
serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]),
|
|
chat_request=req,
|
|
)
|
|
resp = client.chat(det)
|
|
return resp.data.chat_response.choices[0].message.content[0].text.strip()
|
|
|
|
|
|
def _parse_json(raw: str) -> dict | list:
|
|
raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE).strip()
|
|
raw = re.sub(r",\s*([}\]])", r"\1", raw)
|
|
try:
|
|
return json.loads(raw)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
try:
|
|
return json.JSONDecoder(strict=False).decode(raw)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
# recover truncated array — extract complete objects one by one
|
|
if raw.lstrip().startswith("["):
|
|
decoder = json.JSONDecoder(strict=False)
|
|
items: list = []
|
|
idx = raw.index("[") + 1
|
|
while idx < len(raw):
|
|
while idx < len(raw) and raw[idx] in " \t\n\r,":
|
|
idx += 1
|
|
if idx >= len(raw) or raw[idx] == "]":
|
|
break
|
|
try:
|
|
obj, end = decoder.raw_decode(raw, idx)
|
|
items.append(obj)
|
|
idx = end
|
|
except json.JSONDecodeError:
|
|
# Try to recover truncated last object by closing braces
|
|
remainder = raw[idx:]
|
|
for fix in ["}", "}]", '"}', '"}' , '"}]', "null}", "null}]"]:
|
|
try:
|
|
patched = remainder.rstrip().rstrip(",") + fix
|
|
obj = json.loads(patched)
|
|
if isinstance(obj, dict) and obj.get("name"):
|
|
items.append(obj)
|
|
except (json.JSONDecodeError, ValueError):
|
|
continue
|
|
break
|
|
if items:
|
|
logger.info("Recovered %d restaurants from truncated JSON", len(items))
|
|
return items
|
|
raise ValueError(f"JSON parse failed: {raw[:80]!r}")
|
|
|
|
|
|
_EXTRACT_PROMPT = """\
|
|
다음은 유튜브 먹방/맛집 영상의 자막입니다.
|
|
이 영상에서 언급된 모든 식당 정보를 추출하세요.
|
|
|
|
규칙:
|
|
- 식당이 없으면 빈 배열 [] 반환
|
|
- 각 식당에 대해 아래 필드를 JSON 배열로 반환
|
|
- 확실하지 않은 정보는 null
|
|
- 추가 설명 없이 JSON만 반환
|
|
|
|
필드:
|
|
- name: 식당 이름 (string, 필수)
|
|
- address: 주소 또는 위치 힌트 (string | null)
|
|
- region: 지역 (예: 서울 강남, 부산 해운대) (string | null)
|
|
- cuisine_type: 음식 종류 (예: 한식, 일식, 중식, 양식, 카페) (string | null)
|
|
- price_range: 가격대 (예: 1만원대, 2-3만원) (string | null)
|
|
- foods_mentioned: 언급된 메뉴들 (string[])
|
|
- evaluation: 평가 내용 (string | null)
|
|
- guests: 함께한 게스트 (string[])
|
|
|
|
영상 제목: {title}
|
|
자막:
|
|
{transcript}
|
|
|
|
JSON 배열:"""
|
|
|
|
|
|
def extract_restaurants(title: str, transcript: str, custom_prompt: str | None = None) -> tuple[list[dict], str]:
|
|
"""Extract restaurant info from a video transcript using LLM.
|
|
|
|
Returns (list of restaurant dicts, raw LLM response text).
|
|
"""
|
|
# Truncate very long transcripts
|
|
if len(transcript) > 8000:
|
|
transcript = transcript[:7000] + "\n...(중략)...\n" + transcript[-1000:]
|
|
|
|
template = custom_prompt if custom_prompt else _EXTRACT_PROMPT
|
|
prompt = template.format(title=title, transcript=transcript)
|
|
|
|
try:
|
|
raw = _llm(prompt, max_tokens=8192)
|
|
result = _parse_json(raw)
|
|
if isinstance(result, list):
|
|
return result, raw
|
|
if isinstance(result, dict):
|
|
return [result], raw
|
|
return [], raw
|
|
except Exception as e:
|
|
logger.error("Restaurant extraction failed: %s", e)
|
|
return [], ""
|