tasteby/backend/core/extractor.py

"""LLM-based restaurant info extraction from video transcripts.

Uses OCI GenAI (Gemini Flash) to extract structured restaurant data.
"""

from __future__ import annotations

import json
import logging
import os
import re

import oci
from oci.generative_ai_inference import GenerativeAiInferenceClient
from oci.generative_ai_inference.models import (
    ChatDetails,
    GenericChatRequest,
    OnDemandServingMode,
    TextContent,
    UserMessage,
)

from core.cuisine import CUISINE_LIST_TEXT

logger = logging.getLogger(__name__)


def _get_client() -> GenerativeAiInferenceClient:
    config = oci.config.from_file()
    endpoint = os.environ.get("OCI_CHAT_ENDPOINT") or os.environ["OCI_GENAI_ENDPOINT"]
    return GenerativeAiInferenceClient(config, service_endpoint=endpoint)


def _llm(prompt: str, max_tokens: int = 4096) -> str:
    client = _get_client()
    req = GenericChatRequest(
        messages=[UserMessage(content=[TextContent(text=prompt)])],
        max_tokens=max_tokens,
        temperature=0,
    )
    det = ChatDetails(
        compartment_id=os.environ["OCI_COMPARTMENT_ID"],
        serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]),
        chat_request=req,
    )
    resp = client.chat(det)
    return resp.data.chat_response.choices[0].message.content[0].text.strip()


def _parse_json(raw: str) -> dict | list:
    raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE).strip()
    raw = re.sub(r",\s*([}\]])", r"\1", raw)
    try:
        return json.loads(raw)
    except json.JSONDecodeError:
        pass
    try:
        return json.JSONDecoder(strict=False).decode(raw)
    except json.JSONDecodeError:
        pass
    # recover truncated array — extract complete objects one by one
    if raw.lstrip().startswith("["):
        decoder = json.JSONDecoder(strict=False)
        items: list = []
        idx = raw.index("[") + 1
        while idx < len(raw):
            while idx < len(raw) and raw[idx] in " \t\n\r,":
                idx += 1
            if idx >= len(raw) or raw[idx] == "]":
                break
            try:
                obj, end = decoder.raw_decode(raw, idx)
                items.append(obj)
                idx = end
            except json.JSONDecodeError:
                # Try to recover truncated last object by closing braces
                remainder = raw[idx:]
                for fix in ["}", "}]", '"}', '"}'  , '"}]', "null}",  "null}]"]:
                    try:
                        patched = remainder.rstrip().rstrip(",") + fix
                        obj = json.loads(patched)
                        if isinstance(obj, dict) and obj.get("name"):
                            items.append(obj)
                    except (json.JSONDecodeError, ValueError):
                        continue
                break
        if items:
            logger.info("Recovered %d restaurants from truncated JSON", len(items))
            return items
    raise ValueError(f"JSON parse failed: {raw[:80]!r}")


_EXTRACT_PROMPT = """\
다음은 유튜브 먹방/맛집 영상의 자막입니다.
이 영상에서 언급된 모든 식당 정보를 추출하세요.

규칙:
- 식당이 없으면 빈 배열 [] 반환
- 각 식당에 대해 아래 필드를 JSON 배열로 반환
- 확실하지 않은 정보는 null
- 추가 설명 없이 JSON만 반환

필드:
- name: 식당 이름 (string, 필수)
- address: 주소 또는 위치 힌트 (string | null)
- region: 지역을 "나라|시/도|구/군/시" 파이프(|) 구분 형식으로 작성 (string | null)
  - 한국 예시: "한국|서울|강남구", "한국|부산|해운대구", "한국|제주", "한국|강원|강릉시"
  - 해외 예시: "일본|도쿄", "일본|오사카", "싱가포르", "미국|뉴욕", "태국|방콕"
  - 나라는 한글로, 해외 도시도 한글로 표기
- cuisine_type: 아래 목록에서 가장 적합한 것을 선택 (string, 필수). 반드시 아래 목록 중 하나를 사용:
{cuisine_types}
- price_range: 가격대 (예: 1만원대, 2-3만원) (string | null)
- foods_mentioned: 언급된 대표 메뉴 (string[], 최대 10개, 우선순위 높은 순, 반드시 한글로 작성)
- evaluation: 평가 내용 (string | null)
- guests: 함께한 게스트 (string[])

영상 제목: {{title}}
자막:
{{transcript}}

JSON 배열:""".format(cuisine_types=CUISINE_LIST_TEXT)


def extract_restaurants(title: str, transcript: str, custom_prompt: str | None = None) -> tuple[list[dict], str]:
    """Extract restaurant info from a video transcript using LLM.

    Returns (list of restaurant dicts, raw LLM response text).
    """
    # Truncate very long transcripts
    if len(transcript) > 8000:
        transcript = transcript[:7000] + "\n...(중략)...\n" + transcript[-1000:]

    template = custom_prompt if custom_prompt else _EXTRACT_PROMPT
    prompt = template.format(title=title, transcript=transcript)

    try:
        raw = _llm(prompt, max_tokens=8192)
        result = _parse_json(raw)
        if isinstance(result, list):
            return result, raw
        if isinstance(result, dict):
            return [result], raw
        return [], raw
    except Exception as e:
        logger.error("Restaurant extraction failed: %s", e)
        return [], ""