knowledge-inbox/core/enricher.py

"""LLM-based content enrichment via OCI GenAI Gemini Flash."""

import json
import os
import re

import oci
from oci.generative_ai_inference import GenerativeAiInferenceClient
from oci.generative_ai_inference.models import (
    ChatDetails,
    GenericChatRequest,
    OnDemandServingMode,
    TextContent,
    UserMessage,
)

_PROMPT = """\
You are a knowledge extraction assistant. Analyze the content below and return ONLY a valid JSON object with these fields:
- "title": concise descriptive title for this content (string)
- "summary": 3-5 sentence summary capturing key insights, written in English (string)
- "summary_ko": the same summary translated into Korean (string)
- "tags": list of 3-7 relevant keywords or topics (string[])
- "author": author or creator name, or null if not found (string | null)
- "date": publication date in ISO 8601 format (YYYY-MM-DD), or null if not found (string | null)
- "content_type": one of "youtube", "article", "documentation", "news", "forum", "code", "other" (string)
- "language": primary language of the content, ISO 639-1 code, e.g. "en", "ko", "ja" (string)

Content type: {content_type}
Source URL: {url}
Content:
{text}

Return only the JSON object, no markdown, no explanation."""


def _get_client() -> GenerativeAiInferenceClient:
    config = oci.config.from_file()
    # Gemini models live in us-ashburn-1; use OCI_CHAT_ENDPOINT if set,
    # otherwise fall back to OCI_GENAI_ENDPOINT.
    endpoint = os.environ.get("OCI_CHAT_ENDPOINT") or os.environ["OCI_GENAI_ENDPOINT"]
    return GenerativeAiInferenceClient(config, service_endpoint=endpoint)


def enrich(content_type: str, title: str, url: str, text: str) -> dict:
    """Extract structured metadata from content using Gemini Flash.

    Args:
        content_type: One of 'youtube', 'url', 'text'.
        title: Initial title hint (may be empty).
        url: Source URL (empty for plain text).
        text: The full content text to analyze.

    Returns:
        Dict with keys: title, summary, tags, author, date, content_type.
        Falls back to minimal defaults on LLM failure.
    """
    prompt = _PROMPT.format(
        content_type=content_type,
        url=url or "(none)",
        text=text[:6000],
    )

    try:
        client = _get_client()
        req = GenericChatRequest(
            messages=[UserMessage(content=[TextContent(text=prompt)])],
            max_tokens=1024,
            temperature=0,
        )
        det = ChatDetails(
            compartment_id=os.environ["OCI_COMPARTMENT_ID"],
            serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]),
            chat_request=req,
        )
        response = client.chat(det)
        raw = response.data.chat_response.choices[0].message.content[0].text.strip()
        raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE)
        metadata = json.loads(raw)
    except Exception as exc:
        metadata = {
            "title": title or url or text[:80],
            "summary": text[:300],
            "tags": [],
            "author": None,
            "date": None,
            "content_type": content_type,
            "_error": str(exc),
        }

    # Ensure required keys exist
    metadata.setdefault("title", title or url or text[:80])
    metadata.setdefault("summary", "")
    metadata.setdefault("tags", [])
    metadata.setdefault("author", None)
    metadata.setdefault("date", None)
    metadata.setdefault("content_type", content_type)
    metadata.setdefault("language", "en")

    return metadata