knowledge-inbox/core/vocab.py

"""Extract vocabulary from English content using Gemini Flash."""

import json
import os
import re

import oci
from oci.generative_ai_inference import GenerativeAiInferenceClient
from oci.generative_ai_inference.models import (
    ChatDetails,
    GenericChatRequest,
    OnDemandServingMode,
    TextContent,
    UserMessage,
)

_PROMPT = """\
You are an English vocabulary instructor. Analyze the English text below and extract words or phrases that an intermediate English learner (B1-B2 CEFR level) might not know.

Rules:
- Extract 5 to 10 items maximum
- Skip very basic words (go, see, love, etc.) and overly academic/rare words
- Focus on useful vocabulary: idioms, collocations, phrasal verbs, and mid-level words
- Each item must appear in the source text

Return ONLY a valid JSON array. Each element must have:
- "word": the word or phrase as it appears (string)
- "pos": part of speech, e.g. "verb", "noun", "adjective", "phrase", "phrasal verb" (string)
- "definition_en": concise English definition (string)
- "definition_ko": Korean translation of the definition (string)
- "example": the sentence from the source text that contains this word/phrase (string)

Source title: {title}
Text:
{text}

Return only the JSON array, no markdown, no explanation."""


def _get_client() -> GenerativeAiInferenceClient:
    config = oci.config.from_file()
    endpoint = os.environ.get("OCI_CHAT_ENDPOINT") or os.environ["OCI_GENAI_ENDPOINT"]
    return GenerativeAiInferenceClient(config, service_endpoint=endpoint)


def extract_vocab(text: str, title: str = "") -> list[dict]:
    """Extract intermediate-level English vocabulary from text using Gemini Flash.

    Args:
        text: English source text to analyze.
        title: Content title for context.

    Returns:
        List of vocab dicts with keys: word, pos, definition_en, definition_ko, example.
        Returns empty list on failure or if no suitable vocab found.
    """
    prompt = _PROMPT.format(title=title, text=text[:5000])

    try:
        client = _get_client()
        req = GenericChatRequest(
            messages=[UserMessage(content=[TextContent(text=prompt)])],
            max_tokens=2048,
            temperature=0,
        )
        det = ChatDetails(
            compartment_id=os.environ["OCI_COMPARTMENT_ID"],
            serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]),
            chat_request=req,
        )
        response = client.chat(det)
        raw = response.data.chat_response.choices[0].message.content[0].text.strip()
        raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE)
        items = json.loads(raw)
        if not isinstance(items, list):
            return []
        return items
    except Exception:
        return []