feat: add English vocab extraction and Anki card registration
- core/vocab.py: extract B1-B2 level vocabulary from English content via Gemini Flash - core/anki.py: register vocab cards to AnkiConnect (English::Vocabulary deck) - core/enricher.py: add language detection field + summary_ko (Korean summary) - core/obsidian.py: render Korean + English summary in note - daemon/worker.py: call vocab extraction and Anki registration for English content
This commit is contained in:
79
core/vocab.py
Normal file
79
core/vocab.py
Normal file
@@ -0,0 +1,79 @@
|
||||
"""Extract vocabulary from English content using Gemini Flash."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
import oci
|
||||
from oci.generative_ai_inference import GenerativeAiInferenceClient
|
||||
from oci.generative_ai_inference.models import (
|
||||
ChatDetails,
|
||||
GenericChatRequest,
|
||||
OnDemandServingMode,
|
||||
TextContent,
|
||||
UserMessage,
|
||||
)
|
||||
|
||||
_PROMPT = """\
|
||||
You are an English vocabulary instructor. Analyze the English text below and extract words or phrases that an intermediate English learner (B1-B2 CEFR level) might not know.
|
||||
|
||||
Rules:
|
||||
- Extract 5 to 10 items maximum
|
||||
- Skip very basic words (go, see, love, etc.) and overly academic/rare words
|
||||
- Focus on useful vocabulary: idioms, collocations, phrasal verbs, and mid-level words
|
||||
- Each item must appear in the source text
|
||||
|
||||
Return ONLY a valid JSON array. Each element must have:
|
||||
- "word": the word or phrase as it appears (string)
|
||||
- "pos": part of speech, e.g. "verb", "noun", "adjective", "phrase", "phrasal verb" (string)
|
||||
- "definition_en": concise English definition (string)
|
||||
- "definition_ko": Korean translation of the definition (string)
|
||||
- "example": the sentence from the source text that contains this word/phrase (string)
|
||||
|
||||
Source title: {title}
|
||||
Text:
|
||||
{text}
|
||||
|
||||
Return only the JSON array, no markdown, no explanation."""
|
||||
|
||||
|
||||
def _get_client() -> GenerativeAiInferenceClient:
|
||||
config = oci.config.from_file()
|
||||
endpoint = os.environ.get("OCI_CHAT_ENDPOINT") or os.environ["OCI_GENAI_ENDPOINT"]
|
||||
return GenerativeAiInferenceClient(config, service_endpoint=endpoint)
|
||||
|
||||
|
||||
def extract_vocab(text: str, title: str = "") -> list[dict]:
|
||||
"""Extract intermediate-level English vocabulary from text using Gemini Flash.
|
||||
|
||||
Args:
|
||||
text: English source text to analyze.
|
||||
title: Content title for context.
|
||||
|
||||
Returns:
|
||||
List of vocab dicts with keys: word, pos, definition_en, definition_ko, example.
|
||||
Returns empty list on failure or if no suitable vocab found.
|
||||
"""
|
||||
prompt = _PROMPT.format(title=title, text=text[:5000])
|
||||
|
||||
try:
|
||||
client = _get_client()
|
||||
req = GenericChatRequest(
|
||||
messages=[UserMessage(content=[TextContent(text=prompt)])],
|
||||
max_tokens=2048,
|
||||
temperature=0,
|
||||
)
|
||||
det = ChatDetails(
|
||||
compartment_id=os.environ["OCI_COMPARTMENT_ID"],
|
||||
serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]),
|
||||
chat_request=req,
|
||||
)
|
||||
response = client.chat(det)
|
||||
raw = response.data.chat_response.choices[0].message.content[0].text.strip()
|
||||
raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE)
|
||||
items = json.loads(raw)
|
||||
if not isinstance(items, list):
|
||||
return []
|
||||
return items
|
||||
except Exception:
|
||||
return []
|
||||
Reference in New Issue
Block a user