feat: add English vocab extraction and Anki card registration

- core/vocab.py: extract B1-B2 level vocabulary from English content via Gemini Flash
- core/anki.py: register vocab cards to AnkiConnect (English::Vocabulary deck)
- core/enricher.py: add language detection field + summary_ko (Korean summary)
- core/obsidian.py: render Korean + English summary in note
- daemon/worker.py: call vocab extraction and Anki registration for English content
This commit is contained in:
joungmin
2026-02-28 08:39:58 +09:00
parent 86a4104ae3
commit a9db6a8771
5 changed files with 208 additions and 8 deletions

79
core/vocab.py Normal file
View File

@@ -0,0 +1,79 @@
"""Extract vocabulary from English content using Gemini Flash."""
import json
import os
import re
import oci
from oci.generative_ai_inference import GenerativeAiInferenceClient
from oci.generative_ai_inference.models import (
ChatDetails,
GenericChatRequest,
OnDemandServingMode,
TextContent,
UserMessage,
)
_PROMPT = """\
You are an English vocabulary instructor. Analyze the English text below and extract words or phrases that an intermediate English learner (B1-B2 CEFR level) might not know.
Rules:
- Extract 5 to 10 items maximum
- Skip very basic words (go, see, love, etc.) and overly academic/rare words
- Focus on useful vocabulary: idioms, collocations, phrasal verbs, and mid-level words
- Each item must appear in the source text
Return ONLY a valid JSON array. Each element must have:
- "word": the word or phrase as it appears (string)
- "pos": part of speech, e.g. "verb", "noun", "adjective", "phrase", "phrasal verb" (string)
- "definition_en": concise English definition (string)
- "definition_ko": Korean translation of the definition (string)
- "example": the sentence from the source text that contains this word/phrase (string)
Source title: {title}
Text:
{text}
Return only the JSON array, no markdown, no explanation."""
def _get_client() -> GenerativeAiInferenceClient:
config = oci.config.from_file()
endpoint = os.environ.get("OCI_CHAT_ENDPOINT") or os.environ["OCI_GENAI_ENDPOINT"]
return GenerativeAiInferenceClient(config, service_endpoint=endpoint)
def extract_vocab(text: str, title: str = "") -> list[dict]:
"""Extract intermediate-level English vocabulary from text using Gemini Flash.
Args:
text: English source text to analyze.
title: Content title for context.
Returns:
List of vocab dicts with keys: word, pos, definition_en, definition_ko, example.
Returns empty list on failure or if no suitable vocab found.
"""
prompt = _PROMPT.format(title=title, text=text[:5000])
try:
client = _get_client()
req = GenericChatRequest(
messages=[UserMessage(content=[TextContent(text=prompt)])],
max_tokens=2048,
temperature=0,
)
det = ChatDetails(
compartment_id=os.environ["OCI_COMPARTMENT_ID"],
serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]),
chat_request=req,
)
response = client.chat(det)
raw = response.data.chat_response.choices[0].message.content[0].text.strip()
raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE)
items = json.loads(raw)
if not isinstance(items, list):
return []
return items
except Exception:
return []