diff --git a/core/anki.py b/core/anki.py new file mode 100644 index 0000000..b21aa73 --- /dev/null +++ b/core/anki.py @@ -0,0 +1,106 @@ +"""AnkiConnect HTTP client for adding vocabulary cards.""" + +import json +import logging +from urllib import request, error + +logger = logging.getLogger(__name__) + +ANKI_URL = "http://localhost:8765" +DECK_NAME = "English::Vocabulary" +MODEL_NAME = "Basic" + + +def _invoke(action: str, **params) -> object: + """Call AnkiConnect API. + + Args: + action: AnkiConnect action name. + **params: Action parameters. + + Returns: + The result field from AnkiConnect response. + + Raises: + RuntimeError: If AnkiConnect returns an error. + """ + payload = json.dumps({"action": action, "version": 6, "params": params}).encode() + req = request.Request(ANKI_URL, payload, {"Content-Type": "application/json"}) + with request.urlopen(req, timeout=5) as resp: + data = json.loads(resp.read()) + if data.get("error"): + raise RuntimeError(f"AnkiConnect error: {data['error']}") + return data["result"] + + +def _ensure_deck() -> None: + """Create deck if it doesn't exist.""" + _invoke("createDeck", deck=DECK_NAME) + + +def _build_front(word: str, pos: str, example: str) -> str: + return f"{word} ({pos})

{example}" + + +def _build_back(definition_en: str, definition_ko: str) -> str: + return f"{definition_en}

한국어: {definition_ko}" + + +def add_vocab_cards(vocab_list: list[dict], source_title: str = "") -> list[int]: + """Add vocabulary cards to Anki. + + Skips duplicates silently (AnkiConnect returns null for existing notes). + + Args: + vocab_list: List of vocab dicts from extract_vocab(). + source_title: Content title, added as a tag on each card. + + Returns: + List of created note IDs (excludes skipped duplicates). + """ + if not vocab_list: + return [] + + try: + _ensure_deck() + except error.URLError: + logger.warning("AnkiConnect unreachable — skipping vocab card creation") + return [] + + tag = source_title[:50].replace(" ", "_") if source_title else "knowledge-inbox" + + notes = [ + { + "deckName": DECK_NAME, + "modelName": MODEL_NAME, + "fields": { + "Front": _build_front( + item.get("word", ""), + item.get("pos", ""), + item.get("example", ""), + ), + "Back": _build_back( + item.get("definition_en", ""), + item.get("definition_ko", ""), + ), + }, + "options": {"allowDuplicate": False}, + "tags": ["knowledge-inbox", tag], + } + for item in vocab_list + if item.get("word") + ] + + try: + results = _invoke("addNotes", notes=notes) + except Exception as exc: + logger.error("Failed to add Anki cards: %s", exc) + return [] + + created = [note_id for note_id in results if note_id is not None] + skipped = len(results) - len(created) + logger.info( + "Anki: added %d card(s), skipped %d duplicate(s) for '%s'", + len(created), skipped, source_title[:40], + ) + return created diff --git a/core/enricher.py b/core/enricher.py index 293cde2..2acab5a 100644 --- a/core/enricher.py +++ b/core/enricher.py @@ -17,11 +17,13 @@ from oci.generative_ai_inference.models import ( _PROMPT = """\ You are a knowledge extraction assistant. Analyze the content below and return ONLY a valid JSON object with these fields: - "title": concise descriptive title for this content (string) -- "summary": 3-5 sentence summary capturing key insights (string) +- "summary": 3-5 sentence summary capturing key insights, written in English (string) +- "summary_ko": the same summary translated into Korean (string) - "tags": list of 3-7 relevant keywords or topics (string[]) - "author": author or creator name, or null if not found (string | null) - "date": publication date in ISO 8601 format (YYYY-MM-DD), or null if not found (string | null) - "content_type": one of "youtube", "article", "documentation", "news", "forum", "code", "other" (string) +- "language": primary language of the content, ISO 639-1 code, e.g. "en", "ko", "ja" (string) Content type: {content_type} Source URL: {url} @@ -33,10 +35,10 @@ Return only the JSON object, no markdown, no explanation.""" def _get_client() -> GenerativeAiInferenceClient: config = oci.config.from_file() - return GenerativeAiInferenceClient( - config, - service_endpoint=os.environ["OCI_GENAI_ENDPOINT"], - ) + # Gemini models live in us-ashburn-1; use OCI_CHAT_ENDPOINT if set, + # otherwise fall back to OCI_GENAI_ENDPOINT. + endpoint = os.environ.get("OCI_CHAT_ENDPOINT") or os.environ["OCI_GENAI_ENDPOINT"] + return GenerativeAiInferenceClient(config, service_endpoint=endpoint) def enrich(content_type: str, title: str, url: str, text: str) -> dict: @@ -92,5 +94,6 @@ def enrich(content_type: str, title: str, url: str, text: str) -> dict: metadata.setdefault("author", None) metadata.setdefault("date", None) metadata.setdefault("content_type", content_type) + metadata.setdefault("language", "en") return metadata diff --git a/core/obsidian.py b/core/obsidian.py index ddec6ad..e73166f 100644 --- a/core/obsidian.py +++ b/core/obsidian.py @@ -22,18 +22,20 @@ def save_note( source_url: str = "", author: str = "", date: str = "", + summary_ko: str = "", ) -> Path: """Save a processed knowledge item as an Obsidian markdown file. Args: content_type: One of 'youtube', 'url', 'text'. title: The note title. - summary: LLM-generated summary. + summary: LLM-generated summary in English. body: Full content text. tags: List of topic tags. source_url: Original URL (empty for plain text). author: Author name (may be empty). date: Publication date in ISO 8601 format (may be empty). + summary_ko: Korean translation of the summary (may be empty). Returns: Path of the created markdown file. @@ -59,6 +61,8 @@ def save_note( # Build YAML frontmatter tags tags_yaml = ", ".join(tags) if tags else "" + summary_section = f"## 요약\n{summary_ko}\n\n*(English)*\n{summary}" if summary_ko else f"## 요약\n{summary}" + content = f"""--- title: {title} source_type: {content_type} @@ -71,8 +75,7 @@ created: {today} # {title} -## 요약 -{summary} +{summary_section} ## 원문 {body} diff --git a/core/vocab.py b/core/vocab.py new file mode 100644 index 0000000..c4c1523 --- /dev/null +++ b/core/vocab.py @@ -0,0 +1,79 @@ +"""Extract vocabulary from English content using Gemini Flash.""" + +import json +import os +import re + +import oci +from oci.generative_ai_inference import GenerativeAiInferenceClient +from oci.generative_ai_inference.models import ( + ChatDetails, + GenericChatRequest, + OnDemandServingMode, + TextContent, + UserMessage, +) + +_PROMPT = """\ +You are an English vocabulary instructor. Analyze the English text below and extract words or phrases that an intermediate English learner (B1-B2 CEFR level) might not know. + +Rules: +- Extract 5 to 10 items maximum +- Skip very basic words (go, see, love, etc.) and overly academic/rare words +- Focus on useful vocabulary: idioms, collocations, phrasal verbs, and mid-level words +- Each item must appear in the source text + +Return ONLY a valid JSON array. Each element must have: +- "word": the word or phrase as it appears (string) +- "pos": part of speech, e.g. "verb", "noun", "adjective", "phrase", "phrasal verb" (string) +- "definition_en": concise English definition (string) +- "definition_ko": Korean translation of the definition (string) +- "example": the sentence from the source text that contains this word/phrase (string) + +Source title: {title} +Text: +{text} + +Return only the JSON array, no markdown, no explanation.""" + + +def _get_client() -> GenerativeAiInferenceClient: + config = oci.config.from_file() + endpoint = os.environ.get("OCI_CHAT_ENDPOINT") or os.environ["OCI_GENAI_ENDPOINT"] + return GenerativeAiInferenceClient(config, service_endpoint=endpoint) + + +def extract_vocab(text: str, title: str = "") -> list[dict]: + """Extract intermediate-level English vocabulary from text using Gemini Flash. + + Args: + text: English source text to analyze. + title: Content title for context. + + Returns: + List of vocab dicts with keys: word, pos, definition_en, definition_ko, example. + Returns empty list on failure or if no suitable vocab found. + """ + prompt = _PROMPT.format(title=title, text=text[:5000]) + + try: + client = _get_client() + req = GenericChatRequest( + messages=[UserMessage(content=[TextContent(text=prompt)])], + max_tokens=2048, + temperature=0, + ) + det = ChatDetails( + compartment_id=os.environ["OCI_COMPARTMENT_ID"], + serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]), + chat_request=req, + ) + response = client.chat(det) + raw = response.data.chat_response.choices[0].message.content[0].text.strip() + raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE) + items = json.loads(raw) + if not isinstance(items, list): + return [] + return items + except Exception: + return [] diff --git a/daemon/worker.py b/daemon/worker.py index 18190fd..ae1250f 100644 --- a/daemon/worker.py +++ b/daemon/worker.py @@ -4,11 +4,13 @@ import logging import os import time +from core.anki import add_vocab_cards from core.chunker import chunk_text from core.enricher import enrich from core.obsidian import save_note from core.queue_db import fetch_pending, set_done, set_error, set_processing from core.vector import save_to_vector +from core.vocab import extract_vocab from core.web import fetch_page_text from core.youtube import get_transcript @@ -62,6 +64,7 @@ def process_item(item: dict) -> None: source_url=url, author=meta.get("author") or "", date=meta.get("date") or "", + summary_ko=meta.get("summary_ko", ""), ) logger.info("Obsidian note saved: %s", note_path) @@ -70,6 +73,12 @@ def process_item(item: dict) -> None: inserted = save_to_vector(doc_id, chunks) logger.info("Vector store: inserted %d chunks for doc_id=%s", len(inserted), doc_id) + # Add Anki vocabulary cards for English content + if meta.get("language", "").startswith("en"): + vocab = extract_vocab(text, title) + if vocab: + add_vocab_cards(vocab, source_title=title) + set_done(row_id, title, meta) logger.info("Done: %s → %s", row_id[:8], title[:60])