feat: initial knowledge-inbox pipeline implementation
- Oracle ADB queue table (sql/schema.sql) - Queue CRUD: core/queue_db.py - YouTube transcript: core/youtube.py - Web page fetch: core/web.py - LLM enrichment via OCI GenAI Gemini Flash: core/enricher.py - Text chunker: core/chunker.py - Obsidian note writer: core/obsidian.py - Oracle vector store insertion: core/vector.py - Polling daemon: daemon/worker.py - Telegram bot: bot/telegram_bot.py - Main runner: main.py
This commit is contained in:
96
core/enricher.py
Normal file
96
core/enricher.py
Normal file
@@ -0,0 +1,96 @@
|
||||
"""LLM-based content enrichment via OCI GenAI Gemini Flash."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
import oci
|
||||
from oci.generative_ai_inference import GenerativeAiInferenceClient
|
||||
from oci.generative_ai_inference.models import (
|
||||
ChatDetails,
|
||||
GenericChatRequest,
|
||||
OnDemandServingMode,
|
||||
TextContent,
|
||||
UserMessage,
|
||||
)
|
||||
|
||||
_PROMPT = """\
|
||||
You are a knowledge extraction assistant. Analyze the content below and return ONLY a valid JSON object with these fields:
|
||||
- "title": concise descriptive title for this content (string)
|
||||
- "summary": 3-5 sentence summary capturing key insights (string)
|
||||
- "tags": list of 3-7 relevant keywords or topics (string[])
|
||||
- "author": author or creator name, or null if not found (string | null)
|
||||
- "date": publication date in ISO 8601 format (YYYY-MM-DD), or null if not found (string | null)
|
||||
- "content_type": one of "youtube", "article", "documentation", "news", "forum", "code", "other" (string)
|
||||
|
||||
Content type: {content_type}
|
||||
Source URL: {url}
|
||||
Content:
|
||||
{text}
|
||||
|
||||
Return only the JSON object, no markdown, no explanation."""
|
||||
|
||||
|
||||
def _get_client() -> GenerativeAiInferenceClient:
|
||||
config = oci.config.from_file()
|
||||
return GenerativeAiInferenceClient(
|
||||
config,
|
||||
service_endpoint=os.environ["OCI_GENAI_ENDPOINT"],
|
||||
)
|
||||
|
||||
|
||||
def enrich(content_type: str, title: str, url: str, text: str) -> dict:
|
||||
"""Extract structured metadata from content using Gemini Flash.
|
||||
|
||||
Args:
|
||||
content_type: One of 'youtube', 'url', 'text'.
|
||||
title: Initial title hint (may be empty).
|
||||
url: Source URL (empty for plain text).
|
||||
text: The full content text to analyze.
|
||||
|
||||
Returns:
|
||||
Dict with keys: title, summary, tags, author, date, content_type.
|
||||
Falls back to minimal defaults on LLM failure.
|
||||
"""
|
||||
prompt = _PROMPT.format(
|
||||
content_type=content_type,
|
||||
url=url or "(none)",
|
||||
text=text[:6000],
|
||||
)
|
||||
|
||||
try:
|
||||
client = _get_client()
|
||||
req = GenericChatRequest(
|
||||
messages=[UserMessage(content=[TextContent(text=prompt)])],
|
||||
max_tokens=1024,
|
||||
temperature=0,
|
||||
)
|
||||
det = ChatDetails(
|
||||
compartment_id=os.environ["OCI_COMPARTMENT_ID"],
|
||||
serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]),
|
||||
chat_request=req,
|
||||
)
|
||||
response = client.chat(det)
|
||||
raw = response.data.chat_response.choices[0].message.content[0].text.strip()
|
||||
raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE)
|
||||
metadata = json.loads(raw)
|
||||
except Exception as exc:
|
||||
metadata = {
|
||||
"title": title or url or text[:80],
|
||||
"summary": text[:300],
|
||||
"tags": [],
|
||||
"author": None,
|
||||
"date": None,
|
||||
"content_type": content_type,
|
||||
"_error": str(exc),
|
||||
}
|
||||
|
||||
# Ensure required keys exist
|
||||
metadata.setdefault("title", title or url or text[:80])
|
||||
metadata.setdefault("summary", "")
|
||||
metadata.setdefault("tags", [])
|
||||
metadata.setdefault("author", None)
|
||||
metadata.setdefault("date", None)
|
||||
metadata.setdefault("content_type", content_type)
|
||||
|
||||
return metadata
|
||||
Reference in New Issue
Block a user