- Oracle ADB queue table (sql/schema.sql) - Queue CRUD: core/queue_db.py - YouTube transcript: core/youtube.py - Web page fetch: core/web.py - LLM enrichment via OCI GenAI Gemini Flash: core/enricher.py - Text chunker: core/chunker.py - Obsidian note writer: core/obsidian.py - Oracle vector store insertion: core/vector.py - Polling daemon: daemon/worker.py - Telegram bot: bot/telegram_bot.py - Main runner: main.py
97 lines
3.1 KiB
Python
97 lines
3.1 KiB
Python
"""LLM-based content enrichment via OCI GenAI Gemini Flash."""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
|
|
import oci
|
|
from oci.generative_ai_inference import GenerativeAiInferenceClient
|
|
from oci.generative_ai_inference.models import (
|
|
ChatDetails,
|
|
GenericChatRequest,
|
|
OnDemandServingMode,
|
|
TextContent,
|
|
UserMessage,
|
|
)
|
|
|
|
_PROMPT = """\
|
|
You are a knowledge extraction assistant. Analyze the content below and return ONLY a valid JSON object with these fields:
|
|
- "title": concise descriptive title for this content (string)
|
|
- "summary": 3-5 sentence summary capturing key insights (string)
|
|
- "tags": list of 3-7 relevant keywords or topics (string[])
|
|
- "author": author or creator name, or null if not found (string | null)
|
|
- "date": publication date in ISO 8601 format (YYYY-MM-DD), or null if not found (string | null)
|
|
- "content_type": one of "youtube", "article", "documentation", "news", "forum", "code", "other" (string)
|
|
|
|
Content type: {content_type}
|
|
Source URL: {url}
|
|
Content:
|
|
{text}
|
|
|
|
Return only the JSON object, no markdown, no explanation."""
|
|
|
|
|
|
def _get_client() -> GenerativeAiInferenceClient:
|
|
config = oci.config.from_file()
|
|
return GenerativeAiInferenceClient(
|
|
config,
|
|
service_endpoint=os.environ["OCI_GENAI_ENDPOINT"],
|
|
)
|
|
|
|
|
|
def enrich(content_type: str, title: str, url: str, text: str) -> dict:
|
|
"""Extract structured metadata from content using Gemini Flash.
|
|
|
|
Args:
|
|
content_type: One of 'youtube', 'url', 'text'.
|
|
title: Initial title hint (may be empty).
|
|
url: Source URL (empty for plain text).
|
|
text: The full content text to analyze.
|
|
|
|
Returns:
|
|
Dict with keys: title, summary, tags, author, date, content_type.
|
|
Falls back to minimal defaults on LLM failure.
|
|
"""
|
|
prompt = _PROMPT.format(
|
|
content_type=content_type,
|
|
url=url or "(none)",
|
|
text=text[:6000],
|
|
)
|
|
|
|
try:
|
|
client = _get_client()
|
|
req = GenericChatRequest(
|
|
messages=[UserMessage(content=[TextContent(text=prompt)])],
|
|
max_tokens=1024,
|
|
temperature=0,
|
|
)
|
|
det = ChatDetails(
|
|
compartment_id=os.environ["OCI_COMPARTMENT_ID"],
|
|
serving_mode=OnDemandServingMode(model_id=os.environ["OCI_CHAT_MODEL_ID"]),
|
|
chat_request=req,
|
|
)
|
|
response = client.chat(det)
|
|
raw = response.data.chat_response.choices[0].message.content[0].text.strip()
|
|
raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE)
|
|
metadata = json.loads(raw)
|
|
except Exception as exc:
|
|
metadata = {
|
|
"title": title or url or text[:80],
|
|
"summary": text[:300],
|
|
"tags": [],
|
|
"author": None,
|
|
"date": None,
|
|
"content_type": content_type,
|
|
"_error": str(exc),
|
|
}
|
|
|
|
# Ensure required keys exist
|
|
metadata.setdefault("title", title or url or text[:80])
|
|
metadata.setdefault("summary", "")
|
|
metadata.setdefault("tags", [])
|
|
metadata.setdefault("author", None)
|
|
metadata.setdefault("date", None)
|
|
metadata.setdefault("content_type", content_type)
|
|
|
|
return metadata
|