feat: initial knowledge-inbox pipeline implementation

- Oracle ADB queue table (sql/schema.sql)
- Queue CRUD: core/queue_db.py
- YouTube transcript: core/youtube.py
- Web page fetch: core/web.py
- LLM enrichment via OCI GenAI Gemini Flash: core/enricher.py
- Text chunker: core/chunker.py
- Obsidian note writer: core/obsidian.py
- Oracle vector store insertion: core/vector.py
- Polling daemon: daemon/worker.py
- Telegram bot: bot/telegram_bot.py
- Main runner: main.py
This commit is contained in:
joungmin
2026-02-28 08:16:11 +09:00
commit 86a4104ae3
18 changed files with 926 additions and 0 deletions

114
core/vector.py Normal file
View File

@@ -0,0 +1,114 @@
"""Embedding generation and Oracle vector store insertion."""
import array
import os
from contextlib import contextmanager
from typing import Generator
import oci
import oracledb
from oci.generative_ai_inference import GenerativeAiInferenceClient
from oci.generative_ai_inference.models import (
EmbedTextDetails,
OnDemandServingMode,
)
# Reuse same pool as queue_db but connect to same ADB instance
_pool: oracledb.ConnectionPool | None = None
def _get_pool() -> oracledb.ConnectionPool:
"""Return (or lazily create) the module-level connection pool."""
global _pool
if _pool is None:
kwargs: dict = dict(
user=os.environ["ORACLE_USER"],
password=os.environ["ORACLE_PASSWORD"],
dsn=os.environ["ORACLE_DSN"],
min=1,
max=5,
increment=1,
)
wallet = os.environ.get("ORACLE_WALLET")
if wallet:
kwargs["config_dir"] = wallet
_pool = oracledb.create_pool(**kwargs)
return _pool
@contextmanager
def _conn() -> Generator[oracledb.Connection, None, None]:
"""Context manager that acquires and releases a pooled connection."""
pool = _get_pool()
conn = pool.acquire()
try:
yield conn
conn.commit()
except Exception:
conn.rollback()
raise
finally:
pool.release(conn)
def _to_vector_param(embedding: list[float]) -> array.array:
return array.array("f", embedding)
def _embed_texts(texts: list[str]) -> list[list[float]]:
"""Generate embeddings using Cohere Embed v4 via OCI GenAI."""
config = oci.config.from_file()
client = GenerativeAiInferenceClient(
config,
service_endpoint=os.environ["OCI_GENAI_ENDPOINT"],
)
model_id = os.environ.get("OCI_EMBED_MODEL_ID", "cohere.embed-v4.0")
compartment_id = os.environ["OCI_COMPARTMENT_ID"]
details = EmbedTextDetails(
inputs=texts,
serving_mode=OnDemandServingMode(model_id=model_id),
compartment_id=compartment_id,
input_type="SEARCH_DOCUMENT",
)
response = client.embed_text(details)
return response.data.embeddings
def save_to_vector(doc_id: str, chunks: list[str]) -> list[str]:
"""Embed chunks and insert them into the Oracle vector store.
Args:
doc_id: Document identifier (e.g. 'youtube:abc12345').
chunks: List of text chunks to embed and store.
Returns:
List of inserted row UUIDs.
"""
if not chunks:
return []
embeddings = _embed_texts(chunks)
inserted_ids: list[str] = []
sql = """
INSERT INTO vector_store (doc_id, chunk_text, embedding)
VALUES (:doc_id, :chunk_text, :embedding)
RETURNING id INTO :out_id
"""
with _conn() as conn:
cursor = conn.cursor()
for chunk, embedding in zip(chunks, embeddings):
out_id_var = cursor.var(oracledb.STRING)
cursor.execute(
sql,
{
"doc_id": doc_id,
"chunk_text": chunk,
"embedding": _to_vector_param(embedding),
"out_id": out_id_var,
},
)
inserted_ids.append(out_id_var.getvalue()[0])
return inserted_ids