Backend (FastAPI + Oracle ADB), Frontend (Next.js), daemon worker. Features: channel/video/restaurant management, semantic search, Google OAuth, user reviews. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
98 lines
2.7 KiB
Python
98 lines
2.7 KiB
Python
"""Vector embedding generation and storage for restaurant semantic search."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import array
|
|
import os
|
|
|
|
import oci
|
|
from oci.generative_ai_inference import GenerativeAiInferenceClient
|
|
from oci.generative_ai_inference.models import (
|
|
EmbedTextDetails,
|
|
OnDemandServingMode,
|
|
)
|
|
|
|
from core.db import conn
|
|
|
|
|
|
def _embed_texts(texts: list[str]) -> list[list[float]]:
|
|
config = oci.config.from_file()
|
|
client = GenerativeAiInferenceClient(
|
|
config,
|
|
service_endpoint=os.environ["OCI_GENAI_ENDPOINT"],
|
|
)
|
|
model_id = os.environ.get("OCI_EMBED_MODEL_ID", "cohere.embed-v4.0")
|
|
compartment_id = os.environ["OCI_COMPARTMENT_ID"]
|
|
|
|
details = EmbedTextDetails(
|
|
inputs=texts,
|
|
serving_mode=OnDemandServingMode(model_id=model_id),
|
|
compartment_id=compartment_id,
|
|
input_type="SEARCH_DOCUMENT",
|
|
)
|
|
response = client.embed_text(details)
|
|
return response.data.embeddings
|
|
|
|
|
|
def _to_vec(embedding: list[float]) -> array.array:
|
|
return array.array("f", embedding)
|
|
|
|
|
|
def save_restaurant_vectors(restaurant_id: str, chunks: list[str]) -> list[str]:
|
|
"""Embed and store text chunks for a restaurant.
|
|
|
|
Returns list of inserted row IDs.
|
|
"""
|
|
if not chunks:
|
|
return []
|
|
|
|
embeddings = _embed_texts(chunks)
|
|
inserted: list[str] = []
|
|
|
|
sql = """
|
|
INSERT INTO restaurant_vectors (restaurant_id, chunk_text, embedding)
|
|
VALUES (:rid, :chunk, :emb)
|
|
RETURNING id INTO :out_id
|
|
"""
|
|
import oracledb
|
|
with conn() as c:
|
|
cur = c.cursor()
|
|
for chunk, emb in zip(chunks, embeddings):
|
|
out_id = cur.var(oracledb.STRING)
|
|
cur.execute(sql, {
|
|
"rid": restaurant_id,
|
|
"chunk": chunk,
|
|
"emb": _to_vec(emb),
|
|
"out_id": out_id,
|
|
})
|
|
inserted.append(out_id.getvalue()[0])
|
|
return inserted
|
|
|
|
|
|
def search_similar(query: str, top_k: int = 10) -> list[dict]:
|
|
"""Semantic search: find restaurants similar to query text.
|
|
|
|
Returns list of dicts: restaurant_id, chunk_text, distance.
|
|
"""
|
|
embeddings = _embed_texts([query])
|
|
query_vec = _to_vec(embeddings[0])
|
|
|
|
sql = """
|
|
SELECT rv.restaurant_id, rv.chunk_text,
|
|
VECTOR_DISTANCE(rv.embedding, :qvec, COSINE) AS dist
|
|
FROM restaurant_vectors rv
|
|
ORDER BY dist
|
|
FETCH FIRST :k ROWS ONLY
|
|
"""
|
|
with conn() as c:
|
|
cur = c.cursor()
|
|
cur.execute(sql, {"qvec": query_vec, "k": top_k})
|
|
return [
|
|
{
|
|
"restaurant_id": r[0],
|
|
"chunk_text": r[1].read() if hasattr(r[1], "read") else r[1],
|
|
"distance": r[2],
|
|
}
|
|
for r in cur.fetchall()
|
|
]
|