feat: add Gemini 2.5 Flash enrichment to web_search results

Fetch full page content per result, then call Gemini 2.5 Flash (via OCI GenAI) to extract: abstract, date, author, tags, content_type, and uuid. Enrichment runs in parallel (ThreadPoolExecutor). enrich=False flag available to skip for raw/fast results.
2026-02-28 06:47:34 +09:00
parent 6e4125c707
commit 3b1ffb122f
4 changed files with 194 additions and 5 deletions
--- a/.env.example
+++ b/.env.example
@@ -1 +1,6 @@
 SEARXNG_URL=https://searxng.cloud-handson.com
 # OCI GenAI (auth via ~/.oci/config)
 OCI_COMPARTMENT_ID=
 OCI_GENAI_ENDPOINT=  # e.g. https://inference.generativeai.us-ashburn-1.oci.oraclecloud.com
 OCI_CHAT_MODEL_ID=   # OCID of the model (use google.gemini-2.5-pro OCID)
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 mcp[cli]>=1.0.0
 httpx>=0.27.0
 python-dotenv>=1.0.0
 oci>=2.100.0
--- a/src/enricher.py
+++ b/src/enricher.py
@@ -0,0 +1,164 @@
 """Page fetching and LLM-based enrichment via Gemini 2.5 Pro on OCI GenAI."""
 import json
 import os
 import re
 import uuid
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from html.parser import HTMLParser
 import httpx
 import oci
 from oci.generative_ai_inference import GenerativeAiInferenceClient
 from oci.generative_ai_inference.models import (
    ChatDetails,
    GenericChatRequest,
    OnDemandServingMode,
    TextContent,
    UserMessage,
 )
 # ---------------------------------------------------------------------------
 # HTML stripping
 # ---------------------------------------------------------------------------
 class _TextExtractor(HTMLParser):
    _SKIP_TAGS = {"script", "style", "head", "nav", "footer", "noscript"}
    def __init__(self) -> None:
        super().__init__()
        self._buf: list[str] = []
        self._skip = 0
    def handle_starttag(self, tag: str, attrs: list) -> None:
        if tag in self._SKIP_TAGS:
            self._skip += 1
    def handle_endtag(self, tag: str) -> None:
        if tag in self._SKIP_TAGS and self._skip:
            self._skip -= 1
    def handle_data(self, data: str) -> None:
        if not self._skip:
            text = data.strip()
            if text:
                self._buf.append(text)
    def get_text(self) -> str:
        return " ".join(self._buf)
 def _html_to_text(html: str) -> str:
    parser = _TextExtractor()
    parser.feed(html)
    return re.sub(r"\s{3,}", "  ", parser.get_text())
 # ---------------------------------------------------------------------------
 # Page fetching
 # ---------------------------------------------------------------------------
 _HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; searxng-mcp/1.0)"}
 def fetch_page_text(url: str, max_chars: int = 5000) -> str:
    """Fetch a URL and return stripped plain text, truncated to max_chars."""
    try:
        r = httpx.get(url, timeout=10, follow_redirects=True, headers=_HEADERS)
        r.raise_for_status()
        return _html_to_text(r.text)[:max_chars]
    except Exception:
        return ""
 # ---------------------------------------------------------------------------
 # OCI GenAI (Gemini 2.5 Pro) enrichment
 # ---------------------------------------------------------------------------
 _PROMPT = """\
 You are an information extractor. Analyze the web page content below and return ONLY a valid JSON object with these fields:
 - "abstract": 2-3 sentence summary of the page content (string)
 - "date": publication or last-modified date in ISO 8601 format, or null if not found (string | null)
 - "author": author or organization name, or null if not found (string | null)
 - "tags": list of 3-5 relevant keywords (string[])
 - "content_type": one of "article", "documentation", "news", "forum", "code", "video", "other" (string)
 Title: {title}
 URL: {url}
 Content:
 {content}
 Return only the JSON object, no markdown, no explanation."""
 def _get_llm_client() -> GenerativeAiInferenceClient:
    config = oci.config.from_file()
    return GenerativeAiInferenceClient(
        config,
        service_endpoint=os.environ["OCI_GENAI_ENDPOINT"],
    )
 def enrich_result(title: str, url: str, snippet: str) -> dict:
    """Fetch page, then call Gemini 2.5 Pro to extract structured metadata."""
    page_text = fetch_page_text(url)
    content = page_text if page_text else snippet
    prompt = _PROMPT.format(title=title, url=url, content=content[:4000])
    try:
        client = _get_llm_client()
        req = GenericChatRequest(
            messages=[UserMessage(content=[TextContent(text=prompt)])],
            max_tokens=512,
            temperature=0,
        )
        det = ChatDetails(
            compartment_id=os.environ["OCI_COMPARTMENT_ID"],
            serving_mode=OnDemandServingMode(
                model_id=os.environ["OCI_CHAT_MODEL_ID"]
            ),
            chat_request=req,
        )
        response = client.chat(det)
        raw = response.data.chat_response.choices[0].message.content[0].text.strip()
        # Strip markdown code fences if present
        raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE)
        metadata = json.loads(raw)
    except Exception as exc:
        metadata = {
            "abstract": snippet,
            "date": None,
            "author": None,
            "tags": [],
            "content_type": "other",
            "_error": str(exc),
        }
    metadata["uuid"] = str(uuid.uuid4())
    return metadata
 def enrich_results_parallel(
    results: list[dict],
    max_workers: int = 5,
 ) -> list[dict]:
    """Enrich a list of search results in parallel, adding metadata to each."""
    enriched: dict[int, dict] = {}
    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        futures = {
            pool.submit(enrich_result, r["title"], r["url"], r["content"]): i
            for i, r in enumerate(results)
        }
        for future in as_completed(futures):
            idx = futures[future]
            try:
                enriched[idx] = future.result()
            except Exception as exc:
                enriched[idx] = {"uuid": str(uuid.uuid4()), "_error": str(exc)}
    return [
        {**results[i], **enriched.get(i, {})}
        for i in range(len(results))
    ]
--- a/src/server.py
+++ b/src/server.py
@@ -1,4 +1,4 @@
-"""MCP server wrapping the self-hosted SearXNG instance for free web search."""
+"""MCP server wrapping self-hosted SearXNG with Gemini 2.5 Pro enrichment."""
 import os
 import sys
@@ -14,24 +14,38 @@ load_dotenv(os.path.join(_project_root, ".env"))
 import httpx
 from mcp.server.fastmcp import FastMCP
 from src.enricher import enrich_results_parallel
 mcp = FastMCP(
    name="searxng",
-    instructions="Web search via self-hosted SearXNG. Use this instead of built-in WebSearch.",
+    instructions=(
        "Web search via self-hosted SearXNG + Gemini 2.5 Pro enrichment. "
        "Use this instead of the built-in WebSearch tool."
    ),
 )
 SEARXNG_URL = os.environ.get("SEARXNG_URL", "https://searxng.cloud-handson.com")
@mcp.tool()
-def web_search(query: str, max_results: int = 10) -> list[dict]:
+def web_search(query: str, max_results: int = 10, enrich: bool = True) -> list[dict]:
    """Search the web using the self-hosted SearXNG instance.
    Each result is enriched by Gemini 2.5 Pro (via OCI GenAI) with:
    - abstract: 2-3 sentence summary of the page
    - date: publication date (ISO 8601) or null
    - author: author/org name or null
    - tags: 3-5 relevant keywords
    - content_type: article / documentation / news / forum / code / video / other
    - uuid: unique identifier for this result
    Args:
        query: The search query string.
        max_results: Maximum number of results to return (default 10).
        enrich: Set to False to skip LLM enrichment and return raw results faster.
    Returns:
-        List of result dicts with keys: ``title``, ``url``, ``content`` (snippet).
+        List of enriched result dicts.
    """
    response = httpx.get(
        f"{SEARXNG_URL}/search",
@@ -41,7 +55,7 @@ def web_search(query: str, max_results: int = 10) -> list[dict]:
    response.raise_for_status()
    data = response.json()
-    return [
+    results = [
        {
            "title": r.get("title", ""),
            "url": r.get("url", ""),
@@ -50,6 +64,11 @@ def web_search(query: str, max_results: int = 10) -> list[dict]:
        for r in data.get("results", [])[:max_results]
    ]
    if enrich and results:
        results = enrich_results_parallel(results)
    return results
 def main() -> None:
    mcp.run()