diff --git a/.env.example b/.env.example index ce41fd4..623e873 100644 --- a/.env.example +++ b/.env.example @@ -1 +1,6 @@ SEARXNG_URL=https://searxng.cloud-handson.com + +# OCI GenAI (auth via ~/.oci/config) +OCI_COMPARTMENT_ID= +OCI_GENAI_ENDPOINT= # e.g. https://inference.generativeai.us-ashburn-1.oci.oraclecloud.com +OCI_CHAT_MODEL_ID= # OCID of the model (use google.gemini-2.5-pro OCID) diff --git a/requirements.txt b/requirements.txt index b32f357..99697cf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ mcp[cli]>=1.0.0 httpx>=0.27.0 python-dotenv>=1.0.0 +oci>=2.100.0 diff --git a/src/enricher.py b/src/enricher.py new file mode 100644 index 0000000..8c22c7d --- /dev/null +++ b/src/enricher.py @@ -0,0 +1,164 @@ +"""Page fetching and LLM-based enrichment via Gemini 2.5 Pro on OCI GenAI.""" + +import json +import os +import re +import uuid +from concurrent.futures import ThreadPoolExecutor, as_completed +from html.parser import HTMLParser + +import httpx +import oci +from oci.generative_ai_inference import GenerativeAiInferenceClient +from oci.generative_ai_inference.models import ( + ChatDetails, + GenericChatRequest, + OnDemandServingMode, + TextContent, + UserMessage, +) + +# --------------------------------------------------------------------------- +# HTML stripping +# --------------------------------------------------------------------------- + +class _TextExtractor(HTMLParser): + _SKIP_TAGS = {"script", "style", "head", "nav", "footer", "noscript"} + + def __init__(self) -> None: + super().__init__() + self._buf: list[str] = [] + self._skip = 0 + + def handle_starttag(self, tag: str, attrs: list) -> None: + if tag in self._SKIP_TAGS: + self._skip += 1 + + def handle_endtag(self, tag: str) -> None: + if tag in self._SKIP_TAGS and self._skip: + self._skip -= 1 + + def handle_data(self, data: str) -> None: + if not self._skip: + text = data.strip() + if text: + self._buf.append(text) + + def get_text(self) -> str: + return " ".join(self._buf) + + +def _html_to_text(html: str) -> str: + parser = _TextExtractor() + parser.feed(html) + return re.sub(r"\s{3,}", " ", parser.get_text()) + + +# --------------------------------------------------------------------------- +# Page fetching +# --------------------------------------------------------------------------- + +_HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; searxng-mcp/1.0)"} + + +def fetch_page_text(url: str, max_chars: int = 5000) -> str: + """Fetch a URL and return stripped plain text, truncated to max_chars.""" + try: + r = httpx.get(url, timeout=10, follow_redirects=True, headers=_HEADERS) + r.raise_for_status() + return _html_to_text(r.text)[:max_chars] + except Exception: + return "" + + +# --------------------------------------------------------------------------- +# OCI GenAI (Gemini 2.5 Pro) enrichment +# --------------------------------------------------------------------------- + +_PROMPT = """\ +You are an information extractor. Analyze the web page content below and return ONLY a valid JSON object with these fields: +- "abstract": 2-3 sentence summary of the page content (string) +- "date": publication or last-modified date in ISO 8601 format, or null if not found (string | null) +- "author": author or organization name, or null if not found (string | null) +- "tags": list of 3-5 relevant keywords (string[]) +- "content_type": one of "article", "documentation", "news", "forum", "code", "video", "other" (string) + +Title: {title} +URL: {url} +Content: +{content} + +Return only the JSON object, no markdown, no explanation.""" + + +def _get_llm_client() -> GenerativeAiInferenceClient: + config = oci.config.from_file() + return GenerativeAiInferenceClient( + config, + service_endpoint=os.environ["OCI_GENAI_ENDPOINT"], + ) + + +def enrich_result(title: str, url: str, snippet: str) -> dict: + """Fetch page, then call Gemini 2.5 Pro to extract structured metadata.""" + page_text = fetch_page_text(url) + content = page_text if page_text else snippet + + prompt = _PROMPT.format(title=title, url=url, content=content[:4000]) + + try: + client = _get_llm_client() + req = GenericChatRequest( + messages=[UserMessage(content=[TextContent(text=prompt)])], + max_tokens=512, + temperature=0, + ) + det = ChatDetails( + compartment_id=os.environ["OCI_COMPARTMENT_ID"], + serving_mode=OnDemandServingMode( + model_id=os.environ["OCI_CHAT_MODEL_ID"] + ), + chat_request=req, + ) + response = client.chat(det) + raw = response.data.chat_response.choices[0].message.content[0].text.strip() + # Strip markdown code fences if present + raw = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE) + metadata = json.loads(raw) + except Exception as exc: + metadata = { + "abstract": snippet, + "date": None, + "author": None, + "tags": [], + "content_type": "other", + "_error": str(exc), + } + + metadata["uuid"] = str(uuid.uuid4()) + return metadata + + +def enrich_results_parallel( + results: list[dict], + max_workers: int = 5, +) -> list[dict]: + """Enrich a list of search results in parallel, adding metadata to each.""" + enriched: dict[int, dict] = {} + + with ThreadPoolExecutor(max_workers=max_workers) as pool: + futures = { + pool.submit(enrich_result, r["title"], r["url"], r["content"]): i + for i, r in enumerate(results) + } + for future in as_completed(futures): + idx = futures[future] + try: + enriched[idx] = future.result() + except Exception as exc: + enriched[idx] = {"uuid": str(uuid.uuid4()), "_error": str(exc)} + + return [ + {**results[i], **enriched.get(i, {})} + for i in range(len(results)) + ] diff --git a/src/server.py b/src/server.py index 0e1a113..68e5a2f 100644 --- a/src/server.py +++ b/src/server.py @@ -1,4 +1,4 @@ -"""MCP server wrapping the self-hosted SearXNG instance for free web search.""" +"""MCP server wrapping self-hosted SearXNG with Gemini 2.5 Pro enrichment.""" import os import sys @@ -14,24 +14,38 @@ load_dotenv(os.path.join(_project_root, ".env")) import httpx from mcp.server.fastmcp import FastMCP +from src.enricher import enrich_results_parallel + mcp = FastMCP( name="searxng", - instructions="Web search via self-hosted SearXNG. Use this instead of built-in WebSearch.", + instructions=( + "Web search via self-hosted SearXNG + Gemini 2.5 Pro enrichment. " + "Use this instead of the built-in WebSearch tool." + ), ) SEARXNG_URL = os.environ.get("SEARXNG_URL", "https://searxng.cloud-handson.com") @mcp.tool() -def web_search(query: str, max_results: int = 10) -> list[dict]: +def web_search(query: str, max_results: int = 10, enrich: bool = True) -> list[dict]: """Search the web using the self-hosted SearXNG instance. + Each result is enriched by Gemini 2.5 Pro (via OCI GenAI) with: + - abstract: 2-3 sentence summary of the page + - date: publication date (ISO 8601) or null + - author: author/org name or null + - tags: 3-5 relevant keywords + - content_type: article / documentation / news / forum / code / video / other + - uuid: unique identifier for this result + Args: query: The search query string. max_results: Maximum number of results to return (default 10). + enrich: Set to False to skip LLM enrichment and return raw results faster. Returns: - List of result dicts with keys: ``title``, ``url``, ``content`` (snippet). + List of enriched result dicts. """ response = httpx.get( f"{SEARXNG_URL}/search", @@ -41,7 +55,7 @@ def web_search(query: str, max_results: int = 10) -> list[dict]: response.raise_for_status() data = response.json() - return [ + results = [ { "title": r.get("title", ""), "url": r.get("url", ""), @@ -50,6 +64,11 @@ def web_search(query: str, max_results: int = 10) -> list[dict]: for r in data.get("results", [])[:max_results] ] + if enrich and results: + results = enrich_results_parallel(results) + + return results + def main() -> None: mcp.run()