Source code for tooluniverse.database_setup.search

"""
SearchEngine: unified keyword / embedding / hybrid search over a SQLite+FAISS datastore.

Composes:
- SQLiteStore.search_keyword(...)
- Embedder for query-time vectors
- VectorStore.search_embeddings(...)
- A simple hybrid combiner to mix keyword and embedding scores

Scoring
-------
- Keyword scores are alway 1.0.
- Embedding scores are FAISS IP (assume vectors are L2-normalized upstream).
- Hybrid: score = alpha * embed_score + (1 - alpha) * keyword_score  (alpha in [0,1]).

Return shape
------------
Each API returns a list of dicts:
{ "doc_id", "doc_key", "text", "metadata", "score" }

See also
--------
- pipeline.py for high-level build & search helpers
- cli.py for command-line usage
"""

from typing import List, Dict, Any

from tooluniverse.database_setup.sqlite_store import SQLiteStore
from tooluniverse.database_setup.vector_store import VectorStore
from tooluniverse.database_setup.embedder import Embedder
from tooluniverse.database_setup.provider_resolver import (
    resolve_provider,
    resolve_model,
)

import numpy as np


[docs] class SearchEngine: """ Unified keyword + embedding + hybrid search for a given DB path. Parameters ---------- db_path : str Path to the SQLite database file that also anchors <collection>.faiss files. provider : Optional[str] Default embedder provider. May be overridden per-call. model : Optional[str] Default embedding model. May be overridden per-call. Use --- Provides consistent records ``{doc_id, doc_key, text, metadata, score}``. Keyword results get a fixed ``score=1.0``; hybrid combines embedding/keyword scores as ``alpha*emb + (1-alpha)*kw``. Notes ----- - If a collection's `embedding_model` is "precomputed", you MUST pass (provider, model) when calling `embedding_search` or `hybrid_search`. """
[docs] def __init__(self, db_path: str = "embeddings.db"): self.sqlite = SQLiteStore(db_path) self.vectors = VectorStore(db_path) prov = resolve_provider() mdl = resolve_model(prov) self.embedder = Embedder(provider=prov, model=mdl)
def _get_collection_meta(self, collection: str): cur = self.sqlite.conn.execute( "SELECT embedding_model, embedding_dimensions FROM collections WHERE name=? LIMIT 1", (collection,), ) row = cur.fetchone() return (row[0], row[1]) if row else (None, None) # ---- Keyword search ---- # ---- Embedding search ---- # ---- Hybrid search ---- (embedding + keyword) # ---- Collection + Doc Access ----
[docs] def list_collections(self) -> List[str]: """Return the list of collection names registered in the SQLite `collections` table.""" cur = self.sqlite.conn.execute("SELECT name FROM collections") return [r[0] for r in cur.fetchall()]
[docs] def fetch_docs(self, collection: str, doc_keys: List[str] = None, limit: int = 10): """Fetch raw docs by doc_key using SQLiteStore.fetch_docs (for inspection or tooling).""" return self.sqlite.fetch_docs(collection, doc_keys=doc_keys, limit=limit)
[docs] def fetch_random_docs(self, collection: str, n: int = 5): """Return `n` random documents from a collection (for sampling/demo).""" return self.sqlite.fetch_random_docs(collection, n=n)
# ---- Unified search ----
[docs] def search_collection( self, collection: str, query: str, method: str = "hybrid", top_k: int = 5, alpha: float = 0.5, ): """Dispatch to keyword/embedding/hybrid search for a single collection.""" if method == "keyword": return self.keyword_search(collection, query, top_k=top_k) elif method == "embedding": return self.embedding_search(collection, query, top_k=top_k) elif method == "hybrid": return self.hybrid_search(collection, query, top_k=top_k, alpha=alpha) else: raise ValueError(f"Unknown method: {method}")