Source code for tooluniverse.database_setup.cli

"""
tu-datastore: CLI for building, searching, and syncing embedding datastores.

Subcommands
-----------
build
    Upsert a collection, insert documents (with de-dup), embed texts, and write FAISS.
quickbuild
    Build a collection from a folder of text files (.txt/.md).
search
    Query an existing collection by keyword, embedding, or hybrid.
sync-hf upload|download
    Upload/download <collection>.db and <collection>.faiss to/from Hugging Face and (on upload) optionally include --tool-json <file1.json> [file2.json ...].


Environment
-----------
Set EMBED_PROVIDER, EMBED_MODEL, and provider-specific keys (OPENAI / AZURE_* / HF_TOKEN).
All datastore files default to <user_cache_dir>/embeddings/<collection>.db unless overridden.

Exit codes
----------
0 on success; non-zero on I/O, validation, or runtime errors.
"""

import argparse
import json
import os
from .pipeline import build_collection, search
from .hf.sync_hf import upload as sync_upload, download as sync_download
from .packager import pack_folder
from tooluniverse.utils import get_user_cache_dir


[docs] def resolve_db_path(db_arg, collection): """Return resolved db path (user-specified or default cache dir).""" if db_arg: return os.path.expanduser(db_arg) default_db_dir = os.path.join(get_user_cache_dir(), "embeddings") os.makedirs(default_db_dir, exist_ok=True) return os.path.join(default_db_dir, f"{collection}.db")
[docs] def resolve_provider_model(provider_arg, model_arg): """Use CLI args or fall back to environment variables.""" provider = provider_arg or os.getenv("EMBED_PROVIDER") model = model_arg or os.getenv("EMBED_MODEL") if not provider or not model: raise SystemExit( "Missing embedding provider or model. " "Use --provider/--model or set EMBED_PROVIDER/EMBED_MODEL in your .env." ) return provider, model
[docs] def main(): p = argparse.ArgumentParser( "tu-datastore", description="Manage local searchable datastores." ) sub = p.add_subparsers(dest="cmd") # -------------------------------------------------------------------------- # build # -------------------------------------------------------------------------- b = sub.add_parser("build", help="Build or extend a collection from JSON docs") b.add_argument("--collection", required=True, help="Collection name (e.g. toy)") b.add_argument("--docs-json", required=True, help="Path to JSON list of docs") b.add_argument("--db", required=False, help="Optional path to SQLite DB") b.add_argument( "--provider", help="Embedding provider (openai, azure, huggingface, local)" ) b.add_argument("--model", help="Embedding model name or deployment") b.add_argument( "--overwrite", action="store_true", help="Rebuild FAISS index if exists" ) # -------------------------------------------------------------------------- # quickbuild # -------------------------------------------------------------------------- qb = sub.add_parser( "quickbuild", help="Build from a folder of text files (.txt/.md)" ) qb.add_argument("--name", required=True, help="Collection name (e.g. mydata)") qb.add_argument("--from-folder", required=True, help="Folder containing text files") qb.add_argument( "--provider", help="Embedding provider (openai, azure, huggingface, local)" ) qb.add_argument("--model", help="Embedding model name or deployment") qb.add_argument( "--overwrite", action="store_true", help="Rebuild FAISS index if exists" ) # -------------------------------------------------------------------------- # search # -------------------------------------------------------------------------- s = sub.add_parser("search", help="Query an existing collection") s.add_argument("--collection", required=True, help="Collection name (e.g. toy)") s.add_argument("--query", required=True, help="Search query text") s.add_argument("--db", required=False, help="Optional path to SQLite DB") s.add_argument( "--method", default="hybrid", choices=["keyword", "embedding", "hybrid"], help="Search method", ) s.add_argument("--top-k", default=10, type=int, help="Number of results") s.add_argument("--alpha", default=0.5, type=float, help="Hybrid mix weight") s.add_argument("--provider", help="Embedding provider (optional)") s.add_argument("--model", help="Embedding model (optional)") # -------------------------------------------------------------------------- # sync-hf # -------------------------------------------------------------------------- sh = sub.add_parser( "sync-hf", help="Upload/download datastore artifacts to/from Hugging Face" ) sh_sub = sh.add_subparsers(dest="action", required=True) up = sh_sub.add_parser("upload", help="Upload collection artifacts to HF") up.add_argument("--collection", required=True) up.add_argument( "--repo", help="HF dataset repo ID (defaults to <username>/<collection>)" ) up.add_argument( "--private", action=argparse.BooleanOptionalAction, default=True, help="Make dataset private (default True). Use --no-private to make it public.", ) up.add_argument( "--tool-json", nargs="*", default=None, help="Path(s) to Tool JSON file(s) to upload with the datastore.", ) down = sh_sub.add_parser("download", help="Download collection artifacts from HF") down.add_argument("--repo", required=True) down.add_argument("--collection", required=True) down.add_argument( "--overwrite", action="store_true", help="Overwrite existing index" ) down.add_argument( "--include-tools", action="store_true", help="Also download tool JSON files" ) # -------------------------------------------------------------------------- # Parse # -------------------------------------------------------------------------- args = p.parse_args() if args.cmd == "build": with open(args.docs_json) as f: raw = json.load(f) docs = [ ( ( d.get("doc_key"), d.get("text"), d.get("metadata", {}), d.get("text_hash"), ) if isinstance(d, dict) else tuple(d) ) for d in raw ] provider, model = resolve_provider_model(args.provider, args.model) db_path = resolve_db_path(args.db, args.collection) build_collection( db_path=db_path, collection=args.collection, docs=docs, embed_provider=provider, embed_model=model, overwrite=args.overwrite, ) print(f"[INFO] Collection '{args.collection}' written to {db_path}") elif args.cmd == "quickbuild": docs = pack_folder(args.from_folder) if not docs: raise SystemExit("No supported files found. Put .txt or .md in the folder.") provider, model = resolve_provider_model(args.provider, args.model) db_path = resolve_db_path(None, args.name) build_collection( db_path=db_path, collection=args.name, docs=docs, embed_provider=provider, embed_model=model, overwrite=args.overwrite, ) print( f"[INFO] Built collection '{args.name}' with {len(docs)} docs at {db_path}" ) elif args.cmd == "search": db_path = resolve_db_path(args.db, args.collection) # Only require provider/model when embeddings are needed if args.method == "keyword": provider = model = None else: provider, model = resolve_provider_model(args.provider, args.model) res = search( db_path=db_path, collection=args.collection, query=args.query, method=args.method, top_k=args.top_k, alpha=args.alpha, embed_provider=provider, embed_model=model, ) print(json.dumps(res, indent=2)) elif args.cmd == "sync-hf": if args.action == "upload": sync_upload( collection=args.collection, repo=args.repo, private=args.private, tool_json=args.tool_json, ) elif args.action == "download": sync_download( repo=args.repo, collection=args.collection, overwrite=args.overwrite, include_tools=args.include_tools, ) else: p.print_help()