"""
ToolUniverse Profile Configuration Loader
Simplified loader supporting HuggingFace, local files, and HTTP/HTTPS.
"""
import hashlib
import json
import zipfile
import io
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
import yaml
import requests
from huggingface_hub import hf_hub_download
from ..utils import get_user_cache_dir
from .validator import validate_with_schema
def _is_github_repo_url(uri: str) -> bool:
"""Return True if ``uri`` looks like a GitHub repository URL (not a raw file)."""
if not (uri.startswith("https://github.com/") or uri.startswith("http://github.com/")):
return False
# A repo URL has 2-3 path segments after the host: /user/repo[/tree/branch]
# A raw file URL has more: /user/repo/raw/branch/file or /user/repo/blob/...
parts = uri.rstrip("/").split("/")
# ['https:', '', 'github.com', 'user', 'repo', ...]
if len(parts) < 5:
return False
# If there's a 6th segment and it's 'raw', 'blob', 'releases', 'archive', it's a file URL
if len(parts) >= 7 and parts[5] in ("raw", "blob", "releases", "archive"):
return False
return True
[docs]
class ProfileLoader:
"""Simplified loader for ToolUniverse Profile configurations."""
[docs]
def __init__(self, cache_dir: Optional[Union[str, Path]] = None):
"""
Initialize the Profile loader.
Args:
cache_dir: Directory for caching downloaded configurations
"""
if cache_dir:
self.cache_dir = Path(cache_dir)
else:
self.cache_dir = Path(get_user_cache_dir()) / "profiles"
self.cache_dir.mkdir(parents=True, exist_ok=True)
[docs]
def load(self, uri: str) -> Dict[str, Any]:
"""
Load Profile configuration from URI.
Args:
uri: Profile URI (e.g., "hf:user/repo", "./config.yaml", "https://example.com/config.yaml")
Returns
Loaded configuration dictionary
Raises:
ValueError: If URI is unsupported or configuration is invalid
"""
config = self._load_raw(uri)
# Resolve extends on the raw config before filling schema defaults.
# Filling defaults first would cause absent child fields (e.g. tags=[])
# to shadow values inherited from the base.
config = self._resolve_extends(config)
# Validate and fill defaults on the fully merged config
yaml_content = yaml.dump(config, default_flow_style=False, allow_unicode=True)
is_valid, errors, processed = validate_with_schema(yaml_content, fill_defaults_flag=True)
if not is_valid:
error_msg = "Configuration validation failed:\n" + "\n".join(
f" - {e}" for e in errors
)
raise ValueError(error_msg)
return processed
[docs]
def _load_from_hf(self, uri: str) -> Dict[str, Any]:
"""Load configuration from HuggingFace Hub."""
repo_id = uri[3:] # Remove 'hf:' prefix
try:
# Try to download the config file
config_path = hf_hub_download(
repo_id=repo_id,
filename="profile.yaml",
cache_dir=self.cache_dir,
local_files_only=False,
)
return self._load_from_file(config_path)
except Exception as e:
# Fallback: try profile.json
try:
config_path = hf_hub_download(
repo_id=repo_id,
filename="profile.json",
cache_dir=self.cache_dir,
local_files_only=False,
)
return self._load_from_file(config_path)
except Exception:
raise ValueError(
f"Failed to load Profile from HuggingFace {repo_id}: {e}"
)
[docs]
def _load_from_url(self, uri: str) -> Dict[str, Any]:
"""Load configuration from HTTP/HTTPS URL."""
try:
response = requests.get(uri, timeout=30)
response.raise_for_status()
# Try YAML first, then JSON
try:
return yaml.safe_load(response.text)
except yaml.YAMLError:
return response.json()
except Exception as e:
raise ValueError(f"Failed to load Profile from URL {uri}: {e}")
[docs]
def _load_from_file(self, file_path: str) -> Dict[str, Any]:
"""Load configuration from local file."""
path = Path(file_path)
if not path.exists():
raise ValueError(f"Profile file not found: {file_path}")
try:
with open(path, "r", encoding="utf-8") as f:
if path.suffix.lower() in [".yaml", ".yml"]:
return yaml.safe_load(f)
elif path.suffix.lower() == ".json":
return json.load(f)
else:
# Try YAML first, then JSON
try:
f.seek(0)
return yaml.safe_load(f)
except yaml.YAMLError:
f.seek(0)
return json.load(f)
except Exception as e:
raise ValueError(f"Failed to load Profile from file {file_path}: {e}")
[docs]
def _resolve_extends(
self, config: Dict[str, Any], _chain: Optional[frozenset] = None
) -> Dict[str, Any]:
"""
Resolve the ``extends`` field by loading the base Profile and merging.
The base Profile is loaded first; the current config is then deep-merged
on top so that every field in the current config overrides the base.
The ``extends`` key itself is removed from the final result.
``_chain`` tracks the URIs already being resolved in the current call
stack and is used to detect circular references before they cause
infinite recursion.
"""
base_uri = config.get("extends")
if not base_uri:
return config
# Guard against non-string extends (e.g. ``extends: [a, b]``).
# validate_with_schema would catch this later, but _resolve_extends runs
# before validation, so emit a clear error instead of a TypeError.
if not isinstance(base_uri, str):
raise ValueError(
f"Profile 'extends' must be a string URI, "
f"got {type(base_uri).__name__}: {base_uri!r}"
)
if _chain is None:
_chain = frozenset()
if base_uri in _chain:
raise ValueError(
f"Circular Profile extends detected: '{base_uri}' appears more than "
f"once in the inheritance chain {sorted(_chain)}"
)
try:
raw = self._load_raw(base_uri)
raw = self._resolve_extends(raw, _chain | {base_uri})
except Exception as e:
if isinstance(e, ValueError):
raise
raise ValueError(f"Failed to load base Profile '{base_uri}': {e}") from e
merged = self._deep_merge(raw, config)
merged.pop("extends", None)
return merged
# ------------------------------------------------------------------
# Phase 2 – URI → local directory resolution
# ------------------------------------------------------------------
[docs]
def resolve_to_local_dir(self, uri: str) -> Path:
"""
Resolve any URI to a local directory that contains Profile tool files.
Supported URI forms:
- Local path (file or directory): returned as a ``Path`` (file → parent)
- ``hf:user/repo``: full repo downloaded via ``snapshot_download``
- ``https://github.com/user/repo[/tree/branch]``: zip downloaded & extracted
- Other ``http(s)://`` URL: single file downloaded; its parent dir returned
The result is always a ``Path`` to an existing directory.
"""
if uri.startswith("hf:"):
return self._resolve_hf_to_dir(uri)
if _is_github_repo_url(uri):
return self._resolve_github_to_dir(uri)
if uri.startswith(("http://", "https://")):
return self._resolve_http_file_to_dir(uri)
return self._resolve_local_to_dir(uri)
[docs]
def _resolve_local_to_dir(self, uri: str) -> Path:
"""Return a local path as a directory (file → parent)."""
p = Path(uri)
if not p.exists():
raise ValueError(f"Profile path not found: {uri}")
return p if p.is_dir() else p.parent
[docs]
def _resolve_hf_to_dir(self, uri: str) -> Path:
"""Download an entire HuggingFace repo and return its local path."""
from huggingface_hub import snapshot_download
repo_id = uri[3:] # strip "hf:"
cache_subdir = self.cache_dir / "hf"
cache_subdir.mkdir(parents=True, exist_ok=True)
try:
local_dir = snapshot_download(
repo_id=repo_id,
cache_dir=str(cache_subdir),
)
return Path(local_dir)
except Exception as e:
raise ValueError(f"Failed to download HuggingFace repo '{repo_id}': {e}")
[docs]
def _resolve_github_to_dir(self, uri: str) -> Path:
"""Download a GitHub repo as a ZIP and extract it; return extracted dir."""
# Parse: https://github.com/user/repo[/tree/branch]
parts = uri.rstrip("/").split("/")
# parts: ['https:', '', 'github.com', 'user', 'repo', ...]
try:
user, repo = parts[3], parts[4]
except IndexError:
raise ValueError(f"Cannot parse GitHub URL: {uri}")
# Determine branch
branch = "main"
if len(parts) >= 7 and parts[5] == "tree":
branch = parts[6]
zip_url = f"https://github.com/{user}/{repo}/archive/refs/heads/{branch}.zip"
# Use a stable cache key
cache_key = hashlib.md5(f"{user}/{repo}@{branch}".encode()).hexdigest()[:10]
extract_dir = self.cache_dir / "github" / cache_key
marker = extract_dir / ".downloaded"
if not marker.exists():
extract_dir.mkdir(parents=True, exist_ok=True)
try:
resp = requests.get(zip_url, timeout=60)
resp.raise_for_status()
with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
zf.extractall(extract_dir)
marker.touch()
except Exception as e:
raise ValueError(
f"Failed to download GitHub repo {user}/{repo}@{branch}: {e}"
)
# The extracted root is typically "{repo}-{branch}/"
candidates = list(extract_dir.iterdir())
if len(candidates) == 1 and candidates[0].is_dir():
return candidates[0]
return extract_dir
[docs]
def _resolve_http_file_to_dir(self, uri: str) -> Path:
"""Download a single config file from an HTTP(S) URL into cache."""
cache_key = hashlib.md5(uri.encode()).hexdigest()[:12]
# Preserve the filename extension
filename = uri.rstrip("/").split("/")[-1] or "profile.yaml"
cached_file = self.cache_dir / "http" / cache_key / filename
cached_file.parent.mkdir(parents=True, exist_ok=True)
if not cached_file.exists():
try:
resp = requests.get(uri, timeout=30)
resp.raise_for_status()
cached_file.write_bytes(resp.content)
except Exception as e:
raise ValueError(f"Failed to download Profile from URL {uri}: {e}")
return cached_file.parent
# ------------------------------------------------------------------
# Phase 3 – scan a local directory for tool files
# ------------------------------------------------------------------
[docs]
def _load_raw(self, uri: str) -> Dict[str, Any]:
"""Load and parse a Profile URI without validation or extends resolution."""
if uri.startswith("hf:"):
result = self._load_from_hf(uri)
elif uri.startswith(("http://", "https://")):
result = self._load_from_url(uri)
else:
result = self._load_from_file(uri)
if not isinstance(result, dict):
raise ValueError(
f"Profile configuration at '{uri}' must be a YAML/JSON mapping, "
f"got {type(result).__name__}. Check the file is not empty."
)
return result
[docs]
@staticmethod
def _deep_merge(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]:
"""
Recursively merge two dicts.
For keys present in both: dicts are merged recursively; all other
types are replaced by the override value. Lists are replaced, not
concatenated, so ``include_tools`` in the child fully replaces the
parent list rather than extending it.
"""
result = base.copy()
for key, value in override.items():
if key in result and isinstance(result[key], dict) and isinstance(value, dict):
result[key] = ProfileLoader._deep_merge(result[key], value)
else:
result[key] = value
return result