Source code for tooluniverse.nextstrain_tool

# nextstrain_tool.py
"""
Nextstrain REST API tool for ToolUniverse.

Nextstrain is an open-source project to harness the scientific and
public health potential of pathogen genome data. It provides real-time
tracking of evolving pathogens through phylogenetic analysis.

API: https://nextstrain.org/charon
No authentication required. Free for all use.
"""

import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool

NEXTSTRAIN_BASE_URL = "https://nextstrain.org/charon"


[docs] @register_tool("NextstrainTool") class NextstrainTool(BaseTool): """ Tool for querying Nextstrain, the pathogen evolution tracker. Provides access to phylogenetic datasets for various pathogens including influenza, SARS-CoV-2, Zika, Ebola, Dengue, and more. Returns metadata and phylogenetic tree data. No authentication required. """
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("timeout", 45) self.endpoint_type = tool_config.get("fields", {}).get( "endpoint_type", "list_datasets" )
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the Nextstrain API call.""" try: return self._dispatch(arguments) except requests.exceptions.Timeout: return { "error": f"Nextstrain API request timed out after {self.timeout} seconds" } except requests.exceptions.ConnectionError: return { "error": "Failed to connect to Nextstrain API. Check network connectivity." } except requests.exceptions.HTTPError as e: return {"error": f"Nextstrain API HTTP error: {e.response.status_code}"} except Exception as e: return {"error": f"Unexpected error querying Nextstrain: {str(e)}"}
[docs] def _dispatch(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Route to appropriate endpoint based on config.""" if self.endpoint_type == "list_datasets": return self._list_datasets(arguments) elif self.endpoint_type == "get_dataset": return self._get_dataset(arguments) else: return {"error": f"Unknown endpoint_type: {self.endpoint_type}"}
[docs] def _list_datasets(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """List available Nextstrain pathogen datasets.""" pathogen_filter = arguments.get("pathogen", "").lower() url = f"{NEXTSTRAIN_BASE_URL}/getAvailable" response = requests.get(url, timeout=self.timeout) response.raise_for_status() raw = response.json() all_datasets = raw.get("datasets", []) # Group by pathogen (first segment of the request path) pathogen_groups = {} for ds in all_datasets: request_path = ds.get("request", "") if not request_path: continue pathogen = request_path.split("/")[0] pathogen_groups.setdefault(pathogen, []).append(request_path) # Filter by pathogen if specified if pathogen_filter: filtered = {} for p, paths in pathogen_groups.items(): if pathogen_filter in p.lower(): filtered[p] = paths pathogen_groups = filtered # Build response results = [] for pathogen, paths in sorted(pathogen_groups.items()): results.append( { "pathogen": pathogen, "dataset_count": len(paths), "datasets": sorted(paths)[:10], } ) return { "data": results, "metadata": { "source": "Nextstrain", "total_pathogens": len(results), "total_datasets": sum(len(r["datasets"]) for r in results), "filter": pathogen_filter or "(none)", "endpoint": "list_datasets", }, }
[docs] def _get_dataset(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get metadata and tree summary for a Nextstrain dataset.""" dataset = arguments.get("dataset", "") if not dataset: return { "error": "dataset parameter is required (e.g., 'zika', 'ebola', 'flu/seasonal/h3n2/ha/2y')" } url = f"{NEXTSTRAIN_BASE_URL}/getDataset" params = {"prefix": dataset} response = requests.get(url, params=params, timeout=self.timeout) response.raise_for_status() raw = response.json() meta = raw.get("meta", {}) tree = raw.get("tree", {}) # Count sequences (leaves in tree) def count_leaves(node): if not isinstance(node, dict): return 0 children = node.get("children", []) if not children: return 1 return sum(count_leaves(c) for c in children) num_sequences = count_leaves(tree) # Extract tree root attributes root_attrs = tree.get("node_attrs", {}) root_info = {} for key, val in root_attrs.items(): if isinstance(val, dict) and "value" in val: root_info[key] = val["value"] elif not isinstance(val, dict): root_info[key] = val # Data provenance provenance = meta.get("data_provenance", []) prov_names = [] for p in provenance: if isinstance(p, dict): prov_names.append(p.get("name", "")) # Maintainers maintainers = [] for m in meta.get("maintainers", []): if isinstance(m, dict): maintainers.append(m.get("name", "")) result = { "dataset": dataset, "title": meta.get("title", ""), "updated": meta.get("updated", ""), "build_url": meta.get("build_url", ""), "num_sequences": num_sequences, "data_provenance": prov_names, "maintainers": maintainers, "root_attributes": root_info, } # Color-by options colorings = meta.get("colorings", []) if colorings: result["available_colorings"] = [ c.get("key", "") for c in colorings if isinstance(c, dict) ][:15] return { "data": result, "metadata": { "source": "Nextstrain", "query": dataset, "version": raw.get("version", ""), "endpoint": "get_dataset", }, }