Source code for tooluniverse.nextstrain_tool
# nextstrain_tool.py
"""
Nextstrain REST API tool for ToolUniverse.
Nextstrain is an open-source project to harness the scientific and
public health potential of pathogen genome data. It provides real-time
tracking of evolving pathogens through phylogenetic analysis.
API: https://nextstrain.org/charon
No authentication required. Free for all use.
"""
import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool
NEXTSTRAIN_BASE_URL = "https://nextstrain.org/charon"
[docs]
@register_tool("NextstrainTool")
class NextstrainTool(BaseTool):
"""
Tool for querying Nextstrain, the pathogen evolution tracker.
Provides access to phylogenetic datasets for various pathogens
including influenza, SARS-CoV-2, Zika, Ebola, Dengue, and more.
Returns metadata and phylogenetic tree data.
No authentication required.
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout = tool_config.get("timeout", 45)
self.endpoint_type = tool_config.get("fields", {}).get(
"endpoint_type", "list_datasets"
)
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the Nextstrain API call."""
try:
return self._dispatch(arguments)
except requests.exceptions.Timeout:
return {
"error": f"Nextstrain API request timed out after {self.timeout} seconds"
}
except requests.exceptions.ConnectionError:
return {
"error": "Failed to connect to Nextstrain API. Check network connectivity."
}
except requests.exceptions.HTTPError as e:
return {"error": f"Nextstrain API HTTP error: {e.response.status_code}"}
except Exception as e:
return {"error": f"Unexpected error querying Nextstrain: {str(e)}"}
[docs]
def _dispatch(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Route to appropriate endpoint based on config."""
if self.endpoint_type == "list_datasets":
return self._list_datasets(arguments)
elif self.endpoint_type == "get_dataset":
return self._get_dataset(arguments)
else:
return {"error": f"Unknown endpoint_type: {self.endpoint_type}"}
[docs]
def _list_datasets(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""List available Nextstrain pathogen datasets."""
pathogen_filter = arguments.get("pathogen", "").lower()
url = f"{NEXTSTRAIN_BASE_URL}/getAvailable"
response = requests.get(url, timeout=self.timeout)
response.raise_for_status()
raw = response.json()
all_datasets = raw.get("datasets", [])
# Group by pathogen (first segment of the request path)
pathogen_groups = {}
for ds in all_datasets:
request_path = ds.get("request", "")
if not request_path:
continue
pathogen = request_path.split("/")[0]
pathogen_groups.setdefault(pathogen, []).append(request_path)
# Filter by pathogen if specified
if pathogen_filter:
filtered = {}
for p, paths in pathogen_groups.items():
if pathogen_filter in p.lower():
filtered[p] = paths
pathogen_groups = filtered
# Build response
results = []
for pathogen, paths in sorted(pathogen_groups.items()):
results.append(
{
"pathogen": pathogen,
"dataset_count": len(paths),
"datasets": sorted(paths)[:10],
}
)
return {
"data": results,
"metadata": {
"source": "Nextstrain",
"total_pathogens": len(results),
"total_datasets": sum(len(r["datasets"]) for r in results),
"filter": pathogen_filter or "(none)",
"endpoint": "list_datasets",
},
}
[docs]
def _get_dataset(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get metadata and tree summary for a Nextstrain dataset."""
dataset = arguments.get("dataset", "")
if not dataset:
return {
"error": "dataset parameter is required (e.g., 'zika', 'ebola', 'flu/seasonal/h3n2/ha/2y')"
}
url = f"{NEXTSTRAIN_BASE_URL}/getDataset"
params = {"prefix": dataset}
response = requests.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
raw = response.json()
meta = raw.get("meta", {})
tree = raw.get("tree", {})
# Count sequences (leaves in tree)
def count_leaves(node):
if not isinstance(node, dict):
return 0
children = node.get("children", [])
if not children:
return 1
return sum(count_leaves(c) for c in children)
num_sequences = count_leaves(tree)
# Extract tree root attributes
root_attrs = tree.get("node_attrs", {})
root_info = {}
for key, val in root_attrs.items():
if isinstance(val, dict) and "value" in val:
root_info[key] = val["value"]
elif not isinstance(val, dict):
root_info[key] = val
# Data provenance
provenance = meta.get("data_provenance", [])
prov_names = []
for p in provenance:
if isinstance(p, dict):
prov_names.append(p.get("name", ""))
# Maintainers
maintainers = []
for m in meta.get("maintainers", []):
if isinstance(m, dict):
maintainers.append(m.get("name", ""))
result = {
"dataset": dataset,
"title": meta.get("title", ""),
"updated": meta.get("updated", ""),
"build_url": meta.get("build_url", ""),
"num_sequences": num_sequences,
"data_provenance": prov_names,
"maintainers": maintainers,
"root_attributes": root_info,
}
# Color-by options
colorings = meta.get("colorings", [])
if colorings:
result["available_colorings"] = [
c.get("key", "") for c in colorings if isinstance(c, dict)
][:15]
return {
"data": result,
"metadata": {
"source": "Nextstrain",
"query": dataset,
"version": raw.get("version", ""),
"endpoint": "get_dataset",
},
}