Source code for tooluniverse.cellxgene_discovery_tool
# cellxgene_discovery_tool.py
"""
CellxGene Discovery API tool for ToolUniverse.
Provides access to the CZI CellxGene Discovery API for browsing and searching
single-cell RNA-seq datasets and curated collections. Contains 2,000+ datasets
across tissues, diseases, and organisms with cell-type annotations.
API: https://api.cellxgene.cziscience.com/
No authentication required. Free public access.
"""
import requests
from typing import Dict, Any, List
from .base_tool import BaseTool
CXG_BASE_URL = "https://api.cellxgene.cziscience.com"
[docs]
class CellxGeneDiscoveryTool(BaseTool):
"""
Tool for CZI CellxGene Discovery API providing access to single-cell
RNA-seq datasets and curated collections.
No authentication required.
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout = tool_config.get("timeout", 60)
fields = tool_config.get("fields", {})
self.endpoint = fields.get("endpoint", "list_collections")
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the CellxGene Discovery API call."""
try:
return self._query(arguments)
except requests.exceptions.Timeout:
return {"error": f"CellxGene Discovery API timed out after {self.timeout}s"}
except requests.exceptions.ConnectionError:
return {"error": "Failed to connect to CellxGene Discovery API"}
except requests.exceptions.HTTPError as e:
code = e.response.status_code if e.response is not None else "unknown"
if code == 404:
return {
"error": f"Collection/dataset not found: {arguments.get('collection_id', '')}"
}
return {"error": f"CellxGene Discovery API HTTP error: {code}"}
except Exception as e:
return {"error": f"Unexpected error querying CellxGene Discovery: {str(e)}"}
[docs]
def _query(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Route to appropriate endpoint."""
if self.endpoint == "list_collections":
return self._list_collections(arguments)
elif self.endpoint == "get_collection":
return self._get_collection(arguments)
elif self.endpoint == "search_datasets":
return self._search_datasets(arguments)
else:
return {"error": f"Unknown endpoint: {self.endpoint}"}
[docs]
def _list_collections(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""List curated single-cell collections."""
limit = min(arguments.get("limit", 20), 100)
url = f"{CXG_BASE_URL}/curation/v1/collections"
params = {"visibility": "PUBLIC"}
response = requests.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
collections = response.json()
total = len(collections)
results = []
for c in collections[:limit]:
datasets = c.get("datasets", [])
total_cells = sum(
ds.get("cell_count", 0) for ds in datasets if ds.get("cell_count")
)
results.append(
{
"collection_id": c.get("collection_id"),
"name": c.get("name"),
"description": c.get("description", "")[:200]
if c.get("description")
else None,
"doi": c.get("doi"),
"contact_name": c.get("contact_name"),
"curator_name": c.get("curator_name"),
"consortia": c.get("consortia"),
"dataset_count": len(datasets),
"total_cells": total_cells,
"created_at": c.get("created_at"),
}
)
return {
"data": {
"total_collections": total,
"returned": len(results),
"collections": results,
},
"metadata": {
"source": "CZI CellxGene Discovery",
"visibility": "PUBLIC",
},
}
[docs]
def _get_collection(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get detailed collection information with datasets."""
collection_id = arguments.get("collection_id", "")
if not collection_id:
return {"error": "collection_id is required (UUID format)"}
url = f"{CXG_BASE_URL}/curation/v1/collections/{collection_id}"
response = requests.get(url, timeout=self.timeout)
response.raise_for_status()
c = response.json()
datasets = []
for ds in c.get("datasets", []):
# Extract tissue/disease/organism labels
tissues = [t.get("label") for t in ds.get("tissue", []) if t.get("label")]
diseases = [d.get("label") for d in ds.get("disease", []) if d.get("label")]
organisms = [
o.get("label") for o in ds.get("organism", []) if o.get("label")
]
cell_types = [
ct.get("label") for ct in ds.get("cell_type", []) if ct.get("label")
]
assays = [a.get("label") for a in ds.get("assay", []) if a.get("label")]
datasets.append(
{
"dataset_id": ds.get("dataset_id"),
"name": ds.get("title") or ds.get("name"),
"cell_count": ds.get("cell_count"),
"tissues": tissues,
"diseases": diseases,
"organisms": organisms,
"cell_types": cell_types[:20], # Limit long lists
"assays": assays,
"is_primary_data": ds.get("is_primary_data"),
}
)
return {
"data": {
"collection_id": c.get("collection_id"),
"name": c.get("name"),
"description": c.get("description"),
"doi": c.get("doi"),
"contact_name": c.get("contact_name"),
"links": c.get("links"),
"dataset_count": len(datasets),
"datasets": datasets,
},
"metadata": {
"source": "CZI CellxGene Discovery",
"collection_id": collection_id,
},
}
[docs]
def _search_datasets(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Search single-cell datasets by tissue, disease, or organism."""
tissue = arguments.get("tissue", "")
disease = arguments.get("disease", "")
organism = arguments.get("organism", "")
cell_type = arguments.get("cell_type", "")
limit = min(arguments.get("limit", 20), 100)
if not any([tissue, disease, organism, cell_type]):
return {
"error": "At least one search parameter required: tissue, disease, organism, or cell_type"
}
# Fetch full dataset index and filter
url = f"{CXG_BASE_URL}/dp/v1/datasets/index"
response = requests.get(url, timeout=self.timeout)
response.raise_for_status()
all_datasets = response.json()
# Filter datasets
filtered = all_datasets
if tissue:
tissue_lower = tissue.lower()
filtered = [
d
for d in filtered
if any(
tissue_lower in t.get("label", "").lower()
for t in d.get("tissue", [])
)
]
if disease:
disease_lower = disease.lower()
filtered = [
d
for d in filtered
if any(
disease_lower in dis.get("label", "").lower()
for dis in d.get("disease", [])
)
]
if organism:
org_lower = organism.lower()
filtered = [
d
for d in filtered
if any(
org_lower in o.get("label", "").lower()
for o in d.get("organism", [])
)
]
if cell_type:
ct_lower = cell_type.lower()
filtered = [
d
for d in filtered
if any(
ct_lower in ct.get("label", "").lower()
for ct in d.get("cell_type", [])
)
]
total = len(filtered)
# Sort by cell count descending
filtered.sort(key=lambda x: x.get("cell_count", 0) or 0, reverse=True)
filtered = filtered[:limit]
results = []
for d in filtered:
tissues = [t.get("label") for t in d.get("tissue", []) if t.get("label")]
diseases = [
dis.get("label") for dis in d.get("disease", []) if dis.get("label")
]
organisms = [
o.get("label") for o in d.get("organism", []) if o.get("label")
]
results.append(
{
"dataset_id": d.get("id"),
"name": d.get("name", ""),
"cell_count": d.get("cell_count"),
"collection_id": d.get("collection_id"),
"tissues": tissues,
"diseases": diseases,
"organisms": organisms,
"explorer_url": d.get("explorer_url"),
}
)
return {
"data": {
"total_matching": total,
"returned": len(results),
"datasets": results,
},
"metadata": {
"source": "CZI CellxGene Discovery",
"filters": {
"tissue": tissue or None,
"disease": disease or None,
"organism": organism or None,
"cell_type": cell_type or None,
},
"total_datasets_searched": len(all_datasets),
},
}