Source code for tooluniverse.scxa_tool
# scxa_tool.py
"""
Single Cell Expression Atlas (SCXA) REST API tool for ToolUniverse.
EBI's Single Cell Expression Atlas provides curated single-cell RNA-seq
experiments with cell type annotations, marker genes, and expression data
across 380+ experiments from multiple species.
API: https://www.ebi.ac.uk/gxa/sc/json/
No authentication required.
"""
import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool
SCXA_BASE_URL = "https://www.ebi.ac.uk/gxa/sc/json"
[docs]
@register_tool("SCExpressionAtlasTool")
class SCExpressionAtlasTool(BaseTool):
"""
Tool for querying EBI Single Cell Expression Atlas.
Supports listing single-cell RNA-seq experiments with metadata
(species, cell counts, technology, factors) and searching for
experiments where a gene is expressed at single-cell resolution.
No authentication required.
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout = tool_config.get("timeout", 60)
self.operation = tool_config.get("fields", {}).get(
"operation", "list_experiments"
)
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the SCXA API call."""
try:
if self.operation == "list_experiments":
return self._list_experiments(arguments)
elif self.operation == "search_gene":
return self._search_gene(arguments)
elif self.operation == "cluster_marker_genes":
return self._cluster_marker_genes(arguments)
return {
"status": "error",
"error": f"Unknown operation: {self.operation}",
}
except requests.exceptions.Timeout:
return {
"status": "error",
"error": f"SCXA API request timed out after {self.timeout}s",
}
except requests.exceptions.ConnectionError:
return {
"status": "error",
"error": "Failed to connect to SCXA API. Check network.",
}
except Exception as e:
return {
"status": "error",
"error": f"Error querying SCXA: {str(e)}",
}
[docs]
def _list_experiments(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""List all single-cell RNA-seq experiments with optional filtering."""
url = f"{SCXA_BASE_URL}/experiments"
response = requests.get(url, timeout=self.timeout)
response.raise_for_status()
raw = response.json()
experiments = raw.get("experiments", [])
species_filter = arguments.get("species")
if species_filter:
species_lower = species_filter.lower()
experiments = [
e for e in experiments if species_lower in e.get("species", "").lower()
]
keyword = arguments.get("keyword")
if keyword:
kw_lower = keyword.lower()
experiments = [
e
for e in experiments
if kw_lower in e.get("experimentDescription", "").lower()
or kw_lower in " ".join(e.get("experimentalFactors", [])).lower()
]
total = len(experiments)
limit = min(arguments.get("limit", 20), 100)
experiments = experiments[:limit]
results = []
for exp in experiments:
results.append(
{
"accession": exp.get("experimentAccession"),
"description": exp.get("experimentDescription"),
"species": exp.get("species"),
"technology": exp.get("technologyType"),
"num_cells": exp.get("numberOfAssays"),
"experimental_factors": exp.get("experimentalFactors"),
"experiment_type": exp.get("rawExperimentType"),
"last_updated": exp.get("lastUpdate"),
}
)
return {
"status": "success",
"data": results,
"metadata": {
"total_matching": total,
"returned": len(results),
"source": "EBI Single Cell Expression Atlas",
},
}
[docs]
def _cluster_marker_genes(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get computed marker genes per cell cluster (or per cell type).
marker_type:
- "clusters" (default): top marker genes per cell cluster at clustering
resolution k. Requires `k`. Route: /marker-genes/clusters?k={k}
- "cell_types": marker genes per inferred cell type for an organism part.
Requires `organism_part`. Route: /marker-genes/cell-types?organismPart=...
"""
accession = arguments.get("experiment_accession", "")
if not accession:
return {
"status": "error",
"error": "experiment_accession is required (e.g., 'E-MTAB-5061'). "
"Use SCXA_list_experiments to find accessions.",
}
marker_type = arguments.get("marker_type", "clusters")
if marker_type == "cell_types":
organism_part = arguments.get("organism_part")
if not organism_part:
return {
"status": "error",
"error": "organism_part is required for marker_type='cell_types' "
"(e.g., 'pancreas', 'lung').",
}
url = f"{SCXA_BASE_URL}/experiments/{accession}/marker-genes/cell-types"
params = {"organismPart": organism_part}
else:
k = arguments.get("k")
if k is None:
return {
"status": "error",
"error": "k (clustering resolution / number of clusters) is "
"required for marker_type='clusters' (e.g., 8).",
}
url = f"{SCXA_BASE_URL}/experiments/{accession}/marker-genes/clusters"
params = {"k": k}
response = requests.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
raw = response.json()
# The cell-types route returns a dict {"error": ...} when a required
# param is missing; the clusters route returns a flat list of rows.
if isinstance(raw, dict) and "error" in raw:
return {
"status": "error",
"error": f"SCXA marker-genes error: {raw.get('error')}",
}
rows = raw if isinstance(raw, list) else []
limit = arguments.get("limit")
if isinstance(limit, int) and limit > 0:
rows = rows[:limit]
markers = [
{
"gene_name": r.get("geneName"),
"cell_group_value": r.get("cellGroupValue"),
"cell_group_value_where_marker": r.get("cellGroupValueWhereMarker"),
"value": r.get("value"),
"p_value": r.get("pValue"),
"expression_unit": r.get("expressionUnit"),
}
for r in rows
]
return {
"status": "success",
"data": markers,
"metadata": {
"source": "EBI Single Cell Expression Atlas",
"experiment_accession": accession,
"marker_type": marker_type,
"k": arguments.get("k") if marker_type != "cell_types" else None,
"organism_part": arguments.get("organism_part")
if marker_type == "cell_types"
else None,
"num_markers": len(markers),
},
}
[docs]
def _search_gene(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Search for SC experiments where a gene is expressed."""
gene = arguments.get("gene", "")
if not gene:
return {
"status": "error",
"error": "gene parameter is required (symbol like TP53 or Ensembl ID like ENSG00000141510)",
}
params = {}
if gene.startswith("ENSG") or gene.startswith("ENSMUS"):
params["ensgene"] = gene
else:
params["symbol"] = gene
species = arguments.get("species")
if species:
params["species"] = species
url = f"{SCXA_BASE_URL}/search"
response = requests.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
raw = response.json()
matching_gene_id = raw.get("matchingGeneId", "")
results = []
for r in raw.get("results", []):
elem = r.get("element", {})
results.append(
{
"accession": elem.get("experimentAccession"),
"description": elem.get("experimentDescription"),
"species": elem.get("species"),
"num_cells": elem.get("numberOfAssays"),
"technology": elem.get("technologyType"),
"experimental_factors": elem.get("experimentalFactors"),
"pubmed_ids": elem.get("pubMedIds"),
"dois": elem.get("dois"),
}
)
return {
"status": "success",
"data": results,
"metadata": {
"gene_query": gene,
"matching_gene_id": matching_gene_id,
"total_experiments": len(results),
"source": "EBI Single Cell Expression Atlas",
},
}