Source code for tooluniverse.cellxgene_census_tool
"""
CELLxGENE Census API Tool
This tool provides access to single-cell RNA-seq data from the CELLxGENE Census.
The Census is a versioned container of single-cell data from CZ CELLxGENE Discover
containing 50M+ cells from human, mouse, and non-human primate cells.
"""
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool
[docs]
@register_tool("CELLxGENECensusTool")
class CELLxGENECensusTool(BaseTool):
"""
CELLxGENE Census API tool for accessing single-cell RNA-seq data.
Provides access to cell metadata, gene expression, and embeddings.
"""
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the tool with given arguments."""
try:
import cellxgene_census
import tiledbsoma
except ImportError:
return {
"status": "error",
"error": "cellxgene_census package is required. Install with: pip install cellxgene-census",
}
try:
operation = arguments.get("operation", "get_metadata")
census_version = arguments.get("census_version", "stable")
if operation == "get_census_versions":
return self._get_census_versions()
elif operation == "get_obs_metadata":
return self._get_obs_metadata(arguments, census_version)
elif operation == "get_var_metadata":
return self._get_var_metadata(arguments, census_version)
elif operation == "get_anndata":
return self._get_anndata(arguments, census_version)
elif operation == "get_presence_matrix":
return self._get_presence_matrix(arguments, census_version)
elif operation == "get_embeddings":
return self._get_embeddings(arguments, census_version)
elif operation == "download_h5ad":
return self._download_h5ad(arguments, census_version)
else:
return {"status": "error", "error": f"Unknown operation: {operation}"}
except Exception as e:
return {"status": "error", "error": str(e)}
[docs]
def _get_census_versions(self) -> Dict[str, Any]:
"""Get list of available Census versions."""
try:
import cellxgene_census
versions = cellxgene_census.get_census_version_directory()
return {"status": "success", "versions": versions}
except Exception as e:
return {"status": "error", "error": str(e)}
[docs]
def _get_obs_metadata(
self, arguments: Dict[str, Any], census_version: str
) -> Dict[str, Any]:
"""Get observation (cell) metadata."""
try:
import cellxgene_census
organism = arguments.get("organism", "Homo sapiens")
obs_value_filter = arguments.get("obs_value_filter")
column_names = arguments.get("column_names")
# Safeguard: Require filter to prevent querying 50M+ cells
if not obs_value_filter:
return {
"status": "error",
"error": "obs_value_filter is required. The Census contains 50M+ cells; "
"queries without filters will timeout. Examples: "
"'tissue == \"lung\"', 'cell_type == \"T cell\"', "
'\'disease == "COVID-19" and tissue == "blood"\'',
}
with cellxgene_census.open_soma(census_version=census_version) as census:
obs_df = cellxgene_census.get_obs(
census,
organism=organism,
value_filter=obs_value_filter,
column_names=column_names,
)
return {
"status": "success",
"organism": organism,
"num_cells": len(obs_df),
"columns": list(obs_df.columns),
"data": obs_df.head(100).to_dict(orient="records")
if len(obs_df) <= 100
else obs_df.head(100).to_dict(orient="records"),
"message": f"Showing first 100 of {len(obs_df)} cells"
if len(obs_df) > 100
else None,
}
except Exception as e:
return {"status": "error", "error": str(e)}
[docs]
def _get_var_metadata(
self, arguments: Dict[str, Any], census_version: str
) -> Dict[str, Any]:
"""Get variable (gene) metadata."""
try:
import cellxgene_census
organism = arguments.get("organism", "Homo sapiens")
var_value_filter = arguments.get("var_value_filter")
column_names = arguments.get("column_names")
with cellxgene_census.open_soma(census_version=census_version) as census:
var_df = cellxgene_census.get_var(
census,
organism=organism,
value_filter=var_value_filter,
column_names=column_names,
)
return {
"status": "success",
"organism": organism,
"num_genes": len(var_df),
"columns": list(var_df.columns),
"data": var_df.head(100).to_dict(orient="records")
if len(var_df) <= 100
else var_df.head(100).to_dict(orient="records"),
"message": f"Showing first 100 of {len(var_df)} genes"
if len(var_df) > 100
else None,
}
except Exception as e:
return {"status": "error", "error": str(e)}
[docs]
def _get_anndata(
self, arguments: Dict[str, Any], census_version: str
) -> Dict[str, Any]:
"""Get expression data as AnnData object summary."""
try:
import cellxgene_census
organism = arguments.get("organism", "Homo sapiens")
obs_value_filter = arguments.get("obs_value_filter")
var_value_filter = arguments.get("var_value_filter")
obs_column_names = arguments.get("obs_column_names")
var_column_names = arguments.get("var_column_names")
# Safeguard: Require at least one filter to prevent massive queries
if not obs_value_filter and not var_value_filter:
return {
"status": "error",
"error": "At least one filter (obs_value_filter or var_value_filter) is required. "
"The Census contains 50M+ cells and 60K+ genes; unfiltered queries will timeout. "
'Examples: obs_value_filter=\'tissue == "lung"\', var_value_filter=\'feature_name in ["TP53", "BRCA1"]\'',
}
with cellxgene_census.open_soma(census_version=census_version) as census:
adata = cellxgene_census.get_anndata(
census,
organism=organism,
obs_value_filter=obs_value_filter,
var_value_filter=var_value_filter,
obs_column_names=obs_column_names,
var_column_names=var_column_names,
)
return {
"status": "success",
"organism": organism,
"n_obs": adata.n_obs,
"n_vars": adata.n_vars,
"obs_columns": list(adata.obs.columns),
"var_columns": list(adata.var.columns),
"message": "AnnData object created. Use Python API directly to access full data.",
}
except Exception as e:
return {"status": "error", "error": str(e)}
[docs]
def _get_presence_matrix(
self, arguments: Dict[str, Any], census_version: str
) -> Dict[str, Any]:
"""Get feature presence matrix."""
try:
import cellxgene_census
organism = arguments.get("organism", "Homo sapiens")
with cellxgene_census.open_soma(census_version=census_version) as census:
presence_matrix = cellxgene_census.get_presence_matrix(
census, organism=organism
)
return {
"status": "success",
"organism": organism,
"shape": presence_matrix.shape,
"nnz": presence_matrix.nnz,
"density": presence_matrix.nnz
/ (presence_matrix.shape[0] * presence_matrix.shape[1]),
"message": "Presence matrix retrieved. Shape: (genes, datasets)",
}
except Exception as e:
return {"status": "error", "error": str(e)}
[docs]
def _get_embeddings(
self, arguments: Dict[str, Any], census_version: str
) -> Dict[str, Any]:
"""Get pre-calculated embeddings."""
try:
import cellxgene_census
organism = arguments.get("organism", "Homo sapiens")
embedding_name = arguments.get("embedding_name")
with cellxgene_census.open_soma(census_version=census_version) as census:
# Get available embeddings
available_embeddings = (
cellxgene_census.experimental.get_all_available_embeddings(
census_version=census_version
)
)
if embedding_name:
# Get specific embedding
embedding_data = cellxgene_census.experimental.get_embedding(
census, organism=organism, embedding_name=embedding_name
)
return {
"status": "success",
"organism": organism,
"embedding_name": embedding_name,
"shape": embedding_data.shape,
"message": "Embedding retrieved successfully",
}
else:
return {
"status": "success",
"available_embeddings": available_embeddings,
"message": "Specify 'embedding_name' to retrieve specific embedding",
}
except Exception as e:
return {"status": "error", "error": str(e)}
[docs]
def _download_h5ad(
self, arguments: Dict[str, Any], census_version: str
) -> Dict[str, Any]:
"""Download source H5AD file."""
try:
import cellxgene_census
dataset_id = arguments.get("dataset_id")
output_path = arguments.get("output_path")
if not dataset_id:
return {
"status": "error",
"error": "dataset_id is required for download_h5ad operation",
}
if output_path:
cellxgene_census.download_source_h5ad(
dataset_id=dataset_id,
to_path=output_path,
census_version=census_version,
)
return {
"status": "success",
"dataset_id": dataset_id,
"output_path": output_path,
"message": "H5AD file downloaded successfully",
}
else:
uri = cellxgene_census.get_source_h5ad_uri(
dataset_id=dataset_id, census_version=census_version
)
return {
"status": "success",
"dataset_id": dataset_id,
"uri": uri,
"message": "Use this URI to access the H5AD file",
}
except Exception as e:
return {"status": "error", "error": str(e)}