Source code for tooluniverse.expression_anova_tool
"""Per-gene expression ANOVA / fold-change tool.
Wraps the skill script at
``skills/tooluniverse-statistical-modeling/scripts/expression_anova.py``.
"""
from __future__ import annotations
import importlib.util
import os
import sys
from pathlib import Path
from typing import Any, Dict, List
from .base_tool import BaseTool
from .tool_registry import register_tool
_REPO_ROOT = Path(__file__).resolve().parents[2]
_SCRIPT_PATH = (
_REPO_ROOT
/ "skills"
/ "tooluniverse-statistical-modeling"
/ "scripts"
/ "expression_anova.py"
)
def _load_script_module():
if not _SCRIPT_PATH.exists():
return None
spec = importlib.util.spec_from_file_location(
"_expression_anova_script", str(_SCRIPT_PATH)
)
if spec is None or spec.loader is None:
return None
module = importlib.util.module_from_spec(spec)
saved_argv = sys.argv
sys.argv = [str(_SCRIPT_PATH)]
try:
spec.loader.exec_module(module)
finally:
sys.argv = saved_argv
return module
[docs]
@register_tool("ExpressionANOVAPerGeneTool")
class ExpressionANOVAPerGeneTool(BaseTool):
"""Per-gene ANOVA / fold-change across groups.
Each gene contributes one value per group (the mean expression of that
gene across group samples). ``mode='anova'`` runs one-way ANOVA across
the K groups of N gene-level means. ``mode='fold_change'`` computes
per-gene log2FC between ``group_a`` and ``group_b`` (pseudocount=1),
then returns the median across genes.
"""
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
counts_file = arguments.get("counts_file", "")
meta_file = arguments.get("meta_file", "")
group_col = arguments.get("group_col", "")
mode = arguments.get("mode", "anova")
exclude_groups: List[str] = arguments.get("exclude_groups") or []
group_a = arguments.get("group_a", "") or ""
group_b = arguments.get("group_b", "") or ""
if not counts_file or not os.path.exists(counts_file):
return {"status": "error", "error": f"counts_file not found: {counts_file}"}
if not meta_file or not os.path.exists(meta_file):
return {"status": "error", "error": f"meta_file not found: {meta_file}"}
if not group_col:
return {"status": "error", "error": "group_col is required"}
if mode not in {"anova", "fold_change"}:
return {"status": "error", "error": f"Invalid mode: {mode}"}
if mode == "fold_change" and (not group_a or not group_b):
return {
"status": "error",
"error": "group_a and group_b required for mode='fold_change'",
}
module = _load_script_module()
if module is None:
return {
"status": "error",
"error": f"Skill script not available at {_SCRIPT_PATH}",
}
try:
counts, meta, gcol, groups = module.load_data(
counts_file, meta_file, group_col, list(exclude_groups)
)
except KeyError as exc:
return {"status": "error", "error": f"Missing column: {exc}"}
except Exception as exc: # noqa: BLE001
return {"status": "error", "error": f"Data load failed: {exc}"}
if len(counts.columns) == 0:
return {
"status": "error",
"error": "No overlapping samples between counts and metadata",
}
if mode == "anova":
if len(groups) < 2:
return {
"status": "error",
"error": f"ANOVA needs >=2 groups, got {list(groups)}",
}
try:
result = module.per_gene_anova(counts, meta, gcol, groups)
except Exception as exc: # noqa: BLE001
return {"status": "error", "error": f"ANOVA failed: {exc}"}
return {"status": "success", "data": result}
# fold_change — coerce group values to actual dtype of meta column
group_vals = list(groups)
str_map = {str(g): g for g in group_vals}
resolved_a = str_map.get(str(group_a), group_a)
resolved_b = str_map.get(str(group_b), group_b)
if resolved_a not in group_vals or resolved_b not in group_vals:
return {
"status": "error",
"error": f"group_a/group_b not found among groups {group_vals}",
}
try:
result = module.per_gene_fold_change(
counts, meta, gcol, resolved_a, resolved_b
)
except Exception as exc: # noqa: BLE001
return {"status": "error", "error": f"Fold-change failed: {exc}"}
return {"status": "success", "data": result}