Source code for tooluniverse.cellmarker_tool

"""
CellMarker 2.0 Tool

Provides access to the CellMarker 2.0 database for querying curated
cell type marker genes from single-cell RNA-seq and experimental studies.

CellMarker 2.0 is a comprehensive database of cell type markers curated
from >26,000 publications, covering >500 cell types across >400 tissue types
for human and mouse.

The CellMarker 2.0 web site was restructured and no longer exposes the JSP
search endpoints this tool previously scraped; the only public interface now is
the bulk marker download. This tool therefore downloads the full marker table
once (`Cell_marker_All.xlsx`), caches it on disk, and answers all queries
locally from the cached table.

Website: http://bio-bigdata.hrbmu.edu.cn/CellMarker/
No authentication required.

Reference: Hu et al., Nucleic Acids Research, 2023 (PMID: 36300619)
"""

import os
import tempfile
import threading
from typing import Dict, Any, List, Optional

import requests

from .base_tool import BaseTool
from .tool_registry import register_tool


# Bulk marker table. The primary domain sometimes 404s the download path; the
# published IP mirror is used as a fallback.
_DOWNLOAD_URLS = [
    "http://bio-bigdata.hrbmu.edu.cn/CellMarker/CellMarker_download_files/file/Cell_marker_All.xlsx",
    "http://117.50.127.228/CellMarker/CellMarker_download_files/file/Cell_marker_All.xlsx",
]
_CACHE_PATH = os.path.join(tempfile.gettempdir(), "tooluniverse_cellmarker_all.xlsx")
_BROWSER_UA = (
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)

_DF = None  # cached pandas DataFrame (loaded once per process)
_LOAD_LOCK = threading.Lock()


def _download_marker_table(timeout: int) -> None:
    """Download the bulk marker table to the on-disk cache."""
    last_err: Optional[Exception] = None
    for url in _DOWNLOAD_URLS:
        try:
            resp = requests.get(
                url, headers={"User-Agent": _BROWSER_UA}, timeout=timeout
            )
            resp.raise_for_status()
            with open(_CACHE_PATH, "wb") as fh:
                fh.write(resp.content)
            return
        except requests.RequestException as err:  # try the next mirror
            last_err = err
    raise RuntimeError(f"Could not download the CellMarker marker table: {last_err}")


def _load_dataframe(timeout: int = 180):
    """Load (and cache) the CellMarker marker table as a normalized DataFrame."""
    global _DF
    if _DF is not None:
        return _DF
    with _LOAD_LOCK:
        if _DF is not None:
            return _DF
        import pandas as pd

        if not os.path.exists(_CACHE_PATH) or os.path.getsize(_CACHE_PATH) == 0:
            _download_marker_table(timeout)

        df = pd.read_excel(
            _CACHE_PATH,
            usecols=[
                "species",
                "tissue_class",
                "tissue_type",
                "cancer_type",
                "cell_type",
                "cell_name",
                "marker",
                "Symbol",
                "marker_source",
            ],
        )
        # Marker gene symbol: prefer the curated Symbol, fall back to marker.
        df["cell_marker"] = (
            df["Symbol"].fillna(df["marker"]).fillna("").astype(str).str.strip()
        )
        df["source"] = df["marker_source"].fillna("").astype(str).str.strip()
        for col in ("species", "tissue_class", "tissue_type", "cell_type", "cell_name"):
            df[col] = df[col].fillna("").astype(str).str.strip()
        # "supports" = number of curated records for the same marker/cell/tissue,
        # i.e. how many studies back the marker–cell assignment.
        df["supports"] = df.groupby(
            ["species", "tissue_type", "cell_name", "cell_marker"]
        )["cell_marker"].transform("size")
        _DF = df
        return _DF


def _records(df, limit: int = 200) -> List[Dict[str, Any]]:
    """Convert a filtered DataFrame to the tool's record dict list."""
    out = []
    for _, row in df.head(limit).iterrows():
        out.append(
            {
                "species": row["species"],
                "tissue_class": row["tissue_class"],
                "tissue_type": row["tissue_type"],
                "cell_type": row["cell_type"],
                "cell_name": row["cell_name"],
                "cell_marker": row["cell_marker"],
                "source": row["source"],
                "supports": int(row["supports"]),
            }
        )
    return out


def _apply_species(df, species: Optional[str]):
    if species:
        return df[df["species"].str.lower() == species.strip().lower()]
    return df


def _apply_tissue(df, tissue_type: Optional[str]):
    if tissue_type:
        t = tissue_type.strip().lower()
        return df[
            df["tissue_type"].str.lower().str.contains(t, regex=False)
            | df["tissue_class"].str.lower().str.contains(t, regex=False)
        ]
    return df



[docs]
@register_tool("CellMarkerTool")
class CellMarkerTool(BaseTool):
    """
    Tool for querying the CellMarker 2.0 cell type marker database.

    Supported operations:
    - search_by_gene: Find cell types that express a given marker gene
    - search_by_cell_type: Find marker genes for a specific cell type
    - list_cell_types: List available cell types in a tissue
    - search_cancer_markers: Search cancer-specific cell markers
    """


[docs]
    def __init__(self, tool_config: Dict[str, Any]):
        super().__init__(tool_config)
        self.parameter = tool_config.get("parameter", {})
        self.required = self.parameter.get("required", [])
        self.timeout = 180



[docs]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        operation = arguments.get("operation")
        if not operation:
            return {"status": "error", "error": "Missing required parameter: operation"}

        handlers = {
            "search_by_gene": self._search_by_gene,
            "search_by_cell_type": self._search_by_cell_type,
            "list_cell_types": self._list_cell_types,
            "search_cancer_markers": self._search_cancer_markers,
        }
        handler = handlers.get(operation)
        if not handler:
            return {
                "status": "error",
                "error": "Unknown operation: {}".format(operation),
                "available_operations": list(handlers.keys()),
            }

        try:
            df = _load_dataframe(self.timeout)
            return handler(arguments, df)
        except Exception as e:  # noqa: BLE001 - report any failure via the envelope
            return {"status": "error", "error": "CellMarker error: {}".format(str(e))}



[docs]
    def _search_by_gene(self, arguments, df) -> Dict[str, Any]:
        gene_symbol = arguments.get("gene_symbol")
        if not gene_symbol:
            return {
                "status": "error",
                "error": "Missing required parameter: gene_symbol",
            }
        species = arguments.get("species")
        tissue_type = arguments.get("tissue_type")

        sub = df[df["cell_marker"].str.lower() == gene_symbol.strip().lower()]
        sub = _apply_species(sub, species)
        sub = _apply_tissue(sub, tissue_type)

        return {
            "status": "success",
            "data": {
                "gene_symbol": gene_symbol,
                "species": species if species else "all",
                "total_records": int(len(sub)),
                "records": _records(sub),
            },
        }



[docs]
    def _search_by_cell_type(self, arguments, df) -> Dict[str, Any]:
        cell_name = arguments.get("cell_name")
        if not cell_name:
            return {"status": "error", "error": "Missing required parameter: cell_name"}
        species = arguments.get("species")
        tissue_type = arguments.get("tissue_type")

        sub = df[
            df["cell_name"]
            .str.lower()
            .str.contains(cell_name.strip().lower(), regex=False)
        ]
        sub = _apply_species(sub, species)
        sub = _apply_tissue(sub, tissue_type)

        marker_genes = sorted(m for m in sub["cell_marker"].unique() if m)
        return {
            "status": "success",
            "data": {
                "cell_name": cell_name,
                "species": species if species else "all",
                "total_records": int(len(sub)),
                "unique_markers": len(marker_genes),
                "marker_genes": marker_genes[:500],
                "records": _records(sub),
            },
        }



[docs]
    def _list_cell_types(self, arguments, df) -> Dict[str, Any]:
        tissue_type = arguments.get("tissue_type")
        species = arguments.get("species", "Human")
        cell_class = arguments.get("cell_class")

        sub = _apply_species(df, species)
        sub = _apply_tissue(sub, tissue_type)
        if cell_class and cell_class.strip().lower() == "cancer":
            sub = sub[sub["cell_type"] == "Cancer cell"]

        seen = {}
        for _, row in sub.iterrows():
            name = row["cell_name"]
            if name and name not in seen:
                seen[name] = {"cell_name": name, "tissue_class": row["tissue_class"]}
        cell_types = list(seen.values())

        return {
            "status": "success",
            "data": {
                "species": species if species else "all",
                "tissue_type": tissue_type if tissue_type else "all",
                "total_cell_types": len(cell_types),
                "cell_types": cell_types,
            },
        }



[docs]
    def _search_cancer_markers(self, arguments, df) -> Dict[str, Any]:
        cancer_type = arguments.get("cancer_type")
        gene_symbol = arguments.get("gene_symbol")
        cell_type = arguments.get("cell_type")

        if not any([cancer_type, gene_symbol, cell_type]):
            return {
                "status": "error",
                "error": "At least one parameter required: cancer_type, gene_symbol, or cell_type",
            }

        sub = df[df["cell_type"] == "Cancer cell"]
        if gene_symbol:
            sub = sub[sub["cell_marker"].str.lower() == gene_symbol.strip().lower()]
        if cancer_type:
            sub = _apply_tissue(sub, cancer_type)
        if cell_type:
            sub = sub[
                sub["cell_name"]
                .str.lower()
                .str.contains(cell_type.strip().lower(), regex=False)
            ]

        query = {}
        if cancer_type:
            query["cancer_type"] = cancer_type
        if gene_symbol:
            query["gene_symbol"] = gene_symbol
        if cell_type:
            query["cell_type"] = cell_type

        return {
            "status": "success",
            "data": {
                "query": query,
                "total_records": int(len(sub)),
                "records": _records(sub),
            },
        }