Source code for tooluniverse.cellmarker_tool
"""
CellMarker 2.0 Tool
Provides access to the CellMarker 2.0 database for querying curated
cell type marker genes from single-cell RNA-seq and experimental studies.
CellMarker 2.0 is a comprehensive database of cell type markers curated
from >26,000 publications, covering >500 cell types across >400 tissue types
for human and mouse. Data sources include single-cell sequencing, experiments,
reviews, and commercial antibody panels.
Website: http://bio-bigdata.hrbmu.edu.cn/CellMarker/
No authentication required.
Reference: Hu et al., Nucleic Acids Research, 2023 (PMID: 36300619)
"""
import re
import requests
from typing import Dict, Any, List, Optional
from .base_tool import BaseTool
from .tool_registry import register_tool
CELLMARKER_BASE_URL = "http://bio-bigdata.hrbmu.edu.cn/CellMarker"
def _parse_html_table_rows(html: str) -> List[Dict[str, str]]:
"""Parse HTML table rows from CellMarker JSP response.
CellMarker returns data as HTML table rows with columns:
Species | Tissue Class | Tissue Type | Cancer/Normal | Cell Name | Cell Marker | Source | Supports
"""
results = []
# Match all table rows with data
row_pattern = re.compile(r"<tr><td>(.*?)</tr>", re.DOTALL)
for match in row_pattern.finditer(html):
row_html = match.group(1)
# Extract cell contents - handle cells with nested HTML
cells = re.findall(r"<td>(.*?)</td>", "<td>" + row_html, re.DOTALL)
if len(cells) >= 6:
# Extract source from nested HTML (e.g., <p class='noplay'>Experiment</p><img...>)
source_raw = cells[6] if len(cells) > 6 else ""
source_match = re.search(r"class='noplay'>(.*?)</p>", source_raw)
source = source_match.group(1) if source_match else source_raw.strip()
# Extract support count
supports = cells[7].strip() if len(cells) > 7 else "0"
record = {
"species": cells[0].strip(),
"tissue_class": cells[1].strip(),
"tissue_type": cells[2].strip(),
"cell_type": cells[3].strip(), # "Normal cell" or "Cancer cell"
"cell_name": cells[4].strip(),
"cell_marker": cells[5].strip(),
"source": source,
"supports": int(supports) if supports.isdigit() else 0,
}
results.append(record)
return results
def _parse_cell_type_list(js_text: str) -> List[Dict[str, str]]:
"""Parse the JavaScript cell type list from CONTROL endpoint.
CONTROL returns pseudo-JSON like:
[{id:0,tissuet:"none",tissuec:"X",pId:0,name:"Y",namein:"Z",...},...]
"""
results = []
# Extract name values from the JS object array
name_pattern = re.compile(r'namein:"(.*?)"')
tissuec_pattern = re.compile(r'tissuec:"(.*?)"')
names = name_pattern.findall(js_text)
tissues = tissuec_pattern.findall(js_text)
for i, name in enumerate(names):
if name == "ALL":
continue
tissue = tissues[i] if i < len(tissues) else ""
results.append(
{
"cell_name": name,
"tissue_class": tissue,
}
)
return results
[docs]
@register_tool("CellMarkerTool")
class CellMarkerTool(BaseTool):
"""
Tool for querying the CellMarker 2.0 cell type marker database.
CellMarker 2.0 provides curated marker genes for hundreds of cell types
across tissues in human and mouse, sourced from single-cell RNA-seq
studies, experiments, reviews, and commercial antibody panels.
Supported operations:
- search_by_gene: Find cell types that express a given marker gene
- search_by_cell_type: Find marker genes for a specific cell type
- list_cell_types: List available cell types in a tissue
- search_cancer_markers: Search cancer-specific cell markers
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.parameter = tool_config.get("parameter", {})
self.required = self.parameter.get("required", [])
self.session = requests.Session()
self.timeout = 30
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the CellMarker tool with given arguments."""
operation = arguments.get("operation")
if not operation:
return {"status": "error", "error": "Missing required parameter: operation"}
operation_handlers = {
"search_by_gene": self._search_by_gene,
"search_by_cell_type": self._search_by_cell_type,
"list_cell_types": self._list_cell_types,
"search_cancer_markers": self._search_cancer_markers,
}
handler = operation_handlers.get(operation)
if not handler:
return {
"status": "error",
"error": "Unknown operation: {}".format(operation),
"available_operations": list(operation_handlers.keys()),
}
try:
return handler(arguments)
except requests.exceptions.Timeout:
return {"status": "error", "error": "CellMarker request timed out"}
except requests.exceptions.ConnectionError:
return {
"status": "error",
"error": "Could not connect to CellMarker server",
}
except Exception as e:
return {"status": "error", "error": "CellMarker error: {}".format(str(e))}
[docs]
def _search_by_gene(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Find cell types that express a given marker gene.
Uses Marker_table.jsp with POST: markerSpecies + markerMarker params.
Returns HTML table rows parsed into structured data.
"""
gene_symbol = arguments.get("gene_symbol")
if not gene_symbol:
return {
"status": "error",
"error": "Missing required parameter: gene_symbol",
}
species = arguments.get("species", "")
tissue_type = arguments.get("tissue_type")
# Map species to API format
species_map = {
"Human": "human",
"Mouse": "mouse",
"human": "human",
"mouse": "mouse",
}
api_species = species_map.get(species, species.lower() if species else "")
url = "{}/Marker_table.jsp".format(CELLMARKER_BASE_URL)
data = {
"markerSpecies": api_species,
"markerMarker": gene_symbol,
}
resp = self.session.post(url, data=data, timeout=self.timeout)
resp.raise_for_status()
records = _parse_html_table_rows(resp.text)
# Filter by tissue_type if specified
if tissue_type:
tissue_lower = tissue_type.lower()
records = [
r
for r in records
if tissue_lower in r["tissue_type"].lower()
or tissue_lower in r["tissue_class"].lower()
]
# Deduplicate and summarize by cell type
return {
"status": "success",
"data": {
"gene_symbol": gene_symbol,
"species": species if species else "all",
"total_records": len(records),
"records": records[:200], # Limit to 200 records
},
}
[docs]
def _search_by_cell_type(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Find marker genes for a specific cell type.
Uses Marker_table.jsp with quickkeyword + quick_v=yes for general search,
then filters results to match the target cell type.
"""
cell_name = arguments.get("cell_name")
if not cell_name:
return {"status": "error", "error": "Missing required parameter: cell_name"}
species = arguments.get("species")
tissue_type = arguments.get("tissue_type")
# Use quick search which searches across cell names, markers, and tissues
url = "{}/Marker_table.jsp".format(CELLMARKER_BASE_URL)
params = {
"quickkeyword": cell_name,
"quick_v": "yes",
}
resp = self.session.get(url, params=params, timeout=self.timeout)
resp.raise_for_status()
records = _parse_html_table_rows(resp.text)
# Filter to records matching the cell name
cell_lower = cell_name.lower()
filtered = [r for r in records if cell_lower in r["cell_name"].lower()]
# Apply species filter
if species:
species_lower = species.lower()
filtered = [r for r in filtered if r["species"].lower() == species_lower]
# Apply tissue filter
if tissue_type:
tissue_lower = tissue_type.lower()
filtered = [
r
for r in filtered
if tissue_lower in r["tissue_type"].lower()
or tissue_lower in r["tissue_class"].lower()
]
# Extract unique marker genes
marker_genes = sorted(set(r["cell_marker"] for r in filtered))
return {
"status": "success",
"data": {
"cell_name": cell_name,
"species": species if species else "all",
"total_records": len(filtered),
"unique_markers": len(marker_genes),
"marker_genes": marker_genes[:500], # Limit list
"records": filtered[:200],
},
}
[docs]
def _list_cell_types(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""List available cell types in a tissue.
Uses the CONTROL endpoint which returns a JavaScript array of cell types
for a given tissue and species combination.
"""
tissue_type = arguments.get("tissue_type", "")
species = arguments.get("species", "Human")
cell_class = arguments.get("cell_class")
# Map species to API format
species_map = {
"Human": "human",
"Mouse": "mouse",
"human": "human",
"mouse": "mouse",
}
api_species = species_map.get(species, species.lower() if species else "human")
# Determine cancer type filter
cancer_type = "all" # "all" for normal+cancer, "cancer" for cancer only
if cell_class and cell_class.lower() == "cancer":
cancer_type = "cancer"
url = "{}/CONTROL".format(CELLMARKER_BASE_URL)
params = {
"spcies": api_species,
"cancertype": cancer_type,
"tissuet": tissue_type,
"tissuec": tissue_type,
}
resp = self.session.get(url, params=params, timeout=self.timeout)
resp.raise_for_status()
cell_types = _parse_cell_type_list(resp.text)
return {
"status": "success",
"data": {
"species": species,
"tissue_type": tissue_type if tissue_type else "all",
"total_cell_types": len(cell_types),
"cell_types": cell_types,
},
}
[docs]
def _search_cancer_markers(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Search cancer-specific cell markers.
Searches for markers in cancer cell contexts. Can filter by:
- cancer_type (tissue where cancer occurs, e.g., "Breast", "Lung")
- gene_symbol (specific marker gene)
- cell_type (specific cancer cell type)
Uses Marker_table.jsp with markerSpecies/markerMarker for gene-based search,
or quickkeyword search for cell type / cancer type searches.
"""
cancer_type = arguments.get("cancer_type")
gene_symbol = arguments.get("gene_symbol")
cell_type = arguments.get("cell_type")
if not any([cancer_type, gene_symbol, cell_type]):
return {
"status": "error",
"error": "At least one parameter required: cancer_type, gene_symbol, or cell_type",
}
records = []
if gene_symbol:
# Search by gene, then filter to cancer records
url = "{}/Marker_table.jsp".format(CELLMARKER_BASE_URL)
data = {
"markerSpecies": "human",
"markerMarker": gene_symbol,
}
resp = self.session.post(url, data=data, timeout=self.timeout)
resp.raise_for_status()
records = _parse_html_table_rows(resp.text)
elif cell_type:
# Search by cell type name
url = "{}/Marker_table.jsp".format(CELLMARKER_BASE_URL)
params = {"quickkeyword": cell_type, "quick_v": "yes"}
resp = self.session.get(url, params=params, timeout=self.timeout)
resp.raise_for_status()
records = _parse_html_table_rows(resp.text)
elif cancer_type:
# Search by cancer tissue type
url = "{}/Marker_table.jsp".format(CELLMARKER_BASE_URL)
params = {"quickkeyword": cancer_type, "quick_v": "yes"}
resp = self.session.get(url, params=params, timeout=self.timeout)
resp.raise_for_status()
records = _parse_html_table_rows(resp.text)
# Filter to cancer records only
cancer_records = [r for r in records if r["cell_type"] == "Cancer cell"]
# Apply additional filters
if cancer_type:
cancer_lower = cancer_type.lower()
cancer_records = [
r
for r in cancer_records
if cancer_lower in r["tissue_type"].lower()
or cancer_lower in r["tissue_class"].lower()
]
if cell_type and gene_symbol:
# If both provided, also filter by cell type name
cell_lower = cell_type.lower()
cancer_records = [
r for r in cancer_records if cell_lower in r["cell_name"].lower()
]
if gene_symbol and cancer_type:
# Already filtered by gene, also filter by cancer tissue
pass # cancer_type filter already applied above
# Build query summary, omitting None values
query = {}
if cancer_type:
query["cancer_type"] = cancer_type
if gene_symbol:
query["gene_symbol"] = gene_symbol
if cell_type:
query["cell_type"] = cell_type
return {
"status": "success",
"data": {
"query": query,
"total_records": len(cancer_records),
"records": cancer_records[:200],
},
}