Source code for tooluniverse.hubmap_sample_tool

# hubmap_sample_tool.py
"""
HuBMAP Sample/Donor spatial-context tool for ToolUniverse.

Complements the existing HuBMAPTool (dataset-level search) by exposing the
*biospecimen* layer of the Human BioMolecular Atlas Program: the physical
tissue Samples (anatomical blocks, sections, organs, suspensions) and the
human Donors they come from.

The standout content here is RUI (Registration User Interface) spatial
registration: tissue blocks/sections registered into the HuBMAP Common
Coordinate Framework (CCF) carry a `rui_location` JSON object with the
3-D dimensions, placement target reference organ, and CCF/UBERON
`ccf_annotations` describing exactly which anatomical structures the
specimen overlaps. None of the dataset-level tools surface this, nor the
UMLS-coded donor demographics (age / sex / race / BMI).

APIs (no authentication required for public entities):
- Search API: https://search.api.hubmapconsortium.org/v3/search  (Elasticsearch)

Operations:
- search_samples : find tissue Samples by organ code + sample_category, with
                   RUI spatial-registration flag.
- get_sample     : full Sample record including parsed rui_location (CCF
                   placement, dimensions, UBERON annotations) and donor link.
- search_donors  : find human Donors, with normalized demographics
                   (age / sex / race / BMI from metadata.organ_donor_data).
"""

import json
import requests
from typing import Any

from .base_tool import BaseTool
from .tool_registry import register_tool


HUBMAP_SEARCH_URL = "https://search.api.hubmapconsortium.org/v3/search"

# 2-letter RUI organ codes used by HuBMAP origin_samples.organ
ORGAN_CODE_NAMES = {
    "LK": "Left Kidney",
    "RK": "Right Kidney",
    "LI": "Large Intestine",
    "SI": "Small Intestine",
    "HT": "Heart",
    "LV": "Liver",
    "LL": "Left Lung",
    "RL": "Right Lung",
    "LU": "Lung",
    "SP": "Spleen",
    "TH": "Thymus",
    "LY": "Lymph Node",
    "LN": "Lymph Node",
    "BL": "Bladder",
    "PA": "Pancreas",
    "SK": "Skin",
    "BR": "Brain",
    "BM": "Bone Marrow",
    "MU": "Muscle",
    "UT": "Uterus",
    "PL": "Placenta",
    "LE": "Left Eye",
    "RE": "Right Eye",
    "RF": "Right Fallopian Tube",
    "LF": "Left Fallopian Tube",
    "LO": "Left Ovary",
    "RO": "Right Ovary",
    "BD": "Blood",
    "BV": "Blood Vasculature",
    "RB": "Right Bronchus",
    "LB": "Left Bronchus",
    "RN": "Right Kidney (RN)",
}


[docs] @register_tool("HuBMAPSampleTool") class HuBMAPSampleTool(BaseTool): """ Query the HuBMAP biospecimen layer: tissue Samples (with CCF/RUI spatial registration) and human Donors (with UMLS-coded demographics). Dispatched by the `operation` field. No authentication required. """
[docs] def __init__(self, tool_config: dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("timeout", 30) fields = tool_config.get("fields", {}) self.operation = fields.get("operation", "search_samples")
# ------------------------------------------------------------------ # # dispatch # ------------------------------------------------------------------ #
[docs] def run(self, arguments: dict[str, Any]) -> dict[str, Any]: try: if self.operation == "search_samples": return self._search_samples(arguments) elif self.operation == "get_sample": return self._get_sample(arguments) elif self.operation == "search_donors": return self._search_donors(arguments) else: return { "status": "error", "error": f"Unknown operation: {self.operation}", } except requests.exceptions.Timeout: return { "status": "error", "error": f"HuBMAP API timed out after {self.timeout}s", } except requests.exceptions.ConnectionError: return {"status": "error", "error": "Failed to connect to HuBMAP API"} except Exception as e: # never raise return {"status": "error", "error": str(e)}
# ------------------------------------------------------------------ # # helpers # ------------------------------------------------------------------ #
[docs] def _post(self, body: dict[str, Any]) -> dict[str, Any]: resp = requests.post( HUBMAP_SEARCH_URL, json=body, headers={"Content-Type": "application/json"}, timeout=self.timeout, ) resp.raise_for_status() return resp.json()
[docs] @staticmethod def _first_organ_code(src: dict[str, Any]) -> str | None: """Return the first mapped organ code from origin_samples, or None.""" for s in src.get("origin_samples", []): if s.get("organ"): return s["organ"] return None
[docs] @staticmethod def _parse_rui(rui_raw: Any) -> dict[str, Any] | None: """Parse the rui_location JSON (stored as a string) into a compact dict.""" if not rui_raw: return None rui = rui_raw if isinstance(rui_raw, str): try: rui = json.loads(rui_raw) except (ValueError, TypeError): return None if not isinstance(rui, dict): return None placement = rui.get("placement") or {} return { "registered": True, "placement_target": placement.get("target"), "x_dimension": rui.get("x_dimension"), "y_dimension": rui.get("y_dimension"), "z_dimension": rui.get("z_dimension"), "dimension_units": rui.get("dimension_units"), "ccf_annotations": rui.get("ccf_annotations") or [], }
[docs] @staticmethod def _donor_demographics(src: dict[str, Any]) -> dict[str, Any]: """Normalize metadata.organ_donor_data UMLS rows into age/sex/race/bmi/etc.""" demo: dict[str, Any] = {} rows = (src.get("metadata") or {}).get("organ_donor_data") or [] for row in rows: if not isinstance(row, dict): continue label = ( row.get("grouping_concept_preferred_term") or row.get("preferred_term") or "" ) value = row.get("data_value") units = row.get("units") if not label: continue key = label.strip().lower().replace(" ", "_") if units and row.get("data_type") == "Numeric": demo[key] = f"{value} {units}".strip() else: demo[key] = value return demo
# ------------------------------------------------------------------ # # operations # ------------------------------------------------------------------ #
[docs] def _search_samples(self, arguments: dict[str, Any]) -> dict[str, Any]: organ = arguments.get("organ") sample_category = arguments.get("sample_category") registered_only = arguments.get("registered_only", False) limit = min(int(arguments.get("limit", 10) or 10), 50) must: list[dict[str, Any]] = [{"match": {"entity_type": "Sample"}}] if organ: must.append({"match": {"origin_samples.organ": str(organ).upper()}}) if sample_category: must.append({"match": {"sample_category": str(sample_category).lower()}}) if registered_only: must.append({"exists": {"field": "rui_location"}}) body = { "size": limit, "query": {"bool": {"must": must}}, "_source": [ "hubmap_id", "uuid", "entity_type", "sample_category", "origin_samples", "group_name", "rui_location", "donor", "created_timestamp", ], } data = self._post(body) hits = data.get("hits", {}).get("hits", []) total = data.get("hits", {}).get("total", {}).get("value", 0) samples = [] for h in hits: src = h.get("_source", {}) code = self._first_organ_code(src) donor = src.get("donor") or {} samples.append( { "hubmap_id": src.get("hubmap_id"), "uuid": src.get("uuid"), "sample_category": src.get("sample_category"), "organ_code": code, "organ_name": ORGAN_CODE_NAMES.get(code) if code else None, "group_name": src.get("group_name"), "spatially_registered": bool(src.get("rui_location")), "donor_hubmap_id": donor.get("hubmap_id"), } ) return { "status": "success", "data": { "total": total, "returned": len(samples), "samples": samples, }, }
[docs] def _get_sample(self, arguments: dict[str, Any]) -> dict[str, Any]: hubmap_id = arguments.get("hubmap_id") if not hubmap_id: return {"status": "error", "error": "hubmap_id is required"} body = { "size": 1, "query": { "bool": { "must": [ {"match": {"entity_type": "Sample"}}, {"term": {"hubmap_id.keyword": hubmap_id}}, ] } }, } data = self._post(body) hits = data.get("hits", {}).get("hits", []) if not hits: return { "status": "error", "error": f"No HuBMAP Sample found for id '{hubmap_id}'", } src = hits[0].get("_source", {}) code = self._first_organ_code(src) donor = src.get("donor") or {} return { "status": "success", "data": { "hubmap_id": src.get("hubmap_id"), "uuid": src.get("uuid"), "entity_type": src.get("entity_type"), "sample_category": src.get("sample_category"), "organ_code": code, "organ_name": ORGAN_CODE_NAMES.get(code) if code else None, "group_name": src.get("group_name"), "data_access_level": src.get("data_access_level"), "protocol_url": src.get("protocol_url"), "donor_hubmap_id": donor.get("hubmap_id"), "rui_location": self._parse_rui(src.get("rui_location")), "created_timestamp": src.get("created_timestamp"), }, }
[docs] def _search_donors(self, arguments: dict[str, Any]) -> dict[str, Any]: group_name = arguments.get("group_name") query_text = arguments.get("query") limit = min(int(arguments.get("limit", 10) or 10), 50) must: list[dict[str, Any]] = [{"match": {"entity_type": "Donor"}}] if group_name: must.append({"match": {"group_name": group_name}}) if query_text: must.append( { "multi_match": { "query": query_text, "fields": [ "description", "group_name", "metadata.organ_donor_data.data_value", "metadata.organ_donor_data.preferred_term", ], } } ) body = { "size": limit, "query": {"bool": {"must": must}}, "_source": [ "hubmap_id", "uuid", "entity_type", "group_name", "metadata", "created_timestamp", ], } data = self._post(body) hits = data.get("hits", {}).get("hits", []) total = data.get("hits", {}).get("total", {}).get("value", 0) donors = [] for h in hits: src = h.get("_source", {}) donors.append( { "hubmap_id": src.get("hubmap_id"), "uuid": src.get("uuid"), "group_name": src.get("group_name"), "demographics": self._donor_demographics(src), } ) return { "status": "success", "data": { "total": total, "returned": len(donors), "donors": donors, }, }