tooluniverse.hubmap_tool 源代码

# hubmap_tool.py
"""
HuBMAP (Human BioMolecular Atlas Program) tool for ToolUniverse.

Provides access to HuBMAP APIs for searching spatial biology datasets,
listing available organs, and retrieving dataset metadata.

APIs:
- Search API: https://search.api.hubmapconsortium.org/v3/search
- Ontology API: https://ontology.api.hubmapconsortium.org
- Entity API: https://entity.api.hubmapconsortium.org

No authentication required for public datasets.
"""

import requests
from typing import Any

from .base_tool import BaseTool
from .tool_registry import register_tool


HUBMAP_SEARCH_URL = "https://search.api.hubmapconsortium.org/v3/search"
HUBMAP_ONTOLOGY_URL = "https://ontology.api.hubmapconsortium.org"
HUBMAP_ENTITY_URL = "https://entity.api.hubmapconsortium.org"



[文档]
@register_tool("HuBMAPTool")
class HuBMAPTool(BaseTool):
    """
    Tool for querying HuBMAP datasets, organs, and dataset details.

    Supports searching published human tissue datasets by organ, assay type,
    and free text; listing available organs; and getting dataset metadata.

    No authentication required.
    """


[文档]
    def __init__(self, tool_config: dict[str, Any]):
        super().__init__(tool_config)
        self.timeout = tool_config.get("timeout", 30)
        fields = tool_config.get("fields", {})
        self.operation = fields.get("operation", "search_datasets")



[文档]
    def run(self, arguments: dict[str, Any]) -> dict[str, Any]:
        try:
            if self.operation == "search_datasets":
                return self._search_datasets(arguments)
            elif self.operation == "list_organs":
                return self._list_organs(arguments)
            elif self.operation == "get_dataset":
                return self._get_dataset(arguments)
            else:
                return {
                    "status": "error",
                    "error": f"Unknown operation: {self.operation}",
                }
        except requests.exceptions.Timeout:
            return {
                "status": "error",
                "error": f"HuBMAP API timed out after {self.timeout}s",
            }
        except requests.exceptions.ConnectionError:
            return {
                "status": "error",
                "error": "Failed to connect to HuBMAP API",
            }
        except Exception as e:
            return {"status": "error", "error": str(e)}



[文档]
    def _search_datasets(self, arguments: dict[str, Any]) -> dict[str, Any]:
        """Search HuBMAP datasets by organ, assay type, or free text."""
        organ = arguments.get("organ")
        dataset_type = arguments.get("dataset_type")
        query_text = arguments.get("query")
        limit = min(int(arguments.get("limit", 10)), 50)
        status_filter = arguments.get("status", "Published")

        must_clauses = [{"match": {"entity_type": "Dataset"}}]

        if status_filter:
            must_clauses.append({"match": {"status": status_filter}})

        if organ:
            must_clauses.append({"match": {"origin_samples.organ": organ.upper()}})

        if dataset_type:
            must_clauses.append({"match": {"dataset_type": dataset_type}})

        if query_text:
            must_clauses.append(
                {
                    "multi_match": {
                        "query": query_text,
                        "fields": [
                            "title",
                            "description",
                            "dataset_type",
                            "anatomy_0",
                            "anatomy_1",
                        ],
                    }
                }
            )

        body = {
            "size": limit,
            "query": {"bool": {"must": must_clauses}},
            "_source": [
                "hubmap_id",
                "dataset_type",
                "origin_samples.organ",
                "status",
                "title",
                "anatomy_0",
                "anatomy_1",
                "group_name",
                "created_timestamp",
                "doi_url",
                "data_types",
                "donor.mapped_metadata.sex",
                "donor.mapped_metadata.age_value",
            ],
        }

        resp = requests.post(
            HUBMAP_SEARCH_URL,
            json=body,
            headers={"Content-Type": "application/json"},
            timeout=self.timeout,
        )
        resp.raise_for_status()
        data = resp.json()

        hits = data.get("hits", {}).get("hits", [])
        total = data.get("hits", {}).get("total", {}).get("value", 0)

        results = []
        for h in hits:
            src = h.get("_source", {})
            organs = [
                s.get("organ", "")
                for s in src.get("origin_samples", [])
                if s.get("organ")
            ]
            donor = src.get("donor", {})
            mapped = donor.get("mapped_metadata", {}) if donor else {}

            results.append(
                {
                    "hubmap_id": src.get("hubmap_id"),
                    "title": src.get("title"),
                    "dataset_type": src.get("dataset_type"),
                    "organ": organs[0] if organs else None,
                    "status": src.get("status"),
                    "group_name": src.get("group_name"),
                    "anatomy": src.get("anatomy_0") or src.get("anatomy_1"),
                    "doi_url": src.get("doi_url"),
                    "data_types": src.get("data_types"),
                    "donor_sex": mapped.get("sex") if mapped else None,
                    "donor_age": mapped.get("age_value") if mapped else None,
                }
            )

        return {
            "status": "success",
            "data": {
                "total": total,
                "returned": len(results),
                "datasets": results,
            },
        }



[文档]
    def _list_organs(self, arguments: dict[str, Any]) -> dict[str, Any]:
        """List all organs available in HuBMAP."""
        url = f"{HUBMAP_ONTOLOGY_URL}/organs?application_context=HUBMAP"
        resp = requests.get(url, timeout=self.timeout)
        resp.raise_for_status()
        organs = resp.json()

        results = []
        for org in organs:
            results.append(
                {
                    "code": org.get("rui_code"),
                    "term": org.get("term"),
                    "organ_uberon": org.get("organ_uberon"),
                    "organ_cui": org.get("organ_cui"),
                    "rui_supported": org.get("rui_supported"),
                }
            )

        return {
            "status": "success",
            "data": {"total": len(results), "organs": results},
        }



[文档]
    def _get_dataset(self, arguments: dict[str, Any]) -> dict[str, Any]:
        """Get detailed metadata for a specific HuBMAP dataset."""
        hubmap_id = arguments.get("hubmap_id")
        if not hubmap_id:
            return {"status": "error", "error": "hubmap_id is required"}

        url = f"{HUBMAP_ENTITY_URL}/entities/{hubmap_id}"
        resp = requests.get(url, timeout=self.timeout)

        if resp.status_code == 404:
            return {
                "status": "error",
                "error": f"Dataset not found: {hubmap_id}",
            }
        resp.raise_for_status()
        data = resp.json()

        organs = [
            s.get("organ", "")
            for s in data.get("origin_samples", [{}])
            if isinstance(s, dict) and s.get("organ")
        ]

        contacts = data.get("contacts", [])
        contributors = data.get("contributors", [])

        result = {
            "hubmap_id": data.get("hubmap_id"),
            "entity_type": data.get("entity_type"),
            "dataset_type": data.get("dataset_type"),
            "status": data.get("status"),
            "title": data.get("title"),
            "description": data.get("description"),
            "organ": organs[0] if organs else None,
            "group_name": data.get("group_name"),
            "data_types": data.get("data_types"),
            "doi_url": data.get("doi_url"),
            "dbgap_study_url": data.get("dbgap_study_url"),
            "contains_human_genetic_sequences": data.get(
                "contains_human_genetic_sequences"
            ),
            "data_access_level": data.get("data_access_level"),
            "created_timestamp": data.get("created_timestamp"),
            "contacts": [
                {"name": c.get("name"), "email": c.get("email")} for c in contacts[:5]
            ]
            if contacts
            else [],
            "contributors": [
                {"name": c.get("name"), "affiliation": c.get("affiliation")}
                for c in contributors[:10]
            ]
            if contributors
            else [],
        }

        return {"status": "success", "data": result}