tooluniverse.hubmap_tool 源代码

# hubmap_tool.py
"""
HuBMAP (Human BioMolecular Atlas Program) tool for ToolUniverse.

Provides access to HuBMAP APIs for searching spatial biology datasets,
listing available organs, and retrieving dataset metadata.

APIs:
- Search API: https://search.api.hubmapconsortium.org/v3/search
- Ontology API: https://ontology.api.hubmapconsortium.org
- Entity API: https://entity.api.hubmapconsortium.org

No authentication required for public datasets.
"""

import requests
from typing import Any

from .base_tool import BaseTool
from .tool_registry import register_tool


HUBMAP_SEARCH_URL = "https://search.api.hubmapconsortium.org/v3/search"
HUBMAP_ONTOLOGY_URL = "https://ontology.api.hubmapconsortium.org"
HUBMAP_ENTITY_URL = "https://entity.api.hubmapconsortium.org"


[文档] @register_tool("HuBMAPTool") class HuBMAPTool(BaseTool): """ Tool for querying HuBMAP datasets, organs, and dataset details. Supports searching published human tissue datasets by organ, assay type, and free text; listing available organs; and getting dataset metadata. No authentication required. """
[文档] def __init__(self, tool_config: dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("timeout", 30) fields = tool_config.get("fields", {}) self.operation = fields.get("operation", "search_datasets")
[文档] def run(self, arguments: dict[str, Any]) -> dict[str, Any]: try: if self.operation == "search_datasets": return self._search_datasets(arguments) elif self.operation == "list_organs": return self._list_organs(arguments) elif self.operation == "get_dataset": return self._get_dataset(arguments) else: return { "status": "error", "error": f"Unknown operation: {self.operation}", } except requests.exceptions.Timeout: return { "status": "error", "error": f"HuBMAP API timed out after {self.timeout}s", } except requests.exceptions.ConnectionError: return { "status": "error", "error": "Failed to connect to HuBMAP API", } except Exception as e: return {"status": "error", "error": str(e)}
[文档] def _search_datasets(self, arguments: dict[str, Any]) -> dict[str, Any]: """Search HuBMAP datasets by organ, assay type, or free text.""" organ = arguments.get("organ") dataset_type = arguments.get("dataset_type") query_text = arguments.get("query") limit = min(int(arguments.get("limit", 10)), 50) status_filter = arguments.get("status", "Published") must_clauses = [{"match": {"entity_type": "Dataset"}}] if status_filter: must_clauses.append({"match": {"status": status_filter}}) if organ: must_clauses.append({"match": {"origin_samples.organ": organ.upper()}}) if dataset_type: must_clauses.append({"match": {"dataset_type": dataset_type}}) if query_text: must_clauses.append( { "multi_match": { "query": query_text, "fields": [ "title", "description", "dataset_type", "anatomy_0", "anatomy_1", ], } } ) body = { "size": limit, "query": {"bool": {"must": must_clauses}}, "_source": [ "hubmap_id", "dataset_type", "origin_samples.organ", "status", "title", "anatomy_0", "anatomy_1", "group_name", "created_timestamp", "doi_url", "data_types", "donor.mapped_metadata.sex", "donor.mapped_metadata.age_value", ], } resp = requests.post( HUBMAP_SEARCH_URL, json=body, headers={"Content-Type": "application/json"}, timeout=self.timeout, ) resp.raise_for_status() data = resp.json() hits = data.get("hits", {}).get("hits", []) total = data.get("hits", {}).get("total", {}).get("value", 0) results = [] for h in hits: src = h.get("_source", {}) organs = [ s.get("organ", "") for s in src.get("origin_samples", []) if s.get("organ") ] donor = src.get("donor", {}) mapped = donor.get("mapped_metadata", {}) if donor else {} results.append( { "hubmap_id": src.get("hubmap_id"), "title": src.get("title"), "dataset_type": src.get("dataset_type"), "organ": organs[0] if organs else None, "status": src.get("status"), "group_name": src.get("group_name"), "anatomy": src.get("anatomy_0") or src.get("anatomy_1"), "doi_url": src.get("doi_url"), "data_types": src.get("data_types"), "donor_sex": mapped.get("sex") if mapped else None, "donor_age": mapped.get("age_value") if mapped else None, } ) return { "status": "success", "data": { "total": total, "returned": len(results), "datasets": results, }, }
[文档] def _list_organs(self, arguments: dict[str, Any]) -> dict[str, Any]: """List all organs available in HuBMAP.""" url = f"{HUBMAP_ONTOLOGY_URL}/organs?application_context=HUBMAP" resp = requests.get(url, timeout=self.timeout) resp.raise_for_status() organs = resp.json() results = [] for org in organs: results.append( { "code": org.get("rui_code"), "term": org.get("term"), "organ_uberon": org.get("organ_uberon"), "organ_cui": org.get("organ_cui"), "rui_supported": org.get("rui_supported"), } ) return { "status": "success", "data": {"total": len(results), "organs": results}, }
[文档] def _get_dataset(self, arguments: dict[str, Any]) -> dict[str, Any]: """Get detailed metadata for a specific HuBMAP dataset.""" hubmap_id = arguments.get("hubmap_id") if not hubmap_id: return {"status": "error", "error": "hubmap_id is required"} url = f"{HUBMAP_ENTITY_URL}/entities/{hubmap_id}" resp = requests.get(url, timeout=self.timeout) if resp.status_code == 404: return { "status": "error", "error": f"Dataset not found: {hubmap_id}", } resp.raise_for_status() data = resp.json() organs = [ s.get("organ", "") for s in data.get("origin_samples", [{}]) if isinstance(s, dict) and s.get("organ") ] contacts = data.get("contacts", []) contributors = data.get("contributors", []) result = { "hubmap_id": data.get("hubmap_id"), "entity_type": data.get("entity_type"), "dataset_type": data.get("dataset_type"), "status": data.get("status"), "title": data.get("title"), "description": data.get("description"), "organ": organs[0] if organs else None, "group_name": data.get("group_name"), "data_types": data.get("data_types"), "doi_url": data.get("doi_url"), "dbgap_study_url": data.get("dbgap_study_url"), "contains_human_genetic_sequences": data.get( "contains_human_genetic_sequences" ), "data_access_level": data.get("data_access_level"), "created_timestamp": data.get("created_timestamp"), "contacts": [ {"name": c.get("name"), "email": c.get("email")} for c in contacts[:5] ] if contacts else [], "contributors": [ {"name": c.get("name"), "affiliation": c.get("affiliation")} for c in contributors[:10] ] if contributors else [], } return {"status": "success", "data": result}