Source code for tooluniverse.harmonizome_tool

# harmonizome_tool.py
"""
Harmonizome tool for ToolUniverse.

Harmonizome (Ma'ayan Lab, Mount Sinai) integrates data from 100+ genomics
datasets covering gene expression, protein interactions, pathways, diseases,
drug targets, and more into a unified gene-centric resource.

API: https://maayanlab.cloud/Harmonizome/api/1.0/
No authentication required.
"""

import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool

HARMONIZOME_BASE_URL = "https://maayanlab.cloud/Harmonizome/api/1.0"


[docs] @register_tool("HarmonizomeTool") class HarmonizomeTool(BaseTool): """ Tool for querying Harmonizome gene and dataset information. Supports: - Gene details (symbol, name, description, synonyms, proteins) - Dataset catalog (100+ integrated genomics datasets) No authentication required. """
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("timeout", 30) fields = tool_config.get("fields", {}) self.endpoint = fields.get("endpoint", "get_gene")
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the Harmonizome API call.""" try: return self._query(arguments) except requests.exceptions.Timeout: return { "status": "error", "error": f"Harmonizome API timed out after {self.timeout}s", } except requests.exceptions.ConnectionError: return {"status": "error", "error": "Failed to connect to Harmonizome API"} except requests.exceptions.HTTPError as e: status = e.response.status_code if e.response is not None else "unknown" if status == 404: return { "status": "error", "error": "Gene not found in Harmonizome. Check the gene symbol.", } return {"status": "error", "error": f"Harmonizome API HTTP {status}"} except Exception as e: return {"status": "error", "error": f"Unexpected error: {str(e)}"}
[docs] def _query(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Route to appropriate endpoint.""" handlers = { "get_gene": self._get_gene, "list_datasets": self._list_datasets, "get_dataset": self._get_dataset, "search": self._search, "search_genes": self._search_genes, "get_gene_set_members": self._get_gene_set_members, } handler = handlers.get(self.endpoint) if handler: return handler(arguments) return {"status": "error", "error": f"Unknown endpoint: {self.endpoint}"}
[docs] def _get_gene(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get gene details from Harmonizome.""" gene_symbol = arguments.get("gene_symbol", "") if not gene_symbol: return { "status": "error", "error": "gene_symbol is required (e.g., 'TP53').", } url = f"{HARMONIZOME_BASE_URL}/gene/{gene_symbol}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() data = response.json() # Check if we got an error response if data.get("status") == 404 or "message" in data: return { "status": "error", "error": f"Gene '{gene_symbol}' not found: {data.get('message', 'unknown')}", } proteins = [] for p in data.get("proteins", []): proteins.append( { "symbol": p.get("symbol"), "href": p.get("href"), } ) return { "status": "success", "data": { "symbol": data.get("symbol"), "name": data.get("name"), "ncbi_entrez_gene_id": data.get("ncbiEntrezGeneId"), "ncbi_entrez_gene_url": data.get("ncbiEntrezGeneUrl"), "description": data.get("description"), "synonyms": data.get("synonyms", []), "proteins": proteins, }, "metadata": { "source": "Harmonizome (maayanlab.cloud/Harmonizome)", }, }
[docs] def _list_datasets(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """List all available Harmonizome datasets.""" url = f"{HARMONIZOME_BASE_URL}/dataset" response = requests.get(url, timeout=self.timeout) response.raise_for_status() data = response.json() entities = data.get("entities", []) datasets = [] for e in entities: datasets.append( { "name": e.get("name"), "href": e.get("href"), } ) return { "status": "success", "data": datasets, "metadata": { "source": "Harmonizome (maayanlab.cloud/Harmonizome)", "total_datasets": len(datasets), }, }
[docs] def _get_dataset(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get detailed information about a specific Harmonizome dataset.""" dataset_name = arguments.get("dataset_name", "") if not dataset_name: return { "status": "error", "error": "dataset_name is required (e.g., 'CTD Gene-Disease Associations'). Use Harmonizome_list_datasets to find names.", } # URL-encode: spaces become + in Harmonizome API encoded_name = dataset_name.replace(" ", "+") url = f"{HARMONIZOME_BASE_URL}/dataset/{encoded_name}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() data = response.json() if "message" in data: return { "status": "error", "error": f"Dataset not found: {data.get('message', dataset_name)}", } # Extract gene set names (can be large, limit to first 50) gene_sets_raw = data.get("geneSets", []) gene_set_limit = arguments.get("gene_set_limit") or 50 gene_sets = [ {"name": gs.get("name", "").split("/")[0]} for gs in gene_sets_raw[:gene_set_limit] ] return { "status": "success", "data": { "name": data.get("name"), "description": data.get("description"), "association": data.get("association"), "measurement": data.get("measurement"), "attribute_type": data.get("attributeType"), "attribute_group": data.get("attributeGroup"), "dataset_group": data.get("datasetGroup"), "pubmed_ids": data.get("pubMedIds", []), "gene_sets": gene_sets, "total_gene_sets": len(gene_sets_raw), }, "metadata": { "source": "Harmonizome (maayanlab.cloud/Harmonizome)", "dataset_name": dataset_name, }, }
[docs] def _get_gene_set_members(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Retrieve Harmonizome gene-set members or per-gene associations. Two modes via `mode`: - "gene_set" (default): member genes of a curated attribute set, each with thresholdValue + standardizedValue. Requires `attribute` and `dataset`. Endpoint: /gene_set/{attribute}/{dataset} - "gene": the full per-gene association table across all datasets. Requires `gene_symbol`. Endpoint: /gene/{symbol}?showAssociations=true """ mode = arguments.get("mode", "gene_set") limit = arguments.get("limit") or 100 def _encode(value: str) -> str: # Harmonizome path encoding: spaces -> '+'. Other reserved chars are # already correctly URL-encoded in the hrefs the API returns. return requests.utils.quote(value, safe="+/").replace("%20", "+") if mode == "gene": gene_symbol = arguments.get("gene_symbol", "") if not gene_symbol: return { "status": "error", "error": "gene_symbol is required for mode='gene' (e.g., 'DTD2').", } url = f"{HARMONIZOME_BASE_URL}/gene/{requests.utils.quote(gene_symbol, safe='')}" response = requests.get( url, params={"showAssociations": "true"}, timeout=self.timeout ) response.raise_for_status() data = response.json() if data.get("status") == 404 or "message" in data: return { "status": "error", "error": f"Gene '{gene_symbol}' not found: {data.get('message', 'unknown')}", } associations_raw = data.get("associations", []) associations = [ { "gene_set": (a.get("geneSet") or {}).get("name"), "gene_set_href": (a.get("geneSet") or {}).get("href"), "threshold_value": a.get("thresholdValue"), "standardized_value": a.get("standardizedValue"), } for a in associations_raw[:limit] ] return { "status": "success", "data": { "symbol": data.get("symbol"), "name": data.get("name"), "ncbi_entrez_gene_id": data.get("ncbiEntrezGeneId"), "associations": associations, }, "metadata": { "source": "Harmonizome (maayanlab.cloud/Harmonizome)", "mode": "gene", "gene_symbol": gene_symbol, "total_associations": len(associations_raw), "returned": len(associations), }, } # mode == "gene_set" attribute = arguments.get("attribute", "") dataset = arguments.get("dataset", "") if not attribute or not dataset: return { "status": "error", "error": "attribute and dataset are required for mode='gene_set' " "(e.g., attribute='heart', dataset='GTEx Tissue Gene Expression Profiles').", } url = f"{HARMONIZOME_BASE_URL}/gene_set/{_encode(attribute)}/{_encode(dataset)}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() data = response.json() if "message" in data: return { "status": "error", "error": f"Gene set not found: {data.get('message', f'{attribute}/{dataset}')}", } associations_raw = data.get("associations", []) members = [ { "gene": (a.get("gene") or {}).get("symbol"), "gene_href": (a.get("gene") or {}).get("href"), "threshold_value": a.get("thresholdValue"), "standardized_value": a.get("standardizedValue"), } for a in associations_raw[:limit] ] return { "status": "success", "data": { "attribute": (data.get("attribute") or {}).get("name", attribute), "dataset": (data.get("dataset") or {}).get("name", dataset), "members": members, }, "metadata": { "source": "Harmonizome (maayanlab.cloud/Harmonizome)", "mode": "gene_set", "attribute": attribute, "dataset": dataset, "total_associations": len(associations_raw), "returned": len(members), }, }
[docs] def _search_genes(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Search for genes by keyword in Harmonizome.""" query = arguments.get("query", "") if not query: return { "status": "error", "error": "query is required (e.g., 'kinase', 'tumor suppressor')", } limit = arguments.get("limit") or 20 url = f"{HARMONIZOME_BASE_URL}/gene" params = {"search": query} response = requests.get(url, params=params, timeout=self.timeout) response.raise_for_status() data = response.json() entities = data.get("entities", []) genes = [ {"symbol": e.get("symbol"), "href": e.get("href")} for e in entities[:limit] ] return { "status": "success", "data": genes, "metadata": { "source": "Harmonizome (maayanlab.cloud/Harmonizome)", "query": query, "total_results": data.get("count", len(genes)), }, }