Source code for tooluniverse.ensembl_phenotype_tool

# ensembl_phenotype_tool.py
"""
Ensembl REST API Phenotype association tool for ToolUniverse.

Provides access to phenotype/disease associations for:
- Genes (phenotype/gene endpoint)
- Genomic regions (phenotype/region endpoint)
- Variants (variation endpoint with phenotypes=1)

Returns disease/trait associations from multiple sources including
Cancer Gene Census, OMIM, ClinVar, NHGRI-EBI GWAS catalog, and Orphanet.

API: https://rest.ensembl.org/
No authentication required. Rate limit: 15 requests/second.
"""

import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool

ENSEMBL_BASE_URL = "https://rest.ensembl.org"
ENSEMBL_HEADERS = {"User-Agent": "ToolUniverse/1.0", "Accept": "application/json"}


[docs] @register_tool("EnsemblPhenotypeTool") class EnsemblPhenotypeTool(BaseTool): """ Tool for querying phenotype/disease associations from Ensembl REST API. Provides gene-phenotype, region-phenotype, and variant-phenotype lookups. No authentication required. """
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("timeout", 45) self.endpoint_type = tool_config.get("fields", {}).get("endpoint_type", "gene")
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the Ensembl Phenotype API call.""" try: return self._dispatch(arguments) except requests.exceptions.Timeout: return { "error": f"Ensembl REST API request timed out after {self.timeout}s. " "Try a smaller region or a less-studied gene." } except requests.exceptions.ConnectionError: return {"error": "Failed to connect to Ensembl REST API"} except requests.exceptions.HTTPError as e: status = e.response.status_code if e.response else "unknown" if status == 400: return { "error": "Bad request: check gene name, region format, or variant ID" } if status == 404: return { "error": "Not found: the gene, region, or variant was not found in Ensembl" } return {"error": f"Ensembl REST API HTTP error: {status}"} except Exception as e: return {"error": f"Unexpected error: {str(e)}"}
[docs] def _dispatch(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Route to appropriate endpoint.""" if self.endpoint_type == "gene": return self._phenotype_gene(arguments) elif self.endpoint_type == "region": return self._phenotype_region(arguments) elif self.endpoint_type == "variant": return self._phenotype_variant(arguments) return {"error": f"Unknown endpoint_type: {self.endpoint_type}"}
[docs] def _phenotype_gene(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get phenotype associations for a gene.""" species = arguments.get("species", "homo_sapiens") gene = arguments.get("gene", "") if not gene: return {"error": "gene parameter is required (e.g., 'BRCA1')"} url = f"{ENSEMBL_BASE_URL}/phenotype/gene/{species}/{gene}" params = {"content-type": "application/json"} response = requests.get( url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout ) response.raise_for_status() raw = response.json() if not isinstance(raw, list): raw = [] phenotypes = [] for entry in raw: attrs = entry.get("attributes", {}) phenotypes.append( { "description": entry.get("description", ""), "source": entry.get("source", ""), "location": entry.get("location"), "gene_ensembl_id": entry.get("Gene"), "ontology_accessions": entry.get("ontology_accessions", []), "external_references": attrs.get("external_reference"), } ) # Deduplicate by description+source seen = set() unique_phenos = [] for p in phenotypes: key = (p["description"], p["source"]) if key not in seen: seen.add(key) unique_phenos.append(p) return { "data": { "gene": gene, "species": species, "phenotype_count": len(unique_phenos), "phenotypes": unique_phenos[:200], }, "metadata": { "source": "Ensembl REST API", "endpoint": f"phenotype/gene/{species}/{gene}", }, }
[docs] def _phenotype_region(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get phenotype associations for a genomic region.""" species = arguments.get("species", "homo_sapiens") region = arguments.get("region", "") feature_type = arguments.get("feature_type") if not region: return {"error": "region is required (e.g., '17:7661779-7687538')"} url = f"{ENSEMBL_BASE_URL}/phenotype/region/{species}/{region}" params = {"content-type": "application/json"} if feature_type: params["feature_type"] = feature_type response = requests.get( url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout ) response.raise_for_status() raw = response.json() if not isinstance(raw, list): raw = [] # The region endpoint returns entries with nested phenotype_associations phenotypes = [] seen = set() for entry in raw: variant_id = entry.get("id", "") # Each entry has a list of phenotype_associations for assoc in entry.get("phenotype_associations", []): desc = assoc.get("description", "") source = assoc.get("source", "") key = (desc, source, variant_id) if key not in seen and desc: seen.add(key) phenotypes.append( { "description": desc, "source": source, "id": variant_id, "location": assoc.get("location"), "ontology_accessions": assoc.get("ontology_accessions", []), } ) return { "data": { "region": region, "species": species, "phenotype_count": len(phenotypes), "phenotypes": phenotypes[:200], }, "metadata": { "source": "Ensembl REST API", "endpoint": f"phenotype/region/{species}/{region}", }, }
[docs] def _phenotype_variant(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get phenotype associations for a variant via the variation endpoint.""" species = arguments.get("species", "homo_sapiens") variant_id = arguments.get("variant_id", "") if not variant_id: return {"error": "variant_id is required (e.g., 'rs429358')"} # Use the variation endpoint with phenotypes=1 url = f"{ENSEMBL_BASE_URL}/variation/{species}/{variant_id}" params = {"content-type": "application/json", "phenotypes": 1} response = requests.get( url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout ) response.raise_for_status() raw = response.json() pheno_list = raw.get("phenotypes", []) if not isinstance(pheno_list, list): pheno_list = [] phenotypes = [] for entry in pheno_list: phenotypes.append( { "trait": entry.get("trait", ""), "source": entry.get("source", ""), "risk_allele": entry.get("risk_allele"), "pvalue": entry.get("pvalue"), "beta_coefficient": entry.get("beta_coefficient"), "study": entry.get("study"), "genes": entry.get("genes"), "ontology_accessions": entry.get("ontology_accessions", []), } ) # Limit to top 200 (can be very large for well-studied variants) return { "data": { "variant_id": variant_id, "species": species, "phenotype_count": len(phenotypes), "phenotypes": phenotypes[:200], }, "metadata": { "source": "Ensembl REST API", "endpoint": f"variation/{species}/{variant_id}?phenotypes=1", }, }