Source code for tooluniverse.ensembl_phenotype_tool

# ensembl_phenotype_tool.py
"""
Ensembl REST API Phenotype association tool for ToolUniverse.

Provides access to phenotype/disease associations for:
- Genes (phenotype/gene endpoint)
- Genomic regions (phenotype/region endpoint)
- Variants (variation endpoint with phenotypes=1)

Returns disease/trait associations from multiple sources including
Cancer Gene Census, OMIM, ClinVar, NHGRI-EBI GWAS catalog, and Orphanet.

API: https://rest.ensembl.org/
No authentication required. Rate limit: 15 requests/second.
"""

import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool

ENSEMBL_BASE_URL = "https://rest.ensembl.org"
ENSEMBL_HEADERS = {"User-Agent": "ToolUniverse/1.0", "Accept": "application/json"}


[docs] @register_tool("EnsemblPhenotypeTool") class EnsemblPhenotypeTool(BaseTool): """ Tool for querying phenotype/disease associations from Ensembl REST API. Provides gene-phenotype, region-phenotype, and variant-phenotype lookups. No authentication required. """
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("timeout", 45) self.endpoint_type = tool_config.get("fields", {}).get("endpoint_type", "gene")
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the Ensembl Phenotype API call.""" try: return self._dispatch(arguments) except requests.exceptions.Timeout: return { "status": "error", "error": f"Ensembl REST API request timed out after {self.timeout}s. " "Try a smaller region or a less-studied gene.", } except requests.exceptions.ConnectionError: return {"status": "error", "error": "Failed to connect to Ensembl REST API"} except requests.exceptions.HTTPError as e: status = e.response.status_code if e.response else "unknown" if status == 400: return { "status": "error", "error": "Bad request: check gene name, region format, or variant ID", } if status == 404: return { "status": "error", "error": "Not found: the gene, region, or variant was not found in Ensembl", } return { "status": "error", "error": f"Ensembl REST API HTTP error: {status}", } except Exception as e: return {"status": "error", "error": f"Unexpected error: {str(e)}"}
[docs] def _dispatch(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Route to appropriate endpoint.""" if self.endpoint_type == "gene": return self._phenotype_gene(arguments) elif self.endpoint_type == "region": return self._phenotype_region(arguments) elif self.endpoint_type == "variant": return self._phenotype_variant(arguments) elif self.endpoint_type == "term": return self._phenotype_term(arguments) return { "status": "error", "error": f"Unknown endpoint_type: {self.endpoint_type}", }
[docs] def _phenotype_term(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Reverse phenotype lookup: trait/disease name OR ontology accession. Given a phenotype term (e.g. 'Alzheimer disease') or an ontology accession (e.g. 'EFO:0000249', 'HP:0002511', 'MONDO:0004975'), return all associated variants/genes with risk allele, p-value, source, etc. Uses the Ensembl phenotype/term and phenotype/accession endpoints. """ species = arguments.get("species", "homo_sapiens") term = arguments.get("term") accession = arguments.get("accession") # Allow callers to pass the single query under either key, and # auto-detect ontology accessions (PREFIX:NUMBER) passed as 'term'. if not accession and term and self._looks_like_accession(term): accession = term term = None if accession: query = accession url = f"{ENSEMBL_BASE_URL}/phenotype/accession/{species}/{accession}" endpoint = f"phenotype/accession/{species}/{accession}" query_kind = "accession" elif term: query = term url = f"{ENSEMBL_BASE_URL}/phenotype/term/{species}/{term}" endpoint = f"phenotype/term/{species}/{term}" query_kind = "term" else: return { "status": "error", "error": ( "Provide 'term' (e.g. 'Alzheimer disease') or 'accession' " "(e.g. 'EFO:0000249', 'HP:0002511')." ), } params = {"content-type": "application/json"} response = requests.get( url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout ) response.raise_for_status() raw = response.json() if not isinstance(raw, list): raw = [] associations = [] for entry in raw: attrs = entry.get("attributes", {}) or {} associations.append( { "description": entry.get("description", ""), "variant": entry.get("Variation"), "gene": attrs.get("associated_gene"), "risk_allele": attrs.get("risk_allele"), "p_value": attrs.get("p_value"), "beta": attrs.get("beta_coefficient") or attrs.get("beta"), "odds_ratio": attrs.get("odds_ratio"), "clinical_significance": attrs.get("clinical_significance"), "source": entry.get("source", ""), "location": entry.get("location"), "mapped_to_accession": entry.get("mapped_to_accession"), "external_reference": attrs.get("external_reference") or attrs.get("external_id"), } ) return { "status": "success", "data": { "query": query, "query_kind": query_kind, "species": species, "association_count": len(associations), "associations": associations[:500], }, "metadata": { "source": "Ensembl REST API", "endpoint": endpoint, "total_returned": min(len(associations), 500), }, }
[docs] @staticmethod def _looks_like_accession(value: str) -> bool: """Heuristic: ontology accessions look like 'EFO:0000249' / 'HP:0002511'.""" if not isinstance(value, str) or ":" not in value: return False prefix, _, rest = value.partition(":") return prefix.isalpha() and bool(rest) and rest.replace("_", "").isalnum()
[docs] def _phenotype_gene(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get phenotype associations for a gene.""" species = arguments.get("species", "homo_sapiens") gene = arguments.get("gene", "") if not gene: return { "status": "error", "error": "gene parameter is required (e.g., 'BRCA1')", } url = f"{ENSEMBL_BASE_URL}/phenotype/gene/{species}/{gene}" params = {"content-type": "application/json"} response = requests.get( url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout ) response.raise_for_status() raw = response.json() if not isinstance(raw, list): raw = [] phenotypes = [] for entry in raw: attrs = entry.get("attributes", {}) phenotypes.append( { "description": entry.get("description", ""), "source": entry.get("source", ""), "location": entry.get("location"), "gene_ensembl_id": entry.get("Gene"), "ontology_accessions": entry.get("ontology_accessions", []), "external_references": attrs.get("external_reference"), } ) # Deduplicate by description+source seen = set() unique_phenos = [] for p in phenotypes: key = (p["description"], p["source"]) if key not in seen: seen.add(key) unique_phenos.append(p) return { "status": "success", "data": { "gene": gene, "species": species, "phenotype_count": len(unique_phenos), "phenotypes": unique_phenos[:200], }, "metadata": { "source": "Ensembl REST API", "endpoint": f"phenotype/gene/{species}/{gene}", }, }
[docs] def _phenotype_region(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get phenotype associations for a genomic region.""" species = arguments.get("species", "homo_sapiens") region = arguments.get("region", "") feature_type = arguments.get("feature_type") if not region: return { "status": "error", "error": "region is required (e.g., '17:7661779-7687538')", } url = f"{ENSEMBL_BASE_URL}/phenotype/region/{species}/{region}" params = {"content-type": "application/json"} if feature_type: params["feature_type"] = feature_type response = requests.get( url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout ) response.raise_for_status() raw = response.json() if not isinstance(raw, list): raw = [] # The region endpoint returns entries with nested phenotype_associations phenotypes = [] seen = set() for entry in raw: variant_id = entry.get("id", "") # Each entry has a list of phenotype_associations for assoc in entry.get("phenotype_associations", []): desc = assoc.get("description", "") source = assoc.get("source", "") key = (desc, source, variant_id) if key not in seen and desc: seen.add(key) phenotypes.append( { "description": desc, "source": source, "id": variant_id, "location": assoc.get("location"), "ontology_accessions": assoc.get("ontology_accessions", []), } ) return { "status": "success", "data": { "region": region, "species": species, "phenotype_count": len(phenotypes), "phenotypes": phenotypes[:200], }, "metadata": { "source": "Ensembl REST API", "endpoint": f"phenotype/region/{species}/{region}", }, }
[docs] def _phenotype_variant(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get phenotype associations for a variant via the variation endpoint.""" species = arguments.get("species", "homo_sapiens") variant_id = arguments.get("variant_id", "") if not variant_id: return { "status": "error", "error": "variant_id is required (e.g., 'rs429358')", } # Use the variation endpoint with phenotypes=1 url = f"{ENSEMBL_BASE_URL}/variation/{species}/{variant_id}" params = {"content-type": "application/json", "phenotypes": 1} response = requests.get( url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout ) response.raise_for_status() raw = response.json() pheno_list = raw.get("phenotypes", []) if not isinstance(pheno_list, list): pheno_list = [] phenotypes = [] for entry in pheno_list: phenotypes.append( { "trait": entry.get("trait", ""), "source": entry.get("source", ""), "risk_allele": entry.get("risk_allele"), "pvalue": entry.get("pvalue"), "beta_coefficient": entry.get("beta_coefficient"), "study": entry.get("study"), "genes": entry.get("genes"), "ontology_accessions": entry.get("ontology_accessions", []), } ) # Limit to top 200 (can be very large for well-studied variants) return { "status": "success", "data": { "variant_id": variant_id, "species": species, "phenotype_count": len(phenotypes), "phenotypes": phenotypes[:200], }, "metadata": { "source": "Ensembl REST API", "endpoint": f"variation/{species}/{variant_id}?phenotypes=1", }, }