Source code for tooluniverse.ensembl_phenotype_tool
# ensembl_phenotype_tool.py
"""
Ensembl REST API Phenotype association tool for ToolUniverse.
Provides access to phenotype/disease associations for:
- Genes (phenotype/gene endpoint)
- Genomic regions (phenotype/region endpoint)
- Variants (variation endpoint with phenotypes=1)
Returns disease/trait associations from multiple sources including
Cancer Gene Census, OMIM, ClinVar, NHGRI-EBI GWAS catalog, and Orphanet.
API: https://rest.ensembl.org/
No authentication required. Rate limit: 15 requests/second.
"""
import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool
ENSEMBL_BASE_URL = "https://rest.ensembl.org"
ENSEMBL_HEADERS = {"User-Agent": "ToolUniverse/1.0", "Accept": "application/json"}
[docs]
@register_tool("EnsemblPhenotypeTool")
class EnsemblPhenotypeTool(BaseTool):
"""
Tool for querying phenotype/disease associations from Ensembl REST API.
Provides gene-phenotype, region-phenotype, and variant-phenotype lookups.
No authentication required.
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout = tool_config.get("timeout", 45)
self.endpoint_type = tool_config.get("fields", {}).get("endpoint_type", "gene")
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the Ensembl Phenotype API call."""
try:
return self._dispatch(arguments)
except requests.exceptions.Timeout:
return {
"status": "error",
"error": f"Ensembl REST API request timed out after {self.timeout}s. "
"Try a smaller region or a less-studied gene.",
}
except requests.exceptions.ConnectionError:
return {"status": "error", "error": "Failed to connect to Ensembl REST API"}
except requests.exceptions.HTTPError as e:
status = e.response.status_code if e.response else "unknown"
if status == 400:
return {
"status": "error",
"error": "Bad request: check gene name, region format, or variant ID",
}
if status == 404:
return {
"status": "error",
"error": "Not found: the gene, region, or variant was not found in Ensembl",
}
return {
"status": "error",
"error": f"Ensembl REST API HTTP error: {status}",
}
except Exception as e:
return {"status": "error", "error": f"Unexpected error: {str(e)}"}
[docs]
def _dispatch(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Route to appropriate endpoint."""
if self.endpoint_type == "gene":
return self._phenotype_gene(arguments)
elif self.endpoint_type == "region":
return self._phenotype_region(arguments)
elif self.endpoint_type == "variant":
return self._phenotype_variant(arguments)
elif self.endpoint_type == "term":
return self._phenotype_term(arguments)
return {
"status": "error",
"error": f"Unknown endpoint_type: {self.endpoint_type}",
}
[docs]
def _phenotype_term(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Reverse phenotype lookup: trait/disease name OR ontology accession.
Given a phenotype term (e.g. 'Alzheimer disease') or an ontology
accession (e.g. 'EFO:0000249', 'HP:0002511', 'MONDO:0004975'), return
all associated variants/genes with risk allele, p-value, source, etc.
Uses the Ensembl phenotype/term and phenotype/accession endpoints.
"""
species = arguments.get("species", "homo_sapiens")
term = arguments.get("term")
accession = arguments.get("accession")
# Allow callers to pass the single query under either key, and
# auto-detect ontology accessions (PREFIX:NUMBER) passed as 'term'.
if not accession and term and self._looks_like_accession(term):
accession = term
term = None
if accession:
query = accession
url = f"{ENSEMBL_BASE_URL}/phenotype/accession/{species}/{accession}"
endpoint = f"phenotype/accession/{species}/{accession}"
query_kind = "accession"
elif term:
query = term
url = f"{ENSEMBL_BASE_URL}/phenotype/term/{species}/{term}"
endpoint = f"phenotype/term/{species}/{term}"
query_kind = "term"
else:
return {
"status": "error",
"error": (
"Provide 'term' (e.g. 'Alzheimer disease') or 'accession' "
"(e.g. 'EFO:0000249', 'HP:0002511')."
),
}
params = {"content-type": "application/json"}
response = requests.get(
url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout
)
response.raise_for_status()
raw = response.json()
if not isinstance(raw, list):
raw = []
associations = []
for entry in raw:
attrs = entry.get("attributes", {}) or {}
associations.append(
{
"description": entry.get("description", ""),
"variant": entry.get("Variation"),
"gene": attrs.get("associated_gene"),
"risk_allele": attrs.get("risk_allele"),
"p_value": attrs.get("p_value"),
"beta": attrs.get("beta_coefficient") or attrs.get("beta"),
"odds_ratio": attrs.get("odds_ratio"),
"clinical_significance": attrs.get("clinical_significance"),
"source": entry.get("source", ""),
"location": entry.get("location"),
"mapped_to_accession": entry.get("mapped_to_accession"),
"external_reference": attrs.get("external_reference")
or attrs.get("external_id"),
}
)
return {
"status": "success",
"data": {
"query": query,
"query_kind": query_kind,
"species": species,
"association_count": len(associations),
"associations": associations[:500],
},
"metadata": {
"source": "Ensembl REST API",
"endpoint": endpoint,
"total_returned": min(len(associations), 500),
},
}
[docs]
@staticmethod
def _looks_like_accession(value: str) -> bool:
"""Heuristic: ontology accessions look like 'EFO:0000249' / 'HP:0002511'."""
if not isinstance(value, str) or ":" not in value:
return False
prefix, _, rest = value.partition(":")
return prefix.isalpha() and bool(rest) and rest.replace("_", "").isalnum()
[docs]
def _phenotype_gene(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get phenotype associations for a gene."""
species = arguments.get("species", "homo_sapiens")
gene = arguments.get("gene", "")
if not gene:
return {
"status": "error",
"error": "gene parameter is required (e.g., 'BRCA1')",
}
url = f"{ENSEMBL_BASE_URL}/phenotype/gene/{species}/{gene}"
params = {"content-type": "application/json"}
response = requests.get(
url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout
)
response.raise_for_status()
raw = response.json()
if not isinstance(raw, list):
raw = []
phenotypes = []
for entry in raw:
attrs = entry.get("attributes", {})
phenotypes.append(
{
"description": entry.get("description", ""),
"source": entry.get("source", ""),
"location": entry.get("location"),
"gene_ensembl_id": entry.get("Gene"),
"ontology_accessions": entry.get("ontology_accessions", []),
"external_references": attrs.get("external_reference"),
}
)
# Deduplicate by description+source
seen = set()
unique_phenos = []
for p in phenotypes:
key = (p["description"], p["source"])
if key not in seen:
seen.add(key)
unique_phenos.append(p)
return {
"status": "success",
"data": {
"gene": gene,
"species": species,
"phenotype_count": len(unique_phenos),
"phenotypes": unique_phenos[:200],
},
"metadata": {
"source": "Ensembl REST API",
"endpoint": f"phenotype/gene/{species}/{gene}",
},
}
[docs]
def _phenotype_region(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get phenotype associations for a genomic region."""
species = arguments.get("species", "homo_sapiens")
region = arguments.get("region", "")
feature_type = arguments.get("feature_type")
if not region:
return {
"status": "error",
"error": "region is required (e.g., '17:7661779-7687538')",
}
url = f"{ENSEMBL_BASE_URL}/phenotype/region/{species}/{region}"
params = {"content-type": "application/json"}
if feature_type:
params["feature_type"] = feature_type
response = requests.get(
url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout
)
response.raise_for_status()
raw = response.json()
if not isinstance(raw, list):
raw = []
# The region endpoint returns entries with nested phenotype_associations
phenotypes = []
seen = set()
for entry in raw:
variant_id = entry.get("id", "")
# Each entry has a list of phenotype_associations
for assoc in entry.get("phenotype_associations", []):
desc = assoc.get("description", "")
source = assoc.get("source", "")
key = (desc, source, variant_id)
if key not in seen and desc:
seen.add(key)
phenotypes.append(
{
"description": desc,
"source": source,
"id": variant_id,
"location": assoc.get("location"),
"ontology_accessions": assoc.get("ontology_accessions", []),
}
)
return {
"status": "success",
"data": {
"region": region,
"species": species,
"phenotype_count": len(phenotypes),
"phenotypes": phenotypes[:200],
},
"metadata": {
"source": "Ensembl REST API",
"endpoint": f"phenotype/region/{species}/{region}",
},
}
[docs]
def _phenotype_variant(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get phenotype associations for a variant via the variation endpoint."""
species = arguments.get("species", "homo_sapiens")
variant_id = arguments.get("variant_id", "")
if not variant_id:
return {
"status": "error",
"error": "variant_id is required (e.g., 'rs429358')",
}
# Use the variation endpoint with phenotypes=1
url = f"{ENSEMBL_BASE_URL}/variation/{species}/{variant_id}"
params = {"content-type": "application/json", "phenotypes": 1}
response = requests.get(
url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout
)
response.raise_for_status()
raw = response.json()
pheno_list = raw.get("phenotypes", [])
if not isinstance(pheno_list, list):
pheno_list = []
phenotypes = []
for entry in pheno_list:
phenotypes.append(
{
"trait": entry.get("trait", ""),
"source": entry.get("source", ""),
"risk_allele": entry.get("risk_allele"),
"pvalue": entry.get("pvalue"),
"beta_coefficient": entry.get("beta_coefficient"),
"study": entry.get("study"),
"genes": entry.get("genes"),
"ontology_accessions": entry.get("ontology_accessions", []),
}
)
# Limit to top 200 (can be very large for well-studied variants)
return {
"status": "success",
"data": {
"variant_id": variant_id,
"species": species,
"phenotype_count": len(phenotypes),
"phenotypes": phenotypes[:200],
},
"metadata": {
"source": "Ensembl REST API",
"endpoint": f"variation/{species}/{variant_id}?phenotypes=1",
},
}