Source code for tooluniverse.ensembl_overlap_tool
# ensembl_overlap_tool.py
"""
Ensembl Overlap tool for ToolUniverse.
The Ensembl Overlap API retrieves genomic features (genes, transcripts,
regulatory elements, variants, repeats) overlapping a given genomic region
or gene. This is fundamental for understanding what functional elements
exist in a region of interest.
API: https://rest.ensembl.org/overlap/
No authentication required.
"""
import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool
ENSEMBL_REST_BASE = "https://rest.ensembl.org"
ENSEMBL_HEADERS = {"User-Agent": "ToolUniverse/1.0", "Accept": "application/json"}
[docs]
@register_tool("EnsemblOverlapTool")
class EnsemblOverlapTool(BaseTool):
"""
Tool for querying Ensembl Overlap API.
Supports:
- Get features overlapping a genomic region (genes, transcripts, regulatory)
- Get features overlapping an Ensembl gene ID
No authentication required.
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout = tool_config.get("timeout", 90)
fields = tool_config.get("fields", {})
self.endpoint = fields.get("endpoint", "region")
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the Ensembl Overlap API call."""
try:
return self._query(arguments)
except requests.exceptions.Timeout:
return {
"error": f"Ensembl API timed out after {self.timeout}s. Try a smaller region."
}
except requests.exceptions.ConnectionError:
return {"error": "Failed to connect to Ensembl REST API"}
except requests.exceptions.HTTPError as e:
status = e.response.status_code if e.response is not None else "unknown"
if status == 400:
return {
"error": "Bad request. Check region format (e.g., '17:7661779-7687546') and feature types."
}
if status == 404:
return {
"error": "Region or gene not found. Verify species and coordinates."
}
return {"error": f"Ensembl REST API HTTP {status}"}
except Exception as e:
return {"error": f"Unexpected error: {str(e)}"}
[docs]
def _query(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Route to appropriate endpoint."""
if self.endpoint == "region":
return self._overlap_region(arguments)
elif self.endpoint == "gene_id":
return self._overlap_gene(arguments)
else:
return {"error": f"Unknown endpoint: {self.endpoint}"}
[docs]
def _overlap_region(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get features overlapping a genomic region."""
species = arguments.get("species", "human")
region = arguments.get("region", "")
feature_types = arguments.get("feature_types", "gene")
if not region:
return {
"error": "region is required (format: 'chr:start-end', e.g., '17:7661779-7687546')."
}
url = f"{ENSEMBL_REST_BASE}/overlap/region/{species}/{region}"
# Build feature parameters
params = {"content-type": "application/json"}
features = [f.strip() for f in feature_types.split(",")]
for f in features:
params.setdefault("feature", [])
if isinstance(params["feature"], list):
params["feature"].append(f)
else:
params["feature"] = [params["feature"], f]
# Use semicolon-separated params for Ensembl REST
feature_str = ";".join([f"feature={f}" for f in features])
full_url = f"{url}?{feature_str};content-type=application/json"
response = requests.get(full_url, headers=ENSEMBL_HEADERS, timeout=self.timeout)
response.raise_for_status()
data = response.json()
if not isinstance(data, list):
return {"error": "Unexpected response format from Ensembl API."}
# Categorize results by feature type
by_type = {}
for item in data:
ft = item.get("feature_type", "unknown")
by_type.setdefault(ft, [])
by_type[ft].append(item)
# Format results
formatted_features = []
for item in data[:100]:
feature = {
"feature_type": item.get("feature_type"),
"id": item.get("id"),
"start": item.get("start"),
"end": item.get("end"),
"strand": item.get("strand"),
"seq_region_name": item.get("seq_region_name"),
"biotype": item.get("biotype"),
"source": item.get("source"),
}
if item.get("external_name"):
feature["external_name"] = item["external_name"]
if item.get("description"):
feature["description"] = item["description"][:200]
formatted_features.append(feature)
type_summary = {k: len(v) for k, v in by_type.items()}
return {
"data": {
"region": region,
"species": species,
"features": formatted_features,
"type_summary": type_summary,
},
"metadata": {
"source": "Ensembl REST API (rest.ensembl.org)",
"total_features": len(data),
"returned": len(formatted_features),
},
}
[docs]
def _overlap_gene(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get features overlapping an Ensembl gene ID."""
gene_id = arguments.get("gene_id", "")
feature_types = arguments.get("feature_types", "gene")
if not gene_id:
return {
"error": "gene_id is required (Ensembl gene ID, e.g., 'ENSG00000141510' for TP53)."
}
features = [f.strip() for f in feature_types.split(",")]
feature_str = ";".join([f"feature={f}" for f in features])
url = f"{ENSEMBL_REST_BASE}/overlap/id/{gene_id}?{feature_str};content-type=application/json"
response = requests.get(url, headers=ENSEMBL_HEADERS, timeout=self.timeout)
response.raise_for_status()
data = response.json()
if not isinstance(data, list):
return {"error": "Unexpected response format from Ensembl API."}
# Categorize and format
by_type = {}
for item in data:
ft = item.get("feature_type", "unknown")
by_type.setdefault(ft, [])
by_type[ft].append(item)
formatted_features = []
for item in data[:100]:
feature = {
"feature_type": item.get("feature_type"),
"id": item.get("id"),
"start": item.get("start"),
"end": item.get("end"),
"strand": item.get("strand"),
"seq_region_name": item.get("seq_region_name"),
"biotype": item.get("biotype"),
"source": item.get("source"),
}
if item.get("external_name"):
feature["external_name"] = item["external_name"]
if item.get("description"):
feature["description"] = item["description"][:200]
if item.get("transcript_id"):
feature["transcript_id"] = item["transcript_id"]
formatted_features.append(feature)
type_summary = {k: len(v) for k, v in by_type.items()}
return {
"data": {
"gene_id": gene_id,
"features": formatted_features,
"type_summary": type_summary,
},
"metadata": {
"source": "Ensembl REST API (rest.ensembl.org)",
"total_features": len(data),
"returned": len(formatted_features),
},
}