Source code for tooluniverse.ensembl_regulation_tool
# ensembl_regulation_tool.py
"""
Ensembl REST API Regulation and Conservation tool for ToolUniverse.
Provides access to:
- Transcription factor binding motif features (motif instances in genomic regions)
- Evolutionarily constrained elements (regions under purifying selection)
- Binding matrix details (position weight matrices for TF binding)
These endpoints complement the existing ensembl_get_regulatory_features tool
by adding TF-specific binding data and evolutionary conservation scores.
API: https://rest.ensembl.org/
No authentication required. Rate limit: 15 requests/second.
"""
import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool
ENSEMBL_BASE_URL = "https://rest.ensembl.org"
ENSEMBL_HEADERS = {"User-Agent": "ToolUniverse/1.0", "Accept": "application/json"}
[docs]
@register_tool("EnsemblRegulationTool")
class EnsemblRegulationTool(BaseTool):
"""
Tool for querying regulatory and conservation features from Ensembl REST API.
Provides TF binding motifs, constrained elements, and binding matrices.
No authentication required.
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout = tool_config.get("timeout", 30)
self.endpoint_type = tool_config.get("fields", {}).get(
"endpoint_type", "motif_features"
)
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the Ensembl Regulation API call."""
try:
return self._dispatch(arguments)
except requests.exceptions.Timeout:
return {
"error": f"Ensembl REST API request timed out after {self.timeout}s"
}
except requests.exceptions.ConnectionError:
return {"error": "Failed to connect to Ensembl REST API"}
except requests.exceptions.HTTPError as e:
status = e.response.status_code if e.response else "unknown"
if status == 400:
return {"error": f"Bad request: check region format (chr:start-end)"}
return {"error": f"Ensembl REST API HTTP error: {status}"}
except Exception as e:
return {"error": f"Unexpected error: {str(e)}"}
[docs]
def _dispatch(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Route to appropriate endpoint."""
if self.endpoint_type == "motif_features":
return self._motif_features(arguments)
elif self.endpoint_type == "constrained_elements":
return self._constrained_elements(arguments)
elif self.endpoint_type == "binding_matrix":
return self._binding_matrix(arguments)
return {"error": f"Unknown endpoint_type: {self.endpoint_type}"}
[docs]
def _motif_features(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get TF binding motif features in a genomic region."""
species = arguments.get("species", "homo_sapiens")
region = arguments.get("region", "")
if not region:
return {"error": "region is required (e.g., '7:140424943-140524564')"}
url = f"{ENSEMBL_BASE_URL}/overlap/region/{species}/{region}"
params = {"feature": "motif", "content-type": "application/json"}
response = requests.get(
url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout
)
response.raise_for_status()
raw = response.json()
if not isinstance(raw, list):
raw = []
motifs = []
for entry in raw:
motifs.append(
{
"stable_id": entry.get("stable_id", ""),
"transcription_factor_complex": entry.get(
"transcription_factor_complex"
),
"binding_matrix_stable_id": entry.get("binding_matrix_stable_id"),
"score": entry.get("score"),
"start": entry.get("start", 0),
"end": entry.get("end", 0),
"strand": entry.get("strand", 0),
"seq_region_name": entry.get("seq_region_name", ""),
}
)
return {
"data": {
"region": region,
"species": species,
"motif_count": len(motifs),
"motif_features": motifs[:200],
},
"metadata": {
"source": "Ensembl REST API",
"endpoint": f"overlap/region/{species}/{region}?feature=motif",
},
}
[docs]
def _constrained_elements(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get evolutionarily constrained elements in a genomic region."""
species = arguments.get("species", "homo_sapiens")
region = arguments.get("region", "")
if not region:
return {"error": "region is required (e.g., '17:7661779-7687538')"}
url = f"{ENSEMBL_BASE_URL}/overlap/region/{species}/{region}"
params = {"feature": "constrained", "content-type": "application/json"}
response = requests.get(
url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout
)
response.raise_for_status()
raw = response.json()
if not isinstance(raw, list):
raw = []
elements = []
for entry in raw:
elements.append(
{
"id": entry.get("ID"),
"start": entry.get("start", 0),
"end": entry.get("end", 0),
"score": entry.get("score", 0),
"strand": entry.get("strand", 0),
"seq_region_name": entry.get("seq_region_name", ""),
}
)
# Sort by score descending
elements.sort(key=lambda x: x.get("score", 0), reverse=True)
return {
"data": {
"region": region,
"species": species,
"element_count": len(elements),
"constrained_elements": elements[:200],
},
"metadata": {
"source": "Ensembl Compara",
"endpoint": f"overlap/region/{species}/{region}?feature=constrained",
},
}
[docs]
def _binding_matrix(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Get a TF binding matrix by stable ID."""
species = arguments.get("species", "homo_sapiens")
matrix_id = arguments.get("binding_matrix_id", "")
if not matrix_id:
return {"error": "binding_matrix_id is required (e.g., 'ENSPFM0320')"}
url = f"{ENSEMBL_BASE_URL}/species/{species}/binding_matrix/{matrix_id}"
params = {"content-type": "application/json"}
response = requests.get(
url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout
)
response.raise_for_status()
raw = response.json()
# Extract associated TF names
tfs = []
for tf in raw.get("associated_transcription_factor_complexes", []):
if isinstance(tf, dict):
tfs.append(tf.get("name", str(tf)))
else:
tfs.append(str(tf))
return {
"data": {
"stable_id": raw.get("stable_id", matrix_id),
"name": raw.get("name"),
"source": raw.get("source"),
"length": raw.get("length", 0),
"threshold": raw.get("threshold"),
"unit": raw.get("unit"),
"max_position_sum": raw.get("max_position_sum"),
"elements_string": raw.get("elements_string"),
"associated_tfs": tfs,
"matrix": raw.get("elements", {}),
},
"metadata": {
"source": "Ensembl REST API",
"endpoint": f"species/{species}/binding_matrix/{matrix_id}",
},
}