Source code for tooluniverse.ensembl_map_tool
# ensembl_map_tool.py
"""
Ensembl Assembly Mapping tool for ToolUniverse.
Provides coordinate conversion between genome assemblies (e.g., GRCh37 to GRCh38)
and mapping of protein/cDNA positions to genomic coordinates.
API: https://rest.ensembl.org
No authentication required.
"""
import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool
ENSEMBL_BASE_URL = "https://rest.ensembl.org"
ENSEMBL_HEADERS = {"User-Agent": "ToolUniverse/1.0", "Accept": "application/json"}
[docs]
@register_tool("EnsemblMapTool")
class EnsemblMapTool(BaseTool):
"""
Tool for Ensembl coordinate mapping operations.
Supports:
- Assembly-to-assembly coordinate conversion (GRCh37 <-> GRCh38)
- Protein position to genomic coordinate mapping
- cDNA position to genomic coordinate mapping
No authentication required.
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout = tool_config.get("timeout", 90)
fields = tool_config.get("fields", {})
self.endpoint = fields.get("endpoint", "assembly_map")
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the Ensembl mapping API call."""
try:
return self._query(arguments)
except requests.exceptions.Timeout:
return {"error": f"Ensembl API timed out after {self.timeout}s"}
except requests.exceptions.ConnectionError:
return {"error": "Failed to connect to Ensembl REST API"}
except requests.exceptions.HTTPError as e:
status = e.response.status_code if e.response is not None else "unknown"
text = ""
if e.response is not None:
try:
text = e.response.json().get("error", "")
except Exception:
text = e.response.text[:200]
return {"error": f"Ensembl API HTTP {status}: {text}"}
except Exception as e:
return {"error": f"Unexpected error: {str(e)}"}
[docs]
def _query(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Route to appropriate endpoint."""
if self.endpoint == "assembly_map":
return self._assembly_map(arguments)
elif self.endpoint == "translate_coords":
return self._translate_coords(arguments)
else:
return {"error": f"Unknown endpoint: {self.endpoint}"}
[docs]
def _assembly_map(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Convert coordinates between genome assemblies."""
species = arguments.get("species", "human")
source_asm = arguments.get("source_assembly")
chrom = arguments.get("chromosome")
start = arguments.get("start")
end = arguments.get("end")
target_asm = arguments.get("target_assembly")
if not all([source_asm, chrom, start, end, target_asm]):
return {
"error": "source_assembly, chromosome, start, end, and target_assembly are all required."
}
url = f"{ENSEMBL_BASE_URL}/map/{species}/{source_asm}/{chrom}:{start}..{end}/{target_asm}"
response = requests.get(
url,
params={"content-type": "application/json"},
headers=ENSEMBL_HEADERS,
timeout=self.timeout,
)
response.raise_for_status()
data = response.json()
mappings = data.get("mappings", [])
return {
"data": {
"mappings": [
{
"original": m.get("original", {}),
"mapped": m.get("mapped", {}),
}
for m in mappings
]
},
"metadata": {
"source": "Ensembl REST API (rest.ensembl.org)",
"total_mappings": len(mappings),
},
}
[docs]
def _translate_coords(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Map protein or cDNA positions to genomic coordinates."""
ensembl_id = arguments.get("ensembl_id", "")
start = arguments.get("start")
end = arguments.get("end")
if not ensembl_id or start is None or end is None:
return {"error": "ensembl_id, start, and end are all required."}
# Determine if this is a protein (ENSP) or transcript (ENST) ID
if ensembl_id.startswith("ENSP"):
coord_type = "translation"
elif ensembl_id.startswith("ENST"):
coord_type = "cdna"
else:
coord_type = "cdna" # Default to cDNA
url = f"{ENSEMBL_BASE_URL}/map/{coord_type}/{ensembl_id}/{start}..{end}"
response = requests.get(
url,
params={"content-type": "application/json"},
headers=ENSEMBL_HEADERS,
timeout=self.timeout,
)
response.raise_for_status()
data = response.json()
mappings = data.get("mappings", [])
return {
"data": {
"mappings": [
{
"seq_region_name": m.get("seq_region_name"),
"start": m.get("start"),
"end": m.get("end"),
"strand": m.get("strand"),
"coord_system": m.get("coord_system"),
}
for m in mappings
]
},
"metadata": {
"source": "Ensembl REST API (rest.ensembl.org)",
"query_id": ensembl_id,
"coordinate_type": "protein" if coord_type == "translation" else "cDNA",
"total_mappings": len(mappings),
},
}