Source code for tooluniverse.ensembl_vep_tool
# ensembl_vep_tool.py
"""
Ensembl VEP (Variant Effect Predictor) and Variant Recoder API tools for ToolUniverse.
Ensembl VEP predicts functional consequences of genetic variants including
impact on genes, transcripts, and proteins, with SIFT/PolyPhen scores.
The Variant Recoder converts between variant identifier formats.
API: https://rest.ensembl.org/
No authentication required. Rate limited to 15 requests/second.
"""
import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool
ENSEMBL_BASE_URL = "https://rest.ensembl.org"
ENSEMBL_HEADERS = {"User-Agent": "ToolUniverse/1.0", "Accept": "application/json"}
[docs]
@register_tool("EnsemblVEPTool")
class EnsemblVEPTool(BaseTool):
"""
Tool for Ensembl VEP variant annotation and Variant Recoder ID conversion.
Supports three modes:
- vep_hgvs: Annotate variants using HGVS notation (e.g., BRAF:p.Val600Glu)
- vep_id: Annotate variants using dbSNP rsID (e.g., rs7903146)
- variant_recoder: Convert variant IDs between formats (rsID -> HGVS, SPDI)
No authentication required.
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout = tool_config.get("timeout", 30)
fields = tool_config.get("fields", {})
self.mode = fields.get("mode", "vep_hgvs")
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the Ensembl VEP or Variant Recoder API call."""
try:
return self._query(arguments)
except requests.exceptions.Timeout:
return {
"error": f"Ensembl API request timed out after {self.timeout} seconds"
}
except requests.exceptions.ConnectionError:
return {
"error": "Failed to connect to Ensembl REST API. Check network connectivity."
}
except requests.exceptions.HTTPError as e:
return {
"error": f"Ensembl API HTTP error: {e.response.status_code} - {e.response.text[:200]}"
}
except Exception as e:
return {"error": f"Unexpected error querying Ensembl: {str(e)}"}
[docs]
def _query(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Route to appropriate Ensembl endpoint based on mode."""
if self.mode == "vep_hgvs":
return self._vep_hgvs(arguments)
elif self.mode == "vep_id":
return self._vep_id(arguments)
elif self.mode == "variant_recoder":
return self._variant_recoder(arguments)
else:
return {"error": f"Unknown mode: {self.mode}"}
[docs]
def _vep_hgvs(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Annotate a variant using HGVS notation."""
hgvs = arguments.get("hgvs_notation", "")
if not hgvs:
return {"error": "hgvs_notation parameter is required"}
species = arguments.get("species", "human")
url = f"{ENSEMBL_BASE_URL}/vep/{species}/hgvs/{hgvs}"
params = {"content-type": "application/json"}
response = requests.get(
url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout
)
response.raise_for_status()
data = response.json()
if isinstance(data, list) and data:
result = data[0]
return {
"data": self._format_vep_result(result),
"metadata": {
"source": "Ensembl VEP",
"species": species,
"input": hgvs,
"api_version": "REST",
},
}
return {
"data": {},
"metadata": {
"source": "Ensembl VEP",
"species": species,
"input": hgvs,
"num_results": 0,
},
}
[docs]
def _vep_id(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Annotate a variant using dbSNP rsID."""
variant_id = arguments.get("variant_id", "")
if not variant_id:
return {"error": "variant_id parameter is required"}
species = arguments.get("species", "human")
url = f"{ENSEMBL_BASE_URL}/vep/{species}/id/{variant_id}"
params = {"content-type": "application/json"}
response = requests.get(
url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout
)
response.raise_for_status()
data = response.json()
if isinstance(data, list) and data:
result = data[0]
return {
"data": self._format_vep_result(result),
"metadata": {
"source": "Ensembl VEP",
"species": species,
"input": variant_id,
"api_version": "REST",
},
}
return {
"data": {},
"metadata": {
"source": "Ensembl VEP",
"species": species,
"input": variant_id,
"num_results": 0,
},
}
[docs]
def _variant_recoder(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Convert variant identifiers between formats."""
variant_id = arguments.get("variant_id", "")
if not variant_id:
return {"error": "variant_id parameter is required"}
species = arguments.get("species", "human")
url = f"{ENSEMBL_BASE_URL}/variant_recoder/{species}/{variant_id}"
params = {"content-type": "application/json"}
response = requests.get(
url, params=params, headers=ENSEMBL_HEADERS, timeout=self.timeout
)
response.raise_for_status()
data = response.json()
# Variant recoder returns list of dicts, each with allele keys
alleles = []
if isinstance(data, list):
for entry in data:
for allele_key, allele_info in entry.items():
if allele_key == "warnings":
continue
if isinstance(allele_info, dict):
alleles.append(
{
"allele": allele_key,
"input": allele_info.get("input", variant_id),
"id": allele_info.get("id", []),
"hgvsg": allele_info.get("hgvsg", []),
"hgvsc": allele_info.get("hgvsc", []),
"hgvsp": allele_info.get("hgvsp", []),
"spdi": allele_info.get("spdi", []),
}
)
return {
"data": alleles,
"metadata": {
"source": "Ensembl Variant Recoder",
"species": species,
"input": variant_id,
"num_alleles": len(alleles),
},
}
[docs]
def _format_vep_result(self, result: Dict) -> Dict[str, Any]:
"""Format a VEP result to extract key information."""
# Extract most informative transcript consequences
transcript_consequences = []
for tc in result.get("transcript_consequences", []):
formatted = {
"gene_symbol": tc.get("gene_symbol"),
"gene_id": tc.get("gene_id"),
"transcript_id": tc.get("transcript_id"),
"biotype": tc.get("biotype"),
"consequence_terms": tc.get("consequence_terms", []),
"impact": tc.get("impact"),
"amino_acids": tc.get("amino_acids"),
"codons": tc.get("codons"),
"protein_start": tc.get("protein_start"),
"protein_end": tc.get("protein_end"),
"sift_prediction": tc.get("sift_prediction"),
"sift_score": tc.get("sift_score"),
"polyphen_prediction": tc.get("polyphen_prediction"),
"polyphen_score": tc.get("polyphen_score"),
"strand": tc.get("strand"),
}
# Remove None values for cleaner output
formatted = {k: v for k, v in formatted.items() if v is not None}
transcript_consequences.append(formatted)
# Extract colocated variants (known variants at this position)
colocated = []
for cv in result.get("colocated_variants", []):
colocated.append(
{
"id": cv.get("id"),
"allele_string": cv.get("allele_string"),
"frequencies": cv.get("frequencies"),
}
)
return {
"input": result.get("input") or result.get("id"),
"assembly_name": result.get("assembly_name"),
"seq_region_name": result.get("seq_region_name"),
"start": result.get("start"),
"end": result.get("end"),
"strand": result.get("strand"),
"allele_string": result.get("allele_string"),
"most_severe_consequence": result.get("most_severe_consequence"),
"transcript_consequences": transcript_consequences,
"colocated_variants": colocated[:10], # Limit to avoid huge responses
}