Source code for tooluniverse.vep_pathogenicity_tool
"""
Ensembl VEP variant pathogenicity predictor tool for ToolUniverse.
The public Ensembl VEP (Variant Effect Predictor) REST endpoint can return
per-transcript missense pathogenicity predictions in a single no-key call:
- AlphaMissense (DeepMind, 2023): am_pathogenicity (0-1) + am_class
(likely_benign / ambiguous / likely_pathogenic)
- SIFT: prediction (tolerated / deleterious) + numeric score
- PolyPhen-2: prediction (benign / possibly_damaging / probably_damaging)
+ numeric score
- Most-severe consequence term for the variant
The existing ToolUniverse EnsemblVEP_* tools return only basic consequence
terms and do NOT surface these predictor scores. This tool fills that gap,
giving "one call -> many predictor scores" for variant interpretation,
similar in spirit to MyVariant.info but sourced live from Ensembl with the
AlphaMissense plugin pre-enabled.
Key facts about the public endpoint:
- No authentication / API key required.
- GRCh38 host: https://rest.ensembl.org
- GRCh37 host: https://grch37.rest.ensembl.org (AlphaMissense also available)
- content-type MUST be passed as a query parameter (?content-type=...), not a
header, for this endpoint family.
- AlphaMissense=1 is the plugin flag that is server-enabled on the public REST
service. CADD / dbNSFP plugin flags are NOT server-enabled and will error,
so they are intentionally not exposed here (use the dedicated CADD tool for
CADD scores).
- Transient HTTP 500 / non-JSON HTML responses can occur; these are caught and
reported as a clean error rather than raising.
API documentation: https://rest.ensembl.org/documentation/info/vep_id
AlphaMissense: Cheng et al., Science 2023.
"""
import time
from typing import Any, Dict, List, Optional
from urllib.parse import quote
import requests
from .base_tool import BaseTool
from .tool_registry import register_tool
REST_HOSTS = {
"GRCh38": "https://rest.ensembl.org",
"GRCh37": "https://grch37.rest.ensembl.org",
}
[docs]
@register_tool("VEPPathogenicityTool")
class VEPPathogenicityTool(BaseTool):
"""
Query Ensembl VEP for missense pathogenicity predictor scores.
Operations (selected via the `operation` field in the tool config):
- predict_by_rsid: input a dbSNP rsID (e.g. rs699)
- predict_by_hgvs: input HGVS notation (e.g. ENST00000269305.9:c.524G>A)
- predict_by_region: input chrom + pos + alt allele (e.g. 7, 140753336, T)
Each returns a list of per-transcript predictions plus the variant's
most-severe consequence.
"""
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.parameter = tool_config.get("parameter", {})
fields = tool_config.get("fields", {})
self.operation = fields.get("operation", "")
# Ensembl REST via python-requests is markedly slower than curl; keep a
# generous but bounded timeout (must stay <= 30s tool budget).
self.timeout = fields.get("timeout", 30)
# ------------------------------------------------------------------ run
[docs]
def run(self, arguments: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
arguments = arguments or {}
operation = self.operation or arguments.get("operation")
if not operation:
# Single collapsed tool: route by whichever input was supplied.
if arguments.get("rsid"):
operation = "predict_by_rsid"
elif arguments.get("hgvs_notation"):
operation = "predict_by_hgvs"
elif (
arguments.get("chrom")
and arguments.get("pos") is not None
and arguments.get("alt")
):
operation = "predict_by_region"
else:
return {
"status": "error",
"error": "Provide one of: 'rsid', 'hgvs_notation', "
"or ('chrom' + 'pos' + 'alt').",
}
handlers = {
"predict_by_rsid": self._predict_by_rsid,
"predict_by_hgvs": self._predict_by_hgvs,
"predict_by_region": self._predict_by_region,
}
handler = handlers.get(operation)
if not handler:
return {
"status": "error",
"error": f"Unknown operation: {operation}. "
f"Valid operations: {sorted(handlers)}",
}
try:
return handler(arguments)
except requests.exceptions.Timeout:
return {
"status": "error",
"error": "Ensembl VEP request timed out (server slow or "
"overloaded); please retry.",
}
except requests.exceptions.RequestException as exc:
return {"status": "error", "error": f"Ensembl VEP request failed: {exc}"}
except Exception as exc: # pragma: no cover - defensive catch-all
return {"status": "error", "error": f"Unexpected error: {exc}"}
# ----------------------------------------------------------- assemblies
[docs]
@staticmethod
def _resolve_host(arguments: Dict[str, Any]) -> Optional[str]:
build = str(arguments.get("genome_build", "GRCh38")).strip()
# Accept common aliases.
aliases = {
"hg38": "GRCh38",
"grch38": "GRCh38",
"hg19": "GRCh37",
"grch37": "GRCh37",
}
build = aliases.get(build.lower(), build)
return REST_HOSTS.get(build)
# ----------------------------------------------------------- http call
[docs]
def _fetch_vep(self, host: str, path: str) -> Dict[str, Any]:
"""
Call a VEP endpoint and return either {"records": [...]} on success or
{"error": "..."} on failure. Path already includes the leading slash.
"""
url = f"{host}{path}"
sep = "&" if "?" in path else "?"
url = f"{url}{sep}content-type=application/json&AlphaMissense=1"
# The public Ensembl REST service intermittently returns HTTP 5xx under
# load. Retry transient server errors a couple of times with a short
# backoff before giving up. 4xx responses (bad rsID/HGVS) are NOT
# retried since they are deterministic client errors.
# Heavy variants (many overlapping transcripts) take ~7-12s via
# python-requests against the public Ensembl REST service. Allow a
# generous per-attempt timeout but keep at most 2 attempts plus a short
# backoff so the worst case (2 x 14s + 1s) stays within the 30s budget.
per_attempt = min(self.timeout, 14)
max_attempts = 2
resp = None
for attempt in range(max_attempts):
resp = requests.get(url, timeout=per_attempt)
if resp.status_code < 500:
break
if attempt < max_attempts - 1:
time.sleep(1.0)
if resp.status_code >= 400:
# Ensembl returns JSON errors for parse problems, HTML for 5xx.
detail = resp.text[:200].replace("\n", " ")
return {"error": f"Ensembl VEP HTTP {resp.status_code}: {detail}"}
try:
data = resp.json()
except ValueError:
return {"error": "Ensembl VEP returned a non-JSON response"}
if not isinstance(data, list):
return {"error": "Unexpected Ensembl VEP response (expected a list)"}
return {"records": data}
# ----------------------------------------------------------- parsing
[docs]
@staticmethod
def _parse_consequences(tc_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
predictions: List[Dict[str, Any]] = []
for tc in tc_list:
am = tc.get("alphamissense") or {}
predictions.append(
{
"transcript_id": tc.get("transcript_id"),
"gene_symbol": tc.get("gene_symbol"),
"gene_id": tc.get("gene_id"),
"biotype": tc.get("biotype"),
"consequence_terms": tc.get("consequence_terms"),
"amino_acids": tc.get("amino_acids"),
"codons": tc.get("codons"),
"impact": tc.get("impact"),
"sift_prediction": tc.get("sift_prediction"),
"sift_score": tc.get("sift_score"),
"polyphen_prediction": tc.get("polyphen_prediction"),
"polyphen_score": tc.get("polyphen_score"),
"alphamissense_class": am.get("am_class"),
"alphamissense_pathogenicity": am.get("am_pathogenicity"),
}
)
return predictions
[docs]
def _build_result(
self, fetched: Dict[str, Any], query: str, host: str
) -> Dict[str, Any]:
if "error" in fetched:
return {"status": "error", "error": fetched["error"]}
records = fetched["records"]
if not records:
return {
"status": "success",
"data": [],
"metadata": {
"query": query,
"source": "Ensembl VEP",
"count": 0,
"note": "No VEP annotation returned for this input.",
},
}
out: List[Dict[str, Any]] = []
for rec in records:
tc_list = rec.get("transcript_consequences", []) or []
predictions = self._parse_consequences(tc_list)
# Whether any AlphaMissense score is present for this variant.
has_am = any(
p.get("alphamissense_pathogenicity") is not None for p in predictions
)
out.append(
{
"input": rec.get("input"),
"variant_id": rec.get("id"),
"assembly_name": rec.get("assembly_name"),
"seq_region_name": rec.get("seq_region_name"),
"start": rec.get("start"),
"end": rec.get("end"),
"allele_string": rec.get("allele_string"),
"most_severe_consequence": rec.get("most_severe_consequence"),
"has_alphamissense": has_am,
"predictions": predictions,
}
)
return {
"status": "success",
"data": out,
"metadata": {
"query": query,
"source": "Ensembl VEP",
"host": host,
"count": len(out),
"predictors": ["AlphaMissense", "SIFT", "PolyPhen-2"],
},
}
# ----------------------------------------------------------- operations
[docs]
def _predict_by_rsid(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
rsid = arguments.get("rsid") or arguments.get("variant_id")
if not rsid:
return {"status": "error", "error": "Missing required parameter: rsid"}
host = self._resolve_host(arguments)
if not host:
return self._bad_build(arguments)
rsid = str(rsid).strip()
path = f"/vep/human/id/{quote(rsid, safe='')}"
fetched = self._fetch_vep(host, path)
return self._build_result(fetched, rsid, host)
[docs]
def _predict_by_hgvs(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
hgvs = arguments.get("hgvs_notation") or arguments.get("hgvs")
if not hgvs:
return {
"status": "error",
"error": "Missing required parameter: hgvs_notation",
}
host = self._resolve_host(arguments)
if not host:
return self._bad_build(arguments)
hgvs = str(hgvs).strip()
path = f"/vep/human/hgvs/{quote(hgvs, safe='')}"
fetched = self._fetch_vep(host, path)
return self._build_result(fetched, hgvs, host)
[docs]
def _predict_by_region(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
chrom = arguments.get("chrom")
pos = arguments.get("pos")
alt = arguments.get("alt")
if chrom is None or pos is None or not alt:
return {
"status": "error",
"error": "Missing required parameters: chrom, pos, alt",
}
try:
pos_int = int(pos)
except (TypeError, ValueError):
return {"status": "error", "error": f"pos must be an integer, got: {pos}"}
host = self._resolve_host(arguments)
if not host:
return self._bad_build(arguments)
chrom_norm = str(chrom).strip()
if chrom_norm.lower().startswith("chr"):
chrom_norm = chrom_norm[3:]
alt = str(alt).strip().upper()
# Ensembl region+allele VEP format: region/{chr}:{start}-{end}/{alt}
region = f"{chrom_norm}:{pos_int}-{pos_int}"
path = f"/vep/human/region/{quote(region, safe=':-')}/{quote(alt, safe='')}"
fetched = self._fetch_vep(host, path)
query = f"{chrom_norm}:{pos_int} {alt} ({host.split('//')[-1]})"
return self._build_result(fetched, query, host)
# ----------------------------------------------------------- helpers
[docs]
@staticmethod
def _bad_build(arguments: Dict[str, Any]) -> Dict[str, Any]:
return {
"status": "error",
"error": f"Unsupported genome_build '{arguments.get('genome_build')}'. "
f"Valid builds: {sorted(REST_HOSTS)}",
}