Source code for tooluniverse.spliceai_tool

"""
SpliceAI Lookup REST API Tool

This tool provides access to SpliceAI and Pangolin splice prediction scores
via the Broad Institute's SpliceAI Lookup web service.

SpliceAI is a deep learning model that predicts splice-altering variants from
pre-mRNA sequences. It was developed by Illumina and provides delta scores
for acceptor gain/loss and donor gain/loss.

Pangolin is an alternative splice prediction model with similar functionality.

Note: The public API has rate limits (a few queries per minute per user).
For batch processing, users should set up a local instance.

API Documentation: https://github.com/broadinstitute/SpliceAI-lookup
"""

import requests
import re
from typing import Dict, Any, Optional
from .base_tool import BaseTool
from .tool_registry import register_tool

# Base URLs for SpliceAI and Pangolin APIs
SPLICEAI_URLS = {
    "37": "https://spliceai-37-xwkwwwxdwq-uc.a.run.app/spliceai/",
    "38": "https://spliceai-38-xwkwwwxdwq-uc.a.run.app/spliceai/",
}

PANGOLIN_URLS = {
    "37": "https://pangolin-37-xwkwwwxdwq-uc.a.run.app/pangolin/",
    "38": "https://pangolin-38-xwkwwwxdwq-uc.a.run.app/pangolin/",
}


[docs] @register_tool("SpliceAITool") class SpliceAITool(BaseTool): """ SpliceAI and Pangolin Splice Prediction API tool. Provides access to deep learning-based splice prediction scores from the Broad Institute's SpliceAI Lookup service. """
[docs] def __init__(self, tool_config): super().__init__(tool_config) self.parameter = tool_config.get("parameter", {}) self.required = self.parameter.get("required", []) fields = tool_config.get("fields", {}) self.operation = fields.get("operation", "") self.timeout = fields.get("timeout", 60)
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Route to operation handler based on config.""" operation = self.operation or arguments.get("operation") if not operation: return {"status": "error", "error": "Missing: operation"} operation_map = { "predict_splice": self._predict_splice, "predict_splice_pangolin": self._predict_splice_pangolin, "get_max_delta": self._get_max_delta, } handler = operation_map.get(operation) if not handler: return {"status": "error", "error": f"Unknown operation: {operation}"} return handler(arguments)
[docs] def _normalize_variant(self, variant: str) -> str: """Normalize variant format to chr-pos-ref-alt.""" # Remove spaces and standardize variant = variant.strip() # Handle various formats: # chr8-140300616-T-G (already normalized) # chr8:140300616:T:G (colon separated) # 8-140300616-T-G (without chr prefix) # 8:140300616:T:G (without chr prefix, colon) # Replace colons with dashes if ":" in variant: variant = variant.replace(":", "-") # Ensure chr prefix if not variant.lower().startswith("chr"): variant = "chr" + variant return variant
[docs] def _validate_variant(self, variant: str) -> Optional[str]: """Validate variant format. Returns error message if invalid, None if valid.""" pattern = r"^chr[\dXYMT]+-\d+-[ACGTN]+-[ACGTN]+$" if not re.match(pattern, variant, re.IGNORECASE): return f"Invalid variant format: {variant}. Expected format: chr-pos-ref-alt (e.g., chr8-140300616-T-G)" return None
[docs] def _interpret_score(self, delta_score: float) -> str: """Interpret SpliceAI delta score.""" if delta_score is None: return "unknown" if delta_score >= 0.8: return "high_pathogenicity" elif delta_score >= 0.5: return "moderate_pathogenicity" elif delta_score >= 0.2: return "low_pathogenicity" else: return "benign"
[docs] def _predict_splice(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get SpliceAI predictions for a variant.""" variant = arguments.get("variant") if not variant: return {"status": "error", "error": "Missing required parameter: variant"} genome = str(arguments.get("genome", "38")) if genome not in ["37", "38"]: return {"status": "error", "error": "genome must be '37' or '38'"} # Normalize and validate variant variant = self._normalize_variant(variant) error = self._validate_variant(variant) if error: return {"status": "error", "error": error} try: # Build URL base_url = SPLICEAI_URLS[genome] params = {"hg": genome, "variant": variant} # Optional parameters if "distance" in arguments: params["distance"] = arguments["distance"] if "mask" in arguments: params["mask"] = 1 if arguments["mask"] else 0 response = requests.get(base_url, params=params, timeout=self.timeout) response.raise_for_status() data = response.json() # Check for error in response if "error" in data: return {"status": "error", "error": data["error"]} # Extract and interpret scores scores = data.get("scores", []) # Find maximum delta score max_delta = 0.0 for score_entry in scores: if isinstance(score_entry, dict): for key in ["DS_AG", "DS_AL", "DS_DG", "DS_DL"]: val = score_entry.get(key) if val is not None and isinstance(val, (int, float)): max_delta = max(max_delta, val) return { "status": "success", "data": { "variant": variant, "genome": f"GRCh{genome}", "scores": scores, "max_delta_score": max_delta, "interpretation": self._interpret_score(max_delta), "raw_response": data, }, "source": "SpliceAI Lookup (Broad Institute)", } except requests.exceptions.Timeout: return {"status": "error", "error": f"Timeout after {self.timeout}s"} except requests.exceptions.HTTPError as e: error_text = "" try: error_text = e.response.text[:200] except Exception: pass return { "status": "error", "error": f"HTTP {e.response.status_code}: {error_text}", } except Exception as e: return {"status": "error", "error": str(e)}
[docs] def _predict_splice_pangolin(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get Pangolin splice predictions for a variant.""" variant = arguments.get("variant") if not variant: return {"status": "error", "error": "Missing required parameter: variant"} genome = str(arguments.get("genome", "38")) if genome not in ["37", "38"]: return {"status": "error", "error": "genome must be '37' or '38'"} # Normalize and validate variant variant = self._normalize_variant(variant) error = self._validate_variant(variant) if error: return {"status": "error", "error": error} try: # Build URL base_url = PANGOLIN_URLS[genome] params = {"hg": genome, "variant": variant} # Optional parameters if "distance" in arguments: params["distance"] = arguments["distance"] if "mask" in arguments: params["mask"] = 1 if arguments["mask"] else 0 response = requests.get(base_url, params=params, timeout=self.timeout) response.raise_for_status() data = response.json() # Check for error in response if "error" in data: return {"status": "error", "error": data["error"]} return { "status": "success", "data": { "variant": variant, "genome": f"GRCh{genome}", "scores": data.get("scores", []), "raw_response": data, }, "source": "Pangolin (via SpliceAI Lookup)", } except requests.exceptions.Timeout: return {"status": "error", "error": f"Timeout after {self.timeout}s"} except requests.exceptions.HTTPError as e: error_text = "" try: error_text = e.response.text[:200] except Exception: pass return { "status": "error", "error": f"HTTP {e.response.status_code}: {error_text}", } except Exception as e: return {"status": "error", "error": str(e)}
[docs] def _get_max_delta(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get just the maximum delta score and interpretation for a variant.""" result = self._predict_splice(arguments) if result["status"] != "success": return result data = result["data"] return { "status": "success", "data": { "variant": data["variant"], "genome": data["genome"], "max_delta_score": data["max_delta_score"], "interpretation": data["interpretation"], "pathogenicity_threshold": "≥0.2 (low), ≥0.5 (moderate), ≥0.8 (high)", }, "source": "SpliceAI Lookup (Broad Institute)", }