Source code for tooluniverse.tark_tool

"""
Ensembl Tark tools for ToolUniverse — transcript archive + MANE mapping.

Tark (Transcript Archive, EMBL-EBI/Ensembl) is the authoritative archive of
transcript sequences and versions across Ensembl, RefSeq and GENCODE, including
the MANE Select / MANE Plus Clinical pairings (ENST <-> NM). It answers two
clinical-genomics questions the main Ensembl REST does not expose directly:
  - "What is the MANE Select transcript for this gene, and its RefSeq equivalent?"
  - "Give me the archived record (versions, checksums, UTRs) for this transcript."

API: https://tark.ensembl.org/api  (public, no authentication, JSON)
"""

from typing import Any, Dict, List, Optional

import requests

from .base_tool import BaseTool
from .tool_registry import register_tool

TARK_BASE = "https://tark.ensembl.org/api"

# The MANE list is a flat ~19k-row table returned in a single request. Cache it
# at module level so repeated gene lookups don't re-download it each call.
_MANE_CACHE: Optional[List[Dict[str, Any]]] = None


def _load_mane_list(timeout: int) -> List[Dict[str, Any]]:
    global _MANE_CACHE
    if _MANE_CACHE is None:
        resp = requests.get(
            f"{TARK_BASE}/transcript/manelist/",
            headers={"Accept": "application/json"},
            timeout=timeout,
        )
        resp.raise_for_status()
        data = resp.json()
        _MANE_CACHE = data if isinstance(data, list) else data.get("results", [])
    return _MANE_CACHE


[docs] @register_tool("TarkManeTranscriptsTool") class TarkManeTranscriptsTool(BaseTool): """Look up MANE Select / Plus Clinical transcripts (ENST <-> RefSeq NM)."""
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("fields", {}).get("timeout", 30)
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: gene = (arguments.get("gene") or "").strip() ensembl_id = (arguments.get("ensembl_id") or "").strip() refseq_id = (arguments.get("refseq_id") or "").strip() if not (gene or ensembl_id or refseq_id): return { "status": "error", "error": "Provide one of 'gene', 'ensembl_id', or 'refseq_id'.", } try: rows = _load_mane_list(self.timeout) except requests.exceptions.Timeout: return { "status": "error", "error": f"Tark request timed out after {self.timeout}s", } except requests.exceptions.RequestException as e: return {"status": "error", "error": f"Tark request failed: {e}"} except ValueError: return {"status": "error", "error": "Tark returned a non-JSON response"} ens_strip = ensembl_id.split(".")[0].upper() refseq_strip = refseq_id.split(".")[0].upper() gene_up = gene.upper() matches = [ r for r in rows if (gene_up and (r.get("ens_gene_name") or "").upper() == gene_up) or (ens_strip and (r.get("ens_stable_id") or "").upper() == ens_strip) or ( refseq_strip and (r.get("refseq_stable_id") or "").upper() == refseq_strip ) ] results = [ { "gene": r.get("ens_gene_name"), "mane_type": r.get("mane_type"), "ensembl_transcript": _versioned( r.get("ens_stable_id"), r.get("ens_stable_id_version") ), "refseq_transcript": _versioned( r.get("refseq_stable_id"), r.get("refseq_stable_id_version") ), } for r in matches ] return { "status": "success", "data": results, "metadata": { "total_results": len(results), "query": { "gene": gene, "ensembl_id": ensembl_id, "refseq_id": refseq_id, }, "source": "Ensembl Tark MANE list", }, }
[docs] @register_tool("TarkTranscriptTool") class TarkTranscriptTool(BaseTool): """Get the archived transcript record (versions, checksums, UTRs) by ENST id."""
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("fields", {}).get("timeout", 30)
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: stable_id = (arguments.get("stable_id") or "").strip() if not stable_id: return { "status": "error", "error": "'stable_id' (e.g. 'ENST00000380152') is required", } stable_id = stable_id.split(".")[0] try: resp = requests.get( f"{TARK_BASE}/transcript/", params={"stable_id": stable_id, "expand": "transcript_release_set"}, headers={"Accept": "application/json"}, timeout=self.timeout, ) resp.raise_for_status() payload = resp.json() except requests.exceptions.Timeout: return { "status": "error", "error": f"Tark request timed out after {self.timeout}s", } except requests.exceptions.RequestException as e: return {"status": "error", "error": f"Tark request failed: {e}"} except ValueError: return {"status": "error", "error": "Tark returned a non-JSON response"} results = payload.get("results", []) if isinstance(payload, dict) else [] if not results: return { "status": "success", "data": [], "metadata": {"total_results": 0, "query_stable_id": stable_id}, } records = [ { "stable_id": _versioned(r.get("stable_id"), r.get("stable_id_version")), "assembly": r.get("assembly"), "biotype": r.get("biotype"), "region": r.get("loc_region"), "start": r.get("loc_start"), "end": r.get("loc_end"), "strand": r.get("loc_strand"), "transcript_checksum": r.get("transcript_checksum"), "releases": [ rel.get("shortname") for rel in (r.get("transcript_release_set") or []) if isinstance(rel, dict) ], } for r in results ] return { "status": "success", "data": records, "metadata": { "total_results": len(records), "query_stable_id": stable_id, "source": "Ensembl Tark transcript archive", }, }
def _versioned(stable_id: Optional[str], version: Any) -> Optional[str]: """Join a stable id with its version (e.g. ENST00000380152 + 8 -> ...152.8).""" if not stable_id: return None return f"{stable_id}.{version}" if version not in (None, "") else stable_id