Source code for tooluniverse.rnacentral_genome_tool

"""RNAcentral genomic-location, publication, and sequence retrieval tools.

These complement the existing RNAcentral text-search / get-by-accession tools by
exposing three additional, stable RNAcentral REST endpoints that were previously
unwrapped:

  * ``genome_locations`` -> ``GET /rna/{urs}/genome-locations/{taxid}``
    Per-organism genomic coordinates (chromosome, strand, start, end, assembly)
    for a non-coding RNA. Requires a numeric NCBI taxid (e.g. 9606 = human).

  * ``publications`` -> ``GET /rna/{urs}/publications``
    Full literature list (title, authors, journal, year, PubMed ID, DOI) for an
    ncRNA, with the total count.

  * ``sequence`` -> ``GET /rna/{urs}.fasta``
    The canonical RNA sequence in FASTA format (header + sequence string).

All requests are anonymous (no API key). ``run()`` always returns a dict with a
``status`` key and never raises.
"""

import json
from typing import Any, Dict
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen

from .base_tool import BaseTool
from .tool_registry import register_tool

_BASE_URL = "https://rnacentral.org/api/v1"
_TIMEOUT = 30


def _http_get_json(url: str, timeout: int = _TIMEOUT) -> Dict[str, Any]:
    req = Request(url, headers={"Accept": "application/json"})
    with urlopen(req, timeout=timeout) as resp:
        raw = resp.read().decode("utf-8", errors="ignore")
    return json.loads(raw)


def _http_get_text(url: str, timeout: int = _TIMEOUT) -> str:
    # The '.fasta' URL suffix already selects the FASTA representation; sending
    # an explicit Accept header (e.g. text/plain) makes RNAcentral return
    # HTTP 406, so deliberately send no Accept header here.
    req = Request(url)
    with urlopen(req, timeout=timeout) as resp:
        return resp.read().decode("utf-8", errors="ignore")



[docs]
@register_tool("RNAcentralGenomeTool")
class RNAcentralGenomeTool(BaseTool):
    """Retrieve genomic locations, publications, or FASTA sequence for an
    RNAcentral non-coding RNA (URS identifier)."""


[docs]
    def __init__(self, tool_config=None):
        super().__init__(tool_config)
        self.tool_config = tool_config or {}
        settings = self.tool_config.get("settings", {}) or {}
        self.base_url = settings.get("base_url", _BASE_URL).rstrip("/")
        self.timeout = int(settings.get("timeout", _TIMEOUT))
        # Each wrapper fixes its operation via fields.operation so that a call
        # with only urs_id routes to the right endpoint (the schema default for
        # `operation` is not injected by the runner).
        self.default_operation = (self.tool_config.get("fields", {}) or {}).get(
            "operation", "genome_locations"
        )


    # ------------------------------------------------------------------ helpers

[docs]
    def _error(self, message: str, operation: str = None) -> Dict[str, Any]:
        out = {"status": "error", "error": message, "source": "RNAcentral"}
        if operation:
            out["operation"] = operation
        return out



[docs]
    @staticmethod
    def _clean_urs(value: str) -> str:
        # Accept either a bare URS id ('URS00003B7674') or a species-specific id
        # ('URS00003B7674_9606'); genome-locations/publications/sequence all use
        # the bare URS id, so strip any trailing '_taxid' suffix.
        urs = (value or "").strip()
        if "_" in urs:
            urs = urs.split("_", 1)[0]
        return urs


    # ------------------------------------------------------------------- run

[docs]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        arguments = arguments or {}
        operation = (arguments.get("operation") or self.default_operation).strip()

        urs_raw = arguments.get("urs_id")
        urs = self._clean_urs(urs_raw) if urs_raw else ""
        if not urs:
            return self._error("urs_id is required (e.g. 'URS00003B7674').", operation)

        if operation == "genome_locations":
            return self._genome_locations(urs, arguments)
        if operation == "sequence":
            return self._sequence(urs)

        return self._error(
            f"Unknown operation '{operation}'. Use one of: genome_locations, sequence.",
            operation,
        )


    # -------------------------------------------------------------- operations

[docs]
    def _genome_locations(self, urs: str, arguments: Dict[str, Any]) -> Dict[str, Any]:
        taxid = arguments.get("taxid")
        if taxid in (None, ""):
            return self._error(
                "taxid is required for genome_locations (e.g. 9606 for human). "
                "RNAcentral maps genome coordinates per organism.",
                "genome_locations",
            )
        try:
            taxid = int(taxid)
        except (TypeError, ValueError):
            return self._error(
                f"taxid must be a numeric NCBI taxonomy id, got '{taxid}'.",
                "genome_locations",
            )

        url = f"{self.base_url}/rna/{urs}/genome-locations/{taxid}"
        try:
            payload = _http_get_json(url, self.timeout)
        except HTTPError as e:
            if e.code == 404:
                return self._error(
                    f"No genome locations found for {urs} in taxid {taxid} "
                    "(unknown URS, or this organism has no mapping).",
                    "genome_locations",
                )
            return self._error(f"HTTP {e.code} from RNAcentral.", "genome_locations")
        except (URLError, TimeoutError) as e:
            return self._error(f"Network error: {e}", "genome_locations")
        except json.JSONDecodeError:
            return self._error(
                "RNAcentral returned a non-JSON response.", "genome_locations"
            )

        results = payload.get("results", []) if isinstance(payload, dict) else []
        locations = []
        for r in results:
            asm = r.get("ensembl_assembly") or {}
            locations.append(
                {
                    "chromosome": r.get("chromosome"),
                    "strand": r.get("strand"),
                    "start": r.get("start"),
                    "end": r.get("end"),
                    "identity": r.get("identity"),
                    "ucsc_chromosome": r.get("ucsc_chromosome"),
                    "assembly_id": asm.get("assembly_id"),
                    "assembly_ucsc": asm.get("assembly_ucsc"),
                    "gca_accession": asm.get("gca_accession"),
                    "common_name": asm.get("common_name"),
                    "taxid": asm.get("taxid"),
                }
            )

        return {
            "status": "success",
            "source": "RNAcentral",
            "operation": "genome_locations",
            "urs_id": urs,
            "taxid": taxid,
            "data": {
                "count": payload.get("count", len(locations))
                if isinstance(payload, dict)
                else len(locations),
                "locations": locations,
            },
        }



[docs]
    def _sequence(self, urs: str) -> Dict[str, Any]:
        url = f"{self.base_url}/rna/{urs}.fasta"
        try:
            text = _http_get_text(url, self.timeout)
        except HTTPError as e:
            if e.code == 404:
                return self._error(f"Unknown RNAcentral id '{urs}'.", "sequence")
            return self._error(f"HTTP {e.code} from RNAcentral.", "sequence")
        except (URLError, TimeoutError) as e:
            return self._error(f"Network error: {e}", "sequence")

        lines = [ln for ln in text.splitlines() if ln.strip()]
        header = ""
        seq_lines = []
        for ln in lines:
            if ln.startswith(">"):
                header = ln[1:].strip()
            else:
                seq_lines.append(ln.strip())
        sequence = "".join(seq_lines)

        if not sequence:
            return self._error(f"No sequence returned for '{urs}'.", "sequence")

        return {
            "status": "success",
            "source": "RNAcentral",
            "operation": "sequence",
            "urs_id": urs,
            "data": {
                "fasta_header": header,
                "sequence": sequence,
                "length": len(sequence),
            },
        }