Source code for tooluniverse.rnacentral_genome_tool

"""RNAcentral genomic-location, publication, and sequence retrieval tools.

These complement the existing RNAcentral text-search / get-by-accession tools by
exposing three additional, stable RNAcentral REST endpoints that were previously
unwrapped:

  * ``genome_locations`` -> ``GET /rna/{urs}/genome-locations/{taxid}``
    Per-organism genomic coordinates (chromosome, strand, start, end, assembly)
    for a non-coding RNA. Requires a numeric NCBI taxid (e.g. 9606 = human).

  * ``publications`` -> ``GET /rna/{urs}/publications``
    Full literature list (title, authors, journal, year, PubMed ID, DOI) for an
    ncRNA, with the total count.

  * ``sequence`` -> ``GET /rna/{urs}.fasta``
    The canonical RNA sequence in FASTA format (header + sequence string).

All requests are anonymous (no API key). ``run()`` always returns a dict with a
``status`` key and never raises.
"""

import json
from typing import Any, Dict
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen

from .base_tool import BaseTool
from .tool_registry import register_tool

_BASE_URL = "https://rnacentral.org/api/v1"
_TIMEOUT = 30


def _http_get_json(url: str, timeout: int = _TIMEOUT) -> Dict[str, Any]:
    req = Request(url, headers={"Accept": "application/json"})
    with urlopen(req, timeout=timeout) as resp:
        raw = resp.read().decode("utf-8", errors="ignore")
    return json.loads(raw)


def _http_get_text(url: str, timeout: int = _TIMEOUT) -> str:
    # The '.fasta' URL suffix already selects the FASTA representation; sending
    # an explicit Accept header (e.g. text/plain) makes RNAcentral return
    # HTTP 406, so deliberately send no Accept header here.
    req = Request(url)
    with urlopen(req, timeout=timeout) as resp:
        return resp.read().decode("utf-8", errors="ignore")


[docs] @register_tool("RNAcentralGenomeTool") class RNAcentralGenomeTool(BaseTool): """Retrieve genomic locations, publications, or FASTA sequence for an RNAcentral non-coding RNA (URS identifier)."""
[docs] def __init__(self, tool_config=None): super().__init__(tool_config) self.tool_config = tool_config or {} settings = self.tool_config.get("settings", {}) or {} self.base_url = settings.get("base_url", _BASE_URL).rstrip("/") self.timeout = int(settings.get("timeout", _TIMEOUT)) # Each wrapper fixes its operation via fields.operation so that a call # with only urs_id routes to the right endpoint (the schema default for # `operation` is not injected by the runner). self.default_operation = (self.tool_config.get("fields", {}) or {}).get( "operation", "genome_locations" )
# ------------------------------------------------------------------ helpers
[docs] def _error(self, message: str, operation: str = None) -> Dict[str, Any]: out = {"status": "error", "error": message, "source": "RNAcentral"} if operation: out["operation"] = operation return out
[docs] @staticmethod def _clean_urs(value: str) -> str: # Accept either a bare URS id ('URS00003B7674') or a species-specific id # ('URS00003B7674_9606'); genome-locations/publications/sequence all use # the bare URS id, so strip any trailing '_taxid' suffix. urs = (value or "").strip() if "_" in urs: urs = urs.split("_", 1)[0] return urs
# ------------------------------------------------------------------- run
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: arguments = arguments or {} operation = (arguments.get("operation") or self.default_operation).strip() urs_raw = arguments.get("urs_id") urs = self._clean_urs(urs_raw) if urs_raw else "" if not urs: return self._error("urs_id is required (e.g. 'URS00003B7674').", operation) if operation == "genome_locations": return self._genome_locations(urs, arguments) if operation == "sequence": return self._sequence(urs) return self._error( f"Unknown operation '{operation}'. Use one of: genome_locations, sequence.", operation, )
# -------------------------------------------------------------- operations
[docs] def _genome_locations(self, urs: str, arguments: Dict[str, Any]) -> Dict[str, Any]: taxid = arguments.get("taxid") if taxid in (None, ""): return self._error( "taxid is required for genome_locations (e.g. 9606 for human). " "RNAcentral maps genome coordinates per organism.", "genome_locations", ) try: taxid = int(taxid) except (TypeError, ValueError): return self._error( f"taxid must be a numeric NCBI taxonomy id, got '{taxid}'.", "genome_locations", ) url = f"{self.base_url}/rna/{urs}/genome-locations/{taxid}" try: payload = _http_get_json(url, self.timeout) except HTTPError as e: if e.code == 404: return self._error( f"No genome locations found for {urs} in taxid {taxid} " "(unknown URS, or this organism has no mapping).", "genome_locations", ) return self._error(f"HTTP {e.code} from RNAcentral.", "genome_locations") except (URLError, TimeoutError) as e: return self._error(f"Network error: {e}", "genome_locations") except json.JSONDecodeError: return self._error( "RNAcentral returned a non-JSON response.", "genome_locations" ) results = payload.get("results", []) if isinstance(payload, dict) else [] locations = [] for r in results: asm = r.get("ensembl_assembly") or {} locations.append( { "chromosome": r.get("chromosome"), "strand": r.get("strand"), "start": r.get("start"), "end": r.get("end"), "identity": r.get("identity"), "ucsc_chromosome": r.get("ucsc_chromosome"), "assembly_id": asm.get("assembly_id"), "assembly_ucsc": asm.get("assembly_ucsc"), "gca_accession": asm.get("gca_accession"), "common_name": asm.get("common_name"), "taxid": asm.get("taxid"), } ) return { "status": "success", "source": "RNAcentral", "operation": "genome_locations", "urs_id": urs, "taxid": taxid, "data": { "count": payload.get("count", len(locations)) if isinstance(payload, dict) else len(locations), "locations": locations, }, }
[docs] def _sequence(self, urs: str) -> Dict[str, Any]: url = f"{self.base_url}/rna/{urs}.fasta" try: text = _http_get_text(url, self.timeout) except HTTPError as e: if e.code == 404: return self._error(f"Unknown RNAcentral id '{urs}'.", "sequence") return self._error(f"HTTP {e.code} from RNAcentral.", "sequence") except (URLError, TimeoutError) as e: return self._error(f"Network error: {e}", "sequence") lines = [ln for ln in text.splitlines() if ln.strip()] header = "" seq_lines = [] for ln in lines: if ln.startswith(">"): header = ln[1:].strip() else: seq_lines.append(ln.strip()) sequence = "".join(seq_lines) if not sequence: return self._error(f"No sequence returned for '{urs}'.", "sequence") return { "status": "success", "source": "RNAcentral", "operation": "sequence", "urs_id": urs, "data": { "fasta_header": header, "sequence": sequence, "length": len(sequence), }, }