Source code for tooluniverse.unibind_tool

"""UniBind REST API tool.

UniBind (https://unibind.uio.no) is a database of curated, experimentally
derived **direct** transcription factor-DNA binding sites (TFBS), predicted
from ChIP-seq peaks using the DAMO / ChIP-eat pipeline together with JASPAR
profiles. Each "dataset" corresponds to one ChIP-seq experiment for one TF in
one cell type/condition, and exposes the high-confidence TFBS positions
(BED/FASTA), the JASPAR motif(s) used, CentriMo enrichment p-values, score
thresholds and total TFBS counts.

This is distinct from existing ToolUniverse motif tools:
  * JASPAR  -> position frequency MATRICES (the motif models themselves).
  * HOCOMOCO -> position weight matrices for TFs.
  * ReMap   -> raw ChIP-seq peak regions (genomic intervals of binding).
UniBind sits between them: motif-anchored, experimentally supported, direct
TF-DNA binding sites at base-pair resolution, organised per experiment.

Public, no API key. Django REST Framework backend; endpoints require a
trailing slash and ``?format=json``.
"""

import requests
from typing import Any, Dict, List
from .base_tool import BaseTool
from .tool_registry import register_tool

UNIBIND_BASE = "https://unibind.uio.no/api/v1"



[docs]
@register_tool("UniBindRESTTool")
class UniBindRESTTool(BaseTool):
    """Access curated direct TF-DNA binding sites from the UniBind database.

    A single class dispatches three tools via the ``operation`` field declared
    in each JSON config (or passed as an argument):

      * ``search_datasets`` -> filter the TFBS dataset catalog.
      * ``get_dataset``     -> full binding-site detail for one dataset.
      * ``list_tfs``        -> list/filter the catalog of profiled TFs.
    """


[docs]
    def __init__(self, tool_config: Dict):
        super().__init__(tool_config)
        self.session = requests.Session()
        self.session.headers.update({"Accept": "application/json"})
        self.timeout = 30
        fields = tool_config.get("fields", {})
        self.operation = fields.get("operation", "search_datasets")


    # ------------------------------------------------------------------ #
    # Dispatch
    # ------------------------------------------------------------------ #

[docs]
    def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        operation = arguments.get("operation", self.operation)
        try:
            if operation == "get_dataset":
                return self._get_dataset(arguments)
            if operation == "list_tfs":
                return self._list_tfs(arguments)
            return self._search_datasets(arguments)
        except requests.exceptions.Timeout:
            return {
                "status": "error",
                "error": "UniBind API request timed out after 30 seconds.",
            }
        except requests.exceptions.RequestException as exc:
            return {
                "status": "error",
                "error": f"UniBind API request failed: {exc}",
            }
        except Exception as exc:  # noqa: BLE001 - never raise to the caller
            return {
                "status": "error",
                "error": f"Unexpected error querying UniBind: {exc}",
            }


    # ------------------------------------------------------------------ #
    # Helpers
    # ------------------------------------------------------------------ #

[docs]
    def _get_json(self, url: str, params: Dict[str, Any]) -> Dict[str, Any]:
        params = dict(params)
        params["format"] = "json"
        resp = self.session.get(url, params=params, timeout=self.timeout)
        resp.raise_for_status()
        return resp.json()



[docs]
    @staticmethod
    def _int_arg(
        arguments: Dict[str, Any], key: str, default: int, lo: int, hi: int
    ) -> int:
        try:
            val = int(arguments.get(key, default))
        except (TypeError, ValueError):
            val = default
        return max(lo, min(hi, val))


    # ------------------------------------------------------------------ #
    # search_datasets
    # ------------------------------------------------------------------ #

[docs]
    def _search_datasets(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Filter the UniBind dataset catalog.

        Server-side filters (all optional, all compose): ``tf_name``,
        ``species`` (scientific name, e.g. 'Homo sapiens'), ``cell_line``
        (cell type/tissue), ``collection`` ('Robust' or 'Permissive').
        """
        filters: Dict[str, Any] = {}
        for key in ("tf_name", "species", "cell_line", "collection"):
            val = arguments.get(key)
            if val:
                filters[key] = val

        page = self._int_arg(arguments, "page", 1, 1, 10_000_000)
        page_size = self._int_arg(arguments, "page_size", 25, 1, 1000)

        params = dict(filters)
        order = arguments.get("order")
        if order:
            params["order"] = order
        params["page"] = page
        params["page_size"] = page_size

        data = self._get_json(f"{UNIBIND_BASE}/datasets/", params)

        results: List[Dict[str, Any]] = []
        for row in data.get("results", []) or []:
            results.append(
                {
                    "tf_name": row.get("tf_name"),
                    "total_peaks": row.get("total_peaks"),
                    "dataset_url": row.get("url"),
                    "dataset_id": self._dataset_id_from_url(row.get("url")),
                }
            )

        return {
            "status": "success",
            "data": results,
            "metadata": {
                "total_count": data.get("count"),
                "returned": len(results),
                "page": page,
                "page_size": page_size,
                "has_next": bool(data.get("next")),
                "filters": filters,
                "source": "UniBind (unibind.uio.no)",
            },
        }



[docs]
    @staticmethod
    def _dataset_id_from_url(url: Any) -> Any:
        """Extract the dataset identifier from a dataset detail URL."""
        if not url or not isinstance(url, str):
            return None
        trimmed = url.rstrip("/")
        # Detail URLs end in /datasets/<id>; list-link URLs end in ?tf_name=...
        if "/datasets/" in trimmed and "?" not in trimmed:
            return trimmed.rsplit("/datasets/", 1)[-1]
        return None


    # ------------------------------------------------------------------ #
    # get_dataset
    # ------------------------------------------------------------------ #

[docs]
    def _get_dataset(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """Full binding-site detail for one UniBind dataset.

        ``dataset_id`` is the UniBind dataset identifier, e.g.
        ``EXP030726.neural_stem_cells.SMAD3`` (visible as ``dataset_id`` /
        ``dataset_url`` in search_datasets results).
        """
        dataset_id = arguments.get("dataset_id")
        if not dataset_id or not str(dataset_id).strip():
            return {
                "status": "error",
                "error": "Missing required argument 'dataset_id'.",
            }
        dataset_id = str(dataset_id).strip().rstrip("/")

        url = f"{UNIBIND_BASE}/datasets/{dataset_id}/"
        resp = self.session.get(url, params={"format": "json"}, timeout=self.timeout)
        if resp.status_code == 404:
            return {
                "status": "error",
                "error": f"UniBind dataset '{dataset_id}' not found.",
            }
        resp.raise_for_status()
        raw = resp.json()

        tfbs_models = self._flatten_tfbs(raw.get("tfbs", []) or [])

        data = {
            "dataset_id": raw.get("tf_id") or dataset_id,
            "tf_name": raw.get("tf_name"),
            "cell_line": raw.get("cell_line") or [],
            "biological_condition": raw.get("biological_condition") or [],
            "identifier": raw.get("identifier") or [],
            "jaspar_id": raw.get("jaspar_id") or [],
            "prediction_models": raw.get("prediction_models") or [],
            "tfbs_models": tfbs_models,
        }
        return {
            "status": "success",
            "data": data,
            "metadata": {
                "n_tfbs_models": len(tfbs_models),
                "source": "UniBind (unibind.uio.no)",
            },
        }



[docs]
    @staticmethod
    def _flatten_tfbs(tfbs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Flatten the nested ``tfbs`` -> {model_name: [entries]} structure.

        UniBind nests TFBS predictions by prediction model (e.g. 'DAMO'); each
        model maps to a list of per-JASPAR-motif binding-site result blocks.
        """
        flat: List[Dict[str, Any]] = []
        for block in tfbs:
            if not isinstance(block, dict):
                continue
            for model_name, entries in block.items():
                if not isinstance(entries, list):
                    continue
                for entry in entries:
                    if not isinstance(entry, dict):
                        continue
                    flat.append(
                        {
                            "prediction_model": model_name,
                            "jaspar_id": entry.get("jaspar_id"),
                            "jaspar_version": entry.get("jaspar_version"),
                            "total_tfbs": entry.get("total_tfbs"),
                            "score_threshold": entry.get("score_threshold"),
                            "distance_threshold": entry.get("distance_threshold"),
                            "adj_centrimo_pvalue": entry.get("adj_centrimo_pvalue"),
                            "bed_url": entry.get("bed_url"),
                            "fasta_url": entry.get("fasta_url"),
                            "summary_plot_url": entry.get("summary_plot_url"),
                        }
                    )
        return flat


    # ------------------------------------------------------------------ #
    # list_tfs
    # ------------------------------------------------------------------ #

[docs]
    def _list_tfs(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
        """List the catalog of TFs profiled in UniBind.

        The UniBind ``/tfs/`` endpoint ignores its ``search`` parameter and
        always returns the full list, so any ``search`` term is applied as a
        case-insensitive client-side substring filter on the TF name.
        """
        # Pull the full TF list (592 entries -> one page with page_size=1000).
        data = self._get_json(f"{UNIBIND_BASE}/tfs/", {"page_size": 1000, "page": 1})
        rows = data.get("results", []) or []
        names = [r.get("tf_name") for r in rows if r.get("tf_name")]

        search = arguments.get("search")
        if search:
            needle = str(search).strip().lower()
            names = [n for n in names if needle in n.lower()]

        names = sorted(set(names))
        limit = self._int_arg(arguments, "limit", 200, 1, 1000)
        truncated = len(names) > limit
        names = names[:limit]

        return {
            "status": "success",
            "data": names,
            "metadata": {
                "total_tfs_in_unibind": data.get("count"),
                "returned": len(names),
                "search": search,
                "truncated": truncated,
                "source": "UniBind (unibind.uio.no)",
            },
        }