Source code for tooluniverse.unibind_tool
"""UniBind REST API tool.
UniBind (https://unibind.uio.no) is a database of curated, experimentally
derived **direct** transcription factor-DNA binding sites (TFBS), predicted
from ChIP-seq peaks using the DAMO / ChIP-eat pipeline together with JASPAR
profiles. Each "dataset" corresponds to one ChIP-seq experiment for one TF in
one cell type/condition, and exposes the high-confidence TFBS positions
(BED/FASTA), the JASPAR motif(s) used, CentriMo enrichment p-values, score
thresholds and total TFBS counts.
This is distinct from existing ToolUniverse motif tools:
* JASPAR -> position frequency MATRICES (the motif models themselves).
* HOCOMOCO -> position weight matrices for TFs.
* ReMap -> raw ChIP-seq peak regions (genomic intervals of binding).
UniBind sits between them: motif-anchored, experimentally supported, direct
TF-DNA binding sites at base-pair resolution, organised per experiment.
Public, no API key. Django REST Framework backend; endpoints require a
trailing slash and ``?format=json``.
"""
import requests
from typing import Any, Dict, List
from .base_tool import BaseTool
from .tool_registry import register_tool
UNIBIND_BASE = "https://unibind.uio.no/api/v1"
[docs]
@register_tool("UniBindRESTTool")
class UniBindRESTTool(BaseTool):
"""Access curated direct TF-DNA binding sites from the UniBind database.
A single class dispatches three tools via the ``operation`` field declared
in each JSON config (or passed as an argument):
* ``search_datasets`` -> filter the TFBS dataset catalog.
* ``get_dataset`` -> full binding-site detail for one dataset.
* ``list_tfs`` -> list/filter the catalog of profiled TFs.
"""
[docs]
def __init__(self, tool_config: Dict):
super().__init__(tool_config)
self.session = requests.Session()
self.session.headers.update({"Accept": "application/json"})
self.timeout = 30
fields = tool_config.get("fields", {})
self.operation = fields.get("operation", "search_datasets")
# ------------------------------------------------------------------ #
# Dispatch
# ------------------------------------------------------------------ #
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
operation = arguments.get("operation", self.operation)
try:
if operation == "get_dataset":
return self._get_dataset(arguments)
if operation == "list_tfs":
return self._list_tfs(arguments)
return self._search_datasets(arguments)
except requests.exceptions.Timeout:
return {
"status": "error",
"error": "UniBind API request timed out after 30 seconds.",
}
except requests.exceptions.RequestException as exc:
return {
"status": "error",
"error": f"UniBind API request failed: {exc}",
}
except Exception as exc: # noqa: BLE001 - never raise to the caller
return {
"status": "error",
"error": f"Unexpected error querying UniBind: {exc}",
}
# ------------------------------------------------------------------ #
# Helpers
# ------------------------------------------------------------------ #
[docs]
def _get_json(self, url: str, params: Dict[str, Any]) -> Dict[str, Any]:
params = dict(params)
params["format"] = "json"
resp = self.session.get(url, params=params, timeout=self.timeout)
resp.raise_for_status()
return resp.json()
[docs]
@staticmethod
def _int_arg(
arguments: Dict[str, Any], key: str, default: int, lo: int, hi: int
) -> int:
try:
val = int(arguments.get(key, default))
except (TypeError, ValueError):
val = default
return max(lo, min(hi, val))
# ------------------------------------------------------------------ #
# search_datasets
# ------------------------------------------------------------------ #
[docs]
def _search_datasets(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Filter the UniBind dataset catalog.
Server-side filters (all optional, all compose): ``tf_name``,
``species`` (scientific name, e.g. 'Homo sapiens'), ``cell_line``
(cell type/tissue), ``collection`` ('Robust' or 'Permissive').
"""
filters: Dict[str, Any] = {}
for key in ("tf_name", "species", "cell_line", "collection"):
val = arguments.get(key)
if val:
filters[key] = val
page = self._int_arg(arguments, "page", 1, 1, 10_000_000)
page_size = self._int_arg(arguments, "page_size", 25, 1, 1000)
params = dict(filters)
order = arguments.get("order")
if order:
params["order"] = order
params["page"] = page
params["page_size"] = page_size
data = self._get_json(f"{UNIBIND_BASE}/datasets/", params)
results: List[Dict[str, Any]] = []
for row in data.get("results", []) or []:
results.append(
{
"tf_name": row.get("tf_name"),
"total_peaks": row.get("total_peaks"),
"dataset_url": row.get("url"),
"dataset_id": self._dataset_id_from_url(row.get("url")),
}
)
return {
"status": "success",
"data": results,
"metadata": {
"total_count": data.get("count"),
"returned": len(results),
"page": page,
"page_size": page_size,
"has_next": bool(data.get("next")),
"filters": filters,
"source": "UniBind (unibind.uio.no)",
},
}
[docs]
@staticmethod
def _dataset_id_from_url(url: Any) -> Any:
"""Extract the dataset identifier from a dataset detail URL."""
if not url or not isinstance(url, str):
return None
trimmed = url.rstrip("/")
# Detail URLs end in /datasets/<id>; list-link URLs end in ?tf_name=...
if "/datasets/" in trimmed and "?" not in trimmed:
return trimmed.rsplit("/datasets/", 1)[-1]
return None
# ------------------------------------------------------------------ #
# get_dataset
# ------------------------------------------------------------------ #
[docs]
def _get_dataset(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Full binding-site detail for one UniBind dataset.
``dataset_id`` is the UniBind dataset identifier, e.g.
``EXP030726.neural_stem_cells.SMAD3`` (visible as ``dataset_id`` /
``dataset_url`` in search_datasets results).
"""
dataset_id = arguments.get("dataset_id")
if not dataset_id or not str(dataset_id).strip():
return {
"status": "error",
"error": "Missing required argument 'dataset_id'.",
}
dataset_id = str(dataset_id).strip().rstrip("/")
url = f"{UNIBIND_BASE}/datasets/{dataset_id}/"
resp = self.session.get(url, params={"format": "json"}, timeout=self.timeout)
if resp.status_code == 404:
return {
"status": "error",
"error": f"UniBind dataset '{dataset_id}' not found.",
}
resp.raise_for_status()
raw = resp.json()
tfbs_models = self._flatten_tfbs(raw.get("tfbs", []) or [])
data = {
"dataset_id": raw.get("tf_id") or dataset_id,
"tf_name": raw.get("tf_name"),
"cell_line": raw.get("cell_line") or [],
"biological_condition": raw.get("biological_condition") or [],
"identifier": raw.get("identifier") or [],
"jaspar_id": raw.get("jaspar_id") or [],
"prediction_models": raw.get("prediction_models") or [],
"tfbs_models": tfbs_models,
}
return {
"status": "success",
"data": data,
"metadata": {
"n_tfbs_models": len(tfbs_models),
"source": "UniBind (unibind.uio.no)",
},
}
[docs]
@staticmethod
def _flatten_tfbs(tfbs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Flatten the nested ``tfbs`` -> {model_name: [entries]} structure.
UniBind nests TFBS predictions by prediction model (e.g. 'DAMO'); each
model maps to a list of per-JASPAR-motif binding-site result blocks.
"""
flat: List[Dict[str, Any]] = []
for block in tfbs:
if not isinstance(block, dict):
continue
for model_name, entries in block.items():
if not isinstance(entries, list):
continue
for entry in entries:
if not isinstance(entry, dict):
continue
flat.append(
{
"prediction_model": model_name,
"jaspar_id": entry.get("jaspar_id"),
"jaspar_version": entry.get("jaspar_version"),
"total_tfbs": entry.get("total_tfbs"),
"score_threshold": entry.get("score_threshold"),
"distance_threshold": entry.get("distance_threshold"),
"adj_centrimo_pvalue": entry.get("adj_centrimo_pvalue"),
"bed_url": entry.get("bed_url"),
"fasta_url": entry.get("fasta_url"),
"summary_plot_url": entry.get("summary_plot_url"),
}
)
return flat
# ------------------------------------------------------------------ #
# list_tfs
# ------------------------------------------------------------------ #
[docs]
def _list_tfs(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""List the catalog of TFs profiled in UniBind.
The UniBind ``/tfs/`` endpoint ignores its ``search`` parameter and
always returns the full list, so any ``search`` term is applied as a
case-insensitive client-side substring filter on the TF name.
"""
# Pull the full TF list (592 entries -> one page with page_size=1000).
data = self._get_json(f"{UNIBIND_BASE}/tfs/", {"page_size": 1000, "page": 1})
rows = data.get("results", []) or []
names = [r.get("tf_name") for r in rows if r.get("tf_name")]
search = arguments.get("search")
if search:
needle = str(search).strip().lower()
names = [n for n in names if needle in n.lower()]
names = sorted(set(names))
limit = self._int_arg(arguments, "limit", 200, 1, 1000)
truncated = len(names) > limit
names = names[:limit]
return {
"status": "success",
"data": names,
"metadata": {
"total_tfs_in_unibind": data.get("count"),
"returned": len(names),
"search": search,
"truncated": truncated,
"source": "UniBind (unibind.uio.no)",
},
}