Source code for tooluniverse.therasabdab_tool
# therasabdab_tool.py
"""
Thera-SAbDab (Therapeutic Structural Antibody Database) API tool for ToolUniverse.
Thera-SAbDab is a database of therapeutic antibody sequences and structural information,
containing WHO INN (International Nonproprietary Names) antibody therapeutics.
Features:
- Therapeutic antibody sequences (heavy and light chains)
- Structural coverage from PDB
- Target antigen information
- Clinical trial status
- Format types (IgG, bispecific, ADC, etc.)
Website: https://opig.stats.ox.ac.uk/webapps/sabdab-sabpred/therasabdab/
"""
import requests
from typing import Dict, Any, List, Optional
import re
from .base_tool import BaseTool
from .tool_registry import register_tool
# Base URL for Thera-SAbDab
THERASABDAB_BASE_URL = "https://opig.stats.ox.ac.uk/webapps/sabdab-sabpred/therasabdab"
[docs]
@register_tool("TheraSAbDabTool")
class TheraSAbDabTool(BaseTool):
"""
Tool for querying Thera-SAbDab therapeutic antibody database.
Provides access to:
- WHO INN therapeutic antibody names
- Heavy/light chain sequences
- Target antigens
- Clinical trial status
- PDB structural coverage
No authentication required.
"""
# Cache for all therapeutics (loaded once)
_therapeutics_cache = None
[docs]
def __init__(self, tool_config: Dict[str, Any]):
super().__init__(tool_config)
self.timeout = tool_config.get("timeout", 30)
self.operation = tool_config.get("fields", {}).get(
"operation", "search_therapeutics"
)
[docs]
def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""Execute the Thera-SAbDab API call."""
operation = self.operation
if operation == "search_therapeutics":
return self._search_therapeutics(arguments)
elif operation == "get_all_therapeutics":
return self._get_all_therapeutics(arguments)
elif operation == "search_by_target":
return self._search_by_target(arguments)
else:
return {"status": "error", "error": f"Unknown operation: {operation}"}
[docs]
def _parse_search_results(self, html: str) -> List[Dict[str, Any]]:
"""Parse Thera-SAbDab search results from HTML."""
therapeutics = []
# Simple regex-based parsing for table rows
# The page uses a table with columns: INN, Target, Format, Phase, PDB
table_pattern = r"<tr[^>]*>.*?</tr>"
row_matches = re.findall(table_pattern, html, re.DOTALL)
for row in row_matches:
# Skip header rows
if "<th" in row:
continue
# Extract cell data
cell_pattern = r"<td[^>]*>(.*?)</td>"
cells = re.findall(cell_pattern, row, re.DOTALL)
if len(cells) >= 4:
# Clean HTML from cells
def clean_html(text):
# Remove HTML tags
clean = re.sub(r"<[^>]+>", "", text)
# Decode entities
clean = clean.replace(" ", " ").strip()
return clean
therapeutic = {
"inn_name": clean_html(cells[0]) if len(cells) > 0 else None,
"target": clean_html(cells[1]) if len(cells) > 1 else None,
"format": clean_html(cells[2]) if len(cells) > 2 else None,
"phase": clean_html(cells[3]) if len(cells) > 3 else None,
"pdb_ids": [],
}
# Extract PDB IDs from last column if present
if len(cells) > 4:
pdb_text = cells[4]
pdb_matches = re.findall(r"[0-9][A-Za-z0-9]{3}", pdb_text)
therapeutic["pdb_ids"] = list(set(pdb_matches))
# Only add if we have a name
if therapeutic["inn_name"]:
therapeutics.append(therapeutic)
return therapeutics
[docs]
def _load_all_therapeutics(self) -> List[Dict[str, Any]]:
"""Load all therapeutics from Thera-SAbDab (with caching)."""
if TheraSAbDabTool._therapeutics_cache is not None:
return TheraSAbDabTool._therapeutics_cache
# Query the "all therapeutics" endpoint
url = f"{THERASABDAB_BASE_URL}/search/"
params = {"all": "true"}
response = requests.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
therapeutics = self._parse_search_results(response.text)
TheraSAbDabTool._therapeutics_cache = therapeutics
return therapeutics
[docs]
def _search_therapeutics(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""
Search therapeutic antibodies by name or keyword.
Searches the Thera-SAbDab database for matching therapeutics.
"""
query = arguments.get("query")
if not query:
return {"status": "error", "error": "query parameter is required"}
try:
# Load all therapeutics and filter locally
all_therapeutics = self._load_all_therapeutics()
# Filter by query (case-insensitive search in name and target)
query_lower = query.lower()
filtered = [
t
for t in all_therapeutics
if query_lower in (t.get("inn_name") or "").lower()
or query_lower in (t.get("target") or "").lower()
]
return {
"status": "success",
"data": {
"query": query,
"therapeutics": filtered[:20], # Limit results
"count": len(filtered),
"source": "Thera-SAbDab (Oxford OPIG)",
},
}
except requests.exceptions.Timeout:
return {
"status": "error",
"error": f"Thera-SAbDab timeout after {self.timeout}s",
}
except requests.exceptions.RequestException as e:
return {
"status": "error",
"error": f"Thera-SAbDab request failed: {str(e)}",
}
except Exception as e:
return {"status": "error", "error": f"Unexpected error: {str(e)}"}
[docs]
def _get_all_therapeutics(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""
Get summary of all therapeutic antibodies in Thera-SAbDab.
Returns count and sample of therapeutics by format and phase.
"""
try:
therapeutics = self._load_all_therapeutics()
# Summarize by format
formats = {}
phases = {}
for t in therapeutics:
fmt = t.get("format") or "Unknown"
phase = t.get("phase") or "Unknown"
formats[fmt] = formats.get(fmt, 0) + 1
phases[phase] = phases.get(phase, 0) + 1
return {
"status": "success",
"data": {
"total_count": len(therapeutics),
"by_format": formats,
"by_phase": phases,
"sample": therapeutics[:10], # Sample of first 10
"source": "Thera-SAbDab (Oxford OPIG)",
"note": "Full data can be downloaded from the Thera-SAbDab website",
},
}
except requests.exceptions.Timeout:
return {
"status": "error",
"error": f"Thera-SAbDab timeout after {self.timeout}s",
}
except requests.exceptions.RequestException as e:
return {
"status": "error",
"error": f"Thera-SAbDab request failed: {str(e)}",
}
except Exception as e:
return {"status": "error", "error": f"Unexpected error: {str(e)}"}
[docs]
def _search_by_target(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
"""
Search therapeutic antibodies by target antigen.
Useful for finding all approved/clinical antibodies against a target.
"""
target = arguments.get("target")
if not target:
return {"status": "error", "error": "target parameter is required"}
try:
# Load all therapeutics and filter by target
all_therapeutics = self._load_all_therapeutics()
# Filter by target (case-insensitive)
target_lower = target.lower()
filtered = [
t
for t in all_therapeutics
if target_lower in (t.get("target") or "").lower()
]
return {
"status": "success",
"data": {
"target": target,
"therapeutics": filtered[:20],
"count": len(filtered),
"source": "Thera-SAbDab (Oxford OPIG)",
},
}
except requests.exceptions.Timeout:
return {
"status": "error",
"error": f"Thera-SAbDab timeout after {self.timeout}s",
}
except requests.exceptions.RequestException as e:
return {
"status": "error",
"error": f"Thera-SAbDab request failed: {str(e)}",
}
except Exception as e:
return {"status": "error", "error": f"Unexpected error: {str(e)}"}