Source code for tooluniverse.vdjdb_tool

"""
VDJdb - TCR/BCR Clonotype Database Tool

Provides access to the VDJdb API for querying T-cell receptor (TCR) and
B-cell receptor (BCR) sequences with known antigen specificities.

VDJdb is a curated database of TCR sequences with experimentally
verified antigen specificity data, integrating 226,000+ records
from 300+ studies across human, mouse, and macaque species.

API base: https://vdjdb.com/api/
No authentication required.

Reference: Bagaev et al., NAR 2020 (PMID: 31782517)
"""

import json
import requests
from typing import Any, Dict, List, Optional
from .base_tool import BaseTool
from .tool_registry import register_tool


VDJDB_BASE_URL = "https://vdjdb.com/api"

# Column name -> index mapping based on /api/database/meta
COLUMN_NAMES = [
    "gene",
    "cdr3",
    "v.segm",
    "j.segm",
    "species",
    "mhc.a",
    "mhc.b",
    "mhc.class",
    "antigen.epitope",
    "antigen.gene",
    "antigen.species",
    "reference.id",
    "method",
    "meta",
    "cdr3fix",
    "vdjdb.score",
]


def _parse_row(entries: List[str], metadata: Dict) -> Dict[str, Any]:
    """Parse a raw VDJdb search row into a structured dictionary."""
    record = {}
    for i, col_name in enumerate(COLUMN_NAMES):
        if i < len(entries):
            val = entries[i]
            # Parse JSON fields
            if col_name in ("method", "meta", "cdr3fix"):
                try:
                    val = json.loads(val)
                except (json.JSONDecodeError, TypeError):
                    pass
            # Convert score to integer
            elif col_name == "vdjdb.score":
                try:
                    val = int(val)
                except (ValueError, TypeError):
                    pass
            record[col_name] = val

    # Add metadata
    if metadata:
        record["paired_id"] = metadata.get("pairedID", "0")
        record["cdr3_v_end"] = metadata.get("cdr3vEnd")
        record["cdr3_j_start"] = metadata.get("cdr3jStart")

    return record


[docs] @register_tool("VDJDBTool") class VDJDBTool(BaseTool): """ Tool for querying the VDJdb T-cell receptor sequence database. VDJdb is a curated database of TCR sequences with experimentally verified antigen specificity. It links CDR3 sequences to specific epitopes, MHC alleles, and antigen sources. Supported operations: - search_cdr3: Search by CDR3 amino acid sequence - get_antigen_specificity: Search by epitope sequence - get_database_summary: Get database statistics """
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.session = requests.Session() self.timeout = 30
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the VDJdb API tool with given arguments.""" operation = arguments.get("operation") if not operation: return {"status": "error", "error": "Missing required parameter: operation"} operation_handlers = { "search_cdr3": self._search_cdr3, "get_antigen_specificity": self._get_antigen_specificity, "get_database_summary": self._get_database_summary, } handler = operation_handlers.get(operation) if not handler: return { "status": "error", "error": f"Unknown operation: {operation}", "available_operations": list(operation_handlers.keys()), } try: return handler(arguments) except requests.exceptions.Timeout: return {"status": "error", "error": "VDJdb API request timed out"} except requests.exceptions.ConnectionError: return {"status": "error", "error": "Failed to connect to VDJdb API"} except Exception as e: return {"status": "error", "error": f"VDJdb operation failed: {str(e)}"}
[docs] def _post_search( self, filters: List[Dict], page: Optional[int] = None, page_size: int = 25, paired: bool = False, sort: Optional[str] = None, ) -> Dict[str, Any]: """Make a POST search request to VDJdb API.""" url = f"{VDJDB_BASE_URL}/database/search" body = {"filters": filters} if page is not None: body["page"] = page body["pageSize"] = page_size if paired: body["paired"] = True if sort: body["sort"] = sort response = self.session.post( url, json=body, headers={"Content-Type": "application/json"}, timeout=self.timeout, ) if response.status_code != 200: return { "ok": False, "error": f"VDJdb API returned status {response.status_code}", "detail": response.text[:500], } try: data = response.json() except Exception: return {"ok": False, "error": "Invalid JSON response from VDJdb"} return {"ok": True, "data": data}
[docs] def _search_cdr3(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Search VDJdb by CDR3 amino acid sequence.""" cdr3 = arguments.get("cdr3") if not cdr3: return {"status": "error", "error": "cdr3 parameter is required"} species = arguments.get("species") gene = arguments.get("gene") match_type = arguments.get("match_type", "exact") page = arguments.get("page", 0) page_size = arguments.get("page_size", 25) filters = [] # CDR3 filter if match_type == "exact": filters.append( { "column": "cdr3", "value": cdr3, "filterType": "exact", "negative": False, } ) elif match_type == "fuzzy": # Fuzzy match: value format is "sequence:substitutions:insertions:deletions" subs = arguments.get("substitutions", 1) ins = arguments.get("insertions", 1) dels = arguments.get("deletions", 1) filters.append( { "column": "cdr3", "value": f"{cdr3}:{subs}:{ins}:{dels}", "filterType": "sequence", "negative": False, } ) elif match_type == "pattern": filters.append( { "column": "cdr3", "value": cdr3, "filterType": "pattern", "negative": False, } ) # Optional species filter if species: filters.append( { "column": "species", "value": species, "filterType": "exact", "negative": False, } ) # Optional gene filter (TRA or TRB) if gene: filters.append( { "column": "gene", "value": gene, "filterType": "exact", "negative": False, } ) result = self._post_search(filters, page=page, page_size=page_size) if not result["ok"]: return {"status": "error", "error": result["error"]} raw = result["data"] records = [ _parse_row(row["entries"], row.get("metadata", {})) for row in raw.get("rows", []) ] return { "status": "success", "data": { "records": records, "records_found": raw.get("recordsFound", 0), "page": raw.get("page", -1), "page_size": raw.get("pageSize", -1), "page_count": raw.get("pageCount", -1), }, }
[docs] def _get_antigen_specificity(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Search VDJdb by epitope sequence to find TCRs recognizing an antigen.""" epitope = arguments.get("epitope") if not epitope: return {"status": "error", "error": "epitope parameter is required"} species = arguments.get("species") gene = arguments.get("gene") mhc_class = arguments.get("mhc_class") min_score = arguments.get("min_score") page = arguments.get("page", 0) page_size = arguments.get("page_size", 25) filters = [ { "column": "antigen.epitope", "value": epitope, "filterType": "exact", "negative": False, } ] if species: filters.append( { "column": "species", "value": species, "filterType": "exact", "negative": False, } ) if gene: filters.append( { "column": "gene", "value": gene, "filterType": "exact", "negative": False, } ) if mhc_class: filters.append( { "column": "mhc.class", "value": mhc_class, "filterType": "exact", "negative": False, } ) if min_score is not None: filters.append( { "column": "vdjdb.score", "value": str(min_score), "filterType": "level", "negative": False, } ) result = self._post_search(filters, page=page, page_size=page_size) if not result["ok"]: return {"status": "error", "error": result["error"]} raw = result["data"] records = [ _parse_row(row["entries"], row.get("metadata", {})) for row in raw.get("rows", []) ] return { "status": "success", "data": { "records": records, "records_found": raw.get("recordsFound", 0), "page": raw.get("page", -1), "page_size": raw.get("pageSize", -1), "page_count": raw.get("pageCount", -1), }, }
[docs] def _get_database_summary(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get VDJdb database metadata and column information.""" url = f"{VDJDB_BASE_URL}/database/meta" response = self.session.get(url, timeout=self.timeout) if response.status_code != 200: return { "status": "error", "error": f"VDJdb meta endpoint returned status {response.status_code}", } try: data = response.json() except Exception: return {"status": "error", "error": "Invalid JSON from VDJdb meta endpoint"} meta = data.get("metadata", {}) columns = meta.get("columns", []) column_info = [] for col in columns: column_info.append( { "name": col.get("name"), "title": col.get("title"), "data_type": col.get("dataType"), "column_type": col.get("columnType"), "comment": col.get("comment"), } ) return { "status": "success", "data": { "total_records": meta.get("numberOfRecords", 0), "number_of_columns": meta.get("numberOfColumns", 0), "columns": column_info, }, }