tooluniverse.tcdb_tool 源代码

"""
TCDB (Transporter Classification Database) tool for ToolUniverse.

Provides search and lookup for membrane transporter proteins classified
by the IUBMB-approved TC system. 20,000+ proteins in 1,536 families.

TC# format: class.subclass.family.subfamily.protein (e.g., 2.A.1.7.1)

API: https://www.tcdb.org/ (CGI flat-file endpoints, no authentication)
Data is fetched as bulk TSV/CSV and cached in memory for the session.
"""

import csv
import io
import requests
from typing import Any

from .base_tool import BaseTool
from .tool_registry import register_tool

TCDB_BASE = "https://www.tcdb.org"
ACC2TCID_URL = f"{TCDB_BASE}/cgi-bin/projectv/public/acc2tcid.py"
FAMILIES_URL = f"{TCDB_BASE}/cgi-bin/projectv/public/families.py"
SUBSTRATES_URL = f"{TCDB_BASE}/cgi-bin/substrates/getSubstrates.py"
PDB_URL = f"{TCDB_BASE}/cgi-bin/projectv/public/pdb.py"
HUMAN_CSV_URL = f"{TCDB_BASE}/public/human.csv"



[文档]
@register_tool("TCDBTool")
class TCDBTool(BaseTool):
    """
    Tool for querying TCDB (Transporter Classification Database).

    Supports lookup by UniProt accession, family search by TC# or name,
    and substrate-based search. Data is fetched from bulk flat files and
    cached in memory.
    """

    _cache: dict = {}


[文档]
    def __init__(self, tool_config: dict[str, Any]):
        super().__init__(tool_config)
        self.timeout = tool_config.get("timeout", 60)
        fields = tool_config.get("fields", {})
        self.operation = fields.get("operation", "get_transporter")



[文档]
    def _fetch_and_cache(self, key: str, url: str) -> str:
        """Fetch a bulk data file and cache it. Returns raw text."""
        if key not in TCDBTool._cache:
            resp = requests.get(url, timeout=self.timeout)
            resp.raise_for_status()
            TCDBTool._cache[key] = resp.text
        return TCDBTool._cache[key]



[文档]
    def _get_acc2tcid(self) -> dict[str, list[str]]:
        """Parse acc2tcid into {uniprot_acc: [tc_numbers]}."""
        cache_key = "acc2tcid_parsed"
        if cache_key in TCDBTool._cache:
            return TCDBTool._cache[cache_key]
        raw = self._fetch_and_cache("acc2tcid", ACC2TCID_URL)
        mapping: dict[str, list[str]] = {}
        for line in raw.strip().split("\n"):
            parts = line.strip().split("\t")
            if len(parts) >= 2:
                acc, tc = parts[0].strip(), parts[1].strip()
                mapping.setdefault(acc, []).append(tc)
        TCDBTool._cache[cache_key] = mapping
        return mapping



[文档]
    def _get_families(self) -> dict[str, str]:
        """Parse families into {family_id: description}."""
        cache_key = "families_parsed"
        if cache_key in TCDBTool._cache:
            return TCDBTool._cache[cache_key]
        raw = self._fetch_and_cache("families", FAMILIES_URL)
        families: dict[str, str] = {}
        for line in raw.strip().split("\n"):
            parts = line.strip().split("\t")
            if len(parts) >= 2:
                families[parts[0].strip()] = parts[1].strip()
        TCDBTool._cache[cache_key] = families
        return families



[文档]
    def _get_substrates(self) -> list[dict]:
        """Parse substrates into list of {tc_number, substrates: [{chebi_id, name}]}."""
        cache_key = "substrates_parsed"
        if cache_key in TCDBTool._cache:
            return TCDBTool._cache[cache_key]
        raw = self._fetch_and_cache("substrates", SUBSTRATES_URL)
        entries = []
        for line in raw.strip().split("\n"):
            parts = line.strip().split("\t")
            if len(parts) < 2:
                continue
            tc_num = parts[0].strip()
            substrate_parts = parts[1].strip().split("|")
            substrates = []
            for sp in substrate_parts:
                sp = sp.strip()
                if ";" in sp:
                    chebi_id, name = sp.split(";", 1)
                    substrates.append(
                        {"chebi_id": chebi_id.strip(), "name": name.strip()}
                    )
                elif sp:
                    substrates.append({"chebi_id": None, "name": sp})
            entries.append({"tc_number": tc_num, "substrates": substrates})
        TCDBTool._cache[cache_key] = entries
        return entries



[文档]
    def _get_pdb_mapping(self) -> dict[str, list[str]]:
        """Parse PDB data into {tc_number: [pdb_ids]}."""
        cache_key = "pdb_parsed"
        if cache_key in TCDBTool._cache:
            return TCDBTool._cache[cache_key]
        raw = self._fetch_and_cache("pdb", PDB_URL)
        mapping: dict[str, list[str]] = {}
        for line in raw.strip().split("\n"):
            parts = line.strip().split("\t")
            if len(parts) >= 2:
                pdb_id, tc_num = parts[0].strip(), parts[1].strip()
                mapping.setdefault(tc_num, []).append(pdb_id)
        TCDBTool._cache[cache_key] = mapping
        return mapping



[文档]
    def _get_human_transporters(self) -> dict[str, dict]:
        """Parse human.csv into {uniprot_acc: {name, symbol, aliases, tc_number}}."""
        cache_key = "human_parsed"
        if cache_key in TCDBTool._cache:
            return TCDBTool._cache[cache_key]
        raw = self._fetch_and_cache("human", HUMAN_CSV_URL)
        humans: dict[str, dict] = {}
        reader = csv.reader(io.StringIO(raw))
        next(reader, None)
        for row in reader:
            if len(row) < 5:
                continue
            acc = row[3].strip() if row[3] else ""
            if acc:
                humans[acc] = {
                    "name": row[0].strip() if row[0] else None,
                    "symbol": row[1].strip() if row[1] else None,
                    "aliases": row[2].strip() if row[2] else None,
                    "tc_number": row[4].strip() if row[4] else None,
                }
        TCDBTool._cache[cache_key] = humans
        return humans



[文档]
    def _family_for_tc(self, tc_number: str, families: dict[str, str]) -> str | None:
        """Find the family description for a TC number by matching prefixes."""
        parts = tc_number.split(".")
        for length in range(len(parts), 0, -1):
            prefix = ".".join(parts[:length])
            if prefix in families:
                return families[prefix]
        return None



[文档]
    def run(self, arguments: dict[str, Any]) -> dict[str, Any]:
        try:
            if self.operation == "get_transporter":
                return self._get_transporter(arguments)
            elif self.operation == "search_family":
                return self._search_family(arguments)
            elif self.operation == "search_by_substrate":
                return self._search_by_substrate(arguments)
            return {
                "status": "error",
                "error": f"Unknown operation: {self.operation}",
            }
        except requests.exceptions.Timeout:
            return {
                "status": "error",
                "error": f"TCDB API timed out after {self.timeout}s",
            }
        except requests.exceptions.ConnectionError:
            return {"status": "error", "error": "Failed to connect to TCDB"}
        except Exception as e:
            return {"status": "error", "error": str(e)}



[文档]
    def _get_transporter(self, arguments: dict[str, Any]) -> dict[str, Any]:
        """Look up transporter info by UniProt accession."""
        accession = (
            (
                arguments.get("uniprot_accession")
                or arguments.get("uniprot_id")
                or arguments.get("accession")
                or ""
            )
            .strip()
            .upper()
        )
        if not accession:
            return {
                "status": "error",
                "error": "uniprot_accession is required",
            }

        acc2tcid = self._get_acc2tcid()
        tc_numbers = acc2tcid.get(accession)
        if not tc_numbers:
            return {
                "status": "error",
                "error": f"UniProt accession {accession} not found in TCDB",
            }

        families = self._get_families()
        pdb_mapping = self._get_pdb_mapping()
        human_data = self._get_human_transporters()

        results = []
        for tc in tc_numbers:
            entry = {
                "tc_number": tc,
                "family_description": self._family_for_tc(tc, families),
            }
            pdb_ids = pdb_mapping.get(tc, [])
            if pdb_ids:
                entry["pdb_structures"] = pdb_ids[:20]
            results.append(entry)

        data = {
            "uniprot_accession": accession,
            "tc_entries": results,
        }

        human_info = human_data.get(accession)
        if human_info:
            data["human_info"] = human_info

        return {"status": "success", "data": data}



[文档]
    def _search_family(self, arguments: dict[str, Any]) -> dict[str, Any]:
        """Search families by TC family ID prefix or name text."""
        family_id = (arguments.get("family_id") or "").strip()
        family_name = (arguments.get("family_name") or "").strip()
        limit = min(int(arguments.get("limit", 20)), 100)

        if not family_id and not family_name:
            return {
                "status": "error",
                "error": "Either family_id or family_name is required",
            }

        families = self._get_families()
        acc2tcid = self._get_acc2tcid()

        matches = []
        for fid, desc in families.items():
            if family_id and not fid.startswith(family_id):
                continue
            if family_name and family_name.lower() not in desc.lower():
                continue
            member_count = sum(
                1 for tcs in acc2tcid.values() for tc in tcs if tc.startswith(fid)
            )
            matches.append(
                {
                    "family_id": fid,
                    "description": desc,
                    "member_count": member_count,
                }
            )

        matches.sort(key=lambda x: x["family_id"])
        matches = matches[:limit]

        return {
            "status": "success",
            "data": {
                "total_matches": len(matches),
                "families": matches,
                "query": {
                    "family_id": family_id or None,
                    "family_name": family_name or None,
                },
            },
        }



[文档]
    def _search_by_substrate(self, arguments: dict[str, Any]) -> dict[str, Any]:
        """Search transporters by substrate name."""
        substrate_name = (
            arguments.get("substrate_name") or arguments.get("substrate") or ""
        ).strip()
        limit = min(int(arguments.get("limit", 20)), 100)

        if not substrate_name:
            return {
                "status": "error",
                "error": "substrate_name is required",
            }

        substrates = self._get_substrates()
        families = self._get_families()
        query_lower = substrate_name.lower()

        matches = []
        for entry in substrates:
            matching_substrates = [
                s for s in entry["substrates"] if query_lower in s["name"].lower()
            ]
            if matching_substrates:
                family_desc = self._family_for_tc(entry["tc_number"], families)
                matches.append(
                    {
                        "tc_number": entry["tc_number"],
                        "family_description": family_desc,
                        "matching_substrates": matching_substrates,
                    }
                )

        matches = matches[:limit]
        return {
            "status": "success",
            "data": {
                "total_matches": len(matches),
                "transporters": matches,
                "query": substrate_name,
            },
        }