tooluniverse.tcdb_tool 源代码

"""
TCDB (Transporter Classification Database) tool for ToolUniverse.

Provides search and lookup for membrane transporter proteins classified
by the IUBMB-approved TC system. 20,000+ proteins in 1,536 families.

TC# format: class.subclass.family.subfamily.protein (e.g., 2.A.1.7.1)

API: https://www.tcdb.org/ (CGI flat-file endpoints, no authentication)
Data is fetched as bulk TSV/CSV and cached in memory for the session.
"""

import csv
import io
import requests
from typing import Any

from .base_tool import BaseTool
from .tool_registry import register_tool

TCDB_BASE = "https://www.tcdb.org"
ACC2TCID_URL = f"{TCDB_BASE}/cgi-bin/projectv/public/acc2tcid.py"
FAMILIES_URL = f"{TCDB_BASE}/cgi-bin/projectv/public/families.py"
SUBSTRATES_URL = f"{TCDB_BASE}/cgi-bin/substrates/getSubstrates.py"
PDB_URL = f"{TCDB_BASE}/cgi-bin/projectv/public/pdb.py"
HUMAN_CSV_URL = f"{TCDB_BASE}/public/human.csv"


[文档] @register_tool("TCDBTool") class TCDBTool(BaseTool): """ Tool for querying TCDB (Transporter Classification Database). Supports lookup by UniProt accession, family search by TC# or name, and substrate-based search. Data is fetched from bulk flat files and cached in memory. """ _cache: dict = {}
[文档] def __init__(self, tool_config: dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("timeout", 60) fields = tool_config.get("fields", {}) self.operation = fields.get("operation", "get_transporter")
[文档] def _fetch_and_cache(self, key: str, url: str) -> str: """Fetch a bulk data file and cache it. Returns raw text.""" if key not in TCDBTool._cache: resp = requests.get(url, timeout=self.timeout) resp.raise_for_status() TCDBTool._cache[key] = resp.text return TCDBTool._cache[key]
[文档] def _get_acc2tcid(self) -> dict[str, list[str]]: """Parse acc2tcid into {uniprot_acc: [tc_numbers]}.""" cache_key = "acc2tcid_parsed" if cache_key in TCDBTool._cache: return TCDBTool._cache[cache_key] raw = self._fetch_and_cache("acc2tcid", ACC2TCID_URL) mapping: dict[str, list[str]] = {} for line in raw.strip().split("\n"): parts = line.strip().split("\t") if len(parts) >= 2: acc, tc = parts[0].strip(), parts[1].strip() mapping.setdefault(acc, []).append(tc) TCDBTool._cache[cache_key] = mapping return mapping
[文档] def _get_families(self) -> dict[str, str]: """Parse families into {family_id: description}.""" cache_key = "families_parsed" if cache_key in TCDBTool._cache: return TCDBTool._cache[cache_key] raw = self._fetch_and_cache("families", FAMILIES_URL) families: dict[str, str] = {} for line in raw.strip().split("\n"): parts = line.strip().split("\t") if len(parts) >= 2: families[parts[0].strip()] = parts[1].strip() TCDBTool._cache[cache_key] = families return families
[文档] def _get_substrates(self) -> list[dict]: """Parse substrates into list of {tc_number, substrates: [{chebi_id, name}]}.""" cache_key = "substrates_parsed" if cache_key in TCDBTool._cache: return TCDBTool._cache[cache_key] raw = self._fetch_and_cache("substrates", SUBSTRATES_URL) entries = [] for line in raw.strip().split("\n"): parts = line.strip().split("\t") if len(parts) < 2: continue tc_num = parts[0].strip() substrate_parts = parts[1].strip().split("|") substrates = [] for sp in substrate_parts: sp = sp.strip() if ";" in sp: chebi_id, name = sp.split(";", 1) substrates.append( {"chebi_id": chebi_id.strip(), "name": name.strip()} ) elif sp: substrates.append({"chebi_id": None, "name": sp}) entries.append({"tc_number": tc_num, "substrates": substrates}) TCDBTool._cache[cache_key] = entries return entries
[文档] def _get_pdb_mapping(self) -> dict[str, list[str]]: """Parse PDB data into {tc_number: [pdb_ids]}.""" cache_key = "pdb_parsed" if cache_key in TCDBTool._cache: return TCDBTool._cache[cache_key] raw = self._fetch_and_cache("pdb", PDB_URL) mapping: dict[str, list[str]] = {} for line in raw.strip().split("\n"): parts = line.strip().split("\t") if len(parts) >= 2: pdb_id, tc_num = parts[0].strip(), parts[1].strip() mapping.setdefault(tc_num, []).append(pdb_id) TCDBTool._cache[cache_key] = mapping return mapping
[文档] def _get_human_transporters(self) -> dict[str, dict]: """Parse human.csv into {uniprot_acc: {name, symbol, aliases, tc_number}}.""" cache_key = "human_parsed" if cache_key in TCDBTool._cache: return TCDBTool._cache[cache_key] raw = self._fetch_and_cache("human", HUMAN_CSV_URL) humans: dict[str, dict] = {} reader = csv.reader(io.StringIO(raw)) next(reader, None) for row in reader: if len(row) < 5: continue acc = row[3].strip() if row[3] else "" if acc: humans[acc] = { "name": row[0].strip() if row[0] else None, "symbol": row[1].strip() if row[1] else None, "aliases": row[2].strip() if row[2] else None, "tc_number": row[4].strip() if row[4] else None, } TCDBTool._cache[cache_key] = humans return humans
[文档] def _family_for_tc(self, tc_number: str, families: dict[str, str]) -> str | None: """Find the family description for a TC number by matching prefixes.""" parts = tc_number.split(".") for length in range(len(parts), 0, -1): prefix = ".".join(parts[:length]) if prefix in families: return families[prefix] return None
[文档] def run(self, arguments: dict[str, Any]) -> dict[str, Any]: try: if self.operation == "get_transporter": return self._get_transporter(arguments) elif self.operation == "search_family": return self._search_family(arguments) elif self.operation == "search_by_substrate": return self._search_by_substrate(arguments) return { "status": "error", "error": f"Unknown operation: {self.operation}", } except requests.exceptions.Timeout: return { "status": "error", "error": f"TCDB API timed out after {self.timeout}s", } except requests.exceptions.ConnectionError: return {"status": "error", "error": "Failed to connect to TCDB"} except Exception as e: return {"status": "error", "error": str(e)}
[文档] def _get_transporter(self, arguments: dict[str, Any]) -> dict[str, Any]: """Look up transporter info by UniProt accession.""" accession = ( ( arguments.get("uniprot_accession") or arguments.get("uniprot_id") or arguments.get("accession") or "" ) .strip() .upper() ) if not accession: return { "status": "error", "error": "uniprot_accession is required", } acc2tcid = self._get_acc2tcid() tc_numbers = acc2tcid.get(accession) if not tc_numbers: return { "status": "error", "error": f"UniProt accession {accession} not found in TCDB", } families = self._get_families() pdb_mapping = self._get_pdb_mapping() human_data = self._get_human_transporters() results = [] for tc in tc_numbers: entry = { "tc_number": tc, "family_description": self._family_for_tc(tc, families), } pdb_ids = pdb_mapping.get(tc, []) if pdb_ids: entry["pdb_structures"] = pdb_ids[:20] results.append(entry) data = { "uniprot_accession": accession, "tc_entries": results, } human_info = human_data.get(accession) if human_info: data["human_info"] = human_info return {"status": "success", "data": data}
[文档] def _search_family(self, arguments: dict[str, Any]) -> dict[str, Any]: """Search families by TC family ID prefix or name text.""" family_id = (arguments.get("family_id") or "").strip() family_name = (arguments.get("family_name") or "").strip() limit = min(int(arguments.get("limit", 20)), 100) if not family_id and not family_name: return { "status": "error", "error": "Either family_id or family_name is required", } families = self._get_families() acc2tcid = self._get_acc2tcid() matches = [] for fid, desc in families.items(): if family_id and not fid.startswith(family_id): continue if family_name and family_name.lower() not in desc.lower(): continue member_count = sum( 1 for tcs in acc2tcid.values() for tc in tcs if tc.startswith(fid) ) matches.append( { "family_id": fid, "description": desc, "member_count": member_count, } ) matches.sort(key=lambda x: x["family_id"]) matches = matches[:limit] return { "status": "success", "data": { "total_matches": len(matches), "families": matches, "query": { "family_id": family_id or None, "family_name": family_name or None, }, }, }
[文档] def _search_by_substrate(self, arguments: dict[str, Any]) -> dict[str, Any]: """Search transporters by substrate name.""" substrate_name = ( arguments.get("substrate_name") or arguments.get("substrate") or "" ).strip() limit = min(int(arguments.get("limit", 20)), 100) if not substrate_name: return { "status": "error", "error": "substrate_name is required", } substrates = self._get_substrates() families = self._get_families() query_lower = substrate_name.lower() matches = [] for entry in substrates: matching_substrates = [ s for s in entry["substrates"] if query_lower in s["name"].lower() ] if matching_substrates: family_desc = self._family_for_tc(entry["tc_number"], families) matches.append( { "tc_number": entry["tc_number"], "family_description": family_desc, "matching_substrates": matching_substrates, } ) matches = matches[:limit] return { "status": "success", "data": { "total_matches": len(matches), "transporters": matches, "query": substrate_name, }, }