tooluniverse.cath_tool 源代码

# cath_tool.py
"""
CATH Protein Structure Classification Database API tool for ToolUniverse.

CATH is a hierarchical classification of protein domain structures that
clusters proteins at four major levels: Class (C), Architecture (A),
Topology (T), and Homologous superfamily (H). CATH classifies domains
from the PDB and AlphaFold Protein Structure Database.

API: https://www.cathdb.info/version/v4_3_0/api/rest/
No authentication required. Free public access.
"""

import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool

CATH_BASE_URL = "https://www.cathdb.info/version/v4_3_0/api/rest"


[文档] @register_tool("CATHTool") class CATHTool(BaseTool): """ Tool for querying the CATH protein structure classification database. CATH classifies protein domain structures into a hierarchy: Class -> Architecture -> Topology -> Homologous superfamily. Covers 500,000+ domains from PDB and AFDB structures. No authentication required. """
[文档] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("timeout", 30) fields = tool_config.get("fields", {}) self.endpoint = fields.get("endpoint", "superfamily")
[文档] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the CATH API call.""" try: return self._query(arguments) except requests.exceptions.Timeout: return { "status": "error", "error": f"CATH API request timed out after {self.timeout} seconds", } except requests.exceptions.ConnectionError: return { "status": "error", "error": "Failed to connect to CATH API. Check network connectivity.", } except requests.exceptions.HTTPError as e: return { "status": "error", "error": f"CATH API HTTP error: {e.response.status_code}", } except Exception as e: return { "status": "error", "error": f"Unexpected error querying CATH: {str(e)}", }
[文档] def _query(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Route to appropriate CATH endpoint.""" if self.endpoint == "superfamily": return self._get_superfamily(arguments) elif self.endpoint == "domain_summary": return self._get_domain_summary(arguments) elif self.endpoint == "list_funfams": return self._list_funfams(arguments) elif self.endpoint == "get_funfam": return self._get_funfam(arguments) else: return {"status": "error", "error": f"Unknown endpoint: {self.endpoint}"}
[文档] def _get_superfamily(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get CATH superfamily information by CATH ID.""" cath_id = arguments.get("superfamily_id", "") if not cath_id: return { "status": "error", "error": "superfamily_id parameter is required (e.g. 2.40.50.140 for Nucleic acid-binding proteins)", } url = f"{CATH_BASE_URL}/superfamily/{cath_id}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() resp_data = response.json() if not resp_data.get("success"): return { "status": "error", "error": f"CATH API returned unsuccessful response for {cath_id}", } data = resp_data.get("data", {}) result = { "cath_id": data.get("cath_id"), "superfamily_id": data.get("superfamily_id"), "classification_name": data.get("classification_name"), "classification_description": data.get("classification_description"), "example_domain_id": data.get("example_domain_id"), "num_s35_families": data.get("child_count_s35_code"), "num_s60_families": data.get("child_count_s60_code"), "num_s95_families": data.get("child_count_s95_code"), "num_s100_domains": data.get("child_count_s100_code"), "total_domain_count": data.get("child_count_s100_count"), } return { "status": "success", "data": result, "metadata": { "source": "CATH v4.3.0", "query": cath_id, }, }
[文档] def _get_domain_summary(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get domain summary for a CATH domain ID (PDB chain domain).""" domain_id = arguments.get("domain_id", "") if not domain_id: return { "status": "error", "error": "domain_id parameter is required (e.g. 1cukA01 for PDB 1CUK chain A domain 1)", } url = f"{CATH_BASE_URL}/domain_summary/{domain_id}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() resp_data = response.json() data = resp_data.get("data", {}) # Extract CATH classification from cath_id cath_id = data.get("cath_id", "") cath_parts = cath_id.split(".") if cath_id else [] result = { "domain_id": domain_id, "cath_id": cath_id, "superfamily_id": data.get("superfamily_id"), "class": cath_parts[0] if len(cath_parts) > 0 else None, "architecture": ".".join(cath_parts[:2]) if len(cath_parts) > 1 else None, "topology": ".".join(cath_parts[:3]) if len(cath_parts) > 2 else None, "homologous_superfamily": ".".join(cath_parts[:4]) if len(cath_parts) > 3 else None, "residue_count": len(data.get("residues", [])), } # CATH class names class_names = { "1": "Mainly Alpha", "2": "Mainly Beta", "3": "Alpha Beta", "4": "Few Secondary Structures", } if result["class"] in class_names: result["class_name"] = class_names[result["class"]] return { "status": "success", "data": result, "metadata": { "source": "CATH v4.3.0", "query": domain_id, }, }
[文档] def _list_funfams(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """List functional families (FunFams) within a CATH superfamily.""" superfamily_id = arguments.get("superfamily_id", "") if not superfamily_id: return { "status": "error", "error": "superfamily_id is required (e.g., '1.10.510.10' for Globin-like)", } url = f"{CATH_BASE_URL}/superfamily/{superfamily_id}/funfam" response = requests.get( url, headers={"Accept": "application/json"}, timeout=self.timeout ) response.raise_for_status() resp_data = response.json() funfams_raw = resp_data.get("data", []) max_results = arguments.get("max_results", 25) funfams = [] for ff in funfams_raw[:max_results]: funfams.append( { "funfam_number": ff.get("funfam_number"), "name": ff.get("name"), "num_members": ff.get("num_members_in_funfam"), "rep_id": ff.get("rep_id"), "superfamily_id": ff.get("superfamily_id"), } ) return { "status": "success", "data": { "superfamily_id": superfamily_id, "total_funfams": len(funfams_raw), "funfams": funfams, }, "metadata": { "source": "CATH v4.3.0", "query": superfamily_id, }, }
[文档] def _get_funfam(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get details for a specific FunFam within a CATH superfamily.""" superfamily_id = arguments.get("superfamily_id", "") funfam_number = arguments.get("funfam_number", "") if not superfamily_id or not funfam_number: return { "status": "error", "error": "Both superfamily_id and funfam_number are required", } url = f"{CATH_BASE_URL}/superfamily/{superfamily_id}/funfam/{funfam_number}" response = requests.get( url, headers={"Accept": "application/json"}, timeout=self.timeout ) response.raise_for_status() resp_data = response.json() data = resp_data.get("data", {}) result = { "funfam_number": data.get("funfam_number"), "superfamily_id": data.get("superfamily_id"), "name": data.get("name"), "description": data.get("description"), "num_members": data.get("num_members_in_funfam"), "num_seed_members": data.get("num_members_in_seed_aln"), "dops_score": data.get("seed_dops_score"), "rep_id": data.get("rep_id"), "ec_terms": data.get("ec_terms", []), "go_terms": data.get("go_terms", []), } return { "status": "success", "data": result, "metadata": { "source": "CATH v4.3.0", "query": f"{superfamily_id}/funfam/{funfam_number}", }, }