tooluniverse.gpcrdb_tool 源代码

"""
GPCRdb API tool for ToolUniverse.

GPCRdb is a comprehensive database for G protein-coupled receptors (GPCRs),
which are the targets of ~35% of all approved drugs.

API Documentation: https://docs.gpcrdb.org/web_services.html
No authentication required.
"""

import html
import re
import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool

# Base URL for GPCRdb API
GPCRDB_API_URL = "https://gpcrdb.org/services"
_HTML_TAG_RE = re.compile(r"<[^>]+>")


[文档] @register_tool("GPCRdbTool") class GPCRdbTool(BaseTool): """ Tool for querying GPCRdb GPCR database. GPCRdb provides: - GPCR protein information and classification - Structure data for GPCR crystal/cryo-EM structures - Ligand binding data - Mutation data and effects - Sequence alignments No authentication required. Free public access. """
[文档] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout: int = tool_config.get("timeout", 30) self.parameter = tool_config.get("parameter", {})
[文档] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute GPCRdb API call based on operation type.""" # Normalize aliases → protein if not arguments.get("protein"): alias = ( arguments.get("protein_id") or arguments.get("receptor_name") or arguments.get("protein_name") ) if alias: arguments = dict(arguments, protein=alias) operation = arguments.get("operation", "") # Auto-fill operation from tool config const if not provided by user if not operation: operation = self.get_schema_const_operation() if operation == "get_protein": return self._get_protein(arguments) elif operation == "list_proteins": return self._list_proteins(arguments) elif operation == "get_structures": return self._get_structures(arguments) elif operation == "get_ligands": return self._get_ligands(arguments) elif operation == "get_mutations": return self._get_mutations(arguments) else: return { "status": "error", "error": f"Unknown operation: {operation}. Supported: get_protein, list_proteins, get_structures, get_ligands, get_mutations", }
[文档] def _normalize_protein(self, protein: str) -> str: """Resolve gene symbol (e.g. ADRB2) to GPCRdb entry name (e.g. adrb2_human).""" if protein and "_" not in protein: return f"{protein.lower()}_human" return protein
[文档] def _get_protein(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """ Get detailed protein information for a GPCR. Args: arguments: Dict containing: - protein: Protein entry name (e.g., adrb2_human) or UniProt accession """ protein = arguments.get("protein", "") if not protein: return {"status": "error", "error": "Missing required parameter: protein"} try: response = requests.get( f"{GPCRDB_API_URL}/protein/{protein}/", timeout=self.timeout, headers={ "Accept": "application/json", "User-Agent": "ToolUniverse/GPCRdb", }, ) response.raise_for_status() data = response.json() # Strip HTML tags/entities from name field (GPCRdb returns e.g. "&beta;<sub>2</sub>-adrenoceptor") if isinstance(data, dict) and "name" in data: data["name"] = _HTML_TAG_RE.sub("", html.unescape(data["name"])) return { "status": "success", "data": data, "metadata": { "source": "GPCRdb", "protein": protein, }, } except requests.exceptions.HTTPError as e: if e.response.status_code == 404: # Try accession endpoint (for UniProt IDs like P07550) try: acc_response = requests.get( f"{GPCRDB_API_URL}/protein/accession/{protein}/", timeout=self.timeout, headers={ "Accept": "application/json", "User-Agent": "ToolUniverse/GPCRdb", }, ) acc_response.raise_for_status() data = acc_response.json() if isinstance(data, dict) and "name" in data: data["name"] = _HTML_TAG_RE.sub("", html.unescape(data["name"])) return { "status": "success", "data": data, "metadata": {"source": "GPCRdb", "protein": protein}, } except Exception: pass # Fallback: try {lowercase_symbol}_human (e.g. CCR5 → ccr5_human) if "_" not in protein: human_entry = f"{protein.lower()}_human" try: fb_response = requests.get( f"{GPCRDB_API_URL}/protein/{human_entry}/", timeout=self.timeout, headers={ "Accept": "application/json", "User-Agent": "ToolUniverse/GPCRdb", }, ) fb_response.raise_for_status() data = fb_response.json() if isinstance(data, dict) and "name" in data: data["name"] = _HTML_TAG_RE.sub( "", html.unescape(data["name"]) ) return { "status": "success", "data": data, "metadata": { "source": "GPCRdb", "protein": human_entry, "resolved_from": protein, }, } except Exception: pass return { "status": "error", "error": f"Protein not found: {protein}. Use GPCRdb entry name (e.g. adrb2_human) or UniProt accession (e.g. P07550).", } return {"status": "error", "error": f"HTTP error: {e.response.status_code}"} except requests.exceptions.RequestException as e: return {"status": "error", "error": f"Request failed: {str(e)}"} except Exception as e: return {"status": "error", "error": f"Unexpected error: {str(e)}"}
# Map human-readable class/family names to GPCRdb numeric slugs (Feature-122A-001) _FAMILY_NAME_TO_SLUG = { "class a": "001", "class a (rhodopsin)": "001", "rhodopsin": "001", "class b": "002", "class b1": "002", "secretin": "002", "class b2": "003", "adhesion": "003", "class c": "004", "glutamate": "004", "class f": "005", "frizzled": "005", "class t": "006", "taste2": "006", "aminergic": "001_001", "aminergic receptors": "001_001", "peptide receptors": "001_003", "chemokine receptors": "001_003_002", "chemokine": "001_003_002", "purine receptors": "001_004", "lipid receptors": "001_007", "serotonin": "001_001_001", "5-hydroxytryptamine": "001_001_001", "dopamine": "001_001_004", "adrenoceptor": "001_001_003", "adrenergic": "001_001_003", "adrenergic receptors": "001_001_003", "muscarinic": "001_001_002", "histamine": "001_001_005", "beta-adrenergic": "001_001_003_008", "opioid": "001_003_015", "endothelin": "001_003_006", }
[文档] def _list_proteins(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """ List GPCR protein families from GPCRdb. Args: arguments: Dict containing: - family: GPCR family slug (e.g., '001') or human-readable name (e.g., 'Chemokine receptors'). If provided, returns proteins in that family. - protein_class: Alias for family; accepts human-readable names. Note: GPCRdb API does not support listing all proteins by species alone. Without family, returns list of protein families. """ family = arguments.get("family") or arguments.get("protein_class", "") # Resolve human-readable class names to numeric slugs (Feature-122A-001) if family and not family.replace("_", "").isdigit(): resolved = self._FAMILY_NAME_TO_SLUG.get(family.lower()) if resolved: family = resolved try: if family: # List proteins in specific family url = f"{GPCRDB_API_URL}/proteinfamily/proteins/{family}/" else: # List protein families (no endpoint for all proteins by species) url = f"{GPCRDB_API_URL}/proteinfamily/" response = requests.get( url, timeout=self.timeout, headers={ "Accept": "application/json", "User-Agent": "ToolUniverse/GPCRdb", }, ) response.raise_for_status() data = response.json() proteins = data if isinstance(data, list) else [data] # Strip HTML entities and tags from name fields (Feature-123B-002) for item in proteins: if isinstance(item, dict) and isinstance(item.get("name"), str): item["name"] = html.unescape(_HTML_TAG_RE.sub("", item["name"])) note = None if family and len(proteins) == 0: note = ( f"No proteins found for family slug '{family}'. " "The 'family' parameter requires a numeric GPCRdb slug (e.g., '001_003_002' " "for Chemokine receptors). Use protein_class with a human-readable name " "(e.g., 'Chemokine receptors') or call without 'family' to discover slugs." ) elif not family: note = ( "To list proteins in a specific family, pass its numeric slug as 'family' " "(e.g., '001_003_002') or use protein_class with a human-readable name " "(e.g., 'Chemokine receptors'). Call without arguments to discover all slugs." ) return { "status": "success", "data": { "proteins": proteins, "count": len(proteins), "family": family if family else "all families", **({"note": note} if note else {}), }, "metadata": { "source": "GPCRdb", }, } except requests.exceptions.RequestException as e: return {"status": "error", "error": f"Request failed: {str(e)}"} except Exception as e: return {"status": "error", "error": f"Unexpected error: {str(e)}"}
[文档] def _get_structures(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """ Get GPCR structure information. Args: arguments: Dict containing: - protein: Protein entry name (e.g., adrb2_human) or gene symbol (e.g., ADRB2) — optional - state: Receptor state filter (active, inactive, intermediate) """ protein = self._normalize_protein(arguments.get("protein", "")) state = arguments.get("state", "") resolution = arguments.get("resolution") try: if protein: url = f"{GPCRDB_API_URL}/structure/protein/{protein}/" else: url = f"{GPCRDB_API_URL}/structure/" response = requests.get( url, timeout=self.timeout, headers={ "Accept": "application/json", "User-Agent": "ToolUniverse/GPCRdb", }, ) response.raise_for_status() data = response.json() structures = data if isinstance(data, list) else [data] # Filter by state if specified if state: structures = [ s for s in structures if s.get("state", "").lower() == state.lower() ] # Filter by max resolution (client-side, GPCRdb API has no resolution param) if resolution is not None: try: max_res = float(resolution) structures = [ s for s in structures if s.get("resolution") is not None and float(s["resolution"]) <= max_res ] except (ValueError, TypeError): pass return { "status": "success", "data": { "structures": structures, "count": len(structures), "protein": protein if protein else "all", "state_filter": state if state else "all", }, "metadata": { "source": "GPCRdb", }, } except requests.exceptions.HTTPError as e: if e.response.status_code == 404: return { "status": "success", "data": {"structures": [], "count": 0}, "metadata": {"note": "No structures found"}, } return {"status": "error", "error": f"HTTP error: {e.response.status_code}"} except requests.exceptions.RequestException as e: return {"status": "error", "error": f"Request failed: {str(e)}"} except Exception as e: return {"status": "error", "error": f"Unexpected error: {str(e)}"}
[文档] def _get_ligands(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """ Get ligands associated with a GPCR. Args: arguments: Dict containing: - protein: Protein entry name (e.g., adrb2_human) or gene symbol (e.g., ADRB2) """ protein = self._normalize_protein(arguments.get("protein", "")) if not protein: return {"status": "error", "error": "Missing required parameter: protein"} try: response = requests.get( f"{GPCRDB_API_URL}/ligands/{protein}/", timeout=self.timeout, headers={ "Accept": "application/json", "User-Agent": "ToolUniverse/GPCRdb", }, ) response.raise_for_status() data = response.json() ligands = data if isinstance(data, list) else data.get("ligands", []) # Filter by ligand type if specified (e.g., agonist, antagonist, inhibitor) # GPCRdb API returns "Ligand type" field (e.g., "small molecule", "peptide") ligand_type = ( arguments.get("type") or arguments.get("ligand_type") or "" ).lower() if ligand_type: ligands = [ lig for lig in ligands if ligand_type in (lig.get("Ligand type") or lig.get("type") or "").lower() ] total_count = len(ligands) # Apply limit/max_results client-side (Feature-122A-003) limit = arguments.get("limit") or arguments.get("max_results") if limit is not None: try: ligands = ligands[: int(limit)] except (TypeError, ValueError): pass # Sanitize HTML entities and fix nan DOIs in each ligand record for lig in ligands: for field in ("Ligand name", "Protein name"): if isinstance(lig.get(field), str): lig[field] = _HTML_TAG_RE.sub("", html.unescape(lig[field])) doi = lig.get("DOI", "") if isinstance(doi, str) and doi.lower().endswith("/nan"): lig["DOI"] = None result: Dict[str, Any] = { "protein": protein, "ligands": ligands, "count": len(ligands), "total_count": total_count, } if limit is not None and total_count > len(ligands): result["note"] = ( f"Showing {len(ligands)} of {total_count} ligands. Increase limit to retrieve more." ) return { "status": "success", "data": result, "metadata": { "source": "GPCRdb", "protein": protein, }, } except requests.exceptions.HTTPError as e: if e.response.status_code == 404: return { "status": "success", "data": {"protein": protein, "ligands": [], "count": 0}, "metadata": {"note": "No ligands found for this protein"}, } return {"status": "error", "error": f"HTTP error: {e.response.status_code}"} except requests.exceptions.RequestException as e: return {"status": "error", "error": f"Request failed: {str(e)}"} except Exception as e: return {"status": "error", "error": f"Unexpected error: {str(e)}"}
[文档] def _get_mutations(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """ Get mutation data for a GPCR. Args: arguments: Dict containing: - protein: Protein entry name (e.g., adrb2_human) or gene symbol (e.g., ADRB2) """ protein = self._normalize_protein(arguments.get("protein", "")) if not protein: return {"status": "error", "error": "Missing required parameter: protein"} try: response = requests.get( f"{GPCRDB_API_URL}/mutants/protein/{protein}/", timeout=self.timeout, headers={ "Accept": "application/json", "User-Agent": "ToolUniverse/GPCRdb", }, ) response.raise_for_status() data = response.json() mutations = data if isinstance(data, list) else data.get("mutations", []) result: Dict[str, Any] = { "protein": protein, "mutations": mutations, "count": len(mutations), } if len(mutations) == 0: result["note"] = ( "The GPCRdb mutations API (/services/mutants/) currently returns empty results for all receptors. For mutation data, visit https://gpcrdb.org/mutations/." ) return { "status": "success", "data": result, "metadata": { "source": "GPCRdb", "protein": protein, }, } except requests.exceptions.HTTPError as e: if e.response.status_code == 404: return { "status": "success", "data": {"protein": protein, "mutations": [], "count": 0}, "metadata": {"note": "No mutation data found"}, } return {"status": "error", "error": f"HTTP error: {e.response.status_code}"} except requests.exceptions.RequestException as e: return {"status": "error", "error": f"Request failed: {str(e)}"} except Exception as e: return {"status": "error", "error": f"Unexpected error: {str(e)}"}