Source code for tooluniverse.pdbe_search_tool

# pdbe_search_tool.py
"""
PDBe Search (Solr) API tool for ToolUniverse.

PDBe Search provides a powerful Solr-based search interface for the
Protein Data Bank in Europe. It supports full-text and field-specific
queries across all PDB entries, with faceting and filtering capabilities.

API: https://www.ebi.ac.uk/pdbe/search/pdb/select
Also: https://www.ebi.ac.uk/pdbe/api/pdb/compound/summary/{ligand_id}
No authentication required. Free for all use.
"""

import requests
from typing import Dict, Any
from .base_tool import BaseTool
from .tool_registry import register_tool

PDBE_SEARCH_URL = "https://www.ebi.ac.uk/pdbe/search/pdb/select"
PDBE_API_URL = "https://www.ebi.ac.uk/pdbe/api/pdb"


[docs] @register_tool("PDBeSearchTool") class PDBeSearchTool(BaseTool): """ Tool for searching PDBe, the Protein Data Bank in Europe. Provides full-text and field-specific Solr queries across all PDB entries, plus compound/ligand lookup by identifier. No authentication required. """
[docs] def __init__(self, tool_config: Dict[str, Any]): super().__init__(tool_config) self.timeout = tool_config.get("timeout", 30) self.endpoint_type = tool_config.get("fields", {}).get( "endpoint_type", "search_structures" )
[docs] def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Execute the PDBe Search API call.""" try: return self._dispatch(arguments) except requests.exceptions.Timeout: return { "error": f"PDBe Search API request timed out after {self.timeout} seconds" } except requests.exceptions.ConnectionError: return { "error": "Failed to connect to PDBe Search API. Check network connectivity." } except requests.exceptions.HTTPError as e: return {"error": f"PDBe Search API HTTP error: {e.response.status_code}"} except Exception as e: return {"error": f"Unexpected error querying PDBe Search: {str(e)}"}
[docs] def _dispatch(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Route to appropriate endpoint based on config.""" if self.endpoint_type == "search_structures": return self._search_structures(arguments) elif self.endpoint_type == "get_compound": return self._get_compound(arguments) elif self.endpoint_type == "search_by_organism": return self._search_by_organism(arguments) else: return {"error": f"Unknown endpoint_type: {self.endpoint_type}"}
[docs] def _search_structures(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Search PDB structures by keyword or protein name.""" query = arguments.get("query", "") if not query: return { "error": "query parameter is required (e.g., 'insulin', 'kinase', 'BRCA1')" } limit = min(arguments.get("limit", 10), 50) params = { "q": query, "rows": limit, "fl": "pdb_id,title,resolution,experimental_method,deposition_date,number_of_entities,organism_scientific_name", "wt": "json", "sort": "resolution asc", } response = requests.get(PDBE_SEARCH_URL, params=params, timeout=self.timeout) response.raise_for_status() raw = response.json() solr_response = raw.get("response", {}) total = solr_response.get("numFound", 0) structures = [] for doc in solr_response.get("docs", []): entry = { "pdb_id": doc.get("pdb_id", ""), "title": doc.get("title", ""), "resolution": doc.get("resolution"), "experimental_method": doc.get("experimental_method", []), "deposition_date": doc.get("deposition_date"), "number_of_entities": doc.get("number_of_entities"), "organism": doc.get("organism_scientific_name", []), } structures.append(entry) return { "data": structures, "metadata": { "source": "PDBe Search", "total_found": total, "returned": len(structures), "query": query, "endpoint": "search_structures", }, }
[docs] def _get_compound(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Get PDB ligand/compound information by compound ID.""" compound_id = arguments.get("compound_id", "") if not compound_id: return { "error": "compound_id parameter is required (e.g., 'ATP', 'HEM', 'NAG')" } url = f"{PDBE_API_URL}/compound/summary/{compound_id}" response = requests.get(url, timeout=self.timeout) response.raise_for_status() raw = response.json() # Response keyed by compound ID compound_data = raw.get(compound_id, []) if not compound_data: compound_data = raw.get(compound_id.upper(), []) compounds = [] for c in compound_data: if isinstance(c, dict): entry = { "name": c.get("name", ""), "formula": c.get("formula", ""), "weight": c.get("weight"), "compound_type": c.get("compound_type", ""), "inchi": c.get("inchi", ""), "inchi_key": c.get("inchi_key", ""), } # SMILES smiles_list = c.get("smiles", []) if isinstance(smiles_list, list) and smiles_list: first = smiles_list[0] if isinstance(first, dict): entry["smiles"] = first.get("name", "") # Systematic names sys_names = c.get("systematic_names", []) if isinstance(sys_names, list) and sys_names: for sn in sys_names[:2]: if isinstance(sn, dict): entry["systematic_name"] = sn.get("name", "") break compounds.append(entry) result = compounds[0] if compounds else {} result["compound_id"] = compound_id return { "data": result, "metadata": { "source": "PDBe", "query": compound_id, "endpoint": "get_compound", }, }
[docs] def _search_by_organism(self, arguments: Dict[str, Any]) -> Dict[str, Any]: """Search PDB structures filtered by organism.""" organism = arguments.get("organism", "") query = arguments.get("query", "*:*") limit = min(arguments.get("limit", 10), 50) if not organism: return { "error": "organism parameter is required (e.g., 'Homo sapiens', 'Escherichia coli')" } # Build Solr query with organism filter full_query = f'{query} AND organism_scientific_name:"{organism}"' params = { "q": full_query, "rows": limit, "fl": "pdb_id,title,resolution,experimental_method,deposition_date,organism_scientific_name", "wt": "json", "sort": "resolution asc", } response = requests.get(PDBE_SEARCH_URL, params=params, timeout=self.timeout) response.raise_for_status() raw = response.json() solr_response = raw.get("response", {}) total = solr_response.get("numFound", 0) structures = [] for doc in solr_response.get("docs", []): entry = { "pdb_id": doc.get("pdb_id", ""), "title": doc.get("title", ""), "resolution": doc.get("resolution"), "experimental_method": doc.get("experimental_method", []), "deposition_date": doc.get("deposition_date"), "organism": doc.get("organism_scientific_name", []), } structures.append(entry) return { "data": structures, "metadata": { "source": "PDBe Search", "total_found": total, "returned": len(structures), "query": query, "organism_filter": organism, "endpoint": "search_by_organism", }, }