Source code for tooluniverse.rcsb_search_tool

"""
RCSB PDB Structure Search Tool

Tool for searching similar protein structures using RCSB PDB Search API v2.
Supports both sequence-based and structure-based similarity search.
"""

import requests
from typing import Dict, Any, Optional
from .base_tool import BaseTool
from .tool_registry import register_tool



[docs]
@register_tool("RCSBSearchTool")
class RCSBSearchTool(BaseTool):
    """
    Tool for searching similar protein structures using RCSB PDB Search API v2.

    Supports:
    - Sequence-based similarity search
    - Structure-based similarity search (using PDB ID)
    - Text-based search (by name, keyword, etc.)
    """


[docs]
    def __init__(self, tool_config: Dict):
        super().__init__(tool_config)
        self.api_url = "https://search.rcsb.org/rcsbsearch/v2/query"
        self.timeout = 60  # API request timeout in seconds



[docs]
    def _validate_pdb_id(self, pdb_id: str) -> bool:
        """Validate PDB ID format (4 characters, alphanumeric)"""
        if not isinstance(pdb_id, str):
            return False
        pdb_id = pdb_id.strip().upper()
        return len(pdb_id) == 4 and pdb_id.isalnum()



[docs]
    def _validate_sequence(self, sequence: str) -> bool:
        """Validate protein sequence (amino acids only)"""
        if not isinstance(sequence, str):
            return False
        sequence = sequence.strip().upper()
        if len(sequence) < 10:
            return False
        # Valid amino acid codes
        valid_aa = set("ACDEFGHIKLMNPQRSTVWY")
        return all(c in valid_aa for c in sequence)



[docs]
    def _build_sequence_query(
        self, sequence: str, identity_cutoff: float, max_results: int
    ) -> Dict[str, Any]:
        """
        Build sequence similarity search query.

        Uses the correct RCSB Search API v2 format:
        - Uses "value" parameter (not "target")
        - Includes evalue_cutoff (required, default 0.1)
        - Includes identity_cutoff (optional, 0-1)
        - Includes sequence_type ("protein")
        """
        # Convert identity_cutoff to evalue_cutoff if needed
        # Lower identity_cutoff means higher similarity requirement
        # We use a reasonable evalue_cutoff based on identity
        evalue_cutoff = 0.1  # Default evalue cutoff
        if identity_cutoff > 0.9:
            evalue_cutoff = 0.001  # High similarity
        elif identity_cutoff > 0.7:
            evalue_cutoff = 0.01  # Medium-high similarity
        else:
            evalue_cutoff = 0.1  # Lower similarity

        return {
            "query": {
                "type": "terminal",
                "service": "sequence",
                "parameters": {
                    "value": sequence.upper(),
                    "evalue_cutoff": evalue_cutoff,
                    "identity_cutoff": identity_cutoff,
                    "sequence_type": "protein",
                },
            },
            "return_type": "entry",
            "request_options": {
                "paginate": {
                    "start": 0,
                    "rows": max_results,
                },
                "sort": [{"sort_by": "score", "direction": "desc"}],
            },
        }



[docs]
    def _build_structure_query(
        self, pdb_id: str, similarity_threshold: float, max_results: int
    ) -> Dict[str, Any]:
        """
        Build structure similarity search query.

        Uses the correct RCSB Search API v2 format:
        - Uses "value" as an object with "entry_id" and "assembly_id"
        - Includes "operator" (default: "strict_shape_match")
        - Includes "target_search_space" (default: "assembly")
        """
        return {
            "query": {
                "type": "terminal",
                "service": "structure",
                "parameters": {
                    "value": {
                        "entry_id": pdb_id.upper(),
                        "assembly_id": "1",  # Default to first assembly
                    },
                    "operator": "strict_shape_match",
                    "target_search_space": "assembly",
                },
            },
            "return_type": "entry",
            "request_options": {
                "paginate": {
                    "start": 0,
                    "rows": max_results,
                },
                "sort": [{"sort_by": "score", "direction": "desc"}],
            },
        }



[docs]
    def _build_text_query(self, search_text: str, max_results: int) -> Dict[str, Any]:
        """
        Build text search query.

        Uses the correct RCSB Search API v2 format:
        - Searches in multiple attributes
          (struct.title, struct_keywords.pdbx_keywords)
        - Uses OR logic to combine search conditions
        - Supports pagination and sorting
        """
        # Search in multiple attributes using OR logic
        search_nodes = [
            {
                "type": "terminal",
                "service": "text",
                "parameters": {
                    "attribute": "struct.title",
                    "operator": "contains_words",
                    "value": search_text,
                },
            },
            {
                "type": "terminal",
                "service": "text",
                "parameters": {
                    "attribute": "struct_keywords.pdbx_keywords",
                    "operator": "contains_words",
                    "value": search_text,
                },
            },
        ]

        return {
            "query": {
                "type": "group",
                "logical_operator": "or",
                "nodes": search_nodes,
            },
            "return_type": "entry",
            "request_options": {
                "paginate": {
                    "start": 0,
                    "rows": max_results,
                },
                "sort": [{"sort_by": "score", "direction": "desc"}],
            },
        }



[docs]
    def _parse_search_results(self, response_data: Dict[str, Any]) -> list:
        """
        Parse RCSB Search API response.

        Expected response format:
        {
            "query_id": "...",
            "result_type": "entry",
            "total_count": 123,
            "result_set": [
                {"identifier": "6B3Q", "score": 1.0},
                ...
            ]
        }
        """
        results = []

        if not isinstance(response_data, dict):
            return results

        # Extract result identifiers from result_set
        result_set = response_data.get("result_set", [])

        if not result_set:
            return results

        for idx, entry in enumerate(result_set):
            # Entry is a dict with "identifier" and optionally "score"
            if isinstance(entry, dict):
                pdb_id = entry.get("identifier", entry.get("pdb_id", ""))
                score = entry.get("score")
            elif isinstance(entry, str):
                # Fallback: if entry is just a string, use it as PDB ID
                pdb_id = entry
                score = None
            else:
                continue

            if pdb_id:
                result = {
                    "pdb_id": pdb_id,
                    "rank": idx + 1,
                }

                if score is not None:
                    result["score"] = score

                results.append(result)

        return results



[docs]
    def run(
        self,
        arguments: Optional[Dict[str, Any]] = None,
        stream_callback: Optional[Any] = None,
        use_cache: bool = False,
        validate: bool = True,
    ) -> Dict[str, Any]:
        """
        Execute structure similarity search.

        Args:
            arguments: Dictionary containing:
                - query: PDB ID, protein sequence, or search text
                - search_type: "sequence", "structure", or "text"
                  (default: "sequence")
                - similarity_threshold: Similarity threshold 0-1 (default: 0.7)
                  (not used for text search)
                - max_results: Maximum number of results (default: 20)
            stream_callback: Optional callback for streaming
            use_cache: Whether to use caching
            validate: Whether to validate parameters

        Returns:
            Dictionary with search results or error information
        """
        if arguments is None:
            arguments = {}

        query = arguments.get("query", "")
        if query:
            query = str(query).strip()
        search_type = arguments.get("search_type", "sequence")
        if search_type:
            search_type = str(search_type).lower()
        else:
            search_type = "sequence"

        # Get and validate similarity_threshold with clamping
        similarity_threshold_raw = arguments.get("similarity_threshold", 0.7)
        try:
            similarity_threshold = float(similarity_threshold_raw)
            similarity_threshold = max(0.0, min(1.0, similarity_threshold))
        except (ValueError, TypeError):
            similarity_threshold = 0.7

        # Get and validate max_results with clamping
        max_results_raw = arguments.get("max_results", 20)
        try:
            max_results = int(max_results_raw)
            max_results = max(1, min(100, max_results))
        except (ValueError, TypeError):
            max_results = 20

        # Validate parameters
        if not query:
            return {
                "error": (
                    "Missing required parameter: query. "
                    "Provide either a PDB ID (e.g., '1ABC'), "
                    "a protein sequence (amino acids), "
                    "or search text (e.g., drug name, keyword)."
                ),
            }

        # Build query based on search type
        if search_type == "structure":
            # Structure-based search using PDB ID
            if not self._validate_pdb_id(query):
                return {
                    "error": (
                        f"Invalid PDB ID format: '{query}'. "
                        "PDB ID must be 4 alphanumeric characters "
                        "(e.g., '1ABC')."
                    ),
                }

            api_query = self._build_structure_query(
                query, similarity_threshold, max_results
            )
            query_type = "structure"

        elif search_type == "sequence":
            # Sequence-based search
            if not self._validate_sequence(query):
                return {
                    "error": (
                        f"Invalid protein sequence: '{query[:50]}...'. "
                        "Sequence must be at least 10 amino acids long "
                        "and contain only valid amino acid codes "
                        "(A, C, D, E, F, G, H, I, K, L, M, N, P, Q, "
                        "R, S, T, V, W, Y)."
                    ),
                }

            api_query = self._build_sequence_query(
                query, similarity_threshold, max_results
            )
            query_type = "sequence"

        elif search_type == "text":
            # Text-based search (by name, keyword, etc.)
            if not query or not query.strip():
                return {
                    "error": (
                        "Invalid search text. "
                        "Provide a non-empty search term "
                        "(e.g., drug name, protein name, keyword)."
                    ),
                }

            api_query = self._build_text_query(query.strip(), max_results)
            query_type = "text"

        else:
            return {
                "error": (
                    f"Invalid search_type: '{search_type}'. "
                    "Must be 'sequence', 'structure', or 'text'."
                ),
            }

        # Make API request
        try:
            response = requests.post(
                self.api_url,
                json=api_query,
                headers={"Content-Type": "application/json"},
                timeout=self.timeout,
            )  # noqa: E501
            response.raise_for_status()

            # Handle HTTP 204 No Content (empty result set)
            # RCSB API returns 204 when no results are found
            if response.status_code == 204 or len(response.content) == 0:
                response_data = {
                    "result_set": [],
                    "total_count": 0,
                }
            else:
                response_data = response.json()

        except requests.exceptions.Timeout:
            return {
                "error": (
                    "Request timeout. The RCSB PDB Search API "
                    "did not respond in time. Please try again later."
                ),
            }
        except requests.exceptions.HTTPError as e:
            # Try to extract detailed error message from API response
            error_detail = str(e)
            try:
                if hasattr(e, "response") and e.response is not None:
                    error_response = e.response.json()
                    if isinstance(error_response, dict):
                        api_message = error_response.get("message", "")
                        if api_message:
                            error_detail = f"{str(e)}. API message: {api_message}"
            except Exception:
                pass  # Use default error message if parsing fails

            if e.response.status_code == 400:
                return {
                    "error": (
                        f"Invalid request to RCSB PDB Search API: "
                        f"{error_detail}. "
                        "Please check your query parameters. "
                        "Note: The API query format may need adjustment. "
                        "See documentation at "
                        "https://search.rcsb.org/redoc/index.html"
                    ),
                }
            elif e.response.status_code == 404:
                # 404 can mean the PDB ID doesn't exist or
                # doesn't support this search type
                pdb_id_msg = query if search_type == "structure" else "provided"
                error_msg = (
                    "Structure not found or does not support "
                    "similarity search. "
                    f"The PDB ID '{pdb_id_msg}' "
                    "may not exist in the database or may not support "
                    "structure similarity search. "
                    "Please verify the PDB ID is correct."
                )
                return {"error": error_msg}
            else:
                return {
                    "error": (
                        f"RCSB PDB Search API error "
                        f"(HTTP {e.response.status_code}): {error_detail}"
                    ),
                }
        except requests.exceptions.RequestException as e:
            return {
                "error": (
                    f"Network error while connecting to RCSB PDB Search API: {str(e)}"
                ),
            }
        except Exception as e:
            return {
                "error": f"Unexpected error during search: {str(e)}",
            }

        # Parse results
        try:
            results = self._parse_search_results(response_data)

            # Get total_count from API response if available
            # This represents the total number of matches in the database,
            # not just the number of results returned
            # (which may be limited by max_results)
            total_found = response_data.get("total_count", len(results))

            if not results:
                if query_type == "text":
                    message = f"No structures found matching '{query}'."
                else:
                    message = (
                        f"No similar structures found with "
                        f"similarity threshold >= {similarity_threshold}."
                    )
                return {
                    "query": query,
                    "search_type": query_type,
                    "similarity_threshold": (
                        similarity_threshold if query_type != "text" else None
                    ),
                    "total_found": total_found,
                    "results": [],
                    "message": message,
                }

            result_dict = {
                "query": query,
                "search_type": query_type,
                "total_found": total_found,
                "results": results,
            }

            # Only include similarity_threshold for sequence/structure searches
            if query_type != "text":
                result_dict["similarity_threshold"] = similarity_threshold

            return result_dict

        except Exception as e:
            return {
                "error": f"Error parsing search results: {str(e)}",
                "raw_response": str(response_data)[:500],
            }